Hi,I am new to Pytorch DistributedDataParallel. Object detection, My model can achieve 79mAP with nn.DataParallel, but with DistributedDataParallel, it only 50+mAP.
In both ways, except for batchsize, every parameter is the same. For nn.DataParallel, the batchsize is 32, and 4 GPUs, for DistributedDataParallel, the batchsize is 8 for per GPU, 4 GPUS, So the total number of batchsize is the same. Is my approach correct? Why is DistributedDataParallel performing worse?
Here is .py with nn.DataParallel:
#train.py
import os
os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3'
import torch
from model_dla34 import get_pose_net
from loss import CtdetLoss,AverageMeter
from torch.utils.data import DataLoader
from datasets import My_certernet_datasets
import time
datasets = 'pascal'
batch_size = 32
start_epoch = 0
end_epoch = 70
init_lr = 1.25e-4
def main():
train_data = My_certernet_datasets(mode = 'train',datasets = datasets)
print('there are {} train images'.format(len(train_data)))
train_data_loader = DataLoader(dataset=train_data,
num_workers=16,
batch_size=batch_size,
shuffle=True,
pin_memory=True
)
model = get_pose_net()
if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
model = model.to(device)
criterion = CtdetLoss()
optimizer = torch.optim.Adam(model.parameters(), init_lr)
for epoch in range(start_epoch+1,end_epoch+1):
adjust_lr(optimizer,epoch,init_lr)
train(train_data_loader,model,criterion,optimizer,device,epoch,end_epoch)
if epoch % 10 == 0:
save_model(save_dir + '/model_last.pth',epoch,model,optimizer = None)
def train(train_data_loader,model,criterion,optimizer,device,epoch,end_epoch):
losses = AverageMeter()
model = model.train()
for i ,batch in enumerate(train_data_loader):
start_time = time.time()
for k in batch:
if k != 'meta':
batch[k] = batch[k].to(device)
output = model(batch['input'])
loss_stats = criterion(output,batch)
loss = loss_stats['loss']
optimizer.zero_grad()
loss.backward()
optimizer.step()
end_time = time.time()
ELA_time = (end_time - start_time)*(end_epoch - epoch)*len(train_data_loader)
ELA_time = time.strftime('%H:%M:%S',time.gmtime(ELA_time))
losses.update(loss.item())
print('[epoch:{},{}/{}]'.format(epoch,i,len(train_data_loader)),'current_loss:%.4f'% losses.current,\
'average_loss:%.4f' % losses.avg,'ELA_time:',ELA_time)
def adjust_lr(optimizer,epoch,init_lr,lr_step=[45,60]):
if epoch in lr_step:
lr = init_lr*(0.1**(lr_step.index(epoch) + 1))
print('Drop LR to',lr)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def save_model(path,epoch,model,optimizer = None):
if isinstance(model,torch.nn.DataParallel):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
data = {'epoch':epoch,'state_dict':state_dict}
if not (optimizer is None):
data['optimizer'] = optimizer.state_dict()
torch.save(data,path)
if __name__ == '__main__':
main()
Here is .py with DistributedDataParallel:
#train_DDP.py
import os
os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3'
import torch
from model_dla34 import get_pose_net
from loss import CtdetLoss,AverageMeter
from torch.utils.data import DataLoader
from datasets import My_certernet_datasets
import time
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel
torch.distributed.init_process_group(backend='nccl')
datasets = 'pascal'
batch_size = 8
start_epoch = 0
end_epoch = 70
init_lr = 1.25e-4
local_rank = torch.distributed.get_rank()
torch.cuda.set_device(local_rank)
device = torch.device('cuda',local_rank)
def main():
model = get_pose_net()
model.to(device)
model_val = model
if torch.cuda.device_count() > 1:
print("let's use GPU{}!!!".format(local_rank))
model = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank,find_unused_parameters=True)
train_data = My_certernet_datasets(mode = 'train',datasets = datasets)
print('there are {} train images'.format(len(train_data)))
train_data_loader = DataLoader(dataset=train_data,
num_workers=16,
batch_size=batch_size,
sampler=DistributedSampler(train_data)
)
criterion = CtdetLoss()
optimizer = torch.optim.Adam(model.parameters(), init_lr)
for epoch in range(start_epoch+1,end_epoch+1):
DistributedSampler(train_data).set_epoch(epoch)
adjust_lr(optimizer,epoch,init_lr)
train(train_data_loader,model,criterion,optimizer,device,epoch,end_epoch)
if epoch % 10 == 0 and local_rank==0:
save_model(save_dir + '/model_last_{}.pth'.format(epoch),epoch,model,optimizer = None)
def train(train_data_loader,model,criterion,optimizer,device,epoch,end_epoch):
losses = AverageMeter()
model.train()
for i ,batch in enumerate(train_data_loader):
start_time = time.time()
for k in batch:
if k != 'meta':
batch[k] = batch[k].to(device)
output = model(batch['input'])
loss_stats = criterion(output,batch)
loss = loss_stats['loss']
optimizer.zero_grad()
loss.backward()
optimizer.step()
end_time = time.time()
ELA_time = (end_time - start_time)*(end_epoch - epoch)*len(train_data_loader)
ELA_time = time.strftime('%H:%M:%S',time.gmtime(ELA_time))
losses.update(loss.item())
if local_rank==0:
print('[epoch:{},{}/{}]'.format(epoch,i,len(train_data_loader)),'current_loss:%.4f'% losses.current,\
'average_loss:%.4f' % losses.avg,'ELA_time:',ELA_time)
def adjust_lr(optimizer,epoch,init_lr,lr_step=[45,60]):
if epoch in lr_step:
lr = init_lr*(0.1**(lr_step.index(epoch) + 1))
print('Drop LR to',lr)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def save_model(path,epoch,model,optimizer = None):
if isinstance(model,torch.nn.DataParallel):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
data = {'epoch':epoch,'state_dict':state_dict}
if not (optimizer is None):
data['optimizer'] = optimizer.state_dict()
torch.save(data,path)
if __name__ == '__main__':
main()
Someone can help me? This has been bothering me for a few days. Thank you very much!!!