I have been trying use tcp connection and mp.spawn for using two GPUs for model parallel. But after execute some batches(about 4) in CIFAR10, the output of two sub-models becomes nan. Why this has happened? When I do the training in one gpu, it is fine.
code for send and recv
class RemoteSend(autograd.Function):
@staticmethod
def forward(ctx, input: torch.tensor, to_rank: int, from_rank: int,self_rank:int ):
dist.send((torch.tensor(input.dim())*torch.tensor(1.0)).to(self_rank), to_rank)
dist.send((torch.tensor(input.size())*torch.tensor(1.0)).to(self_rank), to_rank)
print("forward send size",torch.tensor(input.size())*1.0)
dist.send(input, to_rank)
to_rank = torch.tensor(to_rank)
from_rank = torch.tensor(from_rank)
ctx.save_for_backward(to_rank, from_rank)
return input
@staticmethod
def backward(ctx, grad_output):
to_rank, from_rank = ctx.saved_tensors
dist.recv(grad_output, int(from_rank))
return grad_output
class RemoteReceive(autograd.Function):
@staticmethod
def forward(ctx, to_rank: int = 0, from_rank: int = 1,self_rank: int = 0):
##print(to_rank, from_rank)
dim = torch.tensor(1.0).to(self_rank)
dist.recv(dim, from_rank)
#print("forward recv dim", dim)
size = torch.rand(int(dim)).to(self_rank)
dist.recv(size, from_rank)
#print("forward recv size",size)
x = torch.zeros(tuple(size.int())).to(self_rank)
#print("try to get input with size", x.size())
dist.recv(x, from_rank)
#print("forward recv finish")
to_rank = torch.tensor(to_rank)
from_rank = torch.tensor(from_rank)
ctx.save_for_backward(to_rank, from_rank)
return x
@staticmethod
def backward(ctx, grad_output, to_rank: int = 0):
to_rank, from_rank = ctx.saved_tensors
dist.send(torch.tensor(grad_output.dim()*1.0), int(to_rank))
dist.send(torch.tensor(grad_output.size())*1.0, int(to_rank))
dist.send(grad_output, int(to_rank))
return None
These are code that when you apply them it send tensors to other gpus in forward and get tensors in backward.
And training code are as below.
def main_worker(gpu,world_size):
dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:1224',
world_size=2, rank=gpu)
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=16,
shuffle=True, num_workers=12)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=16,
shuffle=False, num_workers=12)
model =torchvision.models.MobileNetV2(num_classes = 10)
if gpu == 0:
model1 = model.features[0:5]
criterion = nn.CrossEntropyLoss().cuda(0)
optimizer1 =torch.optim.SGD(model1.parameters(),lr = 0.1,weight_decay= 1e-4)
model1.cuda(0)
for i ,(images,target) in enumerate(trainloader):
images = images.cuda(0,non_blocking =True)
output =model1(images)
#print(output)
RemoteSend.apply(output,1,1,0)
optimizer1.zero_grad()
output.backward(output)
optimizer1.step()
if gpu ==1 :
model2 = nn.Sequential(model.features[5:],Reshape(-1,1280),model.classifier)
criterion = nn.CrossEntropyLoss().cuda(1)
optimizer2 =torch.optim.SGD(model2.parameters(),lr = 0.1,weight_decay= 1e-4)
model2.cuda(1)
for i ,(images,target) in enumerate(trainloader):
target = target.cuda(1,non_blocking =True)
input = torch.tensor([16,32,4,4])
input = RemoteReceive.apply(0,0,1)
input = input.cuda(1)
#print(input)
output = model2(input)
#print(output)
loss = criterion(output, target)
optimizer2.zero_grad()
loss.backward()
optimizer2.step()
print(loss)
I divided MobileNetV2 into two parts.
The first 4 batches the model get normal resputs, after that, loss and output of model becomes nan.
Here are settings
Dataset:CIFAR10
lr:0.1
batchsize: 512
imagesize:32*32
I don’t believe these settings are related to nan, since I have tried on one GPU