Thanks a lot.
I have managed to add distributed to my code, then I met a error message when I run backward(), my code can be simplified like this:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import argparse
def parse_args():
parse = argparse.ArgumentParser()
parse.add_argument(
'--local_rank',
dest = 'local_rank',
type = int,
default = -1,
)
return parse.parse_args()
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3,
64,
kernel_size = 3,
stride = 2,
padding = 1,)
self.conv2 = nn.Conv2d(64,
256,
kernel_size = 3,
stride = 2,
padding = 1,)
self.conv3 = nn.Conv2d(256,
512,
kernel_size = 3,
stride = 2,
padding = 1,)
self.linear = nn.Linear(512, 10)
def forward(self, x):
H, W = x.size()[2:]
x = self.conv1(x)
x = self.conv2(x)
logits = self.conv3(x)
logits = F.interpolate(logits, (H, W), mode='bilinear')
return logits
def train():
args = parse_args()
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(
backend='nccl',
init_method='env://',
)
net = Net()
net.train()
net.cuda()
net = nn.parallel.DistributedDataParallel(net,
device_ids = [args.local_rank, ],
output_device = args.local_rank
)
optim = torch.optim.SGD(
net.parameters(),
lr = 1e-3,
momentum = 0.9,
weight_decay = 5e-4)
criteria = nn.CrossEntropyLoss()
for i in range(10000):
img = torch.randn(2, 3, 768, 768).cuda()
lb = torch.randint(0, 19, [2,768,768]).cuda()
optim.zero_grad()
out = net(img)
loss = criteria(out, lb)
loss.backward()
optim.step()
if __name__ == "__main__":
train()
Then the error appears:
/home/zhangzy/.local/lib/python3.5/site-packages/torch/nn/functional.py:2375: UserWarning: Def
ault upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Ple
ase specify align_corners=True if the old behavior is desired. See the documentation of nn.Ups
ample for details.
"See the documentation of nn.Upsample for details.".format(mode))
Traceback (most recent call last):
File "playdist.py", line 85, in <module>
train()
File "playdist.py", line 80, in train
loss.backward()
File "/home/zhangzy/.local/lib/python3.5/site-packages/torch/tensor.py", line 102, in backwa
rd
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/zhangzy/.local/lib/python3.5/site-packages/torch/autograd/__init__.py", line 90,
in backward
allow_unreachable=True) # allow_unreachable flag
File "/home/zhangzy/.local/lib/python3.5/site-packages/torch/nn/parallel/distributed.py", li
ne 384, in distributed_data_parallel_hook
self._queue_reduction(bucket_idx)
File "/home/zhangzy/.local/lib/python3.5/site-packages/torch/nn/parallel/distributed.py", li
ne 413, in _queue_reduction
self.device_ids)
TypeError: _queue_reduction(): incompatible function arguments. The following argument types a
re supported:
1. (process_group: torch.distributed.ProcessGroup, grads_batch: List[List[at::Tensor]], de
vices: List[int]) -> Tuple[torch.distributed.Work, at::Tensor]
Invoked with: <torch.distributed.ProcessGroupNCCL object at 0x7fe99236d6c0>, [[tensor([[[[0.,
0., 0.],
....
The followings are the tensors printed.
It works when I compute cross-entropy in the picture classification tasks, but cannot work when I compute this loss in the pixel classification tasks like this. What is wrong with my code please?