Hello,
I am trying to train a model which is based on timm library.
My main looks as:
model = DPTDepthModel(
path=None,#model_path,
scale=0.00006016,
shift=0.00579,
invert=True,
backbone=“vitb_rn50_384”,
non_negative=True,
enable_attention_hooks=False,
)
loss_function = MSE()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, )
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
torch.cuda.empty_cache()
model = nn.DataParallel(model, device_ids = [0, 1])
model.cuda()
print("Model and Loss Function are on multiple GPUs now")
for epoch in range(0, 5):
newmodel=train(model,train_loader,loss_function,optimizer,epoch)
torch.save(newmodel.state_dict(), "model"+ str(epoch) +".pt")
return
Error:
File “train.py”, line 482, in main2
newmodel=train(model,train_loader,loss_function,optimizer,epoch)
File “train.py”, line 381, in train
output=model(input.float().cuda())#to(torch.device(‘cuda:1’)))
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 1102, in _call_impl
return forward_call(*input, **kwargs)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py”, line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py”, line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py”, line 86, in parallel_apply
output.reraise()
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/_utils.py”, line 434, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py”, line 61, in _worker
output = module(*input, **kwargs)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 1102, in _call_impl
return forward_call(*input, **kwargs)
File “/home/talha/SemAttNet-Thesis-Code-main/dpt/models.py”, line 165, in forward
inv_depth = super().forward(x).squeeze(dim=1)
File “/home/talha/SemAttNet-Thesis-Code-main/dpt/models.py”, line 78, in forward
glob = self.pretrained.model.forward_flex(x)
File “/home/talha/SemAttNet-Thesis-Code-main/dpt/vit.py”, line 175, in forward_flex
x = self.patch_embed.backbone(x)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 1102, in _call_impl
return forward_call(*input, **kwargs)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/timm/models/resnetv2.py”, line 418, in forward
x = self.forward_features(x)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/timm/models/resnetv2.py”, line 412, in forward_features
x = self.stem(x)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 1102, in _call_impl
return forward_call(*input, **kwargs)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/modules/container.py”, line 141, in forward
input = module(input)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 1102, in _call_impl
return forward_call(*input, **kwargs)
File “/home/talha/anaconda3/envs/SemNet2/lib/python3.6/site-packages/timm/models/layers/std_conv.py”, line 72, in forward
x = F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)