I am trying to run a CNN model on aws ec2 instance. It is giving me this error. The same script runs on a different ec2 instance with a different gpu but not on this. I am not able to solve it even though I’ve restarted the kernel, checked output neurons size, etc. I tried training a pretrained model and it is working perfectly with the same dataloaders and script.
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_4008/1924527283.py in <cell line: 3>()
5 start_time = time.monotonic()
6
----> 7 train_loss, train_acc = train(model, loader_train, optimizer, criterion, device="cpu")
8 valid_loss, valid_acc = evaluate(model, loader_valid, criterion, device="cpu")
9
/tmp/ipykernel_4008/585199812.py in train(model, iterator, optimizer, criterion, device)
13
14 with autocast(enabled=use_amp):
---> 15 y_pred = model(x)
16
17 loss = criterion(y_pred, y)
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/tmp/ipykernel_4008/2249971542.py in forward(self, input)
148
149 #MAIN Branch
--> 150 x = self.qcfem(input)
151 #print(x.shape)
152 residual = x
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/tmp/ipykernel_4008/2249971542.py in forward(self, input)
44 def forward(self, input):
45
---> 46 w = self.relu(self.batch_norm(self.dilated_conv1(input)))
47
48 x = self.relu(self.batch_norm(self.dilated_conv2(input)))
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/conv.py in forward(self, input)
444
445 def forward(self, input: Tensor) -> Tensor:
--> 446 return self._conv_forward(input, self.weight, self.bias)
447
448 class Conv3d(_ConvNd):
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias)
440 weight, bias, self.stride,
441 _pair(0), self.dilation, self.groups)
--> 442 return F.conv2d(input, weight, bias, self.stride,
443 self.padding, self.dilation, self.groups)
444
RuntimeError: CUDA error: device-side assert triggered