- Many thanks for your prompt reply!
However, after I modified the code for model training, the same error persisted.
The error message is shown below…
Traceback (most recent call last):
File "make_train.py", line 44, in <module>
output = net(pressure)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply
raise output
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker
output = module(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/home/data/lyf/pytorch/make_net.py", line 92, in forward
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim*8)(x))
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/batchnorm.py", line 83, in forward
exponential_average_factor, self.eps)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py", line 1697, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
RuntimeError: Tensor for argument #2 'weight' is on CPU, but expected it to be on GPU (while checking arguments for cudnn_batch_norm)
- Please allow me to show you the code for the network definition.
import torch
import torch.nn as nn
import torch.nn.functional as F
from make_ops import conv_out_size_same
from make_data import batch_size
s_h, s_w = 403, 640
# 403,640
s_h2, s_w2 = conv_out_size_same(s_h, 2), conv_out_size_same(s_w, 2)
# 202, 320
s_h4, s_w4 = conv_out_size_same(s_h2, 2), conv_out_size_same(s_w2, 2)
# 101, 160
s_h8, s_w8 = conv_out_size_same(s_h4, 2), conv_out_size_same(s_w4, 2)
# 51, 80
s_h16, s_w16 = conv_out_size_same(s_h8, 2), conv_out_size_same(s_w8, 2)
# 25, 40
s_h32, s_w32 = conv_out_size_same(s_h16, 2), conv_out_size_same(s_w16, 2)
# 12, 20
s_h64, s_w64 = conv_out_size_same(s_h32, 2), conv_out_size_same(s_w32, 2)
# 6, 10
s_h128, s_w128 = conv_out_size_same(s_h64, 2), conv_out_size_same(s_w64, 2)
# 3,5
s_h256, s_w256 = conv_out_size_same(s_h128, 2), conv_out_size_same(s_w128, 2)
# 2, 3
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.CONV1_DEPTH = 2
self.CONV2_DEPTH = 4
self.CONV3_DEPTH = 8
self.CONV4_DEPTH = 16
self.CONV5_DEPTH = 32
self.CONV6_DEPTH = 64
self.CONV7_DEPTH = 128
self.CONV8_DEPTH = 256
self.f_dim = 32
self.channel_dim = 1
self.FC_NODE = 512
self.IMG_HEIGHT = 403
self.IMG_WIDTH = 640
self.batch_size = batch_size
self.fc1 = nn.Linear(in_features=10, out_features=self.f_dim*8)
self.fc2 = nn.Linear(in_features=self.f_dim*8, out_features=self.f_dim*8*s_w256*s_h256)
self.deconv1 = nn.ConvTranspose2d(in_channels=self.f_dim*8, out_channels=self.f_dim*4,
kernel_size=2, stride=2)
self.deconv2 = nn.ConvTranspose2d(in_channels=self.f_dim*4, out_channels=self.f_dim*2,
kernel_size=2, stride=2)
self.deconv3 = nn.ConvTranspose2d(in_channels=self.f_dim*2, out_channels=self.f_dim,
kernel_size=2, stride=2)
self.deconv4 = nn.ConvTranspose2d(in_channels=self.f_dim, out_channels=self.f_dim//2,
kernel_size=2, stride=2)
self.deconv5 = nn.ConvTranspose2d(in_channels=self.f_dim//2, out_channels=self.f_dim//4,
kernel_size=2, stride=2)
self.deconv6 = nn.ConvTranspose2d(in_channels=self.f_dim//4, out_channels=self.f_dim//8,
kernel_size=2, stride=2)
self.deconv7 = nn.ConvTranspose2d(in_channels=self.f_dim//8, out_channels=self.f_dim//16,
kernel_size=2, stride=2)
self.deconv8 = nn.ConvTranspose2d(in_channels=self.f_dim//16, out_channels=self.channel_dim,
kernel_size=2, stride=2)
self.conv1 = nn.Conv2d(in_channels=self.channel_dim, out_channels=self.CONV1_DEPTH,
kernel_size=2, stride=2)
self.conv2 = nn.Conv2d(in_channels=self.CONV1_DEPTH, out_channels=self.CONV2_DEPTH,
kernel_size=2, stride=2)
self.conv3 = nn.Conv2d(in_channels=self.CONV2_DEPTH, out_channels=self.CONV3_DEPTH,
kernel_size=2, stride=2)
self.conv4 = nn.Conv2d(in_channels=self.CONV3_DEPTH, out_channels=self.CONV4_DEPTH,
kernel_size=2, stride=2)
self.conv5 = nn.Conv2d(in_channels=self.CONV4_DEPTH, out_channels=self.CONV5_DEPTH,
kernel_size=2, stride=2)
self.conv6 = nn.Conv2d(in_channels=self.CONV5_DEPTH, out_channels=self.CONV6_DEPTH,
kernel_size=2, stride=2)
self.conv7 = nn.Conv2d(in_channels=self.CONV6_DEPTH, out_channels=self.CONV7_DEPTH,
kernel_size=2, stride=2)
self.conv8 = nn.Conv2d(in_channels=self.CONV7_DEPTH, out_channels=self.CONV8_DEPTH,
kernel_size=2, stride=2)
self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=1)
def forward(self, input_tensor):
x = self.fc1(input_tensor)
x = self.fc2(x)
x = x.view(-1, self.f_dim*8, s_h256, s_w256)
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim*8)(x))
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim*4)(self.deconv1(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim*2)(self.deconv2(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim)(self.deconv3(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim//2)(self.deconv4(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim//4)(self.deconv5(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim//8)(self.deconv6(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.f_dim//16)(self.deconv7(x)))
x = F.tanh(nn.BatchNorm2d(num_features=self.channel_dim)(self.deconv8(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.CONV1_DEPTH)(self.conv1(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.CONV2_DEPTH)(self.conv2(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.CONV3_DEPTH)(self.conv3(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.CONV4_DEPTH)(self.conv4(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.CONV5_DEPTH)(self.conv5(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.CONV6_DEPTH)(self.conv6(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.CONV7_DEPTH)(self.conv7(x)))
x = F.elu(nn.BatchNorm2d(num_features=self.CONV8_DEPTH)(self.conv8(x)))
x = self.avg_pool(x)
x = x.view(-1, self.num_flat_features(x))
x = F.elu(nn.Linear(in_features=x.size()[-1], out_features=self.FC_NODE)(x))
x = nn.Linear(in_features=self.FC_NODE, out_features=self.IMG_HEIGHT*self.IMG_WIDTH)(x)
x = x.view(-1, self.IMG_HEIGHT, self.IMG_WIDTH)
return x
def num_flat_features(self, x):
size = x.size()[1:]
num_features = 1
for s in size:
num_features *= s
return num_features
The error message tells me Error in checking BatchNorm used in Cudnn.
- The code for my modified model training is shown below.
import torch
import torch.nn as nn
import torch.optim as optim
from make_data import train_dataloader, test_dataloader
from make_net import Net
num_epochs = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_loader = train_dataloader
criterion = nn.MSELoss()
net = Net()
optimizer = optim.Adam(net.parameters(), lr=0.1, betas=(0.9, 0.99))
if torch.cuda.is_available():
print("Let's use", torch.cuda.device_count(), "GPUs")
net = nn.DataParallel(net)
net.to(device)
for epoch in range(num_epochs):
running_loss = 0.0
print("Epoch {}/{}".format(epoch, num_epochs-1))
print("-" * 10)
for i, sample in enumerate(train_loader, 0):
image, pressure = sample['image'], sample['pressure']
image = image.float()
image = image.to(device)
pressure = pressure.float()
pressure = pressure.to(device)
optimizer.zero_grad()
output = net(pressure)
loss = criterion(output, image)
loss.backward()
optimizer.step()
running_loss += loss.item()
if (i+1) % 100 == 0:
print("%d, %5d, loss: %.3f" % (epoch, i, running_loss/100))
running_loss = 0.0
- Many thanks for your prompt reply!
Thank you very much!!!