Hi. I would like to use “DataParallel” in DNN training in Pytorch but get some errors.
Before I use “DataParallel”, the code is;
<Code 1>
for epoch in range(epochs):
train_loss = 0.0
val_loss = 0.0
train_loader2 = MakeDataset(file_x_train, file_y_mask_train, tmpbatch_size, shuffle=False)
test_loader2 = MakeDataset(file_x_test, file_y_mask_test, batch_size)
# Training the model
net2.train()
counter = 0
i = 0
for data in train_loader2:
inputs, labels = data
if i ==0:
conv_inputs = inputs
conv_labels = labels
#print(i,conv_inputs.shape)
else:
conv_inputs = torch.cat([conv_inputs, inputs],dim=0)
conv_labels = torch.cat([conv_labels, labels],dim=0)
#print(i,conv_inputs.shape)
i += 1
if i == nconv:
#print(i,conv_inputs.shape)
if cuda:
conv_inputs, conv_labels = Variable(conv_inputs).cuda(), Variable(conv_labels).cuda()
net_optimizer.zero_grad()
outputs = net2.forward(conv_inputs)
loss = criterion(outputs, conv_labels) # ###############
loss.backward()
net_optimizer.step()
train_loss += loss.item()*conv_inputs.size(0)
counter += 1
print("\r{0}".format(counter), end="")
i = 0
# Evaluating the model
net2.eval()
counter = 0
# Tell torch not to calcualte gradients
with torch.no_grad():
for data in test_loader2:
# Move to device
inputs, labels = data
if cuda:
inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()
# Forward pass
output = net2.forward(inputs)
# Calculate Loss
valloss = criterion(output, labels) # ###############
# Add loss to the validation set's running loss
val_loss += valloss.item()*inputs.size(0)
# Get the average loss for the entire epoch
train_loss = train_loss/train_len
valid_loss = val_loss/test_len
train_loss_list.append(train_loss)
test_loss_list.append(valid_loss)
# print out the information
print('[%d] Training Loss: %.6f, Validation Loss: %.6f' % (epoch + 1, train_loss, valid_loss))
torch.save(net2.state_dict(), file_nnstate)
print('Finished Training')
The function " MakeDataset" is loading the training data from certain files(made by me). The DNN model is named “net2”.
I would like to use “DataParallel” and add the code below, before this.
<Code 2>
device_ids = range(torch.cuda.device_count())
print(device_ids)
net2 = torch.nn.DataParallel(net, device_ids=device_ids)
This is from https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
The output corresponding this is
range(0, 8)
However, when I run <Code 2>, I get error on <Code 1>. The error is like this;
<Error on Code 1>
ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/_utils.py", line 385, in reraise
raise self.exc_type(msg)
ValueError: Caught ValueError in replica 7 on device 7.
Original Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File "<ipython-input-16-0dd5899e55f0>", line 14, in forward
x = self.fc(input)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 81, in forward
exponential_average_factor, self.eps)
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/functional.py", line 1666, in batch_norm
raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 220])
When I write
device_ids = range(1)
in <Code 2>, <Code 1> runs without errors.
What should I do to run <Code 1> ? Is <Code 2> not enough?
Thank you very much.