Error in torch.nn.DataParallel

aturahc13 · December 16, 2019, 7:02am

Hi. I would like to use “DataParallel” in DNN training in Pytorch but get some errors.
Before I use “DataParallel”, the code is;

for epoch in range(epochs):
    train_loss = 0.0
    val_loss = 0.0
    
    train_loader2 = MakeDataset(file_x_train, file_y_mask_train, tmpbatch_size, shuffle=False)
    test_loader2 = MakeDataset(file_x_test, file_y_mask_test, batch_size)
    # Training the model
    net2.train()
    counter = 0
    i = 0
    for data in train_loader2:
        inputs, labels = data
        
        if i ==0:
            conv_inputs = inputs
            conv_labels = labels
            #print(i,conv_inputs.shape)
        else:
            conv_inputs = torch.cat([conv_inputs, inputs],dim=0)
            conv_labels = torch.cat([conv_labels, labels],dim=0)
            #print(i,conv_inputs.shape)
            
        i += 1
            
        if i == nconv:
            #print(i,conv_inputs.shape)
            if cuda:
                conv_inputs, conv_labels = Variable(conv_inputs).cuda(), Variable(conv_labels).cuda()

            net_optimizer.zero_grad()
            outputs = net2.forward(conv_inputs)

            loss = criterion(outputs, conv_labels) # ###############

            loss.backward()
            net_optimizer.step()
            train_loss += loss.item()*conv_inputs.size(0)
            counter += 1
            print("\r{0}".format(counter), end="")
            
            i = 0
            
    # Evaluating the model
    net2.eval()
    
    
    counter = 0
    # Tell torch not to calcualte gradients
    with torch.no_grad():
        for data in test_loader2:
            # Move to device
            inputs, labels = data
            if cuda:
                inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()
            
            # Forward pass
            output = net2.forward(inputs)
            
            # Calculate Loss  
            valloss = criterion(output, labels) # ###############

            # Add loss to the validation set's running loss
            val_loss += valloss.item()*inputs.size(0)
                        
                
    # Get the average loss for the entire epoch
    train_loss = train_loss/train_len
    valid_loss = val_loss/test_len
    train_loss_list.append(train_loss)
    test_loss_list.append(valid_loss)

   
    # print out the information
    print('[%d] Training Loss: %.6f, Validation Loss: %.6f'  % (epoch + 1, train_loss, valid_loss))
    torch.save(net2.state_dict(), file_nnstate)
print('Finished Training')

The function " MakeDataset" is loading the training data from certain files(made by me). The DNN model is named “net2”.
I would like to use “DataParallel” and add the code below, before this.
<Code 2>

device_ids = range(torch.cuda.device_count())
print(device_ids)
net2 = torch.nn.DataParallel(net, device_ids=device_ids)

This is from https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
The output corresponding this is

range(0, 8)

However, when I run <Code 2>, I get error on <Code 1>. The error is like this;
<Error on Code 1>

ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
    output.reraise()
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/_utils.py", line 385, in reraise
    raise self.exc_type(msg)
ValueError: Caught ValueError in replica 7 on device 7.
Original Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "<ipython-input-16-0dd5899e55f0>", line 14, in forward
    x = self.fc(input)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 81, in forward
    exponential_average_factor, self.eps)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/functional.py", line 1666, in batch_norm
    raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 220])

When I write

device_ids = range(1)

in <Code 2>, <Code 1> runs without errors.
What should I do to run <Code 1> ? Is <Code 2> not enough?
Thank you very much.

ptrblck · December 16, 2019, 7:12am

The error points to a batchnorm layer, which wants to calculate the running statistics, but fails due to a single provided sample.
This is usually the case, if you feed a batch to the model containing one sample.
Since you are using nn.DataParallel this could happen, if the batch size equals the number of GPUs.
Also, the last batch might be smaller, which could trigger this effect. In that case, you can skip the last (smaller) batch using drop_last=True in your DataLoader.

aturahc13 · December 16, 2019, 7:32am

Dear ptrblck,
Thank you very much for your reply !
As in your reply, I tried to set batch size 100.(bigger than 8 = the number of GPUs) and added drop_last = True in the DataLoader but I get the same error…
Are there anything else to be checked?
Thank you very much.

ptrblck · December 16, 2019, 7:34am

I assume the code is working using a single GPU?
Could you print the shape of the input data inside forward for the sake of debugging?

aturahc13 · December 16, 2019, 7:39am

input data shape is torch.Size([10, 396]) torch.Size([10, 396]) torch.Size([10, 396]) torch.Size([10, 396]) torch.Size([10, 396]) torch.Size([10, 396]) torch.Size([10, 396]) torch.Size([10, 396]) torch.Size([10, 396]) torch.Size([10, 396])

I get this code by

#############################################(2019/11/5変更)


for epoch in range(epochs):
    train_loss = 0.0
    val_loss = 0.0
    
    train_loader2 = MakeDataset(file_x_train, file_y_mask_train, tmpbatch_size, shuffle=False)
    test_loader2 = MakeDataset(file_x_test, file_y_mask_test, batch_size)
    # Training the model
    net.train()
    counter = 0
    i = 0
    for data in train_loader2:
        inputs, labels = data
        
        ### This is new ###
        print(inputs.shape)
        ### This is new ###
        
        if i ==0:
            conv_inputs = inputs
            conv_labels = labels
            #print(i,conv_inputs.shape)
        else:
            conv_inputs = torch.cat([conv_inputs, inputs],dim=0)
            conv_labels = torch.cat([conv_labels, labels],dim=0)
            #print(i,conv_inputs.shape)
            
        i += 1
            
        if i == nconv:
            #print(i,conv_inputs.shape)
            if cuda:
                conv_inputs, conv_labels = Variable(conv_inputs).cuda(), Variable(conv_labels).cuda()

            net_optimizer.zero_grad()
            outputs = net.forward(conv_inputs)

            loss = criterion(outputs, conv_labels) # ###############

            loss.backward()
            net_optimizer.step()
            train_loss += loss.item()*conv_inputs.size(0)
            counter += 1
            print("\r{0}".format(counter), end="")
            
            i = 0
            
    # Evaluating the model
    net.eval()
    
    
    counter = 0
    # Tell torch not to calcualte gradients
    with torch.no_grad():
        for data in test_loader2:
            # Move to device
            inputs, labels = data
            if cuda:
                inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()
            
            # Forward pass
            output = net.forward(inputs)
            
            # Calculate Loss  
            valloss = criterion(output, labels) # ###############

            # Add loss to the validation set's running loss
            val_loss += valloss.item()*inputs.size(0)
                        
                
    # Get the average loss for the entire epoch
    train_loss = train_loss/train_len
    valid_loss = val_loss/test_len
    train_loss_list.append(train_loss)
    test_loss_list.append(valid_loss)

   
    # print out the information
    print('[%d] Training Loss: %.6f, Validation Loss: %.6f'  % (epoch + 1, train_loss, valid_loss))
    torch.save(net.state_dict(), file_nnstate)
print('Finished Training')

ptrblck · December 16, 2019, 7:40am

Is your code constantly printing out this shape? Even when the error is thrown?

aturahc13 · December 16, 2019, 7:44am

Yes, it is. This is printed out when I use single GPU.

aturahc13 · December 16, 2019, 7:45am

When I run device_ids = range(torch.cuda.device_count()) print(device_ids) net2 = torch.nn.DataParallel(net, device_ids=device_ids) before this (the output for this is range(0, 8)), it prints

torch.Size([10, 396])
torch.Size([10, 396])
torch.Size([10, 396])
torch.Size([10, 396])
torch.Size([10, 396])
torch.Size([10, 396])
torch.Size([10, 396])
torch.Size([10, 396])
torch.Size([10, 396])
torch.Size([10, 396])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-77-fb8be4f5e216> in <module>()
     35 
     36             net_optimizer.zero_grad()
---> 37             outputs = net2.forward(conv_inputs)
     38 
     39             loss = criterion(outputs, conv_labels) # ###############

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
    150             return self.module(*inputs[0], **kwargs[0])
    151         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 152         outputs = self.parallel_apply(replicas, inputs, kwargs)
    153         return self.gather(outputs, self.output_device)
    154 

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
    160 
    161     def parallel_apply(self, replicas, inputs, kwargs):
--> 162         return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
    163 
    164     def gather(self, outputs, output_device):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
     83         output = results[i]
     84         if isinstance(output, ExceptionWrapper):
---> 85             output.reraise()
     86         outputs.append(output)
     87     return outputs

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/_utils.py in reraise(self)
    383             # (https://bugs.python.org/issue2651), so we work around it.
    384             msg = KeyErrorMessage(msg)
--> 385         raise self.exc_type(msg)

ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
    output.reraise()
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/_utils.py", line 385, in reraise
    raise self.exc_type(msg)
ValueError: Caught ValueError in replica 6 on device 6.
Original Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "<ipython-input-16-0dd5899e55f0>", line 14, in forward
    x = self.fc(input)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 81, in forward
    exponential_average_factor, self.eps)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/functional.py", line 1666, in batch_norm
    raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 220])```

ptrblck · December 16, 2019, 8:07am

So are you multiplying the batch size by the number of GPUs (9)?
nn.DataParallel will chunk the batch in dim0 and send each piece to a GPU.
Since you get [10, 396] inside the forward method for a single GPU as well as for multiple GPUs using nn.DataParallel, your provided batch should have the shape [90, 396] before feeding it into the nn.DataParallel model.
Is my assumption correct?

aturahc13 · December 17, 2019, 1:04am

Dear ptrblck,
I am sorry to be late even though you replied quickly yesterday.
I did not multiple (9) and I have just tried now.

And it starts working !!!

The only remaining concern is if it is training correctly, so I will confirm that the performance is as same as the one I trained before, after some epochs.
Thank you very much again for your easy-to-understand and polite explanation.

PantherYan · April 6, 2020, 2:52pm

Yes, I also faced this problem.
Traceback (most recent call last): | 0/765 [00:00<?, ?it/s]

outputs = model(**inputs)

File “/usr/local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 532, in call
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py”, line 152, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File “/usr/local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py”, line 162, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File “/usr/local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py”, line 85, in parallel_apply
output.reraise()
File “/usr/local/lib/python3.6/site-packages/torch/_utils.py”, line 394, in reraise
raise self.exc_type(msg)
ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
File “/usr/local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py”, line 60, in _worker
output = module(*input, **kwargs)
File “/usr/local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 532, in call
result = self.forward(*input, **kwargs)
File “/workspace/git/transformers/src/transformers/modeling_bert.py”, line 1522, in forward
loss = loss_fct(logits, labels)
File “/usr/local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 532, in call
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/site-packages/torch/nn/modules/loss.py”, line 916, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File “/usr/local/lib/python3.6/site-packages/apex/amp/wrap.py”, line 28, in wrapper
return orig_fn(*new_args, **kwargs)
File “/usr/local/lib/python3.6/site-packages/torch/nn/functional.py”, line 2021, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File “/usr/local/lib/python3.6/site-packages/apex/amp/wrap.py”, line 28, in wrapper
return orig_fn(*new_args, **kwargs)
File “/usr/local/lib/python3.6/site-packages/torch/nn/functional.py”, line 1836, in nll_loss
.format(input.size(0), target.size(0)))
ValueError: Expected input batch_size (2) to match target batch_size (4).

After modified the batch data, but forgot to multiple * max(1, args.n_gpu) on the sample number dimension.
@ptrblck So many thanks!