icelandszt
(icelandszt .)
April 23, 2020, 10:21am
1
Consider the following network snippet:
def __init__(self, model, n_class, dropout_rate,device):
super(NewModel, self).__init__()
self.bert = model
self.linear = nn.Linear(self.bert.config.hidden_size, 2)
self.linear_1 = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
self.dropout_rate = dropout_rate
self.dropout_1 = nn.Dropout(p = self.dropout_rate)
self.activation = nn.LeakyReLU()
self.bn = nn.BatchNorm1d(num_features = self.bert.config.hidden_size)
def forward(self, batch):
outputs = self.bert(
input_ids = batch[0].to(self.device),
attention_mask = batch[1].to(self.device),
token_type_ids = None,
position_ids = None,
head_mask = None,
inputs_embeds = None,
)
output = outputs[0]
pooled_output=output[:,0]
pooled_output = pooled_output.unsqueeze(0)
pooled_output_1 = self.dropout_1(self.bn(pooled_output))
logits = self.linear(F.leaky_relu(self.linear_1(pooled_output_1)))
My batch size is 16 and during training I get this error:
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 16])
I already set drop_last=True
in the DataLoader
, but the error persists.
Any help would be greatly appreciated.
albanD
(Alban D)
April 23, 2020, 2:08pm
2
Hi,
What is the stack trace that comes with the error?
I think the problem is that you try to do batchnorm with a Tensor of size [1, 16]. So that has batch size of 1 and 16 channels.
But then there is only a single value and so the standard deviation computed by the batchnorm will lead to infinites when you divide by it.
icelandszt
(icelandszt .)
April 23, 2020, 2:59pm
3
Thank you for your time!
Here it is:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-363-0058cef7d67b> in <module>
9
10 import numpy as np
---> 11 train(model,train_dataloader,validation_dataloader)
12 print("")
13 print("Training complete!")
<ipython-input-348-f7eefdedff7c> in train(model, train_dataloader, validation_dataloader)
72 # The documentation for this `model` function is here:
73 # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
---> 74 outputs = model(batch)
75
76 # The call to `model` always returns a tuple, so we need to pull the
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
<ipython-input-362-052e4983c882> in forward(self, batch)
49 pooled_output = pooled_output.unsqueeze(0)
50
---> 51 pooled_output_1 = self.dropout_1(self.bn(pooled_output))
52 pooled_output_2 = self.dropout_2(self.bn2(pooled_output))
53
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py in forward(self, input)
105 input, self.running_mean, self.running_var, self.weight, self.bias,
106 self.training or not self.track_running_stats,
--> 107 exponential_average_factor, self.eps)
108
109
/opt/anaconda3/lib/python3.7/site-packages/torch/nn/functional.py in batch_norm(input, running_mean, running_var, weight, bias, training, momentum, eps)
1664 size_prods *= size[i + 2]
1665 if size_prods == 1:
-> 1666 raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
1667
1668 return torch.batch_norm(
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 16])
albanD
(Alban D)
April 23, 2020, 3:25pm
4
Yes that does point to the batchnorm. So the analysis above is the answer.
icelandszt
(icelandszt .)
April 23, 2020, 9:36pm
5
I realized that it was because of the BatchNorm… that’s why I tried drop_last = True
in the DataLoader
. But it didn’t work so I was searching for a different solution
albanD
(Alban D)
April 23, 2020, 10:29pm
6
Actually looking at the code in the stack trace
49 pooled_output = pooled_output.unsqueeze(0)
50
---> 51 pooled_output_1 = self.dropout_1(self.bn(pooled_output))
Looks like the pooled output was a 1D Tensor, and the unsqueeze added the extra dimension of size 1. In this case, is 16 supposed to be the batch size and you should have only one channel? If so, then you should unsqueeze dimension 1.