So I have been trying to pin point the source of nan values in my model. Here is part of my architecture
class LSTMBlock(nn.Module):
def __init__(
self, in_channels, out_channels, dropout=0, batchnorm=False, bias=False, num_layers=1, bidirectional=True):
super().__init__()
self._lstm = nn.LSTM(
input_size=in_channels,
hidden_size=out_channels,
num_layers=num_layers,
dropout=dropout,
batch_first=True,
bidirectional=bidirectional,
bias=bias
)
self.n_dirs = 2 if bidirectional else 1
self.fc_hid = nn.Linear(2*out_channels, out_channels)
self.fc_out = nn.Linear(2*out_channels, out_channels)
self.hidden_size = out_channels
#initialize different layers
init_weights(self._lstm)
init_weights(self.fc_hid)
init_weights(self.fc_out)
def forward(self, x):
#(B,T,D )
src_len = torch.LongTensor([torch.max((x[i,:, 0]!=0).nonzero()).item()+1 for i in range(x.shape[0])])
packed_x = nn.utils.rnn.pack_padded_sequence(x, src_len.cpu().numpy(), batch_first=True)
packed_outputs, hidden_state = self._lstm(packed_x)
hidden = hidden_state[0]
hidden = hidden[-self.n_dirs:, :, :]#(2,B,H)
# pad packed output sequence (B,T,2*H )
outputs, lengths = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
outputs = self.fc_out(outputs) #(B,:src_len, H)
hidden_state = torch.tanh(self.fc_hid(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
return outputs, hidden_state
The error message that I recieved is
--> 438 encoded, _ = self._encoder(encoder_input)
439
440 # Aggregator: take the mean over all points
~/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1206 input = bw_hook.setup_input_hook(input)
1207
-> 1208 result = forward_call(*input, **kwargs)
1209 if _global_forward_hooks or self._forward_hooks:
1210 for hook in (*_global_forward_hooks.values(), *self._forward_hooks.values()):
/tmp/ipykernel_4888/1110288317.py in forward(self, x)
234 packed_x = nn.utils.rnn.pack_padded_sequence(x, src_len.cpu().numpy(), batch_first=True)
235
--> 236 packed_outputs, hidden_state = self._lstm(packed_x)
237
238 hidden = hidden_state[0]
~/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1209 if _global_forward_hooks or self._forward_hooks:
1210 for hook in (*_global_forward_hooks.values(), *self._forward_hooks.values()):
-> 1211 hook_result = hook(self, input, result)
1212 if hook_result is not None:
1213 result = hook_result
/tmp/ipykernel_4888/1110288317.py in nan_hook(self, inp, out)
59
60 for i, inp in enumerate(inputs):
---> 61 if inp is not None and contains_nan(inp):
62 raise RuntimeError(f'Found NaN input at index: {i} in layer: {layer}')
63
/tmp/ipykernel_4888/1110288317.py in <lambda>(x)
55 inputs = isinstance(inp, tuple) and inp or [inp]
56
---> 57 contains_nan = lambda x: torch.isnan(x).any()
58 layer = self.__class__.__name__
59
TypeError: isnan(): argument 'input' (position 1) must be Tensor, not PackedSequence
The code runs on the validation dataset with the batch size one but when it runs to train the model with batch size of 40, then I got above error. I willl appreciate if someone can offer a way to solve this error.