I followed suggestions from this Topic
my trainer:
model = torch.nn.Conv1d(64, tokenizer.vocab_size, kernel_size=33, padding=33//2)
optimizer = optim.AdamW(model.parameters(), 5e-4)
loss = nn.CTCLoss(zero_infinity=True)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-4,
steps_per_epoch=int(len(speech_recognition_dataloader)),
epochs=1,
anneal_strategy='linear')
torch.autograd.set_detect_anomaly(True)
for batch_idx, batch in enumerate(speech_recognition_dataloader):
mfccs, mfccs_lengths, labels, label_lengths = batch
optimizer.zero_grad()
output = model(mfccs)
output = output.permute((-1, 0, 1))
print(output.size())
probs = nn.functional.log_softmax(output, dim=-1)
_loss = loss(probs, labels, mfccs_lengths, label_lengths)
_loss.backward()
optimizer.step()
scheduler.step()
print(f"iter: {batch_idx}, input_data is nan: {NAN(mfccs)}, loss: {_loss}, output is nan: {NAN(output)}")
my output which comes out of nn.Conv1d model has shape of (sequence, batch, features=28)
where 28 is my vocab size.
I used torch.autograd.set_detect_anomaly(True)
and here is output:
torch.Size([1573, 10, 28])
/home/arsham/.local/lib/python3.8/site-packages/torch/autograd/__init__.py:147: UserWarning: Error detected in LogSoftmaxBackward. Traceback of forward call that caused the error:
File "/home/arsham/.vscode/extensions/ms-toolsai.jupyter-2021.6.999662501/pythonFiles/vscode_datascience_helpers/kernel_prewarm_starter.py", line 31, in <module>
runpy.run_module(module, run_name="__main__", alter_sys=False)
File "/usr/lib/python3.8/runpy.py", line 210, in run_module
return _run_code(code, {}, init_globals, run_name, mod_spec)
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/home/arsham/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
app.start()
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 668, in start
self.io_loop.start()
File "/home/arsham/.local/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
self._run_once()
File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
handle._run()
File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
self._context.run(self._callback, *self._args)
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 456, in dispatch_queue
await self.process_one()
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 445, in process_one
await dispatch(*args)
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 352, in dispatch_shell
await result
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 647, in execute_request
reply_content = await reply_content
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 335, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 532, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2898, in run_cell
result = self._run_cell(
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2944, in _run_cell
return runner(coro)
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3169, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3361, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "/tmp/ipykernel_13510/3759225913.py", line 19, in <module>
probs = nn.functional.log_softmax(output, dim=0)
File "/home/arsham/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 1768, in log_softmax
ret = input.log_softmax(dim)
(Triggered internally at /pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward(
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_13510/3759225913.py in <module>
19 probs = nn.functional.log_softmax(output, dim=0)
20 _loss = loss(probs, labels, mfccs_lengths, label_lengths)
---> 21 _loss.backward()
22 optimizer.step()
23 scheduler.step()
~/.local/lib/python3.8/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
253 create_graph=create_graph,
254 inputs=inputs)
--> 255 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
256
257 def register_hook(self, hook):
~/.local/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
145 retain_graph = create_graph
146
--> 147 Variable._execution_engine.run_backward(
148 tensors, grad_tensors_, retain_graph, create_graph, inputs,
149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Function 'LogSoftmaxBackward' returned nan values in its 0th output.
this error comes from this part: probs = nn.functional.log_softmax(output, dim=-1)
,
where output.size() == (1573, 10, 28)
.
then I removed the log_softmax
part and calculate the loss directly with model output like so:
output = model(mfccs)
output = output.permute((-1, 0, 1))
probs = output
_loss = loss(probs, labels, mfccs_lengths, label_lengths)
and I get anomaly from MkldnnConvolutionBackward
:
/home/arsham/.local/lib/python3.8/site-packages/torch/autograd/__init__.py:147: UserWarning: Error detected in MkldnnConvolutionBackward. Traceback of forward call that caused the error:
File "/home/arsham/.vscode/extensions/ms-toolsai.jupyter-2021.6.999662501/pythonFiles/vscode_datascience_helpers/kernel_prewarm_starter.py", line 31, in <module>
runpy.run_module(module, run_name="__main__", alter_sys=False)
File "/usr/lib/python3.8/runpy.py", line 210, in run_module
return _run_code(code, {}, init_globals, run_name, mod_spec)
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/home/arsham/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
app.start()
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 668, in start
self.io_loop.start()
File "/home/arsham/.local/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
self._run_once()
File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
handle._run()
File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
self._context.run(self._callback, *self._args)
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 456, in dispatch_queue
await self.process_one()
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 445, in process_one
await dispatch(*args)
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 352, in dispatch_shell
await result
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 647, in execute_request
reply_content = await reply_content
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 335, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/arsham/.local/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 532, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2898, in run_cell
result = self._run_cell(
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2944, in _run_cell
return runner(coro)
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3169, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3361, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/home/arsham/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "/tmp/ipykernel_13510/3867036866.py", line 15, in <module>
output = model(mfccs)
File "/home/arsham/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/arsham/.local/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 298, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/arsham/.local/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 294, in _conv_forward
return F.conv1d(input, weight, bias, self.stride,
(Triggered internally at /pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward(
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_13510/3867036866.py in <module>
17 probs = output
18 _loss = loss(probs, labels, mfccs_lengths, label_lengths)
---> 19 _loss.backward()
20 optimizer.step()
21 scheduler.step()
~/.local/lib/python3.8/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
253 create_graph=create_graph,
254 inputs=inputs)
--> 255 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
256
257 def register_hook(self, hook):
~/.local/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
145 retain_graph = create_graph
146
--> 147 Variable._execution_engine.run_backward(
148 tensors, grad_tensors_, retain_graph, create_graph, inputs,
149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Function 'MkldnnConvolutionBackward' returned nan values in its 1th output.