The following code has an error on GPU, but no problem on CPU. Also, when the batch_size is reduced to 256, it runs fine on GPU too. So, I wonder if this is a bug in torch autograd or in cuda. Can pytorch team take a look into this bug?
This is the error msg when run on GPU with batch_size 512.
NVIDIA-SMI 418.40.04 Driver Version: 418.40.04 CUDA Version: 11.1
PyTorch version: 1.9.0
[W python_anomaly_mode.cpp:104] Warning: Error detected in CudnnRnnBackward. Traceback of forward call that caused the error:
File "rnn_error.py", line 41, in <module>
logits = model(input)
File "/miniconda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "rnn_error.py", line 20, in forward
lstm_out, _ = self.lstm(x)
File "/miniconda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/miniconda/lib/python3.7/site-packages/torch/nn/modules/rnn.py", line 680, in forward
self.dropout, self.training, self.bidirectional, self.batch_first)
(function _print_stack)
Traceback (most recent call last):
File "rnn_error.py", line 49, in <module>
loss.backward()
File "/miniconda/lib/python3.7/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/miniconda/lib/python3.7/site-packages/torch/autograd/__init__.py", line 149, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.autograd.set_detect_anomaly(True)
class Net(nn.Module):
def __init__(self, input_dim=80, hidden_dim=512, n_layers=3, embedding_dim=256, target_dim=300000):
super(Net, self).__init__()
self.hidden_dim = hidden_dim
self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, embedding_dim)
self.dropout = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(embedding_dim, target_dim)
def forward(self, x):
"""Forward function."""
# hidden state set to zeros by default
lstm_out, _ = self.lstm(x)
# choosing the last valid frame for Linear layer
last_out = lstm_out[:, -1]
emb = self.fc1(last_out)
emb_d = self.dropout(emb)
logits = self.fc2(emb_d)
return logits
device = 'cuda:0'
batch_size = 512
input = torch.randn(batch_size, 800, 80, requires_grad=True).to(device)
target = torch.randint(300000, (batch_size,), dtype=torch.int64).to(device)
model = Net().to(device)
print(model)
logits = model(input)
print('logits.shape = {}'.format(logits.shape))
print('target.shape = {}'.format(target.shape))
loss = F.cross_entropy(logits, target)
print('loss = {:.2f}'.format(loss.item()))
loss.backward()