I am building a Siamese GRU autoencoder, I am encountering some error when trying to train. The error occurs frequently, with both synthetic and real data. I have searched for information on this issue, and found suggestions for using cuddn 7.6, but there is no windows conda equivalent package. Any suggestions would be greatly appreciated, thanks
The Error:
cuda
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-7-4751c89df448> in <module>
94
95
---> 96 score, y_pred1,y_pred2 = model(x1,x2) #train
97 optimizer.zero_grad()
98
~\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
<ipython-input-7-4751c89df448> in forward(self, seq1, seq2)
51 h1 = X2.new_zeros(self.num_layers*2,N,self.embedding_dim)
52 h1.normal_()
---> 53 y2, _ = self.decoder(encoded2,h1)
54 y2 = y2[:seqLen2]
55
~\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\nn\modules\rnn.py in forward(self, input, hx)
714 if batch_sizes is None:
715 result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,
--> 716 self.dropout, self.training, self.bidirectional, self.batch_first)
717 else:
718 result = _VF.gru(input, batch_sizes, hx, self._flat_weights, self.bias,
RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
The code, with synthetic data.
import torch
import torch.nn as nn
import torch.optim as optim
import torchnet as tnt
class GRUModel(nn.Module):
def __init__(self, embedding_dim, input_dim, latent_dim, num_layers):
super(GRUModel, self).__init__()
self.input_dim = input_dim
self.latent_dim = latent_dim
self.num_layers = num_layers
self.embedding_dim = embedding_dim
self.embedding = nn.Embedding(input_dim,embedding_dim)
self.encoder = nn.GRU(self.embedding_dim, self.latent_dim, self.num_layers, bidirectional = True)
self.decoder = nn.GRU(self.latent_dim, self.embedding_dim, self.num_layers, bidirectional = True)
def forward(self, seq1, seq2 ):
# Encode
X1 = self.embedding(seq1).float().transpose(0,1)
X2 = self.embedding(seq2).float().transpose(0,1)
seqLen1,N,C = X1.shape
seqLen2,N,C = X2.shape
h0 = X1.new_zeros(self.num_layers*2,N,self.latent_dim)
h0.normal_()
_, last_hidden1 = self.encoder(X1,h0)
encoded1 = last_hidden1[-1].repeat((len(X1),1,1))
h0 = X2.new_zeros(self.num_layers*2,N,self.latent_dim)
h0.normal_()
_, last_hidden2 = self.encoder(X2,h0)
encoded2 = last_hidden2[-1].repeat((len(X2),1,1))
score = nn.CosineSimilarity()(last_hidden1[-1],last_hidden2[-1])
# Decode
h1 = X1.new_zeros(self.num_layers*2,N,self.embedding_dim)
h1.normal_()
y1, _ = self.decoder(encoded1,h1)
y1 = y1[:seqLen1]
h1 = X2.new_zeros(self.num_layers*2,N,self.embedding_dim)
h1.normal_()
y2, _ = self.decoder(encoded2,h1)
y2 = y2[:seqLen2]
#we have a reconstruction in both directions. We can try multiple ways to combine the output, we first try average
y1 = y1.view(seqLen1,N,C,2).mean(-1)
y2 = y2.view(seqLen2,N,C,2).mean(-1)
l_first = nn.functional.l1_loss(y1.squeeze(),X1.squeeze())
l_second = nn.functional.l1_loss(y2.squeeze(),X2.squeeze())
return score, l_first,l_second
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = GRUModel(embedding_dim=50,input_dim=22, latent_dim=16, num_layers=2)
model.to(device)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters())
x1 = torch.randint(low=0,high=22,size=(1,250)).long().to(device)
x2 = torch.randint(low=0,high=22,size=(1,200)).long().to(device)
meter = tnt.meter.AverageValueMeter()
meter_val = tnt.meter.AverageValueMeter()
timemeter = tnt.meter.TimeMeter(0)
for epoch in range(50):
timemeter.reset()
model.train()
#for (x1,x2) in trainloader:
for idx in range(5):
s1 = torch.randint(100,4000,(1,))
x1 = torch.randint(low=0,high=22,size=(1,s1)).long().to(device)
s1 = torch.randint(100,4000,(1,))
x2 = torch.randint(low=0,high=22,size=(1,s1)).long().to(device)
x1 = x1.to(device)
x2 = x2.to(device)
score, y_pred1,y_pred2 = model(x1,x2) #train
optimizer.zero_grad()
loss = (1-score) + y_pred1 + y_pred2
loss.backward()
optimizer.step()
meter.add(loss.item())
#print(loss.item())
model.eval()
#for (x1,x2) in valloader:
for idx in range(5):
s1 = torch.randint(100,4000,(1,))
x1 = torch.randint(low=0,high=22,size=(1,s1)).long().to(device)
s1 = torch.randint(100,4000,(1,))
x2 = torch.randint(low=0,high=22,size=(1,s1)).long().to(device)
x1 = x1.to(device)
x2 = x2.to(device)
score, y_pred1,y_pred2 = model(x1,x2) #eval
loss = (1-score) + y_pred1 + y_pred2
meter_val.add(loss.item())
#print(loss.item())
print("EPOCH {}, TRAINING_LOSS {}, VALIDATION_LOSS {}, DURATION {}s".format(epoch,meter.value(),meter_val.value(),timemeter.value()))
meter.reset()
meter_val.reset()
My System:
On windows 10, using a conda install of pytorch:
pytorch 1.4.0 py3.7_cuda101_cudnn7_0 pytorch
cudatoolkit 10.1.243 h74a9793_0
nvidia-smi:
Sun Mar 29 11:14:53 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 442.50 Driver Version: 442.50 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Quadro P2000 WDDM | 00000000:01:00.0 Off | N/A |
| N/A 51C P8 N/A / N/A | 486MiB / 4096MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 35288 C ...ta\Local\Continuum\anaconda3\python.exe N/A |
+-----------------------------------------------------------------------------+