torch.backends.cudnn.CuDNNError: 8: CUDNN_STATUS_EXECUTION_FAILED with LSTM


class Net(nn.Module):

    def __init__(self,batch_sz):
        super(Net, self).__init__()

	self.conv1 = nn.Conv3d(1,30,3,padding=1)
	self.input_bn1=nn.BatchNorm3d(30)

	#self.conv2 = nn.Conv3d(30, 30, 3, padding=1)
 
        self.pool1=nn.MaxPool3d((2, 4, 4), stride=(2,4,4))
	self.input_bn2=nn.BatchNorm3d(30)

	self.conv3 = nn.Conv3d(30, 40, 3, padding=1)
	self.input_bn3=nn.BatchNorm3d(40)

	#self.conv4 = nn.Conv3d(50, 50, 3, padding=1)
        self.pool2=nn.MaxPool3d((2, 2, 2), stride=(2,2,2))

	self.input_bn4=nn.BatchNorm3d(40)

	self.conv5 = nn.Conv3d(40,60, 3, padding=1)
	self.input_bn5=nn.BatchNorm3d(60)

#	self.conv6 = nn.Conv3d(100,100, 3, padding=1)
#	self.input_bn6=nn.BatchNorm3d(100)



	self.lstm1=nn.LSTM(30*17*17*4, 128, 2,batch_first=True,bidirectional=True)
	self.lstm2=nn.LSTM(40*8*8*2, 128, 2,batch_first=True,bidirectional=True)
	self.lstm3=nn.LSTM(60*4*4*1, 128, 2,batch_first=True,bidirectional=True)


	self.linear1=nn.Linear(128*6,500)

	self.dense1_bn = nn.BatchNorm1d(500)

	self.linear2=nn.Linear(500,500)
	
 	self.softmax = nn.LogSoftmax(1)

	self.l1=torch.FloatTensor(np.zeros((batch_sz,7,30*17*17*4))).cuda()
	self.l2=torch.FloatTensor(np.zeros((batch_sz,7,40*8*8*2))).cuda()
	self.l3=torch.FloatTensor(np.zeros((batch_sz,7,60*4*4*1))).cuda()

	
    	for name, param in self.lstm1.named_parameters():
  		if 'bias' in name:
     			nn.init.constant(param, 0.0)
  		elif 'weight' in name:
     			nn.init.xavier_normal(param)

    	for name, param in self.lstm2.named_parameters():
  		if 'bias' in name:
     			nn.init.constant(param, 0.0)
  		elif 'weight' in name:
     			nn.init.xavier_normal(param)

    	for name, param in self.lstm3.named_parameters():
  		if 'bias' in name:
     			nn.init.constant(param, 0.0)
  		elif 'weight' in name:
     			nn.init.xavier_normal(param)


	


    def forward(self, inputs,batch_sz):
		



		for i1 in range(0,batch_sz):

			cnn_feature_1=self.input_bn1(self.pool1(F.relu(self.conv1(inputs[i1]))))

			#cnn_feature_2=self.input_bn2(self.pool1(F.relu(self.conv2(cnn_feature_1))))

			


			self.l1[i1]=cnn_feature_1.view(-1,num_flat_features(cnn_feature_1)).data


			cnn_feature_3=self.input_bn3(self.pool2(F.relu(self.conv3(cnn_feature_1))))

			#cnn_feature_4=self.input_bn4(self.pool2(F.relu(self.conv4(cnn_feature_3))))


			self.l2[i1]=cnn_feature_3.view(-1,num_flat_features(cnn_feature_3)).data


			cnn_feature_5=self.input_bn5(self.pool2(F.relu(self.conv5(cnn_feature_3))))

			#cnn_feature_6=self.input_bn6(self.pool2(F.relu(self.conv6(cnn_feature_5))))

			self.l3[i1]=cnn_feature_5.view(-1,num_flat_features(cnn_feature_5)).data



		packed_input_1 = pack_padded_sequence(Variable(self.l1), np.array([8]*batch_sz,dtype=int),batch_first=True)
		packed_input_2 = pack_padded_sequence(Variable(self.l2), np.array([8]*batch_sz,dtype=int),batch_first=True)
		packed_input_3 = pack_padded_sequence(Variable(self.l3), np.array([8]*batch_sz,dtype=int),batch_first=True)
		#packed_input_4 = pack_padded_sequence(Variable(self.l4), np.array([29]*batch_sz,dtype=int),batch_first=True)
		#packed_input_5 = pack_padded_sequence(Variable(self.l5), np.array([28]*batch_sz,dtype=int),batch_first=True)
		
		packed_output_a, (hta,cta) = self.lstm1(packed_input_1)
		packed_output_b, (htb,ctb) = self.lstm2(packed_input_2)
		packed_output_c, (htc,ctc) = self.lstm3(packed_input_3)
		#packed_output_d, (htd,ctd) = self.lstm4(packed_input_4)
		#packed_output_e, (hte,cte) = self.lstm5(packed_input_5)

		#print(hta)
	
		f_feature=torch.cat((hta[3],htb[3], htc[3],hta[2],htb[2],htc[2]),1)

		print(f_feature)
		
		predictions=self.softmax(self.linear2(self.dense1_bn(self.linear1((f_feature)))))

		

		return predictions


######################################################################################################################################

net=Net(4)
net=net.cuda()
sumloss=0
transformed_dataset_train = bbc_lrw_train_img()
transformed_dataset_test = bbc_lrw_test_img()
transformed_dataset_val = bbc_lrw_val_img()

trainloader = torch.utils.data.DataLoader(transformed_dataset_train, batch_size=4,shuffle=True, num_workers=4,drop_last=True)
print("Data_loader for Train is Ready ")
valloader = torch.utils.data.DataLoader(transformed_dataset_val, batch_size=4,shuffle=True, num_workers=4,drop_last=True)
print("Data_loader for Validation is Ready ")
testloader = torch.utils.data.DataLoader(transformed_dataset_test, batch_size=4,shuffle=True, num_workers=4,drop_last=True)
print("Data_loader for test is Ready ")



#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
optimizer = optim.Adam(net.parameters(),lr =0.00001) #lr=0.00002
#optimizer = optim.SGD(net.parameters(), lr =0.00003, momentum=0.9,weight_decay=1e-5)
acc_sum=0
k_z=0
acc_list=[]
epoch=0
while(1):
	epoch=epoch+1
	for i_a, data_a in enumerate(trainloader, 0):
		#print(i_a)


		net.train(False)
		inputs,labels=data_a

		outputs = net(Variable(inputs.cuda()),4)

		#print(outputs.data.max(1)[1])

		labels=Variable(labels.cuda())

		loss = criterion(outputs, labels)
		sumloss=sumloss+loss.data[0]
		#loss.backward()
		##torch.nn.utils.clip_grad_norm(net.parameters(), 400)

		predict1 = outputs.data.max(1)[1]
		acc = predict1.eq(labels.data).cpu().sum()	

		acc_sum += acc

		torch.nn.utils.clip_grad_norm(net.parameters(), 10) 
		optimizer.step()
		optimizer.zero_grad()

I am using LSTM in batch mode, I have a question if hidden needs to be detached hereā€¦ if yes how ? Is there a solution to this issue of dnn.CuDNNError: 8: CUDNN_STATUS_EXECUTION_FAILED

Did it get resolved? Getting the same issue on RTX 2080S with Pytorch 0.3.1 CUDA 9.0, CuDN 7