Seq2seq parameters become nan

n_iters,print_every=3*75000,10
criterion = nn.NLLLoss()
loss=0
lost_list=[]
enc=encoder_rnn(eng.n_words,256).to(device)
dec=attn_dec_rnn(256,hin.n_words,max_len).to(device)
encoder_optimizer = optim.RMSprop(enc.parameters(), lr=0.01)
decoder_optimizer = optim.RMSprop(dec.parameters(), lr=0.01)

for iter in range(n_iters+1):
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
c=random.choice(numbers)
while(len(c[0])>10):
c=random.choice(numbers)
a,b=torch.tensor(c[0],device=device),torch.tensor(c[1],device=device)
enc_hid=enc.init_hidden()
encoder_outputs = torch.zeros(max_len, enc.hidden_size, device=device)
for i in range(len(a)):
out,enc_hid=enc(a[i],enc_hid)
encoder_outputs[i] = out[0, 0]
dec_hid=enc_hid.detach()
dec_hid=variable(dec_hid.data,requires_grad=True).to(device)
dec_inp=torch.tensor([[hin.word2index[""]]], device=device)
outs=[]
for j in range(len(b)):
out,dec_hid,attn_weights=dec(dec_inp,dec_hid,encoder_outputs)
dec_inp=b[j].detach()
if device==‘cuda’:
outs.append(torch.argmax(out).cpu().data.numpy().tolist())
g.append(torch.argmax(out).cpu().data.numpy().tolist())
else:
outs.append(torch.argmax(out).data.numpy().tolist())

    loss+=criterion(out,b[j].view(1))
if torch.isnan(loss):
    loss=criterion(variable(torch.randn(1,hin.n_words),requires_grad=True).to(device)
            ,variable(torch.tensor(25)).to(device).view(1))
    
l.append(len(a))
dec_inp.detach()
loss.backward()#it computes  parameter change for each trainable parameter 
for i in enc.parameters():
    i.grad.data=i.grad.data.clamp_(-5,5)
for i in dec.parameters():
    i.grad.data=i.grad.data.clamp_(-5,5)
encoder_optimizer.step()
decoder_optimizer.step()
loss=loss.detach()