High trainings and validation loss and not changing

Hello I am new to PyTorch and I am building a word predictor with LSTM but
I have high training loss and not its not changing I tried already a lot of things. Smaller and bigger training set, more and fewer layers, and the rest of the hyperparameters.
Would someone so kind to tell me what I am doing wrong thank you very much
That’s the code I am using`

Blockquote
import torch
import torch.nn as nn
import string
import random
import sys
import unidecode
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import re
import math
import argparse
from argparse import Namespace
from torch.utils.tensorboard import SummaryWriter
import datetime

print(“pytorch version”, torch.version)

def char_tensor(string, all_characters):
tensor = torch.zeros(len(string)).long()
for c in range(len(string)):
tensor[c] = all_characters.index(string[c])
return tensor

def clean_text(text):
text = text.replace(‘\n’,’ ‘)
text = re.sub(’[^A-Za-z ]+', ‘’, text)
return text

def word_to_class(word, word_dict):
word_class = word_dict[word]
word_class_t = torch.zeros(1) + word_class
return word_class_t

def one_hot_encode(y,output_size):
one_hot = torch.zeros(output_size)
one_hot[y] = 1
return one_hot

def create_batches(X_train, y_train, choices, word_dict, output_size, all_characters, batch_size):
x_batch =
y_batch =
for i in range(batch_size):
idx = choices.pop()
x_batch.append(char_tensor((X_train[idx]), all_characters).long())
num = word_to_class(y_train[idx], word_dict)
#one_hot = one_hot_encode(int(num), output_size)
y_batch.append(num)

xt_batch = torch.stack(x_batch)
yt_batch = torch.stack(y_batch)
                      
return xt_batch, yt_batch, choices 

def eval_model(model, X_test, y_test, create_batches, word_dict,
all_characters, output_size, device, batch_size=32):
criterion = nn.CrossEntropyLoss()
iterations = math.floor(len(X_test)/ batch_size)
test_data_size = iterations * batch_size
choices = list(range(test_data_size))
X_test = X_test[:int(iterations * batch_size)]
y_test = y_test[:int(iterations * batch_size)]
epoch_loss = 0
for idx in range(iterations):
text = “Iteration {} / {} “.format(idx, iterations)
print(text, end=”\r”)
hidden = model.init_hidden(batch_size)
x_batch, y_batch, choices = create_batches(X_test, y_test, choices, word_dict, output_size, all_characters, batch_size)
x_batch = x_batch.float().to(device)
y_batch = y_batch.type(torch.long)
output, hidden = model(x_batch, hidden)
output = output.float()
y_batch = y_batch.long().to(device)
loss = criterion(output.squeeze(0), y_batch.squeeze(1))
epoch_loss += loss.item()

print("validation loss {}".format(epoch_loss/iterations))

def save_checkpoint(state, is_best, filename=‘/output/checkpoint.pth.tar’):
“”“Save checkpoint if a new best is achieved”“”
if is_best:
print (“=> Saving a new best”)
torch.save(state, filename) # save checkpoint
else:
print (“=> Validation Accuracy did not improve”)

class SentimentNet(nn.Module):
def init(self, vocab_size, output_size, embedding_dim, hidden_dim1, hidden_dim2,
n_layers, device, drop_prob=0.3):
super(SentimentNet, self).init()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim1 = hidden_dim1
self.hidden_dim2 = hidden_dim2
self.device = device

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim1, n_layers, dropout=drop_prob, batch_first=True)
    self.dropout = nn.Dropout(drop_prob)
    self.fc = nn.Linear(hidden_dim1, hidden_dim2)
    self.fc2 = nn.Linear(hidden_dim2, output_size)
    self.relu = nn.ReLU()
    self.softmax = torch.nn.Softmax(dim=1)
    
def forward(self, x, hidden):
    batch_size = x.size(0)
    x = x.long()
    embeds = self.embedding(x)
    lstm_out, hidden = self.lstm(embeds, hidden)
    #lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
    
    out = self.dropout(lstm_out)
    out = self.relu(self.fc(out[:, -1, :]))
    out = self.dropout(out)
    out = self.softmax(self.fc2(out))
    #out = out[:,-1]
    #out = self.softmax(out)
    
    #out = out.view(batch_size, -1)
    return out, hidden

def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers, batch_size,
        self.hidden_dim1).zero_().to(self.device), weight.new(self.n_layers,
            batch_size, self.hidden_dim1).zero_().to(self.device))
    return hidden

def main(args):
# Device configuration
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
seq_length = args.seq_length
num_epochs = 500
batch_size = args.batch_size
hidden_size1 = args.hidden_size1
hidden_size2 = args.hidden_size2
num_layers = args.num_layers
lr = args.lr
# Get characters from string.printable
all_characters = string.printable
n_characters = len(all_characters)
# Read large text file (Note can be any text file: not limited to just names)
text = unidecode.unidecode(open(‘data/’ + args.filename).read())
text_clean = clean_text(text)
text_clean = text_clean.split()
print("length text “, len(text_clean))
part = len(text_clean) * args.data_size
text_clean = text_clean[:int(part)]
seq_length = args.seq_length
path = 'runs/LSTM_{}_lr-{}batch_size-{}'.format(datetime.datetime.now().strftime("%Y-%m-%d%H-%M-%S”), args.lr, args.seq_length, args.batch_size)
writer = SummaryWriter(path)
y_data =
data =
part = ‘’
for idx in range(0, len(text_clean) - 1):
part += (text_clean[idx] + " ")
new_length = len(part) + len(text_clean[idx+1])
if new_length >= seq_length:
if len(part) > seq_length:
print(“ERROR”, len(part))
print(part)
part = part[:-1]
part = part.rjust(seq_length)
y_data.append(text_clean[idx+1])
data.append(part)
part = ‘’
continue
print("data size ", len(data))
print(“ydata”,len(y_data))
train_size = round(len(data) * 0.1)
X_train = data[:train_size]
y_train = y_data[:train_size]
X_test = data[train_size:]
y_test = y_data[train_size:]
print(“X_train”, len(X_train))
print(“y_train”, len(y_train))
print(“X_train”, len(X_train[0]))
print(“y_train”, y_train[0])
unique_word = set(text_clean)
output_size = len(set(text_clean))
word_dict = {word:idx for idx,word in enumerate(unique_word)}
iterations = math.floor(len(X_train)/ batch_size)
train_data_size = iterations * batch_size
choices = list(range(train_data_size))

#model = RNN(seq_length, hidden_size, num_layers,output_size).to(device)
print("create model ..")
model = SentimentNet(output_size, output_size, output_size, hidden_size1,
        hidden_size2, num_layers, device).to(device)
print("... done  loding ")
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss() 
print("=> Starting training")
print("batch_Size", batch_size)
print("num_layers", num_layers)
print("lr", lr)
best_loss = float("inf")
print("start training")
print("label amount ", len(y_train))
X_train = X_train[:int(iterations * batch_size)]
y_train = y_train[:int(iterations * batch_size)]
print("use {} to train ".format(device))
for epoch in range(1, num_epochs + 1):
    epoch_loss = 0
    choices = list(range(train_data_size))
    # cute the test set
    for idx in range(iterations):
        text = "Iteration  {} / {} ".format(idx, iterations)
        print(text, end="\r")
        hidden = model.init_hidden(batch_size)
        x_batch, y_batch,  choices = create_batches(X_train, y_train, choices, word_dict, output_size, all_characters, batch_size)
        x_batch = x_batch.float().to(device)
        #x_batch = x_batch.unsqueeze(0)
        #print(x_batch.type())
        y_batch = y_batch.type(torch.long)
        output, hidden = model(x_batch, hidden)
        output = output.float()
        y_batch = y_batch.long().to(device)
        #print(y_batch)
        loss = criterion(output.squeeze(0), y_batch.squeeze(1))
        #print("loss ", loss)
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
        epoch_loss += loss.item()
    epoch_loss = epoch_loss / iterations
    print("Epoch {} loss {}".format(epoch, epoch_loss ))
    eval_model(model,  X_test, y_test, create_batches, word_dict,
            all_characters, output_size, device)

if name == “main”:
parser = argparse.ArgumentParser(description=‘LSTM word prediction’)
parser.add_argument(‘–filename’, default=“book1_LittleWomen.txt”, help=‘’)
parser.add_argument(‘–seq_length’, default=20, type=int, help=‘’)
parser.add_argument(‘–batch_size’, default=128, type=int, help=‘’)
parser.add_argument(‘–hidden_size1’, default=512, type=int, help=‘’)
parser.add_argument(‘–hidden_size2’, default=1024, type=int, help=‘’)
parser.add_argument(‘–num_layers’, default=4, type=int, help=‘’)
parser.add_argument(‘–data_size’, default=0.1, type=float, help=‘’)
parser.add_argument(‘–lr’, default=0.0005, type=float, help=‘’)
args = parser.parse_args()
main(args)

Blockquote

`

Is the loss not changing at all or just too slow?
You could check, if the computation graph was detached by printing all gradients after the backward call via:

for name, param in model.named_parameters():
    print(name, param.grad)

If some .grad attributes return a None value, you could have accidentally detached the graph somewhere.
On the other hand, if you see valid gradients for all parameters, I would recommend to scale down the use case a bit and try to overfit a small dataset (e.g. just 10 samples).
Once this is working, you could try to scale it up again.

PS: you can post code snippets by wrapping them into three backticks ```, which would make debugging easier. :wink:


import torch
import torch.nn as nn
import string
import random
import sys
import unidecode
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import re
import math
import argparse
from argparse import Namespace
from torch.utils.tensorboard import SummaryWriter
import datetime


print("pytorch version", torch.__version__)


def char_tensor(string, all_characters):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return tensor

def clean_text(text):
    text = text.replace('\n',' ')
    text = re.sub('[^A-Za-z ]+', '', text)
    return text

def word_to_class(word,  word_dict):
    word_class = word_dict[word]
    word_class_t = torch.zeros(1) + word_class
    return word_class_t

def one_hot_encode(y,output_size):
    one_hot =  torch.zeros(output_size)
    one_hot[y] = 1
    return one_hot

def create_batches(X_train, y_train, choices, word_dict, output_size, all_characters, batch_size):
    x_batch = []
    y_batch = []
    for i in range(batch_size):
        idx = choices.pop()
        x_batch.append(char_tensor((X_train[idx]), all_characters).long())
        num = word_to_class(y_train[idx], word_dict)
        #one_hot = one_hot_encode(int(num), output_size)
        y_batch.append(num)
    
    xt_batch = torch.stack(x_batch)
    yt_batch = torch.stack(y_batch)
                          
    return xt_batch, yt_batch, choices 


def eval_model(model,  X_test, y_test, create_batches, word_dict,
        all_characters, output_size, device, batch_size=32):
    criterion = nn.CrossEntropyLoss()
    iterations  = math.floor(len(X_test)/ batch_size)
    test_data_size = iterations * batch_size
    choices = list(range(test_data_size))
    X_test = X_test[:int(iterations * batch_size)]
    y_test = y_test[:int(iterations * batch_size)]
    epoch_loss = 0
    for idx in range(iterations):
        text = "Iteration  {} / {} ".format(idx, iterations)
        print(text, end="\r")
        hidden = model.init_hidden(batch_size)
        x_batch, y_batch,  choices = create_batches(X_test, y_test, choices, word_dict, output_size, all_characters, batch_size)
        x_batch = x_batch.float().to(device)
        y_batch = y_batch.type(torch.long)
        output, hidden = model(x_batch, hidden)
        output = output.float()
        y_batch = y_batch.long().to(device)
        loss = criterion(output.squeeze(0), y_batch.squeeze(1))
        epoch_loss += loss.item()

    print("validation loss {}".format(epoch_loss/iterations))





        
        
def save_checkpoint(state, is_best, filename='/output/checkpoint.pth.tar'):
    """Save checkpoint if a new best is achieved"""
    if is_best:
        print ("=> Saving a new best")
        torch.save(state, filename)  # save checkpoint
    else:
        print ("=> Validation Accuracy did not improve")




class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim1, hidden_dim2,
            n_layers, device, drop_prob=0.3):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.device = device
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim1, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc2 = nn.Linear(hidden_dim2, output_size)
        self.relu = nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        #lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.relu(self.fc(out[:, -1, :]))
        out = self.dropout(out)
        out = self.softmax(self.fc2(out))
        #out = out[:,-1]
        #out = self.softmax(out)
        
        #out = out.view(batch_size, -1)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size,
            self.hidden_dim1).zero_().to(self.device), weight.new(self.n_layers,
                batch_size, self.hidden_dim1).zero_().to(self.device))
        return hidden







def main(args):
    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    seq_length = args.seq_length
    num_epochs = 500
    batch_size = args.batch_size
    hidden_size1 = args.hidden_size1
    hidden_size2 = args.hidden_size2
    num_layers = args.num_layers
    lr = args.lr
    # Get characters from string.printable
    all_characters = string.printable
    n_characters = len(all_characters)
    # Read large text file (Note can be any text file: not limited to just names)
    text = unidecode.unidecode(open('data/' + args.filename).read())
    text_clean = clean_text(text)
    text_clean = text_clean.split()
    print("length text ", len(text_clean))
    part = len(text_clean) * args.data_size
    text_clean = text_clean[:int(part)]
    seq_length = args.seq_length
    path = 'runs/LSTM_{}_lr-{}_batch_size-{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.lr, args.seq_length, args.batch_size)
    writer = SummaryWriter(path)
    y_data = []
    data = []
    part = ''
    for idx in range(0, len(text_clean) - 1):
        part += (text_clean[idx] + " ")
        new_length = len(part) + len(text_clean[idx+1])
        if new_length >= seq_length:
            if len(part) > seq_length:
                print("ERROR", len(part))
                print(part)
            part =  part[:-1]
            part = part.rjust(seq_length)
            y_data.append(text_clean[idx+1])
            data.append(part)
            part = ''
            continue
    print("data size ", len(data))
    print("ydata",len(y_data))
    train_size = round(len(data) * 0.1)
    X_train = data[:train_size]  
    y_train = y_data[:train_size]
    X_test = data[train_size:]
    y_test = y_data[train_size:]
    print("X_train", len(X_train))
    print("y_train", len(y_train))
    print("X_train", len(X_train[0]))
    print("y_train",  y_train[0])
    unique_word = set(text_clean)
    output_size = len(set(text_clean))
    word_dict = {word:idx for idx,word in enumerate(unique_word)}
    iterations  = math.floor(len(X_train)/ batch_size)
    train_data_size = iterations * batch_size
    choices = list(range(train_data_size))
    
    #model = RNN(seq_length, hidden_size, num_layers,output_size).to(device)
    print("create model ..")
    model = SentimentNet(output_size, output_size, output_size, hidden_size1,
            hidden_size2, num_layers, device).to(device)
    print("... done  loding ")
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss() 
    print("=> Starting training")
    print("batch_Size", batch_size)
    print("num_layers", num_layers)
    print("lr", lr)
    best_loss = float("inf")
    print("start training")
    print("label amount ", len(y_train))
    X_train = X_train[:int(iterations * batch_size)]
    y_train = y_train[:int(iterations * batch_size)]
    print("use {} to train ".format(device))
    for epoch in range(1, num_epochs + 1):
        epoch_loss = 0
        choices = list(range(train_data_size))
        # cute the test set
        for idx in range(iterations):
            text = "Iteration  {} / {} ".format(idx, iterations)
            print(text, end="\r")
            hidden = model.init_hidden(batch_size)
            x_batch, y_batch,  choices = create_batches(X_train, y_train, choices, word_dict, output_size, all_characters, batch_size)
            x_batch = x_batch.float().to(device)
            #x_batch = x_batch.unsqueeze(0)
            #print(x_batch.type())
            y_batch = y_batch.type(torch.long)
            output, hidden = model(x_batch, hidden)
            output = output.float()
            y_batch = y_batch.long().to(device)
            #print(y_batch)
            loss = criterion(output.squeeze(0), y_batch.squeeze(1))
            #print("loss ", loss)
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()
            for name, param in model.named_parameters():
                print(name, param.grad)
                break
            epoch_loss += loss.item()
        epoch_loss = epoch_loss / iterations
        print("Epoch {} loss {}".format(epoch, epoch_loss ))
        eval_model(model,  X_test, y_test, create_batches, word_dict,
                all_characters, output_size, device)




if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='LSTM word prediction')
    parser.add_argument('--filename', default="book1_LittleWomen.txt", help='')
    parser.add_argument('--seq_length', default=20, type=int, help='')
    parser.add_argument('--batch_size', default=128, type=int, help='')
    parser.add_argument('--hidden_size1', default=512, type=int, help='')
    parser.add_argument('--hidden_size2', default=1024, type=int, help='')
    parser.add_argument('--num_layers', default=4, type=int, help='')
    parser.add_argument('--data_size', default=0.1, type=float, help='')
    parser.add_argument('--lr', default=0.0005, type=float, help='')
    args = parser.parse_args()
    main(args)
embedding.weight tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
embedding.weight tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
Epoch 7 loss 8.744615505903194

no their are not changing and I dont detach anything

Could you check, if these .grad attributes are None before the first backward call and zeros (or any values) afterwards?
If so, this wouldn’t point towards a detached computation graph, but vanishing gradients.

Unfortunately not

embedding.weight tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
afterback prob 2 
embedding.weight tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


There also zero

I made the model smaller and now their are None,
the only time I detach is the hidden. I understood that I have to do this

I don’t quite understand the outputs at the moment.

For the original model, did you see zero gradients in the embedding after creating the model and before the first backward or zero_grad call?
If so, then some other issues are in the code, since the .grad attributes are not filled with zeros by default and some code is apparently manipulating these .grad attributes.

The smaller model has apparently introduced an issue, which is now detaching the gradient for the embedding layer (or is now showing this issue at least).
Could you post this small model, so that we could have a look?

sorry for the confusion, now I have with 2 LSTM layers 32 and 64 fully connected

pytorch version 1.5.0+cu101
length text  189014
data size  2037
ydata 2037
X_train 1222
y_train 1222
X_train 50
y_train Louisa
create model ..
... done  loding 
embedding.weight None
lstm.weight_ih_l0 None
lstm.weight_hh_l0 None
lstm.bias_ih_l0 None
lstm.bias_hh_l0 None
lstm.weight_ih_l1 None
lstm.weight_hh_l1 None
lstm.bias_ih_l1 None
lstm.bias_hh_l1 None
fc.weight None
fc.bias None
fc2.weight None
fc2.bias None
=> Starting training
batch_Size 128
num_layers 2
lr 0.05
start training
label amount  1222
use cuda to train 
embedding.weight tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
lstm.weight_ih_l0 tensor([[-1.1063e-08,  9.8287e-09,  1.0656e-08,  ...,  5.9566e-09,
          9.9389e-09,  9.8855e-09],
        [ 8.4692e-09,  1.5402e-08, -2.8731e-08,  ..., -3.0175e-09,
         -2.9208e-08, -6.2337e-09],
        [ 3.9109e-08, -2.3059e-09,  5.4245e-09,  ..., -2.0804e-08,
         -1.3307e-09, -3.0689e-08],
        ...,
        [-3.2323e-09, -1.7988e-10,  5.3804e-09,  ...,  1.4348e-09,
          2.5034e-09,  2.9880e-09],
        [ 6.4037e-10,  2.4291e-08, -1.3478e-08,  ..., -1.9483e-08,
         -1.2747e-09,  5.6516e-08],
        [-4.9771e-08,  2.1613e-08, -1.0253e-08,  ...,  3.1495e-08,
         -4.2576e-08, -9.7894e-09]], device='cuda:0')
lstm.weight_hh_l0 tensor([[-1.0335e-08,  5.0598e-09,  3.4083e-09,  ..., -7.7449e-09,
          4.3620e-11, -1.4557e-08],
        [ 2.5127e-08,  3.3538e-09, -7.4944e-10,  ..., -8.1999e-09,
         -8.3178e-09, -1.7731e-08],
        [ 1.1010e-08,  1.5204e-09, -5.1952e-09,  ...,  4.4810e-09,
          3.0697e-09,  3.3824e-09],
        ...,
        [-2.7111e-09, -5.2482e-09,  8.0058e-11,  ..., -2.7295e-09,
          7.4953e-10, -1.2066e-09],
        [-5.1258e-09,  1.1105e-08,  5.5830e-09,  ..., -8.5149e-09,
          4.2974e-10, -1.4851e-09],
        [-6.0431e-09,  5.8212e-09,  5.2287e-09,  ..., -1.0254e-08,
         -1.1058e-09,  1.2332e-08]], device='cuda:0')
lstm.bias_ih_l0 tensor([ 1.1024e-08, -3.6670e-08,  2.7407e-09,  8.6584e-09,  9.9973e-09,
         1.1498e-08, -3.3817e-08, -1.0450e-08,  7.9949e-09, -2.0574e-08,
         1.6153e-08,  7.2418e-09,  1.4902e-08, -1.0442e-08,  6.9512e-10,
         1.7358e-08, -8.8627e-09,  6.6883e-09, -7.1049e-09,  1.9581e-08,
         1.6576e-08,  1.0012e-08, -2.6061e-08, -1.1620e-08, -1.1734e-08,
        -1.0348e-08, -4.1551e-09, -1.2766e-08,  1.1955e-08, -2.0379e-08,
         5.7411e-09, -9.9559e-09, -1.8285e-08,  2.4916e-10,  2.8724e-09,
        -9.7809e-09,  3.3857e-08,  1.6930e-08, -1.3645e-08, -2.3350e-08,
         9.1927e-09, -1.1418e-08,  2.6563e-08,  1.4002e-09,  3.1444e-09,
        -7.2658e-09, -1.0615e-08,  1.2867e-08, -2.7184e-08,  2.5239e-08,
         2.1939e-08, -4.7100e-09,  3.7677e-08,  9.6631e-09, -3.0405e-09,
        -3.9288e-08, -1.2278e-09, -6.4334e-09,  2.5760e-08,  7.3272e-09,
        -6.3468e-08, -3.1217e-08,  4.4455e-09, -3.3671e-09,  1.6373e-07,
        -8.6436e-08,  4.1341e-08,  6.8275e-08, -7.1331e-08,  2.1625e-08,
         3.0736e-08, -3.1264e-08, -2.4704e-09,  6.3031e-09, -1.8657e-09,
         5.2730e-09,  5.2671e-08, -5.8692e-08, -3.5754e-08, -6.9110e-08,
        -6.6689e-10,  9.4464e-09, -1.8123e-09,  5.8840e-08, -3.4635e-08,
         1.2183e-08, -4.4816e-08, -4.8964e-09, -5.4653e-09,  6.7911e-09,
        -9.9127e-08,  1.2522e-09, -1.0675e-08, -3.7923e-08, -8.1623e-08,
         5.2246e-08, -2.0552e-08, -3.9296e-09, -9.5493e-09,  5.7565e-10,
         2.3854e-08,  9.2057e-10, -9.5234e-10, -1.4200e-08,  4.2268e-09,
         9.1718e-09,  4.2653e-09,  2.5779e-08,  2.7945e-08,  2.2597e-08,
         7.7742e-09,  8.1644e-09,  5.4698e-09,  9.5971e-09, -6.8111e-09,
         2.2269e-08,  2.2441e-08,  1.1569e-08, -3.3382e-09, -1.0234e-08,
        -2.0803e-09, -1.5768e-08, -1.1084e-08, -1.0867e-08, -1.4320e-08,
         2.3871e-09, -3.2945e-08, -3.6374e-08], device='cuda:0')
lstm.bias_hh_l0 tensor([ 1.1024e-08, -3.6670e-08,  2.7407e-09,  8.6584e-09,  9.9973e-09,
         1.1498e-08, -3.3817e-08, -1.0450e-08,  7.9949e-09, -2.0574e-08,
         1.6153e-08,  7.2418e-09,  1.4902e-08, -1.0442e-08,  6.9512e-10,
         1.7358e-08, -8.8627e-09,  6.6883e-09, -7.1049e-09,  1.9581e-08,
         1.6576e-08,  1.0012e-08, -2.6061e-08, -1.1620e-08, -1.1734e-08,
        -1.0348e-08, -4.1551e-09, -1.2766e-08,  1.1955e-08, -2.0379e-08,
         5.7411e-09, -9.9559e-09, -1.8285e-08,  2.4916e-10,  2.8724e-09,
        -9.7809e-09,  3.3857e-08,  1.6930e-08, -1.3645e-08, -2.3350e-08,
         9.1927e-09, -1.1418e-08,  2.6563e-08,  1.4002e-09,  3.1444e-09,
        -7.2658e-09, -1.0615e-08,  1.2867e-08, -2.7184e-08,  2.5239e-08,
         2.1939e-08, -4.7100e-09,  3.7677e-08,  9.6631e-09, -3.0405e-09,
        -3.9288e-08, -1.2278e-09, -6.4334e-09,  2.5760e-08,  7.3272e-09,
        -6.3468e-08, -3.1217e-08,  4.4455e-09, -3.3671e-09,  1.6373e-07,
        -8.6436e-08,  4.1341e-08,  6.8275e-08, -7.1331e-08,  2.1625e-08,
         3.0736e-08, -3.1264e-08, -2.4704e-09,  6.3031e-09, -1.8657e-09,
         5.2730e-09,  5.2671e-08, -5.8692e-08, -3.5754e-08, -6.9110e-08,
        -6.6689e-10,  9.4464e-09, -1.8123e-09,  5.8840e-08, -3.4635e-08,
         1.2183e-08, -4.4816e-08, -4.8964e-09, -5.4653e-09,  6.7911e-09,
        -9.9127e-08,  1.2522e-09, -1.0675e-08, -3.7923e-08, -8.1623e-08,
         5.2246e-08, -2.0552e-08, -3.9296e-09, -9.5493e-09,  5.7565e-10,
         2.3854e-08,  9.2057e-10, -9.5234e-10, -1.4200e-08,  4.2268e-09,
         9.1718e-09,  4.2653e-09,  2.5779e-08,  2.7945e-08,  2.2597e-08,
         7.7742e-09,  8.1644e-09,  5.4698e-09,  9.5971e-09, -6.8111e-09,
         2.2269e-08,  2.2441e-08,  1.1569e-08, -3.3382e-09, -1.0234e-08,
        -2.0803e-09, -1.5768e-08, -1.1084e-08, -1.0867e-08, -1.4320e-08,
         2.3871e-09, -3.2945e-08, -3.6374e-08], device='cuda:0')
lstm.weight_ih_l1 tensor([[ 8.4012e-08, -2.9539e-08,  1.3858e-08,  ...,  9.5577e-09,
         -5.4224e-09,  1.3203e-09],
        [ 2.3479e-08, -2.4442e-08,  3.5963e-09,  ..., -2.0605e-09,
         -1.1282e-09,  6.4519e-09],
        [-1.6921e-08, -9.0169e-09, -1.6799e-08,  ...,  1.3422e-08,
          2.1985e-08, -1.5083e-08],
        ...,
        [-4.6890e-08, -3.3445e-08,  2.3630e-08,  ...,  2.1088e-08,
         -7.8772e-09,  1.9002e-08],
        [-1.0548e-08,  6.9650e-09,  1.4421e-08,  ..., -1.4109e-08,
          1.7312e-08, -5.6161e-09],
        [ 3.2606e-08,  3.0479e-09,  2.3944e-08,  ...,  4.8170e-08,
          5.3714e-09,  2.9957e-08]], device='cuda:0')
lstm.weight_hh_l1 tensor([[ 6.3105e-09,  8.5650e-09,  1.2199e-08,  ..., -5.2815e-09,
         -1.3290e-08, -1.1771e-08],
        [ 2.1026e-09,  5.6092e-09,  2.3512e-09,  ...,  4.2428e-09,
         -1.4690e-09, -8.6503e-09],
        [ 3.8106e-09, -7.4459e-09, -8.0541e-10,  ...,  7.4869e-09,
         -2.0593e-10,  6.5506e-09],
        ...,
        [ 1.0081e-09,  2.1005e-09, -1.8269e-09,  ..., -5.9828e-09,
         -1.5774e-09,  1.4648e-09],
        [-2.2967e-09, -4.4685e-09,  2.1895e-09,  ...,  8.1985e-09,
          2.7976e-09,  2.2228e-09],
        [-1.8460e-09, -5.9974e-09,  5.0067e-09,  ...,  6.2485e-09,
         -4.9985e-10,  5.7330e-09]], device='cuda:0')
lstm.bias_ih_l1 tensor([-1.4627e-07, -3.2598e-08,  4.5248e-08, -3.6820e-08,  8.5256e-08,
        -1.6813e-09, -2.5214e-08, -2.6378e-09,  4.9213e-08, -6.8352e-09,
        -1.6807e-08,  3.5691e-08,  9.1554e-08,  2.8932e-08,  1.0407e-07,
         1.6364e-08,  2.7275e-08, -2.3216e-08,  5.4652e-08, -1.1749e-08,
         5.7970e-08,  5.0135e-10,  9.7462e-09,  5.9243e-08, -2.4544e-08,
        -1.9900e-07, -1.3418e-08,  9.4952e-08, -2.7586e-08, -1.0855e-08,
         6.7174e-08,  5.7864e-08, -6.9357e-09, -4.5838e-08,  9.7428e-09,
        -1.9542e-08,  6.9959e-08, -1.2232e-08,  1.8899e-08, -3.8950e-10,
         8.4376e-08, -8.8189e-09,  1.5604e-08,  5.3567e-08, -1.3605e-08,
        -4.3587e-08,  3.2783e-08,  4.5598e-08,  8.6922e-08, -1.5309e-08,
         1.4283e-07,  1.9255e-08,  9.3040e-09,  9.1163e-09,  2.2544e-08,
        -8.0286e-08, -2.4412e-09, -2.0616e-07,  2.4769e-08,  6.9618e-08,
        -4.0080e-08, -3.2525e-08,  2.9676e-08,  3.6615e-08,  8.4488e-07,
         8.8108e-07, -8.8140e-07, -3.3642e-07, -6.1423e-07,  1.8071e-07,
        -8.2801e-07,  1.4977e-06,  6.4847e-07, -9.8418e-08, -1.5375e-06,
         2.5985e-07, -4.9636e-07,  5.6687e-07, -8.1233e-07,  4.4857e-07,
        -1.1530e-06, -2.1248e-08, -1.7730e-06, -3.1444e-07,  1.9457e-07,
        -3.8730e-07,  1.2674e-06,  2.6293e-06,  9.9905e-07,  1.0608e-06,
         5.5551e-08,  3.9791e-07,  7.9500e-07,  7.8013e-08, -9.5171e-08,
         2.6153e-07, -1.6503e-07, -2.3647e-08,  4.5301e-08, -1.7203e-08,
         8.6487e-08,  1.6123e-09, -3.6743e-08, -1.6984e-08,  4.3068e-08,
        -3.6344e-08, -3.1881e-08,  3.2567e-08,  8.8137e-08,  2.7760e-08,
         8.9140e-08,  1.4355e-08,  3.1222e-08, -1.9337e-08,  4.1113e-08,
        -2.1121e-08,  6.1227e-08,  1.7561e-09,  1.4743e-08,  8.3583e-08,
        -9.0050e-09, -2.2631e-07, -2.6680e-08,  1.6824e-07, -3.2538e-08,
        -1.1349e-08,  6.9777e-08,  6.5302e-08], device='cuda:0')
lstm.bias_hh_l1 tensor([-1.4627e-07, -3.2598e-08,  4.5248e-08, -3.6820e-08,  8.5256e-08,
        -1.6813e-09, -2.5214e-08, -2.6378e-09,  4.9213e-08, -6.8352e-09,
        -1.6807e-08,  3.5691e-08,  9.1554e-08,  2.8932e-08,  1.0407e-07,
         1.6364e-08,  2.7275e-08, -2.3216e-08,  5.4652e-08, -1.1749e-08,
         5.7970e-08,  5.0135e-10,  9.7462e-09,  5.9243e-08, -2.4544e-08,
        -1.9900e-07, -1.3418e-08,  9.4952e-08, -2.7586e-08, -1.0855e-08,
         6.7174e-08,  5.7864e-08, -6.9357e-09, -4.5838e-08,  9.7428e-09,
        -1.9542e-08,  6.9959e-08, -1.2232e-08,  1.8899e-08, -3.8950e-10,
         8.4376e-08, -8.8189e-09,  1.5604e-08,  5.3567e-08, -1.3605e-08,
        -4.3587e-08,  3.2783e-08,  4.5598e-08,  8.6922e-08, -1.5309e-08,
         1.4283e-07,  1.9255e-08,  9.3040e-09,  9.1163e-09,  2.2544e-08,
        -8.0286e-08, -2.4412e-09, -2.0616e-07,  2.4769e-08,  6.9618e-08,
        -4.0080e-08, -3.2525e-08,  2.9676e-08,  3.6615e-08,  8.4488e-07,
         8.8108e-07, -8.8140e-07, -3.3642e-07, -6.1423e-07,  1.8071e-07,
        -8.2801e-07,  1.4977e-06,  6.4847e-07, -9.8418e-08, -1.5375e-06,
         2.5985e-07, -4.9636e-07,  5.6687e-07, -8.1233e-07,  4.4857e-07,
        -1.1530e-06, -2.1248e-08, -1.7730e-06, -3.1444e-07,  1.9457e-07,
        -3.8730e-07,  1.2674e-06,  2.6293e-06,  9.9905e-07,  1.0608e-06,
         5.5551e-08,  3.9791e-07,  7.9500e-07,  7.8013e-08, -9.5171e-08,
         2.6153e-07, -1.6503e-07, -2.3647e-08,  4.5301e-08, -1.7203e-08,
         8.6487e-08,  1.6123e-09, -3.6743e-08, -1.6984e-08,  4.3068e-08,
        -3.6344e-08, -3.1881e-08,  3.2567e-08,  8.8137e-08,  2.7760e-08,
         8.9140e-08,  1.4355e-08,  3.1222e-08, -1.9337e-08,  4.1113e-08,
        -2.1121e-08,  6.1227e-08,  1.7561e-09,  1.4743e-08,  8.3583e-08,
        -9.0050e-09, -2.2631e-07, -2.6680e-08,  1.6824e-07, -3.2538e-08,
        -1.1349e-08,  6.9777e-08,  6.5302e-08], device='cuda:0')
fc.weight tensor([[ 9.8894e-08, -2.6156e-08,  8.0726e-09,  ..., -7.6222e-08,
         -9.5513e-08,  3.1293e-08],
        [-3.4396e-07, -1.6109e-07, -2.4421e-07,  ...,  2.2020e-07,
          3.9574e-07, -8.7920e-08],
        [-1.2105e-07,  2.8051e-08,  3.6063e-08,  ...,  9.6170e-08,
         -2.5761e-08,  1.5995e-07],
        ...,
        [ 3.6290e-07,  6.9809e-08, -5.4139e-08,  ...,  6.5467e-08,
          2.9734e-07,  1.5860e-07],
        [ 2.2620e-07,  1.1168e-07, -8.2239e-08,  ..., -7.9498e-08,
         -5.7117e-07, -4.9839e-07],
        [ 3.6932e-07, -4.7490e-08, -1.2123e-07,  ..., -3.2805e-07,
          4.7888e-07,  6.1199e-07]], device='cuda:0')
fc.bias tensor([-5.6586e-07,  1.9536e-06,  3.9677e-07,  5.7258e-07,  2.7910e-06,
         1.6484e-06, -6.9489e-07, -1.2641e-06, -3.6925e-07, -7.8995e-07,
        -2.4324e-07,  2.9274e-06,  2.2647e-07, -1.2867e-06, -1.8834e-06,
        -2.2755e-06,  2.4804e-06, -1.5545e-06, -3.5014e-07,  1.5309e-06,
        -4.5696e-08,  1.9216e-06, -1.5605e-06,  0.0000e+00,  2.9187e-06,
         0.0000e+00, -1.7968e-07, -2.0765e-06,  3.3464e-06, -9.9088e-07,
         3.0034e-06,  1.3896e-07,  2.0264e-06,  2.3008e-06,  0.0000e+00,
         1.8977e-06,  1.5784e-06, -3.0460e-06,  2.1113e-06,  3.7848e-07,
         6.4100e-07, -4.6641e-06, -1.7383e-06,  2.2122e-06,  2.7763e-07,
         3.0854e-06, -2.3523e-06, -1.5652e-06,  8.1877e-07,  2.9956e-07,
        -1.6166e-06,  1.3329e-07,  0.0000e+00,  0.0000e+00, -2.0984e-06,
        -3.0431e-07,  1.3923e-06,  7.5651e-07,  2.3667e-06,  2.1777e-06,
         6.4042e-07,  1.8063e-06, -2.7013e-06,  3.2940e-06], device='cuda:0')
fc2.weight tensor([[2.8238e-10, 4.3198e-09, 5.1769e-10,  ..., 5.1011e-09, 3.0162e-09,
         9.3640e-09],
        [2.9068e-10, 4.5367e-09, 5.2633e-10,  ..., 5.1909e-09, 3.0547e-09,
         9.5743e-09],
        [2.8322e-10, 4.2048e-09, 5.0373e-10, 

and its only the Embedding layer weights are always zero and the full connected weights are changing
I hope this helps you

I think the computation graph is not broken, but the model architecture creates small gradients.
This dummy code produces valid gradients, which have a very small magnitude:

model = SentimentNet(10, 10, 10, 10, 10, 10, 'cpu')
x = torch.randint(0, 10, (10, 10))
hidden = model.init_hidden(10)
out = model(x, hidden)
out[0].mean().backward()
for name, param in model.named_parameters():
    print(name, param.grad)

print(model.embedding.weight.grad)

Also, note that nn.CrossEntropyLoss expects raw logits as the model output, not probabilities, so you should remove the softmax from your model (which might also increase the gradient magnitudes).

thank you that works a bit now its around 8.6 at the beginning goes done to max 7.5
I tried already a lot of hyperparameters. Do you know what could work ?

Unfortunately, I don’t know what would work the best, but am generally recommending to scale down the problem a bit and try to overfit a small data sample (e.g. just 10 samples) until the model achieves a (near) perfect accuracy. Once this is done, you might have a good starting point for the hyperparameters and might scale up the use case again.

Ok I will try this thank you