Same output from regression network

Hello, this is my first time implementing a network from scratch and I am facing an issue.
my model is getting the same prediction…
my input is as follows:
[batch_of_essays, max_number_of_sentences, max_number_of_words, word_dimenssion_embedding]

for each sentence I am passing it to a 1D CNN then apply pooling then pass it to an LSTM
For each essay, I will pool its sentences LSTM output to get essay representation.

This is my model code


class STL(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, filters, max_sent, max_word):
        super(STL, self).__init__()
        self.filters = filters
        self.word_level_cnn = nn.Conv1d(embedding_dim, filters, kernel_size=5)
        self.dropout = nn.Dropout(p=0.5)
        self.sentence_level_rnn = nn.LSTM(filters, hidden_dim, num_layers=max_sent)
        self.dense = nn.Linear(hidden_dim, 1)

    def forward(self, x):


        batch_size, max_sent, max_word, embedding_dim = x.size()

        essays_rep = []
        for i in range(batch_size):
            sent_rep = []
            for j in range(max_sent):
                cnn_out = self.word_level_cnn(x[i][j].permute(1, 0).unsqueeze(0)) # passing to cnn with size (batch=1, embedding_dim, max_word)
                cnn_out = cnn_out.squeeze(0).transpose(0, 1) # (max_word, embedding_dim)
                # this should be attention pooling
                pooled_output, _ = torch.max(cnn_out, dim=0)
                sent_rep.append(pooled_output)
            essays_rep.append(torch.stack(sent_rep))

        x = torch.stack(essays_rep)

        x = self.dropout(x)

        # Sentence-level RNN layer
        x, _ = self.sentence_level_rnn(x)
        # print('after RNN size', x.size())

        # Sentence-level attention pooling layer
        pooled_output, _ = torch.max(x, dim=1)
        # print('after atten size', pooled_output.size())

        # Dense layer then sigmoid for normanlization
        score = self.dense(pooled_output)
        # print('after dense', score)
        score = nn.Sigmoid()(score)

        return score

this is the training code:


# Loading the dataset
train_dataset = EssaysDataset(data_path=train_data_path, essay_set=essay_set)
dev_dataset = EssaysDataset(data_path=dev_data_path, essay_set=essay_set)
test_dataset = EssaysDataset(data_path=test_data_path, essay_set=essay_set)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch, shuffle=True)
dev_loader = DataLoader(dataset=dev_dataset, batch_size=batch, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch, shuffle=False)


# taking a look at this
# for samples, labels in train_loader:
    # print(samples.shape, labels.shape)
    # break

max_sent = train_dataset.MAX_SENT
max_word = train_dataset.MAX_WORD

# the model 
model = STL(vocab_size, embedding_dim, hidden_dim, filters, max_sent, max_word)
model.to(device)

# loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

# training loop
n_total_steps = len(train_loader)
for epoch in range(epochs):
    model.train()
    for i, (essays, labels) in enumerate(train_loader):
        
        # print(essays[0])
        # forward 
        essays = essays.to(device)
        labels = labels.to(device)
        outputs = model(essays)
        outputs = outputs.squeeze()

        labels = (labels - labels.min() ) / ( labels.max() - labels.min())

        loss = criterion(outputs, labels)
    
        # backwards
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 10 == 0:
            print(f'epoch {epoch+1} / {epochs}, step {i+1}/{n_total_steps}, loss={loss.item():.4f}')

    # test
    with torch.no_grad():
        model.eval()
        n_correct = 0
        n_samples = 0

        QWK = 0
        for essays, labels in test_loader:
            essays = essays.to(device)
            labels = labels.to(device)
            outputs = model(essays)
            outputs = torch.round((outputs * (prompt_max - prompt_min)) + prompt_min)
            n_samples += len(labels)
            n_correct += sum(1 for x, y in zip(labels, outputs) if x == y)

        # QWK = quadratic_weighted_kappa(x, y, min_rating=prompt_min, max_rating=prompt_max)
        acc = 100.0 * n_correct / n_samples
        print(f'accuracy = {acc}, QWK = {QWK}')