Dont know what to do next

Hello,
i am creating a encoder-decoder neural translation with a GRU unit. I am trying to code this for good 3 months but cant seem to make it better. The loss keeps being around 11 and accuracy is 17%, it was 75 before but after i put encoder.train() and encoder.eval() in the loops it dropped.

encoder:

hyperparams:
batch_size = 16
embedding_dim = 256
enc_units = 256
epochs = 10
lr_rate = 0.05

optimizer = torch.optim.Adam(list(decoder.parameters()) + list(encoder.parameters()), lr_rate)
loss_fn = nn.CrossEntropyLoss()

decoder:
class Decoder(nn.Module):
def init(self, embedding_dim, enc_units, vocab_size, vocab):
super(Decoder, self).init()
self.embedding_dim = embedding_dim # TODO
self.vocab_size = vocab_size
self.enc_units = enc_units
self.vocab = vocab
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
self.embedding.weight.data = torch.nn.functional.normalize(self.embedding.weight.data)
self.gru = nn.GRU(self.embedding_dim, self.enc_units) #gru has dropout in itself
#self.dropout = nn.Dropout(0.2)
self.linear_layer = nn.Linear(self.enc_units, len(self.vocab))
self.linear_layer_inference = nn.Linear(self.enc_units, self.embedding_dim)
self.softmax_layer = nn.Softmax(2)
self.activation_layer = nn.ReLU()

def forward(self, y, initial_state, loss_fn, previous_output, max_generated_length, training=True):
    prediction = torch.randn(2, 2)
    count = 0
    vocab_range = torch.arange(0, len(self.vocab), device='cuda')
    loss = 0
    correct = 0
    false = 0
    embedding_table = self.embedding(vocab_range)

    if training:
        for step_t in range(y.T.size()[0]):
            mask = torch.where(y.T[step_t] == 0, 0, 1)
            debug = y.T[step_t]
            output, initial_state = self.gru(torch.unsqueeze(embedding_table[y.T[step_t]], 0), initial_state)
            #output = self.dropout(output)
            linear_layer_output = self.linear_layer(output)
            linear_layer_output = self.activation_layer(linear_layer_output)
            mask = torch.unsqueeze(mask, 0).T
            prob = self.softmax_layer(linear_layer_output)
            prediction = torch.squeeze(torch.argmax(prob, dim=2))
            linear_layer_output = linear_layer_output * mask
            loss_step = loss_fn(torch.squeeze(linear_layer_output), y.T[step_t])
            loss += loss_step / y.T.size()[0]
            correct += (prediction == y.T[step_t]).type(torch.float).sum().item()
            false += (prediction != y.T[step_t]).type(torch.float).sum().item()

        return loss, correct, false

    else:
        end_number = self.vocab['<end>']
        for i in range(max_generated_length):
            previous_output = self.linear_layer_inference(previous_output)
            previous_output, initial_state = self.gru(previous_output, initial_state)
            logits = self.linear_layer(previous_output[-1:, :, :])
            logits = self.activation_layer(logits)
            prob = self.softmax_layer(logits)
            prediction = torch.squeeze(torch.argmax(prob, dim=2))
            count = count + (prediction == end_number).sum().item()
            if y.T.size()[0] < i+1:
                false += 1
            else:
                correct += (prediction == y.T[i]).type(torch.float).sum().item()
                false += (prediction != y.T[i]).type(torch.float).sum().item()
            if count == prediction.size()[0]:
                return correct, false
        return correct, false

def sort_batch(X, y, lengths):
lengths, indx = lengths.sort(dim=0, descending=True)
X = X[indx]
y = y[indx]
return X.transpose(0, 1), y, lengths

train loops:
def train_loop(dataloader, model, encoder, optimizer, loss_fn, device):
size = len(dataloader.dataset)
model.train()
encoder.train()
total_correct, total_false = 0, 0
for batch, (X_, y, len_) in enumerate(iter(dataloader)):
X, y, len__ = sort_batch(X_, y, len_)
encoder_output, hidden_state = encoder.forward(X.to(device), len__, device)
#print(‘encoding done’)
loss, correct, false = model.forward(y.to(device), hidden_state, loss_fn, encoder_output, max(len__))
#print(‘decoding done’)
total_correct += correct
total_false += false
loss.backward()
optimizer.step()
optimizer.zero_grad()
if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X_)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]“)
print(f"Train Error: \n”
f"Accuracy: {total_correct/(total_false+total_correct)*100:>0.3f}% "
f"Correct: {total_correct} out of {total_correct+total_false}")

def test_loop(dataloader, model, encoder, loss_fn, device):
model.eval()
encoder.eval()
size = len(dataloader.dataset)
total_correct, total_false, total_loss = 0, 0, 0
with torch.no_grad():
for batch, (X_, y, len_) in enumerate(dataloader):
X, y, len__ = sort_batch(X_, y, len_)
encoder_output, hidden_state = encoder.forward(X.to(device), len__, device)
correct, false = model.forward(y.to(device), hidden_state, loss_fn, encoder_output, max(len__)*3, training=False)
#total_loss += loss
total_correct += correct
total_false += false
if batch % 100 == 0:
current = (batch + 1) * len(X_)
print(f"[{current:>5d}/{size:>5d}] "
f"Correct so far: {(100 * total_correct/(total_false+total_correct)):>0.1f} "
f"Correct: {total_correct}, false: {total_false} ")
print(f"Test Error: \n Accuracy: {(100 * total_correct/(total_false+total_correct)):>0.1f}%, ")