Torch doesn't optimize the model

Hi,
I am training an encoder for transformer
in my code, everything looks good but the accuracy and loss doesn’t change!

def Tensor(x):
    return T.Tensor(x) if not isinstance(x, type(T.Tensor(1))) else x

def create_padding_mask(x):
    x = Tensor(x)
    return (x == 0).float().unsqueeze(1).unsqueeze(1)

def accuracy(true,pred):
    acc = (true.argmax(-1) == pred.argmax(-1)).float().detach().numpy()
    return float(100 * acc.sum() / len(acc))


def batch2(x,y):
    x,y = shuffle(copy.copy(x), copy.copy(y))
    x,y = x[:5000], y[:5000]
    for i in range(0, len(x), batch_size):
        yield x[i:i+batch_size], y[i:i+batch_size]

def cross_entropy(true, pred):
    pred = T.clamp(pred, 1e-8, 1. - 1e-8)
    N = pred.shape[0]
    return -T.sum(true * T.log(pred+1e-9)) / N

def create_look_ahead_mask(x):
    x = Tensor(x)
    mask = (1-T.triu(T.ones(x.shape[-2], x.shape[-2])) == 1)
    return mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, batch_size):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.batch_size = batch_size
        self.depth = d_model // num_heads
        self.Q = nn.Linear(d_model, d_model)
        self.K = nn.Linear(d_model, d_model)
        self.V = nn.Linear(d_model, d_model)
        self.linear = nn.Linear(d_model, d_model)

    def SDPA(self, Q, K, V, M):
        mat = T.matmul(Q, K.transpose(-2, -1))
        scale = mat / np.sqrt(K.shape[-1])
        if M is not None:
            scale += M * -1e9
        softmax = F.softmax(scale, -1)
        return T.matmul(softmax, V)

    def split_heads(self, inputs, batch_size):
        return T.reshape(inputs, (batch_size, self.num_heads, -1, self.depth))

    def forward(self, x):
        query, key, value, mask = x;query, key, value, mask = query.float(), key.float(), value.float(), mask.float()
        batch_size = query.shape[0]
        Q = self.split_heads(self.Q(query), query.shape[0])
        K = self.split_heads(self.K(query), query.shape[0])
        V = self.split_heads(self.V(query), query.shape[0])
        sdpa = self.SDPA(Q, K, V, mask)
        cat = sdpa.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.depth)
        return self.linear(cat)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        pe = T.zeros(max_len, d_model)
        position = T.arange(0, max_len).unsqueeze(1)
        div_term = T.exp(T.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = T.sin(position * div_term)
        pe[:, 1::2] = T.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = Tensor(x).to(device)
        x = x + T.Tensor(self.pe[:, :x.size(1)]).to(device)
        return self.dropout(x)

class Encoder(nn.Module):
    def __init__(self, batch_size, d_model, num_heads, dropout, num_layers, maxlen, num_words):
        super(Encoder, self).__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.PE = PositionalEncoding(d_model, dropout, num_words)
        self.emb1 = nn.Embedding(num_words, d_model).to(device)
        self.emb2 = nn.Embedding(num_words, d_model).to(device)
        self.emb3 = nn.Embedding(num_words, d_model).to(device)
        self.ffn = nn.Linear(d_model, d_model).to(device)
        self.attention = MultiHeadAttention(d_model, num_heads, batch_size).to(device)
        self.output = nn.Linear(maxlen * d_model, num_words).to(device)
        self.optimizer = Adam(self.parameters())

    def forward(self, x):
        x = Tensor(x)
        x = x.long().to(device)
        pad_mask = create_padding_mask(x)
        emb1 = self.PE(self.emb1(x).to(device) * np.sqrt(self.d_model)).to(device)
        emb2 = self.PE(self.emb2(x).to(device) * np.sqrt(self.d_model)).to(device)
        emb3 = self.PE(self.emb3(x).to(device) * np.sqrt(self.d_model)).to(device)
        inp = emb1 + emb2 + emb3
        for i in range(self.num_layers):
            attention = self.attention([inp,inp,inp,pad_mask])
            layer_norm = F.layer_norm(inp + attention, attention.shape[1:], eps=1e-6)
            ffn = self.ffn(layer_norm)
            layer_norm = F.layer_norm(layer_norm + ffn, attention.shape[1:], eps=1e-6)
        output = layer_norm.flatten(-2)
        output = self.output(output)
        return F.softmax(output, -1)

batch_size = 32
d_model = 512
num_heads = 8
dropout = .1
num_layers = 12
mask_token = tok.word_index['<mask>'] # 59
epochs = 10000
device = 'cuda'
encoder = Encoder(batch_size, d_model, num_heads, dropout, num_layers, maxlen, num_words)
for epoch in range(epochs):
    losses = []
    accuracies = []
    t = time.time()
    for x,y in batch2(X,Y):
        prediction = encoder(x)
        true = Tensor(y).long().to(device)
        loss = nn.CrossEntropyLoss()(prediction, true)
        losses.append(loss.cpu().detach().numpy())
        accuracies.append(accuracy(true, prediction))
        encoder.optimizer.zero_grad()
        loss.backward()
        encoder.optimizer.step()
    print(f"Epoch: {epoch+1}/{epochs}\tLoss: {str(np.mean(losses))[:5]}\t Accuracy: {str(np.mean(accuracies))[:5]}\t{str(time.time() - t)[:5]}")

the results are this:

Epoch Loss Accuracy Time Taken (s)
Epoch: 1/10000 Loss: 9.555 Accuracy: 0.0 47.77
Epoch: 2/10000 Loss: 9.559 Accuracy: 0.0 47.88
Epoch: 3/10000 Loss: 9.565 Accuracy: 0.0 47.85
Epoch: 4/10000 Loss: 9.540 Accuracy: 0.0 47.87
Epoch: 5/10000 Loss: 9.563 Accuracy: 0.0 47.86
Epoch: 6/10000 Loss: 9.552 Accuracy: 0.0 47.86
Epoch: 7/10000 Loss: 9.558 Accuracy: 0.0 47.86
Epoch: 8/10000 Loss: 9.561 Accuracy: 0.0 47.83
Epoch: 9/10000 Loss: 9.556 Accuracy: 0.636 47.84
Epoch: 10/10000 Loss: 9.543 Accuracy: 0.0 47.86
Epoch: 11/10000 Loss: 9.533 Accuracy: 0.0 47.87
Epoch: 12/10000 Loss: 9.542 Accuracy: 0.0 47.90
Epoch: 13/10000 Loss: 9.542 Accuracy: 0.0 47.90
Epoch: 14/10000 Loss: 9.542 Accuracy: 0.0 47.90
Epoch: 15/10000 Loss: 9.537 Accuracy: 0.0 47.91

in Keras, I get at least a non-zero accuracy but in PyTorch, nothing changes
why is it like that?

i found the solution

  1. output should be log_softmax with nll loss function
  2. Y should be (batch_size,)
  3. zero_grad should be called after encoder(x)