Hi,
I am training an encoder for transformer
in my code, everything looks good but the accuracy and loss doesn’t change!
def Tensor(x):
return T.Tensor(x) if not isinstance(x, type(T.Tensor(1))) else x
def create_padding_mask(x):
x = Tensor(x)
return (x == 0).float().unsqueeze(1).unsqueeze(1)
def accuracy(true,pred):
acc = (true.argmax(-1) == pred.argmax(-1)).float().detach().numpy()
return float(100 * acc.sum() / len(acc))
def batch2(x,y):
x,y = shuffle(copy.copy(x), copy.copy(y))
x,y = x[:5000], y[:5000]
for i in range(0, len(x), batch_size):
yield x[i:i+batch_size], y[i:i+batch_size]
def cross_entropy(true, pred):
pred = T.clamp(pred, 1e-8, 1. - 1e-8)
N = pred.shape[0]
return -T.sum(true * T.log(pred+1e-9)) / N
def create_look_ahead_mask(x):
x = Tensor(x)
mask = (1-T.triu(T.ones(x.shape[-2], x.shape[-2])) == 1)
return mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, batch_size):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.num_heads = num_heads
self.batch_size = batch_size
self.depth = d_model // num_heads
self.Q = nn.Linear(d_model, d_model)
self.K = nn.Linear(d_model, d_model)
self.V = nn.Linear(d_model, d_model)
self.linear = nn.Linear(d_model, d_model)
def SDPA(self, Q, K, V, M):
mat = T.matmul(Q, K.transpose(-2, -1))
scale = mat / np.sqrt(K.shape[-1])
if M is not None:
scale += M * -1e9
softmax = F.softmax(scale, -1)
return T.matmul(softmax, V)
def split_heads(self, inputs, batch_size):
return T.reshape(inputs, (batch_size, self.num_heads, -1, self.depth))
def forward(self, x):
query, key, value, mask = x;query, key, value, mask = query.float(), key.float(), value.float(), mask.float()
batch_size = query.shape[0]
Q = self.split_heads(self.Q(query), query.shape[0])
K = self.split_heads(self.K(query), query.shape[0])
V = self.split_heads(self.V(query), query.shape[0])
sdpa = self.SDPA(Q, K, V, mask)
cat = sdpa.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.depth)
return self.linear(cat)
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout, max_len):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(dropout)
pe = T.zeros(max_len, d_model)
position = T.arange(0, max_len).unsqueeze(1)
div_term = T.exp(T.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
pe[:, 0::2] = T.sin(position * div_term)
pe[:, 1::2] = T.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = Tensor(x).to(device)
x = x + T.Tensor(self.pe[:, :x.size(1)]).to(device)
return self.dropout(x)
class Encoder(nn.Module):
def __init__(self, batch_size, d_model, num_heads, dropout, num_layers, maxlen, num_words):
super(Encoder, self).__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.batch_size = batch_size
self.num_layers = num_layers
self.PE = PositionalEncoding(d_model, dropout, num_words)
self.emb1 = nn.Embedding(num_words, d_model).to(device)
self.emb2 = nn.Embedding(num_words, d_model).to(device)
self.emb3 = nn.Embedding(num_words, d_model).to(device)
self.ffn = nn.Linear(d_model, d_model).to(device)
self.attention = MultiHeadAttention(d_model, num_heads, batch_size).to(device)
self.output = nn.Linear(maxlen * d_model, num_words).to(device)
self.optimizer = Adam(self.parameters())
def forward(self, x):
x = Tensor(x)
x = x.long().to(device)
pad_mask = create_padding_mask(x)
emb1 = self.PE(self.emb1(x).to(device) * np.sqrt(self.d_model)).to(device)
emb2 = self.PE(self.emb2(x).to(device) * np.sqrt(self.d_model)).to(device)
emb3 = self.PE(self.emb3(x).to(device) * np.sqrt(self.d_model)).to(device)
inp = emb1 + emb2 + emb3
for i in range(self.num_layers):
attention = self.attention([inp,inp,inp,pad_mask])
layer_norm = F.layer_norm(inp + attention, attention.shape[1:], eps=1e-6)
ffn = self.ffn(layer_norm)
layer_norm = F.layer_norm(layer_norm + ffn, attention.shape[1:], eps=1e-6)
output = layer_norm.flatten(-2)
output = self.output(output)
return F.softmax(output, -1)
batch_size = 32
d_model = 512
num_heads = 8
dropout = .1
num_layers = 12
mask_token = tok.word_index['<mask>'] # 59
epochs = 10000
device = 'cuda'
encoder = Encoder(batch_size, d_model, num_heads, dropout, num_layers, maxlen, num_words)
for epoch in range(epochs):
losses = []
accuracies = []
t = time.time()
for x,y in batch2(X,Y):
prediction = encoder(x)
true = Tensor(y).long().to(device)
loss = nn.CrossEntropyLoss()(prediction, true)
losses.append(loss.cpu().detach().numpy())
accuracies.append(accuracy(true, prediction))
encoder.optimizer.zero_grad()
loss.backward()
encoder.optimizer.step()
print(f"Epoch: {epoch+1}/{epochs}\tLoss: {str(np.mean(losses))[:5]}\t Accuracy: {str(np.mean(accuracies))[:5]}\t{str(time.time() - t)[:5]}")
the results are this:
Epoch | Loss | Accuracy | Time Taken (s) |
---|---|---|---|
Epoch: 1/10000 | Loss: 9.555 | Accuracy: 0.0 | 47.77 |
Epoch: 2/10000 | Loss: 9.559 | Accuracy: 0.0 | 47.88 |
Epoch: 3/10000 | Loss: 9.565 | Accuracy: 0.0 | 47.85 |
Epoch: 4/10000 | Loss: 9.540 | Accuracy: 0.0 | 47.87 |
Epoch: 5/10000 | Loss: 9.563 | Accuracy: 0.0 | 47.86 |
Epoch: 6/10000 | Loss: 9.552 | Accuracy: 0.0 | 47.86 |
Epoch: 7/10000 | Loss: 9.558 | Accuracy: 0.0 | 47.86 |
Epoch: 8/10000 | Loss: 9.561 | Accuracy: 0.0 | 47.83 |
Epoch: 9/10000 | Loss: 9.556 | Accuracy: 0.636 | 47.84 |
Epoch: 10/10000 | Loss: 9.543 | Accuracy: 0.0 | 47.86 |
Epoch: 11/10000 | Loss: 9.533 | Accuracy: 0.0 | 47.87 |
Epoch: 12/10000 | Loss: 9.542 | Accuracy: 0.0 | 47.90 |
Epoch: 13/10000 | Loss: 9.542 | Accuracy: 0.0 | 47.90 |
Epoch: 14/10000 | Loss: 9.542 | Accuracy: 0.0 | 47.90 |
Epoch: 15/10000 | Loss: 9.537 | Accuracy: 0.0 | 47.91 |
in Keras, I get at least a non-zero accuracy but in PyTorch, nothing changes
why is it like that?