the following code is gpt generated and tweaked
i read about the architecture and saw from scratch implementation of it and i dont get what is wrong with the following code it doesnt learn doesnt exceed guessing accuracy but there is initial drop in loss in the first epoch
any help is much appreciated
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor, Normalize
from torchvision import transforms
from torchvision.datasets import FashionMNIST ,MNIST ,CIFAR100 ,CIFAR10
data_set = MNIST
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads):
super(MultiHeadAttention, self).__init__()
self.n_heads = n_heads
self.d_model = d_model
self.d_k = d_model // n_heads
self.linear_q = nn.Linear(d_model, d_model)
self.linear_k = nn.Linear(d_model, d_model)
self.linear_v = nn.Linear(d_model, d_model)
self.linear_out = nn.Linear(d_model, d_model)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
query = self.linear_q(query)
key = self.linear_k(key)
value = self.linear_v(value)
query = query.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
key = key.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
value = value.view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float))
if mask is not None:
mask = mask.unsqueeze(1).unsqueeze(2)
scores = scores.masked_fill(mask == 0, -1e9)
attention = F.softmax(scores, dim=-1)
x = torch.matmul(attention, value)
x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_k)
x = self.linear_out(x)
return x
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(FeedForward, self).__init__()
self.linear_1 = nn.Linear(d_model, d_ff)
self.linear_2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.dropout(F.relu(self.linear_1(x)))
x = self.linear_2(x)
return x
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, n_heads)
self.feed_forward = FeedForward(d_model, d_ff, dropout)
self.norm_1 = nn.LayerNorm(d_model)
self.norm_2 = nn.LayerNorm(d_model)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
def forward(self, x, mask=None):
x_norm = self.norm_1(x)
attn_out = self.self_attn(x_norm, x_norm, x_norm, mask)
x = x + self.dropout_1(attn_out)
x_norm = self.norm_2(x)
ff_out = self.feed_forward(x_norm)
x = x + self.dropout_2(ff_out)
return x
class ImageTransformer(nn.Module):
def __init__(self, input_dim, n_heads, d_ff, n_layers, d_model, dropout=0.1):
super(ImageTransformer, self).__init__()
self.input_dim = input_dim
self.d_model = d_model
self.embedding = nn.Linear(input_dim, d_model)
self.pe = PositionalEncoding(d_model ,max_len = 1000)
self.layers = nn.ModuleList([
TransformerEncoderLayer(d_model, n_heads, d_ff, dropout)
for _ in range(n_layers)
])
self.norm = nn.LayerNorm(d_model)
self.output = nn.Linear(d_model, 10)
def forward(self, x, mask=None):
x = self.embedding(x)
x = self.pe(x)
for layer in self.layers:
x = layer(x, mask)
x = self.norm(x)
x = torch.mean(x, dim=1)
x = self.output(x)
return x
def train(model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
if batch_idx % 50 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.cross_entropy(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
accuracy = 100. * correct / len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
test_loss, correct, len(test_loader.dataset), accuracy))
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def main(model = None):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 256
epochs = 10
lr = 0.01
n_heads = 8
d_model = 256
d_ff = 512
n_layers = 4
train_dataset = data_set(root="./data", train=True, download=True, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
transforms.Lambda(lambda x: x.view(-1)) # Flatten the image
]))
test_dataset = data_set(root="./data", train=False, download=True, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
transforms.Lambda(lambda x: x.view(-1)) # Flatten the image
]))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
if not model :
model = ImageTransformer(28*28, n_heads, d_ff, n_layers, d_model).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
num_parameters = count_parameters(model)
print("Number of parameters in the model:", num_parameters)
for epoch in range(1, epochs + 1):
train(model, device, train_loader, optimizer, epoch)
test(model, device, test_loader)
return model
if __name__ == '__main__':
model = main()