My code for a custom model based on the transformer encoder layer of the Vision Transformer is not converging with the binary classification task as shown below, while the multi-class classification is converging with 50 epochs and SGD optimizer with a learning rate of 0.005. I tried with both cross-entropy and binary cross-entropy loss functions, but both didn’t work. What might be the issues here?
The code for the model is as follows
class ViT(ImageClassificationBase):
def __init__(self,
img_size=224, # from Table 3
num_channels=3,
patch_size=16,
embedding_dim=768, # from Table 1
dropout=0.1,
num_transformer_layers=6, # from Table 1
num_heads=16,
num_classes=2)
super().__init__()
self.layers = num_transformer_layers
# Assert image size is divisible by patch size
assert img_size % patch_size == 0, "Image size must be divisble by patch size."
assert embedding_dim % num_heads == 0, "Embedding dimension should be divisible by num_heads."
# 1. Create patch embedding
self.patch_embedding = PatchEmbedding(in_channels=num_channels,
patch_size=patch_size,
embedding_dim=embedding_dim)
# 2. Create class token
self.class_token = nn.Parameter(torch.randn(1, 1, embedding_dim),
requires_grad=True)
# 3. Create positional embedding
num_patches = (img_size * img_size) // patch_size**2 # N = HW/P^2
self.positional_embedding = nn.Parameter(torch.randn(1, num_patches+1, embedding_dim))
self.embedding_dropout = nn.Dropout(p=dropout)
self.transformer_encoder_layer = MyDenseBlock(embedding = embedding_dim,n_heads=num_heads,mlp = 4*embedding_dim)
# 7. Create MLP head
self.mlp_head = nn.Sequential(
nn.LayerNorm(normalized_shape=embedding_dim),
nn.Linear(in_features=embedding_dim,
out_features=num_classes)
)
def forward(self, x):
batch_size = x.shape[0]
# Create the patch embedding
x = self.patch_embedding(x)
init_patch = x
# First, expand the class token across the batch size
class_token = self.class_token.expand(batch_size, -1, -1)
# Prepend the class token to the patch embedding
x = torch.cat((class_token, x), dim=1)
# Add the positional embedding to patch embedding with class token
x = self.positional_embedding + x
# Dropout on patch + positional embedding
x = self.embedding_dropout(x)
# Pass embedding through Transformer Encoder stack
for l in range(self.layers):
encoder_feat = self.transformer_encoder_layer(x)
x = encoder_feat[:,1:]+init_patch
x= torch.cat((encoder_feat[:,0].unsqueeze(1),x),dim=1)
# Pass 0th index of x through MLP head
x = self.mlp_head(x[:, 0])
return x