My Binary Classifier is not Learning

SrJ · June 16, 2020, 6:56pm

I have constructed this model from scratch and it doesn’t learn. the accuracy score is always 0.5 or 0.4375. And val is stuck too. Any help will be appreciated.

This is My Dataset Loader:

class Dataset(torch.utils.data.Dataset):
def init(self,df,max_len=96):

    self.df = df
    self.max_len = max_len
    self.labeled = (df.shape[1] == 2)
    self.tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    
def __getitem__(self,index):
    data = {}
    row = self.df.iloc[index]
    ids,masks,labels = self.get_input_data(row)
    data['ids'] = ids
    data['masks'] = masks
    if self.labeled:
        data['labels'] = torch.tensor(labels,dtype=torch.float64)
    return data

def __len__(self):
    return len(self.df)

def get_input_data(self,row):
    ids = self.tokenizer.encode(row[0],add_special_tokens=True)
    pad_len = self.max_len - len(ids)
    if pad_len > 0 :
        ids += [0]*pad_len
    ids = torch.tensor(ids)    
    masks = torch.where(ids != 1 , torch.tensor(1),torch.tensor(0))
    return ids,masks,row[1]

This my Model

class Model(nn.Module):
def init(self):
super(Model,self).init()
self.distilBert = transformers.DistilBertModel.from_pretrained(‘distilbert-base-uncased’)
self.fc0 = nn.Linear(768,256)
self.fc1 = nn.Linear(256,1)
self.lr = nn.LeakyReLU()
nn.init.normal_(self.fc0.weight,std= 0.2)
nn.init.normal_(self.fc0.bias ,0.1)
nn.init.normal_(self.fc1.weight,std =0.2)
nn.init.normal_(self.fc1.bias, 0.1)
def forward(self,input_ids,attention_mask):
    src = self.distilBert(input_ids,attention_mask)
    src = src[0][:,0,:]
    x = self.fc0(src)
    x = self.lr(x)
    x = self.fc1(x)
    return x

And this is my training loop :

criterion = nn.BCELoss()
model = Model().to(‘cuda’)
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

for epoch in range(epochs):

    epoch_loss = 0
    val_loss = 0
    model.train()
    for data in tqdm(train_loader):
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        labels = data['labels'].cuda()
        optimizer.zero_grad()
        outputs = model(ids,masks)
        outputs = F.sigmoid(outputs)
        loss = criterion(outputs.double(),labels)
        loss.backward()
        optimizer.step()
       
        epoch_loss += loss.item()
        
        outputs = outputs.cpu().detach().numpy()
        labels = labels.cpu().detach().numpy()
        outputs = np.argmax(outputs,axis=1)
    print(f'Train Epoch {epoch} : Loss {epoch_loss/len(train_loader)}')
    print("Train Accuracy : ",accuracy_score(outputs,labels))
    model.eval()   
    for data in val_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        labels = data['labels'].cuda()
        outputs = model(ids,masks)
        outputs = F.sigmoid(outputs)
        loss = criterion(outputs.double(),labels)
        val_loss += loss.item()
        outputs = outputs.cpu().detach().numpy()
        labels = labels.cpu().detach().numpy()
        outputs = np.argmax(outputs,axis=1)
    print(f'Val Epoch {epoch} : Loss {val_loss/len(val_loader)}')
    print("Val Accuracy : ",accuracy_score(outputs,labels))

ptrblck · June 16, 2020, 7:22pm

I would recommend to scale down your problem a bit and try to overfit a small fraction of your dataset (e.g. just 10 samples) by playing around with the hyper-parameters.
Once your model can learn this dataset, you could try to scale it up again by using more data.

SrJ · June 16, 2020, 7:23pm

I have used small datasets of 200. and inspected every output layer. I found out that my output of linear layer 1 and 2 keep increasing and going up to 10e8 range. What could cause this problem?

ptrblck · June 16, 2020, 7:25pm

Your targets might be biased towards class1, so that your model tries to maximize its output.
Could you check the class distribution of your dataset?

SrJ · June 16, 2020, 7:27pm

I am using SST2 dataset for sentiment classification. And the dataset positive negative ratio is almost same.

SrJ · June 17, 2020, 9:10pm

Ok I have found the solution to my problem. It is with the Optimizer. As i have used a distilBert Layer at the beginning , i have to use very low lr like 3e-5 according to the paper.