Lstm binary text classification model same prediction

Faruk_Celik · May 14, 2024, 12:37am

The reason I beleive it is not learning is that it always predicts 0.

class LSTM(nn.Module):
    def __init__(self, num_emb, output_size, num_layers=1, hidden_size=128):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Create an embedding for each token
        self.embedding = nn.Embedding(num_emb, 500)
        
        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, 
                            num_layers=num_layers, batch_first=True, dropout=0.5)
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq):
        input_embs = self.embedding(input_seq)
        self.memory_a = False
        
        
        if self.memory_a:
            output, (hidden_out, mem_out) = self.lstm(input_embs, (self.hidden, self.memory))
        else:
            hidden = torch.zeros(self.num_layers, 512, self.hidden_size, device=device)
            memory = torch.zeros(self.num_layers, 512, self.hidden_size, device=device)
            output, (hidden_out, mem_out) = self.lstm(input_embs, (hidden, memory))
            self.hidden = hidden_out
            self.memory = mem_out
            self.memory_a = True
                
        return self.fc_out(output)

def objective(trial):
    input_size = 500
    hidden_size = 500
    output_size = 1
    num_layers = 2
    model = LSTM(num_emb=32000, output_size=1, 
                           num_layers=2, hidden_size=input_size).to(device)

    # Hyperparameters to tune
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 1e-4, 1e-3, 1e-2])
    beta_1 = trial.suggest_categorical('beta_1', [0.85, 0.9, 0.95])
    beta_2 = trial.suggest_categorical('beta_2', [0.9, 0.95, 0.995, 0.996, 0.997, 0.998, 0.999])
    epsilon = trial.suggest_categorical('epsilon', [1e-8, 1e-7, 1e-6, 1e-5, 1e-4])
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
    loss_fn = nn.BCEWithLogitsLoss()
    
    
    
    train_acc = 0
    test_acc = 0
    
    bs = 512

    
    for epoch in range(10):

        model.train()
        training_loss_logger = []
        training_acc_logger = []
        steps = 0
        
        for batch_idx, batch in tqdm(enumerate(train_loader, 1), desc="Training", total=len(train_loader)):
            text, _, labels = batch
            text, labels = text.to(device), labels.to(device)
            
            pred = model(text)

            loss = loss_fn(pred[:, -1, :], labels.float().unsqueeze(1))

            optimizer.zero_grad()
            
            loss.backward()
            
            optimizer.step()

            training_loss_logger.append(loss.item())

            train_acc += (pred[:, -1, :].argmax(1) == labels).sum()
            steps += bs            
            
        train_acc = (train_acc/steps).item()
        training_acc_logger.append(train_acc)
        print(f"VAL LOSS: {np.mean((training_acc_logger))}")

        model.eval()
        test_loss_logger = []
        test_acc_logger = []
        steps = 0
        with torch.no_grad():
            for batch_idx, batch in tqdm(enumerate(val_loader, 1), desc="Testing", total=len(val_loader)):
                text, _, labels = batch
                text, labels = text.to(device), labels.to(device)
                bs = labels.shape[0]


                pred = model(text)
                

                loss = loss_fn(pred[:, -1, :], labels.float().unsqueeze(1))
                test_loss_logger.append(loss.item())

                test_acc += (pred[:, -1, :].argmax(1) == labels).sum()
                steps += bs

            test_acc = (test_acc/steps).item()
            test_acc_logger.append(test_acc)
            print(f"VAL LOSS: {np.mean((test_loss_logger))}")
            print(f"VAL ACC: {test_acc}")
            print(f"pred: {pred[:, -1, :].argmax(1)}")
            print(f"labels: {labels}")

    trial.set_user_attr("val_loss", np.mean((test_loss_logger)))
    trial.set_user_attr("accuracy", test_acc)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon})
        
    print(f"Used Hyperparameters: 'learning_rate': {learning_rate}, 'beta_1': {beta_1}, 'beta_2': {beta_2}, 'epsilon': {epsilon}")
    
    return np.mean((test_loss_logger))

vdw · May 14, 2024, 2:02am

Are your input samples dependent on each other? Otherwise it is not clear why you re-ause the hidden and cell state.

Also output has 3 dimensions: (batch_size, seq_len, hidden_size). I’m not sure how nn.Linear behaves in this case.

Maybe you can try the simplified forward() method:

def forward(self, input_seq):
    input_embs = self.embedding(input_seq)
    output, _ = self.lstm(input_embs)
    return self.fc_out(output[:, -1, :])

Of course, then the line

loss = loss_fn(pred[:, -1, :], labels.float().unsqueeze(1))

needs to be changed to

loss = loss_fn(pred, labels.float().unsqueeze(1))

Faruk_Celik · May 14, 2024, 8:56pm

Thanks for your response. I tried the changes you suggested, however it still makes the same predictions. pred: tensor([[-0.5098], [-0.5098], [-0.5098], [-0.5098], [-0.5098], [-0.5098],

Faruk_Celik · May 15, 2024, 11:22am

Also I checked the weights and the lstm weights are 0.

embedding.weight tensor(0., device='cuda:0')
lstm.weight_ih_l0 tensor(0., device='cuda:0')
lstm.weight_hh_l0 tensor(0., device='cuda:0')
lstm.bias_ih_l0 tensor(0., device='cuda:0')
lstm.bias_hh_l0 tensor(0., device='cuda:0')
lstm.weight_ih_l1 tensor(0., device='cuda:0')
lstm.weight_hh_l1 tensor(0., device='cuda:0')
lstm.bias_ih_l1 tensor(0., device='cuda:0')
lstm.bias_hh_l1 tensor(0., device='cuda:0')
fc_out.weight tensor(33.2756, device='cuda:0')
fc_out.bias tensor(0.1410, device='cuda:0')

agasheaditya · May 15, 2024, 6:53pm

The model is returning un-normalized predicted probabilities. This can be solved using torch.sigmoid like:

logits = model(text)
preds = torch.round(torch.sigmoid(logits))

Faruk_Celik · May 16, 2024, 12:12am

Thanks for your suggestion. So I tried it, however upon further inspection I have realized the loss remains near same across 3 epochs. And it still makes the same prediction every time.

class LSTM(nn.Module):
    def __init__(self, num_emb, output_size, num_layers=1, hidden_size=128):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Create an embedding for each token
        self.embedding = nn.Embedding(num_emb, 500)
        
        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, 
                            num_layers=num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, output_size)
        self.act = nn.Sigmoid()

    def forward(self, input_seq):
        input_embs = self.embedding(input_seq)
        output, _ = self.lstm(input_embs)
        return self.act(self.fc_out(output))[:, -1, :]
def objective(trial):
    input_size = 500
    hidden_size = 500
    output_size = 1
    num_layers = 2
    model = LSTM(num_emb=32000, output_size=1, 
                           num_layers=2, hidden_size=input_size).to(device)

    # Hyperparameters to tune
    learning_rate = trial.suggest_categorical('learning_rate', [1e-2, 1e-3, 1e-4, 1e-5])
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    #loss_fn = nn.BCEWithLogitsLoss()
    loss_fn = nn.BCELoss()
    
    
    
    
    train_acc = 0
    test_acc = 0
    
    bs = 512

    
    for epoch in range(10):
        print(f"TRIAL: {trial.number}, EPOCH: {epoch}, LR: {learning_rate}")

        model.train()
        training_loss_logger = []
        training_acc_logger = []
        steps = 0
        
        for batch_idx, batch in tqdm(enumerate(train_loader, 1), desc="Training", total=len(train_loader)):
            text, _, labels = batch
            text, labels = text.to(device), labels.to(device)
            
            
            pred = model(text)

            loss = loss_fn(pred, labels.float().unsqueeze(1))
            
            pred = torch.round(pred)


            optimizer.zero_grad()
            
            loss.backward()
            
            optimizer.step()

            training_loss_logger.append(loss.item())

            train_acc += (pred == labels.float().unsqueeze(1)).sum()
            steps += bs
    
        for name, param in model.named_parameters():
            print(name, param.grad.abs().sum())
        
            
        train_acc = (train_acc/steps).item()
        training_acc_logger.append(train_acc)

        print(f"TRAIN LOSS: {np.mean((training_loss_logger))}")

        model.eval()
        test_loss_logger = []
        test_acc_logger = []
        steps = 0
        with torch.no_grad():
            for batch_idx, batch in tqdm(enumerate(val_loader, 1), desc="Testing", total=len(val_loader)):
                text, _, labels = batch
                text, labels = text.to(device), labels.to(device)
                bs = labels.shape[0]


                pred = model(text)
                

                loss = loss_fn(pred, labels.float().unsqueeze(1))
                test_loss_logger.append(loss.item())
                
                pred = torch.round(pred)

                test_acc += (pred == labels.float().unsqueeze(1)).sum()
                steps += bs

            test_acc = (test_acc/steps).item()
            test_acc_logger.append(test_acc)
            print(f"VAL LOSS: {np.mean((test_loss_logger))}")
            print(f"VAL ACC: {test_acc}")
            print(f"pred: {pred[:5]}")
            print(f"labels: {labels[:5]}")
            

    trial.set_user_attr("val_loss", np.mean((test_loss_logger)))
    trial.set_user_attr("accuracy", test_acc)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate})
        
    print(f"Used Hyperparameters: 'learning_rate': {learning_rate}")
    
    return np.mean((test_loss_logger))

Output for loss:

TRAIN LOSS 1: 0.6720710087497279
VAL LOSS 1: 0.6692158620236284

TRAIN LOSS 2: 0.6716524698824252
VAL LOSS 2: 0.6692240430136859

TRAIN LOSS 3: 0.6716394789938657
VAL LOSS 3: 0.6691344168226597

agasheaditya · May 16, 2024, 7:42am

One more thing you can try; that is applying nonlinear activations over linear layers. Here is the discussion which can be helpful.

Link: Loss does not change and weights remain zero - #6 by ptrblck

vdw · May 16, 2024, 12:30pm

The posted code use the BCEWithLogitsLoss which combines the Sigmoid and BCELoss. So you should not use torch.sigmoid as well!

vdw · May 16, 2024, 12:37pm

Wait a minute: What is actually your exact task, and how do labels look like?

Your model indicates a binary classification task, so labels should be a tensor of shape (batch_size) containing 0s and 1s. I’m a bit confused by the call of .float().

Faruk_Celik · May 16, 2024, 12:43pm

I am using sigmoid with BCELoss right now. The labels consist of 0’s and 1’s. It gives an error when I do not use float. So, I figured out the predictions become the same after LSTM layer. The embedding returns unique values, however the LSTM returns a same value repeated.

vdw · May 16, 2024, 12:50pm

Faruk_Celik:

embedding.weight tensor(0., device='cuda:0')
lstm.weight_ih_l0 tensor(0., device='cuda:0')
lstm.weight_hh_l0 tensor(0., device='cuda:0')
lstm.bias_ih_l0 tensor(0., device='cuda:0')
lstm.bias_hh_l0 tensor(0., device='cuda:0')
lstm.weight_ih_l1 tensor(0., device='cuda:0')
lstm.weight_hh_l1 tensor(0., device='cuda:0')
lstm.bias_ih_l1 tensor(0., device='cuda:0')
lstm.bias_hh_l1 tensor(0., device='cuda:0')
fc_out.weight tensor(33.2756, device='cuda:0')
fc_out.bias tensor(0.1410, device='cuda:0')

How do you get those values? For example, fc_out.weight shouldn’t be a single values but a 2d tensor of shape (hidden_size, output_size)

Faruk_Celik · May 16, 2024, 12:54pm

The parameters are non-zero and they are alright now. The problem seems to be with the last layer.

Faruk_Celik · May 16, 2024, 12:58pm

class LSTM(nn.Module):
    def __init__(self, num_emb, output_size, num_layers=1, hidden_size=128):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
     
        # Create an embedding for each token
        self.embedding = nn.Embedding(num_emb, 500)
        
        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, 
                            num_layers=num_layers, batch_first=True, dropout=0.5)
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq):
        input_embs = self.embedding(input_seq)
        
        output, _ = self.lstm(input_embs)
        return nn.Sigmoid(self.fc_out(output[:, -1, :]))

vdw · May 16, 2024, 1:08pm

I’m actually not sure if you can/should use nn.Sigmoid here. I know that the sigmoid has not trainable parameters, but it “feels” odd :). Could you try:

F.sigmoid(self.fc_out(output[:, -1, :]))

You might need an import torch.nn.functional as F.

EDIT: I’ve just had a look at the source code. It probably doesn’t make a difference.

Faruk_Celik · May 16, 2024, 1:13pm

Thanks a lot , I will try this as soon as possible and return with the results. However, will this also fix the problem with the LSTM layer? I doubt that the problem is due to sigmoid since training with ‘BCEWithLogitsLoss’ with no Sigmoid resulted the same.

Faruk_Celik · May 16, 2024, 10:11pm

So right now, the predictions are not working a lot but there is at least a bit difference between them.

Model:

class LSTM(nn.Module):
    def __init__(self, input_size, num_emb, output_size, num_layers=2, hidden_size=128):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Create an embedding for each token
        self.embedding = nn.Embedding(num_emb, 30)
                
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, 
                            num_layers=num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, output_size)
        self.act = nn.Sigmoid()

    def forward(self, input_seq, logP = False):
        input_embs = self.embedding(input_seq)
        
        output, _ = self.lstm(input_embs[:, -1, :])
        if logP:
            print(f"input_embs: {input_embs[:5]}")
            print(f"outputLn: {outputLn[:5]}")
            print(f"output lstm: {output[:5]}")
        output = self.fc_out(output)

        if logP:
            print(f"output Linear: {output[:5]}")
        
        output = self.act(output)
        if logP:
            print(f"output sigmoid: {output[:5]}")
            
        return output

Output:

pred: tensor([[0.4511],
        [0.4147],
        [0.3994],
        [0.3928],
        [0.3898]], device='cuda:0')
labels: tensor([1, 0, 0, 0, 0], device='cuda:0')

vdw · May 17, 2024, 1:42am

Are those the prediction after training (I assume so)?

During training, does the loss go down? Did you try different learning rates? Can you overfit on a very small training dataset, i.e., does the training loss goes basically down to 0 if use only a few training samples?

Faruk_Celik · May 17, 2024, 2:36am

Those are the predictions. I tried multiple learning rates which did not work. I switched to BCEWithLogitsLoss and the training loss went to 0 when I used a really small dataset however accuracy is still not improving.

vdw · May 17, 2024, 10:53am

If the training loss goes to 0, the at least train_acc should got towards 1. Are you sure those calculations are all done correctly?

Faruk_Celik · May 17, 2024, 11:24am

The loss seems to be decreasing and it goes to negative at some point. The predictions change over time which is good. However the accuracy remains approx. .4.

class LSTM(nn.Module):
    def __init__(self, input_size, num_emb, output_size, num_layers=2, hidden_size=128):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Create an embedding for each token
        self.embedding = nn.Embedding(num_emb, 30)
                
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, 
                            num_layers=num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, logP = False):
        input_embs = self.embedding(input_seq)
        
        output, _ = self.lstm(input_embs[:, -1, :])
        if logP:
            print(f"input_embs: {input_embs[:5]}")
            print(f"outputLn: {outputLn[:5]}")
            print(f"output lstm: {output[:5]}")
        output = self.fc_out(output)

        if logP:
            print(f"output Linear: {output[:5]}")
        
        if logP:
            print(f"output sigmoid: {output[:5]}")
            
        return output


def objective(trial):
    input_size = 30
    hidden_size = 512
    output_size = 1
    num_layers = 7
    model = LSTM(input_size, num_emb=32000, output_size=1, 
                           num_layers=num_layers, hidden_size=input_size).to(device)

    # Hyperparameters to tune
    #{'learning_rate': 0.001, 'beta_1': 0.8, 'beta_2': 0.997, 'epsilon': 1e-07}. Best is trial 0 with value: 0.6835878929521284.
    """
    learning_rate = trial.suggest_categorical('learning_rate', [1e-2, 1e-3, 1e-4, 1e-5])
    beta_1 = trial.suggest_categorical('beta_1', [0.8, 0.85, 0.9, 0.95])
    beta_2 = trial.suggest_categorical('beta_2', [0.995, 0.996, 0.997, 0.998, 0.999])
    epsilon = trial.suggest_categorical('epsilon', [1e-8, 1e-7, 1e-6])
    """
    learning_rate = 0.001
    beta_1 = 0.8
    beta_2 = 0.997
    epsilon = 1e-07
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
    loss_fn = nn.BCEWithLogitsLoss()
    
    
    
    
    train_acc = 0
    test_acc = 0
    
    epochs = 30
    
    for epoch in range(epochs):
        print(f"TRIAL: {trial.number}, EPOCH: {epoch}, LR: {learning_rate}")

        model.train()
        training_loss_logger = []
        training_acc_logger = []
        steps = 0
        
        for batch_idx, batch in tqdm(enumerate(train_loader, 1), desc="Training", total=len(train_loader)):
            text, _, labels = batch
            labels -=1
            text, labels = text.to(device), labels.to(device)
            
            bs = len(labels)
            
            pred = model(text)

            loss = loss_fn(pred, labels.float().unsqueeze(1))
            
            pred = (pred >= 0).float()

            optimizer.zero_grad()
            
            loss.backward()
            
            optimizer.step()

            training_loss_logger.append(loss.item())
            
            train_acc += sum((sum(pred.float() == labels.float()))/ len(labels))
            steps += bs
            
            
            
        train_acc = (train_acc/steps).item()
        training_acc_logger.append(train_acc)

        print(f"TRAIN LOSS: {np.mean((training_loss_logger))}")
        print(f"TRAIN ACC: {train_acc}")

        model.eval()
        test_loss_logger = []
        test_acc_logger = []
        steps = 0
        with torch.no_grad():
            for batch_idx, batch in tqdm(enumerate(val_loader, 1), desc="Testing", total=len(val_loader)):
                text, _, labels = batch
                text, labels = text.to(device), labels.to(device)
                bs = labels.shape[0]

                pred = model(text)

                loss = loss_fn(pred, labels.float().unsqueeze(1))
                test_loss_logger.append(loss.item())
                pred = (pred >= 0).float()

                test_acc += sum(sum(pred.float() == labels.float())/ len(labels))
                steps += bs

            test_acc = (test_acc/steps).item()
            test_acc_logger.append(test_acc)
            print(f"VAL LOSS: {np.mean((test_loss_logger))}")
            print(f"VAL ACC: {test_acc}")
            print(f"pred: {pred[:5]}")
            print(f"labels: {labels[:5]}")
            #print(f"texts: {text[:5]}")
            

    trial.set_user_attr("val_loss", np.mean((test_loss_logger)))
    trial.set_user_attr("accuracy", test_acc)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate})
        
    print(f"Used Hyperparameters: 'learning_rate': {learning_rate}")
    
    return np.mean((test_loss_logger))