Optimizer.step() doesn't work

I tried to train model.
But, output did not change when validation data was entered into the model after training.
Then, I checked parameters of model using model.parameters(), But weights were same.
What is the problem?

I tried to check whether grad is 0 or not using this code.

loss.backward()

for param in net.parameters():
    print(param.grad.data.sum())

# start debugger
import pdb; pdb.set_trace()

optimizer.step()

But this error is happened.
AttributeError: ‘NoneType’ object has no attribute ‘data’

Also, I changed learning_rate 0.001 to 0.01, But nothing changed.

Batch sizes, etc. are set to small values for testing purposes.

How can I fix it ?

# from positional_encoding import positional_encoding
import torch
import torch.nn as nn
import time
import numpy as np
import random
import copy
import math
# from timm.scheduler import CosineLRScheduler
from transformers import get_cosine_schedule_with_warmup
from encoder_MRL import TransformerEncoder
from loss_MRL import MarginRankingLoss
from make_data import positional_encoding
from make_input import make_input_test

def train(model): 
    model.train()
    batch_size = 10
    train_loss_list = []
    #乱数
    l = random.sample(range(0,1000), k=1000)
    # l = random.sample(range(0,287113), k=287113)
    # batch_loss = []
    count = 0
    #ミニバッチ学習 (minibatch training)
    iterate = math.ceil(1000 / batch_size) #小数点以下繰り上げ
    for batch in range(iterate):
        if batch == iterate: #最後のバッチ (last batch)
            batch_size = 287113 % batch_size
        train_data_, train_answer_, SBse_pos_, ExSum_id_ = [], [], [], []
        # for i in l[count:count+batch_size]:
        for i in range(1):
            train_data = np.load(f'/home1/user/sata/train_data_MRL/{i}.npy')
            train_answer = np.load(f'./cnn_dm/train_answer_MRL/{i}.npy')
            SBse_pos = np.load(f'./cnn_dm/train_SBse_pos/{i}.npy')
            ExSum_id = np.load(f'./cnn_dm/train_ExSum_id_MRL/{i}.npy')
            
        # for i in range(10):
            # train_data_.append(train_data)
            # train_answer_.append(train_answer)
            # SBse_pos_.append(SBse_pos)
            # ExSum_id_.append(ExSum_id)

            train_answer = torch.tensor(train_answer, dtype=torch.float32)  
            # train_data_ = np.array(train_data_)
            # print(train_data_.shape)

            input = torch.tensor(train_data, dtype=torch.float32).to(device)
            cos_sim, output = model(input, SBse_pos) #output: encoderの出力
            output = output.to('cpu')
            # cos_sim = np.array(cos_sim)
            # print(cos_sim.shape, flush=True)
            # output.detach().numpy()
            # print(output.shape, flush=True)

        #cos類似度(cos_sim)を並び替え (sort cos_sim)
        cos_sim_sorted = []
        for j in ExSum_id:
            cos_sim_sorted.append(cos_sim[j])
        cos_sim_sorted = torch.tensor(cos_sim)

        #outputのソート, 抽出 (sort output)
        output_sorted = []
        for i in ExSum_id[:3]:
            output_sorted.append(output[i].tolist())
        output_sorted = torch.tensor(output_sorted)

        criterion = MarginRankingLoss(0.001) #引数はmarginの値
        loss = criterion.get_loss(cos_sim_sorted, train_answer, output_sorted)
        # loss.detach().numpy()/
        # print(loss.shape, flush=True)
        # batch_loss.append(loss) #temp
    
        # loss = sum(batch_loss) / len(batch_loss) #temp
        # loss.clone().detach().requires_grad_(True)
        # loss = torch.tensor(loss, requires_grad=True) ##temp
        loss.requires_grad_(True)

        #重みの更新 (update weights)
        optimizer.zero_grad()
        loss.backward()
        for n, param in model.named_parameters():
            if param.grad is not None:
                print(n, param.grad.data.sum())
        optimizer.step()

        train_loss_list.append(loss)
        batch_loss = []
        
        if (count) % 1000 == 0:
            print(f'Epoch {epoch+1} Deta {count} Loss {loss}', flush=True)

        count += batch_size

    train_loss_list = torch.tensor(train_loss_list).to('cpu').detach().numpy().copy()
    train_loss = np.mean(train_loss_list)

    return train_loss

def evaluate(model):
    model.eval()
    with torch.no_grad():
        valid_loss_list = []
        for i in range(100):
        # for i in range(13368):
            #データの読み込み (load data)
            valid_data = np.load(f'/home1/user/sata/validation_data_MRL/{i}.npy')
            valid_answer = np.load(f'./cnn_dm/validation_answer_MRL/{i}.npy')
            SBse_pos = np.load(f'./cnn_dm/validation_SBse_pos/{i}.npy')
            ExSum_id = np.load(f'./cnn_dm/validation_ExSum_id_MRL/{i}.npy')
            
            #torch.tensorに変換 np.array to torch.tensor
            # valid_data = torch.tensor(valid_data, dtype=torch.float32)
            valid_answer = torch.tensor(valid_answer, dtype=torch.float32)

            input = torch.tensor(valid_data, dtype=torch.float32).to(device)
            cos_sim, output = model(input, SBse_pos)
            output = output.to('cpu')
            #cos類似度(cos_sim)を並び替え (sort cos_sim)
            cos_sim_sorted = []
            for j in ExSum_id:
                cos_sim_sorted.append(cos_sim[j])
            cos_sim_sorted = torch.tensor(cos_sim)
        
            #outputのソート, 抽出 (sort output)
            output_sorted = []
            for i in ExSum_id[:3]:
                output_sorted.append(output[i].tolist())
            output_sorted = torch.tensor(output_sorted)

            criterion = MarginRankingLoss(0.001) #引数はmarginの値 (0.001 is margin)
            loss = criterion.get_loss(cos_sim_sorted, valid_answer, output_sorted)

            valid_loss_list.append(loss)
            if (i+1) % 100 == 0:
            # if (i+1) % 1000 == 0:
                print(f'Deta {i+1} Loss {loss}', flush=True)
        valid_loss_list = torch.tensor(valid_loss_list).to('cpu').detach().numpy().copy()
        valid_loss = np.mean(valid_loss_list)
    return valid_loss

if __name__ == '__main__':
    device = torch.device('cuda:0')

    num_layers = 6
    d_model = 768
    num_heads = 8
    dff = 3072
    # input_limit = 100
    dropout_rate = 0.1
    # batch_size = 10
    batch_size = 100
    lr = 0.001
    # lr = 0.01
    EPOCHS = 30

    print('loading models...', flush=True)
    model = TransformerEncoder(d_model, num_heads, dff, dropout_rate, num_layers).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=EPOCHS)
    # scheduler = CosineLRScheduler(optimizer, t_initial=100, lr_min=1e-6, warmup_t=3, warmup_lr_init=1e-6, warmup_prefix=True)
    print('models are ready !\n', flush=True)

    checkpoint_path = './checkpoints/model.pt'

    for epoch in range(EPOCHS):
        start = time.time()

        print('training now...', flush=True)
        train_loss = train(model)
        print('evaluating now...', flush=True)
        valid_loss = evaluate(model)
        print('processing scheduler.step() ...', flush=True)
        scheduler.step()
        # scheduler.step(epoch)
        
        print(f'Epoch {epoch+1} train_Loss {train_loss}', flush=True)
        print(f'Epoch {epoch+1} valid_Loss {valid_loss}', flush=True)
        print(f'Time taken for 1 epoch: {time.time() - start} secs\n', flush=True)
        
        #チェックポイントの保存 (save checkpoints)
        if (epoch + 1) % 5 == 0:
            print('saving checkpoint...\n', flush=True)
            torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            # 'epoch': epoch,
            # 'loss': loss,
            # }, checkpoint_path)
            }, f'./checkpoints/model-{epoch+1}_MRL.pt')

    print('training is done', flush=True)

You are detaching the outputs of your model by rewrapping them into tensors. The backward() call will then raise a proper error which you masked by calling .requires_grad_(True) on the loss. Use torch.stack instead of recreating a new tensor and it should work.

1 Like

I understand.
Is there a problem with calculating the loss after sorting the output of the model?
Thank you for your help.

Edit
Or should they be sorted within the model?

Sorting is not the problem, detaching the computation graph by creating new tensors is.

1 Like

Thank you.
I modified the code like this.

"""encoder_MRL.py"""
import torch
import torch.nn as nn
import numpy as np

class TransformerEncoder(nn.Module):
    def __init__(self, d_model, num_heads, dff, dropout_rate, num_layers):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model, num_heads, dff, dropout_rate)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)

    def forward(self, input, SBse_pos, ExSum_id): 
        output = self.transformer_encoder(input) #(len(article),100,768)

        indices = torch.arange(output.size(0))
        MTse = output[indices, indices]
 
        cos_sim = torch.cosine_similarity(MTse, SBse_pos, dim=1)
        cos_sim_sorted = torch.gather(cos_sim, 0, ExSum_id)

        return cos_sim_sorted, output
"""train.py"""
cos_sim, output = model(input, SBse_pos, ExSum_id)
criterion = MarginRankingLoss(0.001) 
loss = criterion.get_loss(cos_sim, train_answer, output)
loss.requires_grad_(True)
optimizer.zero_grad()
loss.backward()
optimizer.step()

But validation loss didn’t cahange. When w/o loss.requires_grad_(True), This error happened.

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

I think It seem to detaching the computation graph. Which code make this problem?
I don’t think I rewrapped them into tensors.
If you don’t mind, could you give me the code to fix it?

editted:
I was using “TotalLoss = torch.tensor(0, dtype=torch.float32).to(device)” to initialize TotalLoss to 0.
After changing code initializing TotalLoss, RuntimeError no longer occurs w/o loss.requires_grad_(True). But valid_loss doesn’t change.

"""loss_MRL.py"""
import torch 
import torch.nn as nn
device = torch.device('cuda:0')

class MarginRankingLoss():
    def __init__(self, margin):
        super(MarginRankingLoss, self).__init__()
        # self._init_paramm_map
        self.margin = margin
        self.loss_func = nn.MarginRankingLoss(margin)

    def get_loss(self, cos_sim, real, pred):
        # TotalLoss = torch.tensor(0, dtype=torch.float32).to(device)
        # initializing TotalLoss to 0
        ones = torch.ones(cos_sim.size()).to(device)
        loss_func = torch.nn.MarginRankingLoss(0.0)
        TotalLoss = loss_func(cos_sim, cos_sim, ones)

        #MRL
        n = cos_sim.size(0)
        for i in range(1,n):
            pos_score = torch.tensor(cos_sim[:-i], dtype=torch.float32).to(device)
            neg_score = torch.tensor(cos_sim[i:], dtype=torch.float32).to(device)
            ones = torch.ones(pos_score.size()).to(device)
            loss_func = torch.nn.MarginRankingLoss(self.margin * i)
            TotalLoss += loss_func(pos_score, neg_score, ones)
        
        return TotalLoss

I fixed it modifying code like this. valid_loss now changes as training progresses.

"""loss_MRL.py"""
pos_score = cos_sim[:-i]
neg_score = cos_sim[i:]