I tried to train model.
But, output did not change when validation data was entered into the model after training.
Then, I checked parameters of model using model.parameters(), But weights were same.
What is the problem?
I tried to check whether grad is 0 or not using this code.
loss.backward()
for param in net.parameters():
print(param.grad.data.sum())
# start debugger
import pdb; pdb.set_trace()
optimizer.step()
But this error is happened.
AttributeError: ‘NoneType’ object has no attribute ‘data’
Also, I changed learning_rate 0.001 to 0.01, But nothing changed.
Batch sizes, etc. are set to small values for testing purposes.
How can I fix it ?
# from positional_encoding import positional_encoding
import torch
import torch.nn as nn
import time
import numpy as np
import random
import copy
import math
# from timm.scheduler import CosineLRScheduler
from transformers import get_cosine_schedule_with_warmup
from encoder_MRL import TransformerEncoder
from loss_MRL import MarginRankingLoss
from make_data import positional_encoding
from make_input import make_input_test
def train(model):
model.train()
batch_size = 10
train_loss_list = []
#乱数
l = random.sample(range(0,1000), k=1000)
# l = random.sample(range(0,287113), k=287113)
# batch_loss = []
count = 0
#ミニバッチ学習 (minibatch training)
iterate = math.ceil(1000 / batch_size) #小数点以下繰り上げ
for batch in range(iterate):
if batch == iterate: #最後のバッチ (last batch)
batch_size = 287113 % batch_size
train_data_, train_answer_, SBse_pos_, ExSum_id_ = [], [], [], []
# for i in l[count:count+batch_size]:
for i in range(1):
train_data = np.load(f'/home1/user/sata/train_data_MRL/{i}.npy')
train_answer = np.load(f'./cnn_dm/train_answer_MRL/{i}.npy')
SBse_pos = np.load(f'./cnn_dm/train_SBse_pos/{i}.npy')
ExSum_id = np.load(f'./cnn_dm/train_ExSum_id_MRL/{i}.npy')
# for i in range(10):
# train_data_.append(train_data)
# train_answer_.append(train_answer)
# SBse_pos_.append(SBse_pos)
# ExSum_id_.append(ExSum_id)
train_answer = torch.tensor(train_answer, dtype=torch.float32)
# train_data_ = np.array(train_data_)
# print(train_data_.shape)
input = torch.tensor(train_data, dtype=torch.float32).to(device)
cos_sim, output = model(input, SBse_pos) #output: encoderの出力
output = output.to('cpu')
# cos_sim = np.array(cos_sim)
# print(cos_sim.shape, flush=True)
# output.detach().numpy()
# print(output.shape, flush=True)
#cos類似度(cos_sim)を並び替え (sort cos_sim)
cos_sim_sorted = []
for j in ExSum_id:
cos_sim_sorted.append(cos_sim[j])
cos_sim_sorted = torch.tensor(cos_sim)
#outputのソート, 抽出 (sort output)
output_sorted = []
for i in ExSum_id[:3]:
output_sorted.append(output[i].tolist())
output_sorted = torch.tensor(output_sorted)
criterion = MarginRankingLoss(0.001) #引数はmarginの値
loss = criterion.get_loss(cos_sim_sorted, train_answer, output_sorted)
# loss.detach().numpy()/
# print(loss.shape, flush=True)
# batch_loss.append(loss) #temp
# loss = sum(batch_loss) / len(batch_loss) #temp
# loss.clone().detach().requires_grad_(True)
# loss = torch.tensor(loss, requires_grad=True) ##temp
loss.requires_grad_(True)
#重みの更新 (update weights)
optimizer.zero_grad()
loss.backward()
for n, param in model.named_parameters():
if param.grad is not None:
print(n, param.grad.data.sum())
optimizer.step()
train_loss_list.append(loss)
batch_loss = []
if (count) % 1000 == 0:
print(f'Epoch {epoch+1} Deta {count} Loss {loss}', flush=True)
count += batch_size
train_loss_list = torch.tensor(train_loss_list).to('cpu').detach().numpy().copy()
train_loss = np.mean(train_loss_list)
return train_loss
def evaluate(model):
model.eval()
with torch.no_grad():
valid_loss_list = []
for i in range(100):
# for i in range(13368):
#データの読み込み (load data)
valid_data = np.load(f'/home1/user/sata/validation_data_MRL/{i}.npy')
valid_answer = np.load(f'./cnn_dm/validation_answer_MRL/{i}.npy')
SBse_pos = np.load(f'./cnn_dm/validation_SBse_pos/{i}.npy')
ExSum_id = np.load(f'./cnn_dm/validation_ExSum_id_MRL/{i}.npy')
#torch.tensorに変換 np.array to torch.tensor
# valid_data = torch.tensor(valid_data, dtype=torch.float32)
valid_answer = torch.tensor(valid_answer, dtype=torch.float32)
input = torch.tensor(valid_data, dtype=torch.float32).to(device)
cos_sim, output = model(input, SBse_pos)
output = output.to('cpu')
#cos類似度(cos_sim)を並び替え (sort cos_sim)
cos_sim_sorted = []
for j in ExSum_id:
cos_sim_sorted.append(cos_sim[j])
cos_sim_sorted = torch.tensor(cos_sim)
#outputのソート, 抽出 (sort output)
output_sorted = []
for i in ExSum_id[:3]:
output_sorted.append(output[i].tolist())
output_sorted = torch.tensor(output_sorted)
criterion = MarginRankingLoss(0.001) #引数はmarginの値 (0.001 is margin)
loss = criterion.get_loss(cos_sim_sorted, valid_answer, output_sorted)
valid_loss_list.append(loss)
if (i+1) % 100 == 0:
# if (i+1) % 1000 == 0:
print(f'Deta {i+1} Loss {loss}', flush=True)
valid_loss_list = torch.tensor(valid_loss_list).to('cpu').detach().numpy().copy()
valid_loss = np.mean(valid_loss_list)
return valid_loss
if __name__ == '__main__':
device = torch.device('cuda:0')
num_layers = 6
d_model = 768
num_heads = 8
dff = 3072
# input_limit = 100
dropout_rate = 0.1
# batch_size = 10
batch_size = 100
lr = 0.001
# lr = 0.01
EPOCHS = 30
print('loading models...', flush=True)
model = TransformerEncoder(d_model, num_heads, dff, dropout_rate, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=EPOCHS)
# scheduler = CosineLRScheduler(optimizer, t_initial=100, lr_min=1e-6, warmup_t=3, warmup_lr_init=1e-6, warmup_prefix=True)
print('models are ready !\n', flush=True)
checkpoint_path = './checkpoints/model.pt'
for epoch in range(EPOCHS):
start = time.time()
print('training now...', flush=True)
train_loss = train(model)
print('evaluating now...', flush=True)
valid_loss = evaluate(model)
print('processing scheduler.step() ...', flush=True)
scheduler.step()
# scheduler.step(epoch)
print(f'Epoch {epoch+1} train_Loss {train_loss}', flush=True)
print(f'Epoch {epoch+1} valid_Loss {valid_loss}', flush=True)
print(f'Time taken for 1 epoch: {time.time() - start} secs\n', flush=True)
#チェックポイントの保存 (save checkpoints)
if (epoch + 1) % 5 == 0:
print('saving checkpoint...\n', flush=True)
torch.save({
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
"scheduler": scheduler.state_dict(),
# 'epoch': epoch,
# 'loss': loss,
# }, checkpoint_path)
}, f'./checkpoints/model-{epoch+1}_MRL.pt')
print('training is done', flush=True)