I’m building a classifier for a QA bot, and have a dataset for 8k question, 149 different Ans.
I got some problem when traing my model; the “loss” can’t go down as I expected
so I wish someone can help me…
Here is my method:
I use word2vec to got word’s vector, then use GRU model to get the vector of sentence
the w2v model train with wiki data, and work well on my another NPL project
GRU code is from my senior, I think it work well, too
# Part of the code for getting sentence vector
input_size = 400
hidden_dim = 400
num_layers = 1
gru = nn.GRU(input_size, hidden_dim,num_layers,batch_first = True)
h0 = torch.rand(num_layers, 7187, hidden_dim) # (num_layers, batch, hidden_dim)
# shape of input [dataset_len,max_sentence_len,input_feature]
inputSet = torch.tensor(x_train,dtype = torch.float)
sentenceVecs, hidden = gru(inputSet,h0)
sentenceVecs = sentenceVecs[:,-1, :]
and here is my classifier model
from argparse import Namespace
args = Namespace(
dataset_file = 'dataset/waimai_10k_tw.pkl',
model_save_path='torchmodel/pytorch_bce.model',
# Training hyper parameters
batch_size = 100,
learning_rate = 0.002,
min_learning_rate = 0.002,
num_epochs=200,
)
class JWP(nn.Module):
def __init__(self,
n_feature,
n_hidden,
n_hidden2,
n_hidden3,
n_output):
super(JWP, self).__init__()
self.hidden = nn.Linear(n_feature, n_hidden)
self.hidden2 = nn.Linear(n_hidden, n_hidden2)
self.hidden3 = nn.Linear(n_hidden2, n_hidden3)
self.out = nn.Linear(n_hidden3, n_output)
def forward(self, x, apply_softmax=False):
x = F.relu(self.hidden(x).squeeze())
x = F.relu(self.hidden2(x).squeeze())
x = F.relu(self.hidden3(x).squeeze())
#
if(apply_softmax):
x = torch.softmax(self.out(x))
else:
x = self.out(x)
return x
traing code
lr = args.learning_rate
min_lr = args.min_learning_rate
def adjust_learning_rate(optimizer, epoch):
global lr
if epoch % 10 == 0 and epoch != 0:
lr = lr * 0.65
if(lr < min_lr):
lr = min_lr
for param_group in optimizer.param_groups:
param_group['lr'] = lr
if __name__ == "__main__":
EPOCH = args.num_epochs
net = JWP(400,325,275,225,149)
# net = JWP(400,250,149)
# net = JWP(400,149)
print(net)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss_func = torch.nn.CrossEntropyLoss()
for t in range(EPOCH):
adjust_learning_rate(optimizer,t)
"""
Train phase
"""
net.train()
TrainLoss = 0.0
# Train batch
for step,(batchData, batchTarget) in enumerate(trainDataLoader):
optimizer.zero_grad()
out = net(batchData)
loss = loss_func(out,batchTarget)
TrainLoss = TrainLoss + loss
loss.backward()
optimizer.step()
TrainLoss = TrainLoss / (step+1) # epoch loss
"""
Result
"""
print(
"epoch:",t+1 ,
"train_loss:",round(TrainLoss.item(),3),
"LR:",lr
)
Is that my model is too simple or use wrong method?
the loss always stock as around 4.6 can’t lower any more
epoch: 2898 train_loss: 4.643 LR: 0.002
epoch: 2899 train_loss: 4.643 LR: 0.002
epoch: 2900 train_loss: 4.643 LR: 0.002
epoch: 2901 train_loss: 4.643 LR: 0.002