[Solved] Why the result run after training are much bad than in the trainning loop

I am new to Pytorch and Deepleaning, this is my second demo.
I try to use LSTM to do various length sequence classification.
But the test result is bad. I investigate very long time and not found why.

And I found a strange thing, after Epoch, I want run on the trainning data again to see the scores after training, like the Pytorch tutorial did. But the result is much difference with the running in Epoch loop:

[Epoch: 49/ 50] Training Loss: 0.027, Training Acc: 0.939
Training Acc: 0.196, Training correct num: 1955.000, Training total num: 10000.000

We can see the last epoch training result is 0.939, but after it, the run on training data only get result 0.196

This is the model.py

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import codecs
from torch.autograd import Variable
import time
import data

start_time = time.time()

torch.manual_seed(1)


EMBEDDING_DIM =128
HIDDEN_DIM = 64
BATCH_SIZE = 10
EPOCH_NUM = 50

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tag_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tag_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (autograd.Variable(torch.zeros(1, BATCH_SIZE, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, BATCH_SIZE, self.hidden_dim)))

    def forward(self, sentence, lengths):
        embeds = self.word_embeddings(sentence)

        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), BATCH_SIZE, -1), self.hidden)

        tag_space = self.hidden2tag(lstm_out[-1])
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores


train_file = "./data/train.txt"
test_file = "./data/dev.txt"
training_data = data.load_seqs_data(train_file)
test_data = data.load_seqs_data(test_file)
vocab_dic = data.load_vocab_data()
word_to_ix, tag_to_ix, ix_to_tag = data.load_ix_dics()

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


training_dataloader = data.create_dataset(training_data, word_to_ix, tag_to_ix, BATCH_SIZE)
test_dataloader = data.create_dataset(test_data, word_to_ix, tag_to_ix, BATCH_SIZE)
train_loss_ = []
train_acc_ = []
for epoch in range(EPOCH_NUM):

    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    for inputs, labels, lens, raw_datas in training_dataloader:
        inputs, labels = Variable(inputs), Variable(labels)

        model.zero_grad()

        model.hidden = model.init_hidden()

        tag_score = model(inputs.t(), lens.numpy())

        loss = loss_function(tag_score, labels)

        loss.backward()
        optimizer.step()

        _, predicted = torch.max(tag_score.data, 1)
        total_acc += (predicted.numpy() == labels.data.numpy()).sum()
        total += len(labels)
        total_loss += loss.data[0]

    print("The total acc {0}, the total {1}".format(total_acc, total))
    train_loss_.append(total_loss / total)
    train_acc_.append(total_acc / total)

    print('[Epoch: %3d/%3d] Training Loss: %.3f, Training Acc: %.3f'
          % (epoch, EPOCH_NUM, train_loss_[epoch], train_acc_[epoch]))

# See what the scores are after training
total_acc = 0.0
total = 0.0
for seqs, labels, lens, raw_datas in training_dataloader:
    tag_scores = model(Variable(seqs).t(), lens.numpy())

    _, predicted = torch.max(tag_scores.data, 1)
    total_acc += (predicted.numpy() == labels.numpy()).sum()
    total += len(labels)
print('Training Acc: %.3f, Training correct num: %.3f, Training total num: %.3f' % (total_acc/total, total_acc, total))


resultfile = codecs.open("test.result",'w','utf-8')
    

total_acc = 0.0
total = 0.0
for test, labels, lens, raw_datas in test_dataloader:
    test = Variable(test)
    tag_scores = model(test.t(), lens.numpy())
    for score, label, raw_data in zip(tag_scores, labels, raw_datas):
        resultfile.write('{0} {1} {2}\n'.format(ix_to_tag[torch.max(score, 0)[1].data.numpy()[0]], ix_to_tag[label], raw_data))

    _, predicted = torch.max(tag_scores.data, 1)
    total_acc += (predicted.numpy() == labels.numpy()).sum()
    total += len(labels)

print('Testing Acc: %.3f, Testing correct num: %.3f, Testing total num: %.3f' % (total_acc/total, total_acc, total))


resultfile.close()

print("--- %s seconds ---" % (time.time() - start_time))

This is the data.py:

import torch
import codecs
from torch.utils.data import DataLoader, Dataset


def load_seqs_data(filepath):
    file = codecs.open(filepath,'r','utf-8')
    lines = [line.strip() for line in file] 
    file.close()

    trainingSequences = []
    for line in lines:
        
        word = line.split()
        assert len(word) >= 2
        trainingSequences.append((word[0], word[1:]))

    return sorted(trainingSequences, key=lambda sequence: len(sequence[1]))


def load_vocab_data():
    file = codecs.open('./data/vocab.txt', 'r', 'utf-8')
    lines = [line.strip() for line in file] 
    file.close()

    word_dic = {}
    for line in lines:
        word = line.split()
        assert len(word) == 2
        word_dic[word[0]] = word[1]

    return word_dic

def load_ix_dics():
    vocab_dic = load_vocab_data()
     
    word_to_ix = {}
    word_to_ix["<<pad>>"] = 0
    for word, numbers in vocab_dic.items():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
    
    tag_to_ix = {}
    ix_to_tag = {}
    tag_dic = load_tag_dic()
    for tag, num in tag_dic.items():
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)
            ix_to_tag[len(tag_to_ix)-1] = tag
    
    return word_to_ix, tag_to_ix, ix_to_tag
 
def load_tag_dic():
    file = codecs.open('./data/id2tag.txt','r','utf-8')
    lines = [line.strip() for line in file] 
    file.close()

    tag_dic = {}
    for line in lines:
        tag = line.split()
        assert len(tag) == 2
        tag_dic[tag[0]] = tag[1]

    return tag_dic


def vectorize_data(data, to_ix):
    return [[to_ix[tok] if tok in to_ix else to_ix['UNK'] for tok in seq] for y, seq in data]

def collate_fn(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    seqs, label, lens, raw_data = zip(*batch)
    pad_seqs = []
    lens = []
    max_len = len(seqs[0])
    for i in range(len(seqs)):
        temp_seq = [0] * max_len
        temp_seq[:len(seqs[i])] = seqs[i]
        pad_seqs.append(temp_seq)
        lens.append(len(seqs[i]))

    pad_seqs_tensor = torch.LongTensor(pad_seqs)
    label_tensor = torch.LongTensor(label)
    lens_tensor = torch.LongTensor(lens)

    return pad_seqs_tensor, label_tensor, lens_tensor, raw_data


def create_dataset(data, word_to_ix, tag_to_ix, bs=4):
    vectorized_seqs = vectorize_data(data, word_to_ix)
    seq_lengths = torch.LongTensor([len(s) for s in vectorized_seqs])
    target_tensor = torch.LongTensor([tag_to_ix[y] for y, _ in data])
    raw_data = [x for _, x in data]
    return DataLoader(MyDataset(vectorized_seqs, target_tensor, seq_lengths, raw_data),
                      batch_size=bs,
                      shuffle=False,
                      collate_fn=collate_fn,
                      num_workers=0)


class MyDataset(Dataset):
    def __init__(self, sequences, labels, lens, raw_datas):
        self.seqs = sequences
        self.labels = labels
        self.lens = lens
        self.raw_datas = raw_datas

    def __getitem__(self, index):
        seq, target, len, raw_data = self.seqs[index], self.labels[index], self.lens[index], self.raw_datas[index] 
        return seq, target, len, ''.join(raw_data)

    def __len__(self):
        return len(self.seqs)


Many thanks for any comments!

I found the root cause is because I not call init_hidden before run every test sequence.