The performances of train mode and eval mode have a big gap

Hi there!
I am running a project of visual speech recognition task, the network structure is 3DConv+Resnet18+15*depth-wise 1DConv, the loss is CTC loss, and I can get a relatively good performance under model.train(). When I change the mode to model.eval() in val stage, the performance get very poor, and basically remain unchanged. But when I don’t use model.train() in val stage, the performance has little difference with that of training.
How to solve this problem? If anybody want more information about the project, just ask me.
Can anybody help me? Thanks very much!

@chen Are you saying you get better model accuracy when you use model.train() instead of model.eval() during validation?
model.eval() shouldn’t be the cause for difference in accuracy as it affects only behaviour of Dropout and BatchNorm.
It could be the case where your model has not generalized for unseen data.
Can you post the code here?

Try reducing the Dropout value, or even set it to zero.

Thank you! The code you mean is the whole project or the model code or other else?

Network, training and evaluation code.

Thank you! I didn’t use Dropout in the model, but I used many BN layers.

model code:

import math
import torch.nn as nn

def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,padding=1, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

def __init__(self, inplanes, planes, stride=1, downsample=None):
    super(BasicBlock, self).__init__()
    self.conv1 = conv3x3(inplanes, planes, stride)
    self.bn1 = nn.BatchNorm2d(planes)
    self.relu = nn.ReLU(inplace=True)
    self.conv2 = conv3x3(planes, planes)
    self.bn2 = nn.BatchNorm2d(planes)
    self.downsample = downsample
    self.stride = stride

def forward(self, x):
    residual = x
    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)
    out = self.conv2(out)
    out = self.bn2(out)
    if self.downsample is not None:
        residual = self.downsample(x)

    out += residual
    out = self.relu(out)

    return out


class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
    self.inplanes = 64
    super(ResNet, self).__init__()
    self.layer1 = self._make_layer(block, 64, layers[0])                
    self.layer2 = self._make_layer(block, 128, layers[1], stride=2)     
    self.layer3 = self._make_layer(block, 256, layers[2], stride=2)    
    self.layer4 = self._make_layer(block, 512, layers[3], stride=2)     
    self.avgpool = nn.AvgPool2d(2)
    # self.fc = nn.Linear(512 * block.expansion, num_classes)
    # self.bnfc = nn.BatchNorm1d(num_classes)
    for m in self.modules():
        if isinstance(m, nn.Conv2d):
            n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            m.weight.data.normal_(0, math.sqrt(2. / n))
        elif isinstance(m, nn.BatchNorm2d):
            m.weight.data.fill_(1)
            m.bias.data.zero_()
        elif isinstance(m, nn.BatchNorm1d):
            m.weight.data.fill_(1)
            m.bias.data.zero_()

def _make_layer(self, block, planes, blocks, stride=1):
    downsample = None
    if stride != 1 or self.inplanes != planes * block.expansion:
        downsample = nn.Sequential(
            nn.Conv2d(self.inplanes, planes * block.expansion,
                      kernel_size=1, stride=stride, bias=False),
            nn.BatchNorm2d(planes * block.expansion),
        )

    layers = []
    layers.append(block(self.inplanes, planes, stride, downsample))
    self.inplanes = planes * block.expansion
    for i in range(1, blocks):
        layers.append(block(self.inplanes, planes))

    return nn.Sequential(*layers)

def forward(self, x):
    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)
    x = self.avgpool(x)
    x = x.view(x.size(0), -1)
    # x = self.fc(x)
    # x = self.bnfc(x)
    return x


class GRU(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
    super(GRU, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
    # self.gru = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
    self.fc = nn.Linear(hidden_size*2, num_classes)

def forward(self, x):
    #h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)
    out, _ = self.gru(x)

    out = self.fc(out)  # predictions based on every time step

    return out


class Lipreading(nn.Module):
def __init__(self, mode, inputDim=512, hiddenDim=1024, nClasses=29, batchsize=36):
    super(Lipreading, self).__init__()
    self.mode = mode
    self.inputDim = inputDim
    self.hiddenDim = hiddenDim
    self.nClasses = nClasses
    self.batchsize = batchsize
    self.nLayers = 3
    self.relu = nn.ReLU(True)
    self.bn = nn.BatchNorm1d(3*inputDim)
    # frontend3D
    self.frontend3D = nn.Sequential(
            nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False),
            nn.BatchNorm3d(64),
            nn.ReLU(True),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
            )  # width/4  height/4
    # resnet
    self.resnet18 = ResNet(BasicBlock, [2, 2, 2, 2], num_classes=self.inputDim)
    # backend_conv
    self.backend_conv_1 = nn.Conv1d(self.inputDim, 3*self.inputDim, 1, 1, 0, bias=False)

    self.backend_conv_2 = nn.Sequential(
        nn.Conv1d(3*self.inputDim, 3*self.inputDim, 5, 1, 2, bias=False, groups=3*self.inputDim),
        nn.Conv1d(3*self.inputDim, 3*self.inputDim, 1, 1, 0),
    )
    # backend_gru
    self.gru = GRU(self.inputDim, self.hiddenDim, self.nLayers, self.nClasses)
    self.fc = nn.Linear(3*inputDim,nClasses)
    # initialize
    self._initialize_weights()

def forward(self, x):
    x = self.frontend3D(x)
    x = x.transpose(1, 2)   
    x = x.contiguous()  
    bs = x.size(0)
    # fra_len = x.size(2)
    x = x.view(-1, 64, x.size(3), x.size(4)) #torch.reshape() = torch.contiguous().view()
    x = self.resnet18(x)   # -1*512
    if self.mode == 'temporalConv':
        x = x.view(bs, -1, int(x.size(0)/bs))
        x = self.backend_conv_1(x)
        flag = 0
        while(flag<15):
            residual = x
            x = self.backend_conv_2(x)
            x += residual
            x = self.bn(x)
            x = self.relu(x)                          
            flag += 1
        x = x.transpose(0,2)
        x = x.transpose(1,2)
        x = self.fc(x)
    elif self.mode == 'backendGRU' or self.mode == 'finetuneGRU':
        x = x.view(int(x.size(0)/bs), bs, -1)
        x = self.gru(x)
    else:
        raise Exception('No model is selected')
    return x

def _initialize_weights(self):
    for m in self.modules():
        if isinstance(m, nn.Conv3d):
            n = m.kernel_size[0] * m.kernel_size[1] * m.kernel_size[2] * m.out_channels
            m.weight.data.normal_(0, math.sqrt(2. / n))       
            if m.bias is not None:
                m.bias.data.zero_()

        elif isinstance(m, nn.Conv2d):
            n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            m.weight.data.normal_(0, math.sqrt(2. / n))
            if m.bias is not None:
                m.bias.data.zero_()

        elif isinstance(m, nn.Conv1d):
            n = m.kernel_size[0] * m.out_channels
            m.weight.data.normal_(0, math.sqrt(2. / n))
            if m.bias is not None:
                m.bias.data.zero_()

        elif isinstance(m, nn.BatchNorm3d):
            m.weight.data.fill_(1)
            m.bias.data.zero_()

        elif isinstance(m, nn.BatchNorm2d):
            m.weight.data.fill_(1)
            m.bias.data.zero_()

        elif isinstance(m, nn.BatchNorm1d):
            m.weight.data.fill_(1)
            m.bias.data.zero_()

training and val code

import multiprocessing as mp
import random
import time
import argparse
import torch.optim as optim
from torchsummary import summary
from video_only.model import *
from video_only.dataset import *
from video_only.lr_scheduler import *
from video_only.cvtransforms import *
from video_only.Decoder import *

vocabularies = ['_','A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
            'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ' ', '\'',
           ]


def data_loader(args):
dsets = {x: MyDataset(x, args.dataset) for x in ['train','test']}
dset_loaders = {x: torch.utils.data.DataLoader(dsets[x], batch_size=args.batchsize, shuffle=True, num_workers=args.workers,collate_fn=ctc_collate,pin_memory=True,drop_last=True) for x in ['train','test']}
dset_sizes = {x: len(dsets[x]) for x in ['train','test']}
print('\nStatistics: train: {}\ttest: {}'.format(dset_sizes['train'],dset_sizes['test']))
return dset_loaders, dset_sizes

def reload_model(model, path=""):
if not bool(path):
    print('-'*20)
    print('train from scratch')
    print('-' * 20)
    return model
else:
    model_dict = model.state_dict()
    pretrained_dict = torch.load(path)
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)
    print('-' * 20)
    print('*** model has been successfully loaded! ***')
    print('-' * 20)
    return model


if __name__ == '__main__':
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

parser = argparse.ArgumentParser(description='Pytorch ResNet18+GRU/Conv LRS')

parser.add_argument('--nClasses', default=29, type=int, help='the number of classes')
parser.add_argument('--path',default='/media/cxd/software/BLweights/temporalConv/temporalConv_566.pt',help='path to model')
parser.add_argument('--dataset', default='/home/cxd/Videos/trainval', help='path to dataset')
parser.add_argument('--mode', default='temporalConv', help='temporalConv, backendGRU, finetuneGRU')
parser.add_argument('--lr', default=0.00003, type=float, help='initial learning rate')
parser.add_argument('--batchsize', default=8, type=int, help='mini-batch size (default: 8)')
parser.add_argument('--workers', default=8, type=int, help='number of data loading workers (default: 16)')
parser.add_argument('--epochs', default=10000, type=int, help='number of total epochs')
parser.add_argument('--interval', default=10, type=int, help='display interval')
parser.add_argument('--seed', default=1, type=int, help='random seed')
parser.add_argument('--lmpath',default='/home/cxd/Videos/test_cor.binary',help='path to LM model')
args = parser.parse_args()
for arg in vars(args):
    print('args:{}={}'.format(arg, getattr(args, arg))) 
mp.set_start_method('forkserver',force=True)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# print(device)
save_path = '/media/cxd/software/BLweights/' + args.mode
# print(save_path)
if not os.path.isdir(save_path):
    os.mkdir(save_path)
model = lipreading(mode=args.mode, inputDim=512, hiddenDim=1024, nClasses=args.nClasses, batchsize= args.batchsize)
summary(model,input_size=(1,75,48,96),device='cpu')

print('#Parameters:',sum(param.numel() for param in model.parameters()))

model = model.to(device)

decoder = Decoder(vocabularies,lm_path=args.lmpath)

# reload model
model = reload_model(model, args.path)
# define loss function and optimizer
criterion = nn.CTCLoss(zero_infinity=True).to(device)
if args.mode == 'temporalConv' or args.mode == 'finetuneGRU':
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.)
elif args.mode == 'backendGRU':
    for param in model.parameters():
        param.requires_grad = False   
    for param in model.gru.parameters():
        param.requires_grad = True
    optimizer = optim.Adam([
        {'params': model.gru.parameters(), 'lr': args.lr}
        ], lr=0., weight_decay=0.)
else:
    raise Exception('No model is found!')

dset_loaders, dset_sizes = data_loader(args)
# scheduler = AdjustLR(optimizer, [args.lr], sleep_epochs=10, half=10, verbose=1)
for epoch in range(566,args.epochs):

    f = open('/home/cxd/Videos/log.txt','a')
    f.write('epoch:{}'.format(epoch))
    f.write('\n')


    print('-' * 20)
    print('training......')
    print('-' * 20)
    # scheduler.step(epoch)

    model.train()
    print('-' * 20)
    print('Epoch {}/{}'.format(epoch, args.epochs - 1))
    print('Current Learning rate: {}'.format(showLR(optimizer)))
    print('-' * 20)

    running_loss, running_corrects, running_all, cer = 0., 0., 0., 0.

    for batch_idx, sample_batched in enumerate(dset_loaders['train']):
        optimizer.zero_grad()
        inputs,targets,lengths,y_lengths,idx = sample_batched

        # print(inputs.shape)

        # summary(model, input_size=(inputs.size(1),inputs.size(2),inputs.size(3),inputs.size(4)), device='cuda')
        # inputs = inputs.permute(0,2,1,3,4)
        inputs = inputs.float()     #  36*1*29*40*80
        inputs, targets = inputs.to(device) , targets.to(device)
        # lengths, y_lengths = lengths.to(device), y_lengths.to(device)
        outputs = model(inputs)   # frame_len * batchsize * 29

        loss = criterion(F.log_softmax(outputs,dim=-1),targets,lengths,y_lengths)

        loss.backward()

        # nn.utils.clip_grad_value_(model.parameters(), 10)

        optimizer.step()
        # stastics

        decoded = decoder.decode_greedy(outputs,lengths)
        cursor, gt = 0, []
        for b in range(inputs.size(0)):
            y_str = ''.join([vocabularies[ch] for ch in targets[cursor: cursor + y_lengths[b]]])
            gt.append(y_str)
            cursor += y_lengths[b]
        CER = decoder.cer_batch(decoded,gt)
        cer += CER
        cer_mean = cer/(batch_idx+1)

        running_loss += loss.data * inputs.size(0)
        running_all += len(inputs)
        if batch_idx == 0:
            since = time.time()
        if (batch_idx + 1) % (args.interval*10) == 0:
            print('Test seq: {};\npred: {}\n'.format(gt, decoded))
        elif (batch_idx+1) % args.interval == 0 or (batch_idx == len(dset_loaders['train'])-1):
            # print('Test seq: {};\npred: {}\n'.format(gt,decoded))
            print('Process: [{:5.0f}/{:5.0f} ({:.0f}%)]\tLoss: {:.4f}\tcer:{:.4f}\tCost time:{:5.0f}s\tEstimated time:{:5.0f}s\t'.format(
                running_all,
                len(dset_loaders['train'].dataset),
                100. * batch_idx / (len(dset_loaders['train'])-1),
                running_loss / running_all,
                cer_mean,
                time.time()-since,
                (time.time()-since)*(len(dset_loaders['train'])-1) / batch_idx - (time.time()-since)))


    print('{} Epoch:\t{:2}\tLoss: {:.4f}\tcer:{:.4f}\t'.format(
        'pretrain',
        epoch,
        running_loss / len(dset_loaders['train'].dataset),
        cer_mean)+'\n')

    f.write('{} Epoch:\t{:2}\tLeaningRate:{}\tLoss: {:.4f}\tcer:{:.4f}\t'.format(
        'train',
        epoch,
        showLR(optimizer),
        running_loss / len(dset_loaders['train'].dataset),
        cer_mean)+'\n')

    f.close()

    torch.save(model.state_dict(), save_path+'/'+args.mode+'_'+str(epoch+1)+'.pt')

    print('-' * 20)
    print('testing......')
    print('-' * 20)
    # model.eval()
    # running_loss, running_corrects, running_all = 0., 0., 0.
    Tcer = 0.0
    for batch_idx, sample_batched in enumerate(dset_loaders['test']):
        inputs, targets, lengths, y_lengths, idx = sample_batched
        inputs = inputs.float()  # 36*1*29*40*80
        inputs, targets = inputs.to(device), targets.to(device)
    #     # lengths, y_lengths = lengths.to(device), y_lengths.to(device)
        outputs = model(inputs)
    #
        decoded = decoder.decode_beam(outputs, lengths)
        cursor, gt = 0, []
        for b in range(inputs.size(0)):
            y_str = ''.join([vocabularies[ch] for ch in targets[cursor: cursor + y_lengths[b]]])
            gt.append(y_str)
            cursor += y_lengths[b]
        cer = decoder.cer_batch(decoded, gt)
        Tcer += cer
        Tcer_mean = Tcer / (batch_idx + 1)
        print('Test seq: {};\npred: {}\n'.format(gt,decoded))
        print('greedy_cer:{:.4f}\n'.format(Tcer_mean))

    f = open('/home/cxd/Videos/log.txt', 'a')
    f.write("test epoch:\t{}\tTcer:\t{:.4f}".format(epoch,Tcer_mean)+'\n'+'\n')
    f.close()

Thank you very much! I found the reason. In my forward function of the Network, I use the while to repeat the depth-wise convolution layer, while in pytorch, defined layers cannot be reused.