Hi there!
I am running a project of visual speech recognition task, the network structure is 3DConv+Resnet18+15*depth-wise 1DConv, the loss is CTC loss, and I can get a relatively good performance under model.train(). When I change the mode to model.eval() in val stage, the performance get very poor, and basically remain unchanged. But when I don’t use model.train() in val stage, the performance has little difference with that of training.
How to solve this problem? If anybody want more information about the project, just ask me.
Can anybody help me? Thanks very much!
@chen Are you saying you get better model accuracy when you use model.train() instead of model.eval() during validation?
model.eval() shouldn’t be the cause for difference in accuracy as it affects only behaviour of Dropout
and BatchNorm
.
It could be the case where your model has not generalized for unseen data.
Can you post the code here?
Try reducing the Dropout
value, or even set it to zero.
Thank you! The code you mean is the whole project or the model code or other else?
Network, training and evaluation code.
Thank you! I didn’t use Dropout
in the model, but I used many BN
layers.
model code:
import math
import torch.nn as nn
def conv3x3(in_planes, out_planes, stride=1):
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(2)
# self.fc = nn.Linear(512 * block.expansion, num_classes)
# self.bnfc = nn.BatchNorm1d(num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm1d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
# x = self.fc(x)
# x = self.bnfc(x)
return x
class GRU(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(GRU, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
# self.gru = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
self.fc = nn.Linear(hidden_size*2, num_classes)
def forward(self, x):
#h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)
out, _ = self.gru(x)
out = self.fc(out) # predictions based on every time step
return out
class Lipreading(nn.Module):
def __init__(self, mode, inputDim=512, hiddenDim=1024, nClasses=29, batchsize=36):
super(Lipreading, self).__init__()
self.mode = mode
self.inputDim = inputDim
self.hiddenDim = hiddenDim
self.nClasses = nClasses
self.batchsize = batchsize
self.nLayers = 3
self.relu = nn.ReLU(True)
self.bn = nn.BatchNorm1d(3*inputDim)
# frontend3D
self.frontend3D = nn.Sequential(
nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False),
nn.BatchNorm3d(64),
nn.ReLU(True),
nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
) # width/4 height/4
# resnet
self.resnet18 = ResNet(BasicBlock, [2, 2, 2, 2], num_classes=self.inputDim)
# backend_conv
self.backend_conv_1 = nn.Conv1d(self.inputDim, 3*self.inputDim, 1, 1, 0, bias=False)
self.backend_conv_2 = nn.Sequential(
nn.Conv1d(3*self.inputDim, 3*self.inputDim, 5, 1, 2, bias=False, groups=3*self.inputDim),
nn.Conv1d(3*self.inputDim, 3*self.inputDim, 1, 1, 0),
)
# backend_gru
self.gru = GRU(self.inputDim, self.hiddenDim, self.nLayers, self.nClasses)
self.fc = nn.Linear(3*inputDim,nClasses)
# initialize
self._initialize_weights()
def forward(self, x):
x = self.frontend3D(x)
x = x.transpose(1, 2)
x = x.contiguous()
bs = x.size(0)
# fra_len = x.size(2)
x = x.view(-1, 64, x.size(3), x.size(4)) #torch.reshape() = torch.contiguous().view()
x = self.resnet18(x) # -1*512
if self.mode == 'temporalConv':
x = x.view(bs, -1, int(x.size(0)/bs))
x = self.backend_conv_1(x)
flag = 0
while(flag<15):
residual = x
x = self.backend_conv_2(x)
x += residual
x = self.bn(x)
x = self.relu(x)
flag += 1
x = x.transpose(0,2)
x = x.transpose(1,2)
x = self.fc(x)
elif self.mode == 'backendGRU' or self.mode == 'finetuneGRU':
x = x.view(int(x.size(0)/bs), bs, -1)
x = self.gru(x)
else:
raise Exception('No model is selected')
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv3d):
n = m.kernel_size[0] * m.kernel_size[1] * m.kernel_size[2] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.Conv1d):
n = m.kernel_size[0] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm3d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm1d):
m.weight.data.fill_(1)
m.bias.data.zero_()
training and val code
import multiprocessing as mp
import random
import time
import argparse
import torch.optim as optim
from torchsummary import summary
from video_only.model import *
from video_only.dataset import *
from video_only.lr_scheduler import *
from video_only.cvtransforms import *
from video_only.Decoder import *
vocabularies = ['_','A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ' ', '\'',
]
def data_loader(args):
dsets = {x: MyDataset(x, args.dataset) for x in ['train','test']}
dset_loaders = {x: torch.utils.data.DataLoader(dsets[x], batch_size=args.batchsize, shuffle=True, num_workers=args.workers,collate_fn=ctc_collate,pin_memory=True,drop_last=True) for x in ['train','test']}
dset_sizes = {x: len(dsets[x]) for x in ['train','test']}
print('\nStatistics: train: {}\ttest: {}'.format(dset_sizes['train'],dset_sizes['test']))
return dset_loaders, dset_sizes
def reload_model(model, path=""):
if not bool(path):
print('-'*20)
print('train from scratch')
print('-' * 20)
return model
else:
model_dict = model.state_dict()
pretrained_dict = torch.load(path)
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict)
print('-' * 20)
print('*** model has been successfully loaded! ***')
print('-' * 20)
return model
if __name__ == '__main__':
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
parser = argparse.ArgumentParser(description='Pytorch ResNet18+GRU/Conv LRS')
parser.add_argument('--nClasses', default=29, type=int, help='the number of classes')
parser.add_argument('--path',default='/media/cxd/software/BLweights/temporalConv/temporalConv_566.pt',help='path to model')
parser.add_argument('--dataset', default='/home/cxd/Videos/trainval', help='path to dataset')
parser.add_argument('--mode', default='temporalConv', help='temporalConv, backendGRU, finetuneGRU')
parser.add_argument('--lr', default=0.00003, type=float, help='initial learning rate')
parser.add_argument('--batchsize', default=8, type=int, help='mini-batch size (default: 8)')
parser.add_argument('--workers', default=8, type=int, help='number of data loading workers (default: 16)')
parser.add_argument('--epochs', default=10000, type=int, help='number of total epochs')
parser.add_argument('--interval', default=10, type=int, help='display interval')
parser.add_argument('--seed', default=1, type=int, help='random seed')
parser.add_argument('--lmpath',default='/home/cxd/Videos/test_cor.binary',help='path to LM model')
args = parser.parse_args()
for arg in vars(args):
print('args:{}={}'.format(arg, getattr(args, arg)))
mp.set_start_method('forkserver',force=True)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# print(device)
save_path = '/media/cxd/software/BLweights/' + args.mode
# print(save_path)
if not os.path.isdir(save_path):
os.mkdir(save_path)
model = lipreading(mode=args.mode, inputDim=512, hiddenDim=1024, nClasses=args.nClasses, batchsize= args.batchsize)
summary(model,input_size=(1,75,48,96),device='cpu')
print('#Parameters:',sum(param.numel() for param in model.parameters()))
model = model.to(device)
decoder = Decoder(vocabularies,lm_path=args.lmpath)
# reload model
model = reload_model(model, args.path)
# define loss function and optimizer
criterion = nn.CTCLoss(zero_infinity=True).to(device)
if args.mode == 'temporalConv' or args.mode == 'finetuneGRU':
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.)
elif args.mode == 'backendGRU':
for param in model.parameters():
param.requires_grad = False
for param in model.gru.parameters():
param.requires_grad = True
optimizer = optim.Adam([
{'params': model.gru.parameters(), 'lr': args.lr}
], lr=0., weight_decay=0.)
else:
raise Exception('No model is found!')
dset_loaders, dset_sizes = data_loader(args)
# scheduler = AdjustLR(optimizer, [args.lr], sleep_epochs=10, half=10, verbose=1)
for epoch in range(566,args.epochs):
f = open('/home/cxd/Videos/log.txt','a')
f.write('epoch:{}'.format(epoch))
f.write('\n')
print('-' * 20)
print('training......')
print('-' * 20)
# scheduler.step(epoch)
model.train()
print('-' * 20)
print('Epoch {}/{}'.format(epoch, args.epochs - 1))
print('Current Learning rate: {}'.format(showLR(optimizer)))
print('-' * 20)
running_loss, running_corrects, running_all, cer = 0., 0., 0., 0.
for batch_idx, sample_batched in enumerate(dset_loaders['train']):
optimizer.zero_grad()
inputs,targets,lengths,y_lengths,idx = sample_batched
# print(inputs.shape)
# summary(model, input_size=(inputs.size(1),inputs.size(2),inputs.size(3),inputs.size(4)), device='cuda')
# inputs = inputs.permute(0,2,1,3,4)
inputs = inputs.float() # 36*1*29*40*80
inputs, targets = inputs.to(device) , targets.to(device)
# lengths, y_lengths = lengths.to(device), y_lengths.to(device)
outputs = model(inputs) # frame_len * batchsize * 29
loss = criterion(F.log_softmax(outputs,dim=-1),targets,lengths,y_lengths)
loss.backward()
# nn.utils.clip_grad_value_(model.parameters(), 10)
optimizer.step()
# stastics
decoded = decoder.decode_greedy(outputs,lengths)
cursor, gt = 0, []
for b in range(inputs.size(0)):
y_str = ''.join([vocabularies[ch] for ch in targets[cursor: cursor + y_lengths[b]]])
gt.append(y_str)
cursor += y_lengths[b]
CER = decoder.cer_batch(decoded,gt)
cer += CER
cer_mean = cer/(batch_idx+1)
running_loss += loss.data * inputs.size(0)
running_all += len(inputs)
if batch_idx == 0:
since = time.time()
if (batch_idx + 1) % (args.interval*10) == 0:
print('Test seq: {};\npred: {}\n'.format(gt, decoded))
elif (batch_idx+1) % args.interval == 0 or (batch_idx == len(dset_loaders['train'])-1):
# print('Test seq: {};\npred: {}\n'.format(gt,decoded))
print('Process: [{:5.0f}/{:5.0f} ({:.0f}%)]\tLoss: {:.4f}\tcer:{:.4f}\tCost time:{:5.0f}s\tEstimated time:{:5.0f}s\t'.format(
running_all,
len(dset_loaders['train'].dataset),
100. * batch_idx / (len(dset_loaders['train'])-1),
running_loss / running_all,
cer_mean,
time.time()-since,
(time.time()-since)*(len(dset_loaders['train'])-1) / batch_idx - (time.time()-since)))
print('{} Epoch:\t{:2}\tLoss: {:.4f}\tcer:{:.4f}\t'.format(
'pretrain',
epoch,
running_loss / len(dset_loaders['train'].dataset),
cer_mean)+'\n')
f.write('{} Epoch:\t{:2}\tLeaningRate:{}\tLoss: {:.4f}\tcer:{:.4f}\t'.format(
'train',
epoch,
showLR(optimizer),
running_loss / len(dset_loaders['train'].dataset),
cer_mean)+'\n')
f.close()
torch.save(model.state_dict(), save_path+'/'+args.mode+'_'+str(epoch+1)+'.pt')
print('-' * 20)
print('testing......')
print('-' * 20)
# model.eval()
# running_loss, running_corrects, running_all = 0., 0., 0.
Tcer = 0.0
for batch_idx, sample_batched in enumerate(dset_loaders['test']):
inputs, targets, lengths, y_lengths, idx = sample_batched
inputs = inputs.float() # 36*1*29*40*80
inputs, targets = inputs.to(device), targets.to(device)
# # lengths, y_lengths = lengths.to(device), y_lengths.to(device)
outputs = model(inputs)
#
decoded = decoder.decode_beam(outputs, lengths)
cursor, gt = 0, []
for b in range(inputs.size(0)):
y_str = ''.join([vocabularies[ch] for ch in targets[cursor: cursor + y_lengths[b]]])
gt.append(y_str)
cursor += y_lengths[b]
cer = decoder.cer_batch(decoded, gt)
Tcer += cer
Tcer_mean = Tcer / (batch_idx + 1)
print('Test seq: {};\npred: {}\n'.format(gt,decoded))
print('greedy_cer:{:.4f}\n'.format(Tcer_mean))
f = open('/home/cxd/Videos/log.txt', 'a')
f.write("test epoch:\t{}\tTcer:\t{:.4f}".format(epoch,Tcer_mean)+'\n'+'\n')
f.close()
Thank you very much! I found the reason. In my forward function
of the Network, I use the while
to repeat the depth-wise convolution layer
, while in pytorch, defined layers cannot be reused.