Long shot but can anyone please tell me what I am doing wrong? Thank you.
I am using the slightly modified version of: GitHub - jeffreyyihuang/two-stream-action-recognition: Using two stream architecture to implement a classic action recognition method on UCF101 dataset
Everything worked perfectly fine (albeit bad performance) with random crop for training and centercrop for testing/val. Now, I am trying to get fivecrops and I have modified the batch sizes and I get an error that I don’t really understand because my conv input should be the (ncropsxbatchsize, channel, height, width) - [5x6,20,224,224]
I get this error:
Traceback (most recent call last):
File “motion_cnn.py”, line 282, in
main()
File “motion_cnn.py”, line 73, in main
model.run()
File “motion_cnn.py”, line 123, in run
self.train_1epoch()
File “motion_cnn.py”, line 172, in train_1epoch
output = self.model(result)
File “/media/d/DATA_2/two-stream-action-recognition-master/venv/local/lib/python2.7/site-packages/torch/nn/modules/module.py”, line 477, in call
result = self.forward(*input, **kwargs)
File “/media/d/DATA_2/two-stream-action-recognition-master/network.py”, line 145, in forward
x = self.conv1_custom(x)
File “/media/d/DATA_2/two-stream-action-recognition-master/venv/local/lib/python2.7/site-packages/torch/nn/modules/module.py”, line 477, in call
result = self.forward(*input, **kwargs)
File “/media/d/DATA_2/two-stream-action-recognition-master/venv/local/lib/python2.7/site-packages/torch/nn/modules/conv.py”, line 301, in forward
self.padding, self.dilation, self.groups)
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 20, 7, 7], but got input of size [30, 9] instead
This is how my motion_cnn looks:
import numpy as np
import pickle
from PIL import Image
import time
import tqdm
import shutil
from random import randint
import argparse
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn
import torch
import torch.backends.cudnn as cudnn
import torch.cuda as cuda
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from utils import *
from network import *
import dataloader
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
cuda.set_device(0)
parser = argparse.ArgumentParser(description='UCF101 motion stream on resnet101')
parser.add_argument('--epochs', default=1000, type=int, metavar='N', help='number of total epochs')
parser.add_argument('--batch-size', default=6, type=int, metavar='N', help='mini-batch size (default: 4)')
parser.add_argument('--lr', default=1e-2, type=float, metavar='LR', help='initial learning rate')#best is 2
parser.add_argument('--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set')
parser.add_argument('--resume', default='80', type=str, metavar='PATH', help='path to latest checkpoint (default: none)')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)')
def main():
global arg
arg = parser.parse_args()
print arg
#Prepare DataLoader
data_loader = dataloader.Motion_DataLoader(
BATCH_SIZE=arg.batch_size,
num_workers=8,
path='/media/d/DATA_2/two-stream-action-recognition-master/J/flow/',
ucf_list='/media/d/DATA_2/two-stream-action-recognition-master/J_list/',
ucf_split='01',
in_channel=10,
)
train_loader,test_loader, test_video = data_loader.run()
#print len(train_loader.dataset)
#Model
model = Motion_CNN(
# Data Loader
train_loader=train_loader,
test_loader=test_loader,
# Utility
start_epoch=arg.start_epoch,
resume=arg.resume,
evaluate=arg.evaluate,
# Hyper-parameter
nb_epochs=arg.epochs,
lr=arg.lr,
batch_size=arg.batch_size,
channel = 10*2,
test_video=test_video
)
#Training
model.run()
class Motion_CNN():
def __init__(self, nb_epochs, lr, batch_size, resume, start_epoch, evaluate, train_loader, test_loader, channel,test_video):
self.nb_epochs=nb_epochs
self.lr=lr
self.batch_size=batch_size
self.resume=resume
self.start_epoch=start_epoch
self.evaluate=evaluate
self.train_loader=train_loader
self.test_loader=test_loader
self.best_prec1=0
self.channel=channel
self.test_video=test_video
def build_model(self):
print ('==> Build model and setup loss and optimizer')
#build model
self.model = resnet101(pretrained= True, channel=self.channel).cuda()
#print self.model
#Loss function and optimizer
self.criterion = nn.CrossEntropyLoss().cuda()
self.optimizer = torch.optim.SGD(self.model.parameters(), self.lr, momentum=0.9, weight_decay=1e-2)# torch.optim.Adam(self.model.parameters(), self.lr, eps=1e-2, amsgrad=True) #torch.optim.Adagrad(self.model.parameters(), self.lr, weight_decay=1e-2) #torch.optim.SGD(self.model.parameters(), self.lr, weight_decay=1e-2, nesterov=True)
#self.scheduler = ReduceLROnPlateau(self.optimizer, 'min', patience=5, verbose=True) #StepLR(self.optimizer, step_size=10, gamma=0.25)
def resume_and_evaluate(self):
if self.resume:
if os.path.isfile(self.resume):
print("==> loading checkpoint '{}'".format(self.resume))
checkpoint = torch.load(self.resume)
self.start_epoch = checkpoint['epoch']
self.best_prec1 = checkpoint['best_prec1']
self.model.load_state_dict(checkpoint['state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer'])
print("==> loaded checkpoint '{}' (epoch {}) (best_prec1 {})"
.format(self.resume, checkpoint['epoch'], self.best_prec1))
else:
print("==> no checkpoint found at '{}'".format(self.resume))
if self.evaluate:
self.epoch=0
prec1, val_loss = self.validate_1epoch()
return
def run(self):
self.build_model()
self.resume_and_evaluate()
cudnn.benchmark = True
for self.epoch in range(self.start_epoch, self.nb_epochs):
self.train_1epoch()
prec1, val_loss = self.validate_1epoch()
is_best = prec1 > self.best_prec1
#lr_scheduler
self.scheduler.step(val_loss)
# save model
if is_best:
self.best_prec1 = prec1
with open('record/motion/motion_video_preds.pickle','wb') as f:
pickle.dump(self.dic_video_level_preds,f)
f.close()
save_checkpoint({
'epoch': self.epoch,
'state_dict': self.model.state_dict(),
'best_prec1': self.best_prec1,
'optimizer' : self.optimizer.state_dict()
},is_best,'record/motion/checkpoint.pth.tar','record/motion/model_best.pth.tar')
def train_1epoch(self):
print('==> Epoch:[{0}/{1}][training stage]'.format(self.epoch, self.nb_epochs))
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
#switch to train mode
self.model.train()
end = time.time()
# mini-batch training
progress = tqdm(self.train_loader)
for i, (data,label) in enumerate(progress):
# measure data loading time
data_time.update(time.time() - end)
label = label.cuda(async=True)
input_var = Variable(data).cuda()
#print len(input_var)
target_var = Variable(label).cuda()
bs, ncrops, c, h, w = input_var.size()
result = self.model(input_var.view(-1, c, h, w)) # fuse batch size and ncrops
# compute output
output = self.model(result)
loss = self.criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, label, topk=(1, 5))
losses.update(loss.data[0], data.size(0))
top1.update(prec1[0], data.size(0))
top5.update(prec5[0], data.size(0))
# compute gradient and do SGD step
self.optimizer.zero_grad()
loss.backward()
#torch.nn.utils.clip_grad_norm(self.model.parameters(), 0.25)
self.optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
info = {'Epoch':[self.epoch],
'Batch Time':[round(batch_time.avg,3)],
'Data Time':[round(data_time.avg,3)],
'Loss':[round(losses.avg,5)],
'Prec@1':[round(top1.avg,4)],
'Prec@5':[round(top5.avg,4)],
'lr': self.optimizer.param_groups[0]['lr']
}
record_info(info, 'record/motion/opf_train.csv','train')
def validate_1epoch(self):
print('==> Epoch:[{0}/{1}][validation stage]'.format(self.epoch, self.nb_epochs))
batch_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
# switch to evaluate mode
self.model.eval()
self.dic_video_level_preds={}
end = time.time()
progress = tqdm(self.test_loader)
for i, (keys,data,label) in enumerate(progress):
#data = data.sub_(127.353346189).div_(14.971742063)
label = label.cuda(async=True)
data_var = Variable(data, volatile=True).cuda(async=True)
label_var = Variable(label, volatile=True).cuda(async=True)
bs, ncrops, c, h, w = data_var.size()
result = self.model(data_var.view(-1, c, h, w)) # fuse batch size and ncrops
result_avg = result.view(bs, ncrops, -1).mean(1) # avg over crops
# compute output
output = self.model(result_avg)
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
#Calculate video level prediction
preds = output.data.cpu().numpy()
nb_data = preds.shape[0]
for j in range(nb_data):
videoName = keys[j].split('-',1)[0] # ApplyMakeup_g01_c01
if videoName not in self.dic_video_level_preds.keys():
self.dic_video_level_preds[videoName] = preds[j,:]
else:
self.dic_video_level_preds[videoName] += preds[j,:]
#Frame to video level accuracy
video_top1, video_top5, video_loss = self.frame2_video_level_accuracy()
info = {'Epoch':[self.epoch],
'Batch Time':[round(batch_time.avg,3)],
'Loss':[round(video_loss,5)],
'Prec@1':[round(video_top1,3)],
'Prec@5':[round(video_top5,3)]
}
record_info(info, 'record/motion/opf_test.csv','test')
return video_top1, video_loss
def frame2_video_level_accuracy(self):
correct = 0
video_level_preds = np.zeros((len(self.dic_video_level_preds),9)) #6
video_level_labels = np.zeros(len(self.dic_video_level_preds))
ii=0
for key in sorted(self.dic_video_level_preds.keys()):
name = key.split('-',1)[0]
preds = self.dic_video_level_preds[name]
label = int(self.test_video[name])-1
video_level_preds[ii,:] = preds
video_level_labels[ii] = label
ii+=1
if np.argmax(preds) == (label):
correct+=1
#top1 top5
video_level_labels = torch.from_numpy(video_level_labels).long()
video_level_preds = torch.from_numpy(video_level_preds).float()
loss = self.criterion(Variable(video_level_preds).cuda(), Variable(video_level_labels).cuda())
top1,top5 = accuracy(video_level_preds, video_level_labels, topk=(1,5))
top1 = float(top1.numpy())
top5 = float(top5.numpy())
return top1,top5,loss.data.cpu().numpy()
if __name__=='__main__':
main()
And this is how my dataloader looks:
import numpy as np
import pickle
from PIL import Image
import time
import shutil
import random
import argparse
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn
import torch
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.optim.lr_scheduler import ReduceLROnPlateau
from split_train_test_video import *
class motion_dataset(Dataset):
def __init__(self, dic, in_channel, root_dir, mode, transform=None):
#Generate a 16 Frame clip
self.keys=dic.keys()
self.values=dic.values()
self.root_dir = root_dir
self.transform = transform
self.mode=mode
self.in_channel = in_channel
self.img_rows=224
self.img_cols=224
self.fiveCrops = True
def stackopf(self,video_name, clip_idx, nb_clips=None):
name = self.video
u = self.root_dir+ 'u/' + name
v = self.root_dir+ 'v/'+ name
if self.fiveCrops:
self.ncrops = 5
else:
self.ncrops = 1
flow = torch.FloatTensor(self.ncrops,2*self.in_channel,self.img_rows,self.img_cols)
#i = int(self.clips_idx)
i = int(clip_idx)
for j in range(self.in_channel):
idx = i + j
if self.mode == 'train':
if idx >= nb_clips+1:
idx = nb_clips+1
idx = str(idx)
frame_idx = 'frame'+ idx.zfill(6) #6zeros for frame name
h_image = u +'/' + frame_idx +'.jpg'
v_image = v +'/' + frame_idx +'.jpg'
imgH=(Image.open(h_image))
imgV=(Image.open(v_image))
H = self.transform(imgH)
V = self.transform(imgV)
if self.fiveCrops:
flow[:, 2 * (j - 1), :, :] = H.squeeze()
flow[:, 2 * (j - 1) + 1, :, :] = V.squeeze()
else:
flow[:, 2 * (j - 1), :, :] = H
flow[:, 2 * (j - 1) + 1, :, :] = V
imgH.close()
imgV.close()
return flow.squeeze()
def __len__(self):
return len(self.keys)
def __getitem__(self, idx):
#print ('mode:',self.mode,'calling Dataset:__getitem__ @ idx=%d'%idx)
nb_clips=0
if self.mode == 'train':
self.video, nb_clips = self.keys[idx].split('-')
self.clips_idx = random.randint(1,int(nb_clips))
elif self.mode == 'val':
self.video,self.clips_idx = self.keys[idx].split('-')
else:
raise ValueError('There are only train and val mode')
label = self.values[idx]
label = int(label)-1
data = self.stackopf(self.video, self.clips_idx, int(nb_clips))
#len(data)
if self.mode == 'train':
sample = (data,label)
elif self.mode == 'val':
sample = (self.video,data,label)
else:
raise ValueError('There are only train and val mode')
return sample
class Motion_DataLoader():
def __init__(self, BATCH_SIZE, num_workers, in_channel, path, ucf_list, ucf_split):
self.BATCH_SIZE=BATCH_SIZE
self.num_workers = num_workers
self.frame_count={}
self.in_channel = in_channel
self.data_path=path
# split the training and testing videos
splitter = UCF101_splitter(path=ucf_list,split=ucf_split)
self.train_video, self.test_video = splitter.split_video()
def load_frame_count(self):
#print '==> Loading frame number of each video'
with open('/media/d/DATA_2/two-stream-action-recognition-master/dataloader/dic/frame_count_j.pickle','rb') as file:
dic_frame = pickle.load(file)
file.close()
for line in dic_frame : #'v_Lunges_g07_c01.avi'
#videoname = line.split('_',1)[1].split('.',1)[0] #Lunges_g07_c01
#n,g = videoname.split('_',1)
#if n == 'HandStandPushups':
# videoname = 'HandstandPushups_'+ g
self.frame_count[line]=dic_frame[line]
def run(self):
self.load_frame_count()
self.get_training_dic()
self.val_sample19()
train_loader = self.train()
print len(train_loader.dataset)
val_loader = self.val()
return train_loader, val_loader, self.test_video
def val_sample19(self):
self.dic_test_idx = {}
#print len(self.test_video)
for video in self.test_video: #Knot_Tying_D001_000041_000170 #ApplyEyeMakeup_g01_c01
if self.frame_count[video]>27 and self.frame_count[video]<1200: # CHANGE
#n,g = video.split('_',1) #v_ApplyEyeMakeup_g01_c01.avi
sampling_interval = int((self.frame_count[video]-10+1)/19)
for index in range(19):
clip_idx = index*sampling_interval
key = video + '-' + str(clip_idx+1)
self.dic_test_idx[key] = self.test_video[video]
def get_training_dic(self):
self.dic_video_train={}
for video in self.train_video:
if self.frame_count[video]>27 and self.frame_count[video]<1200: # CHANGE!
nb_clips = self.frame_count[video]-10+1
key = video +'-' + str(nb_clips)
self.dic_video_train[key] = self.train_video[video]
def train(self):
training_set = motion_dataset(dic=self.dic_video_train, in_channel=self.in_channel, root_dir=self.data_path,
mode='train',
transform = transforms.Compose([
transforms.Resize([256,256]),
transforms.FiveCrop([224, 224]),
transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops]))
#transforms.RandomCrop([224, 224]),
#transforms.ToTensor(),
#transforms.Normalize([0.5], [0.5])
]))
print '==> Training data :',len(training_set),' videos',training_set[1][0].size()
train_loader = DataLoader(
dataset=training_set,
batch_size=self.BATCH_SIZE,
shuffle=True,
num_workers=self.num_workers,
pin_memory=True
)
return train_loader
def val(self):
validation_set = motion_dataset(dic= self.dic_test_idx, in_channel=self.in_channel, root_dir=self.data_path ,
mode ='val',
transform = transforms.Compose([
transforms.Resize([256,256]),
#transforms.CenterCrop([224, 224]),
#transforms.ToTensor(),
transforms.FiveCrop([224, 224]),
transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops]))
#transforms.Normalize([0.5], [0.5])
]))
print '==> Validation data :',len(validation_set),' frames',validation_set[1][1].size()
#print validation_set[1]
val_loader = DataLoader(
dataset=validation_set,
batch_size=self.BATCH_SIZE,
shuffle=True,
num_workers=self.num_workers)
return val_loader
if __name__ == '__main__':
data_loader =Motion_DataLoader(BATCH_SIZE=1,num_workers=1,in_channel=10,
path='/media/d/DATA_2/two-stream-action-recognition-master/J/flow/',
ucf_list='/media/d/DATA_2/two-stream-action-recognition-master/J_list',
ucf_split='01'
)
train_loader,val_loader,test_video = data_loader.run()
#print train_loader,val_loader