Currently I’m working on feature learning and transfer learning (in context of audio spectrograms). For this purpose, in a first step, I trained a network consisting of 4 parallel Alexnet like conv-nets until fc6-layer with shared weights. In fc7, the outputs of the fc6 layers are stacked together, and fc8 does the classification to 24 classes.
In the transfer learning part, I want to use the saved model in PTH file from the pretraining and further train the fully connected layers for the final classification in 10 classes. I want to freeze the convolutional layers 1-5, and train only the fully connected layers, fc6-fc8. If I understood it correctly, this approach is referred to as conv-net feature extractor. The size of the fc6 in the pretraining is different than the size of fc6 in the later transfer learning part.
I share my code, which runs without any errors so far. But I am not sure, if I am doing right. The results (accuracy) are lower than training the network completely from scratch without any pretraining. I tried to search for similar transfer learning cases, but most of the examples are about already “professionally” pretrained networks (like VGG, RESNET, …). In my case, I do the pretraining myself, and save the model as PTH file. In the transfer learning, I want to load the PTH file freezing the conv-layers and train the FC-layers for the final classification task.
See below my code. I would be very grateful, if somebody could help me, saying if I’m doing right, or correcting me, if I do a mistake. Most important question is, if the loading of the pretrained model and freezing the layers and training only the fully connected layers is correct (in Network.py below).
Code consisting of:
- NetworkOrig.py (network for pretraining)
- Network.py (network for final classification)
- Short code snippet for saving model (within Train.py, s. below)
- Train.py (for training in classification step)
(dataloader script not attached)
Network in pretraining (containing the 4 parallel convnets with shared weights):
# NetworkOrig.py
class Network(nn.Module):
def __init__(self, classes=24):
super(Network, self).__init__()
self.conv = nn.Sequential()
self.conv.add_module('conv1_s1',nn.Conv2d(3, 32, kernel_size=17, stride=2, padding=0))
self.conv.add_module('relu1_s1',nn.ReLU(inplace=True))
self.conv.add_module('pool1_s1',nn.MaxPool2d(kernel_size=3, stride=2))
self.conv.add_module('lrn1_s1',LRN(local_size=5, alpha=0.0001, beta=0.75))
self.conv.add_module('conv2_s1',nn.Conv2d(32, 96, kernel_size=3, padding=2, groups=2))
self.conv.add_module('relu2_s1',nn.ReLU(inplace=True))
self.conv.add_module('pool2_s1',nn.MaxPool2d(kernel_size=3, stride=2))
self.conv.add_module('lrn2_s1',LRN(local_size=5, alpha=0.0001, beta=0.75))
self.conv.add_module('conv3_s1',nn.Conv2d(96, 128, kernel_size=3, padding=1))
self.conv.add_module('relu3_s1',nn.ReLU(inplace=True))
self.conv.add_module('conv4_s1',nn.Conv2d(128, 128, kernel_size=3, padding=1, groups=2))
self.conv.add_module('relu4_s1',nn.ReLU(inplace=True))
self.conv.add_module('conv5_s1',nn.Conv2d(128, 96, kernel_size=3, padding=1, groups=2))
self.conv.add_module('relu5_s1',nn.ReLU(inplace=True))
self.conv.add_module('pool5_s1',nn.MaxPool2d(kernel_size=3, stride=2))
self.fc6 = nn.Sequential()
self.fc6.add_module('fc6_s1',nn.Linear(384, 256))
self.fc6.add_module('relu6_s1',nn.ReLU(inplace=True))
self.fc6.add_module('drop6_s1',nn.Dropout(p=0.4))
self.fc7 = nn.Sequential()
self.fc7.add_module('fc7',nn.Linear(4*256,256))
self.fc7.add_module('relu7',nn.ReLU(inplace=True))
self.fc7.add_module('drop7',nn.Dropout(p=0.4))
self.classifier = nn.Sequential()
self.classifier.add_module('fc8',nn.Linear(256, classes))
def forward(self, x):
#Batch, ImageTile, Channel, Hight, Width
B,T,C,H,W = x.size()
x = x.transpose(0,1)
x_list = []
# do for each of the 4 image tiles
for i in range(4):
z = self.conv(x[i])
z = self.fc6(z.view(B,-1))
z = z.view([B,1,-1])
x_list.append(z)
x = cat(x_list,1)
x = self.fc7(x.view(B,-1))
x = self.classifier(x)
return x
Transfer learning network for final Classification - with loading PTH file, freezing conv-nets and training fully connected layers, where I’m not sure if I am doing it the correct way:
# Network.py
import torch
import torch.nn as nn
from torch import cat
import torch.nn.init as init
from torch.autograd import Variable
from NetworkOrig import Network as NetworkOriginal
import sys
sys.path.append('Utils')
from Layers import LRN
class Network(nn.Module):
def __init__(self, classes=10):
super(Network, self).__init__()
# network from above pretraining step
netOrig = NetworkOriginal(24)
netOrig.cuda()
# loading PTH model file
checkpointL = torch.load('jps_040_018760.pth')
netOrig.load_state_dict(checkpointL['state_dict'])
# freeze convolutional layers
print("###seq list:",(*list(netOrig.children())[:1]))
self.features = nn.Sequential(*list(netOrig.children())[:1])
for p in self.features.parameters():
p.requires_grad = False
self.fc6 = nn.Sequential()
self.fc6.add_module('fc6_s1',nn.Linear(4224, 1024))
self.fc6.add_module('relu6_s1',nn.ReLU(inplace=True))
self.fc6.add_module('drop6_s1',nn.Dropout(p=0.3))
self.fc7 = nn.Sequential()
self.fc7.add_module('fc7',nn.Linear(4*256,256))
self.fc7.add_module('relu7',nn.ReLU(inplace=True))
self.fc7.add_module('drop7',nn.Dropout(p=0.3))
self.classifier = nn.Sequential()
self.classifier.add_module('fc8',nn.Linear(256, classes))
def forward(self, x):
#Batch, Channel, Hight, Width
B,C,H,W = x.size()
z = self.features(x)
z = self.fc6(z.view(B,-1))
z = z.view([B,1,-1])
z = self.fc7(z.view(B,-1))
z = self.classifier(z)
return z
Code snippet for saving model:
net = Network(24)
if args.gpu is not None:
net.cuda()
...
...
torch.save(
{'state_dict': net.state_dict(),'optimizer': optimizer.state_dict()},filename)
print('Saved: '+args.checkpoint)
Training script for better understanding:
# Train.py
import ...
#from ImageDataLoader import DataLoader
from ImageLoaderM import DataLoader
sys.path.append('Dataset')
from Network import Network
from NetworkOrig import Network as NetworkOrig
from TrainingUtils import adjust_learning_rate, compute_accuracy
parser = ...
# tensorboard logs
...
def main():
if args.gpu is not None:
print(('Using GPU %d'%args.gpu))
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu)
else:
print('CPU mode')
trainpath = args.data + '/jpg'
train_data = DataLoader(trainpath, 'TransferTrain.txt', classes=args.classes)
train_loader = torch.utils.data.DataLoader(dataset=train_data,
batch_size=args.batch,
shuffle=True,
num_workers=args.cores)
valpath = args.data + '/jpg'
val_data = DataLoader(valpath, 'TransferDev.txt', classes=args.classes)
val_loader = torch.utils.data.DataLoader(dataset=val_data,
batch_size=args.batch,
shuffle=True,
num_workers=args.cores)
N = train_data.N
iter_per_epoch = train_data.N/args.batch
# Network initialize
net = Network(args.classes)
if args.gpu is not None:
net.cuda()
############## Load from checkpoint if exists, otherwise from model ###############
if os.path.exists(args.checkpoint):
files = [f for f in os.listdir(args.checkpoint) if 'pth' in f]
if len(files)>0:
files.sort()
ckp = files[-1]
net.load_state_dict(torch.load(args.checkpoint+'/'+ckp)['state_dict'],strict=False)
args.iter_start = int(ckp.split(".")[0].split("_")[1])
print('Starting from: ',ckp)
else:
if args.model is not None:
net.load(args.model)
else:
if args.model is not None:
net.load(args.model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=1e-5)
############## TESTING ###############
if args.evaluate:
test(net,criterion,val_loader,0)
return
############## TRAINING ###############
print(('Start training: lr %f, batch size %d, classes %d'%(args.lr,args.batch,args.classes)))
print(('Checkpoint: '+args.checkpoint))
# Train the Model
best_acc = 0.0
test_acc = 0.0
batch_time, net_time = [], []
steps = args.iter_start
for epoch in range(int(args.iter_start/iter_per_epoch),args.epochs):
if (epoch%2==0 and epoch>0 and epoch<10) or (epoch%10==0 and epoch>9):
test_acc = test(net, criterion, val_loader, steps)
if(test_acc > best_acc):
filename = '%s/jps_%03i_%06d.pth'%(args.checkpoint,epoch,steps)
torch.save(
{'state_dict': net.state_dict(),'optimizer': optimizer.state_dict()},filename)
print('Saved: '+args.checkpoint)
best_acc = test_acc
else:
print('No model saving (best Acc %.2f%%)' %(best_acc))
lr = adjust_learning_rate(optimizer, epoch, init_lr=args.lr, step=20, decay=0.1)
end = time()
for i, (images, labels) in enumerate(train_loader):
batch_time.append(time()-end)
if len(batch_time)>100:
del batch_time[0]
images = Variable(images)
labels = Variable(labels)
if args.gpu is not None:
images = images.cuda()
labels = labels.cuda()
# Forward/Backward/Optimize
optimizer.zero_grad()
t = time()
outputs = net(images)
net_time.append(time()-t)
if len(net_time)>100:
del net_time[0]
prec1, prec3 = compute_accuracy(outputs.cpu().data, labels.cpu().data, topk=(1, 3))
acc = prec1.item()
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
loss = float(loss.cpu().data.numpy())
if steps%20==0:
print(('[%2d/%2d] %5d) [batch load % 2.3fsec, net %1.2fsec], Loss: % 1.3f, Accuracy % 2.2f%%' %(
epoch+1, args.epochs, steps,
np.mean(batch_time), np.mean(net_time),
loss,acc)))
steps += 1
end = time()
def test(net, criterion, val_loader, steps):
print('Network evaluation:')
accuracy = []
loss = []
net.eval()
for i, (images, labels) in enumerate(val_loader):
images = Variable(images)
if args.gpu is not None:
images = images.cuda()
outputs = net(images)
outputs = outputs.cpu().data
prec1, prec5 = compute_accuracy(outputs, labels, topk=(1, 1))
accuracy.append(prec1.item())
losseval = criterion(outputs,labels)
loss.append(losseval)
writerEval.add_scalar('Loss', np.mean(loss), steps)
writerEval.add_scalar('Accuracy', np.mean(accuracy), steps)
print('TESTING: %d), Accuracy %.2f%%' %(steps,np.mean(accuracy)))
print('TESTING: %d), Loss %.2f' %(steps,np.mean(loss)))
net.train()
return np.mean(accuracy)
if __name__ == "__main__":
main()