import torch
import gc
gc.collect()
torch.cuda.empty_cache()
import os
import numpy as np
from PIL import Image
from torchvision import transforms,models, utils
from sklearn import preprocessing
import argparse
import random
from torch.utils.tensorboard import SummaryWriter
from collections import defaultdict,OrderedDict
from copy import deepcopy
from torch.optim import Adam as Adam
from torch.nn import CrossEntropyLoss as CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
import torch
import random
from torch.utils.data.distributed import DistributedSampler
from apex.parallel import DistributedDataParallel as DDP
from apex.parallel import convert_syncbn_model
from apex import amp
#reproducibility
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True
parser = argparse.ArgumentParser()
parser.add_argument(‘–sub_idx’, default=3, type=int)
parser.add_argument(‘–categories’, default=‘51’, type=int)
parser.add_argument(‘–writerpath’, default=‘/home/nikiguo93/python_examples/DataLoader/places365/runs/’, type=str)
parser.add_argument(‘–modelname’, default=‘resnet50’, type=str)
parser.add_argument(‘–arch’, default=‘resnet’, type=str)
parser.add_argument(‘–lr’, default= 5e-6, type=float)
parser.add_argument(‘–MaxEpoch’, default=150, type=int)
parser.add_argument(‘–mtl’, default= ‘True’, type=str)
parser.add_argument(“–local_rank”, default=os.getenv(‘LOCAL_RANK’, 0), type=int)
parser.add_argument(‘–dirpath’, default= ‘/scratch/nikiguo93/results_relu_fc_bk3_bk4/’, type=str)
args = parser.parse_args()
print(args.mtl)
writer = SummaryWriter(args.writerpath + ‘train_places365_mtl’ + str(args.mtl))
‘’’
num_of_gpu = torch.cuda.device_count()
print(num_of_gpu)
os.environ[‘CUDA_VISIBLE_DEVICES’] = ‘0,1,2,3’
os.environ[‘NCCL_DEBUG’]=‘INFO’
os.environ[‘NCCL_DEBUG_SUBSYS’]=‘ALL’
os.environ[‘TORCH_DISTRIBUTED_DEBUG’] = ‘INFO’
set up distributed device
rank = int(os.environ[‘RANK’])
local_rank = int(os.environ[‘LOCAL_RANK’])
torch.cuda.set_device(rank % torch.cuda.device_count())
torch.distributed.init_process_group(backend=‘nccl’ if torch.distributed.is_nccl_available() else ‘gloo’)
device = torch.device(‘cuda’,local_rank)
print(f"[init] == local rank: {local_rank}, global rank: {rank} ==")
‘’’
args.gpu = args.local_rank
torch.cuda.set_device(args.gpu)
torch.distributed.init_process_group(backend=‘nccl’,init_method=‘env://’)
class Places365Dataset(Dataset):
def __init__(self,image_path,file_list,transform,train=True):
train_ImageNames = []
self.train = train
self.transform = transform
for FileName in os.listdir(image_path + 'train'):
if FileName in file_list:
for ImgName in os.listdir(image_path+'train/'+FileName):
image_name = os.path.join(image_path,'train/',FileName,ImgName)
train_ImageNames.append(image_name)
train_ImageNames.sort()
test_ImageNames = []
for FileName in os.listdir(image_path + 'val'):
if FileName in file_list:
for ImgName in os.listdir(image_path+'val/'+FileName):
image_name = os.path.join(image_path,'val/',FileName,ImgName)
test_ImageNames.append(image_name)
test_ImageNames.sort()
train_label = [train_ImageNames[i].split('/')[5] for i in range(len(train_ImageNames))]
test_label = [test_ImageNames[i].split('/')[5] for i in range(len(test_ImageNames))]
b_train=random.sample(range(0, 999), 400)
self.train_img_names = []
self.train_lbl = []
for i in range(len(train_ImageNames)):
res = i%1000
if res in b_train:
self.train_img_names.append(train_ImageNames[i])
self.train_lbl.append(train_label[i])
b_test=random.sample(range(0, 99), 50)
self.test_img_names = []
self.test_lbl = []
for i in range(len(test_ImageNames)):
res = i%100
if res in b_test:
self.test_img_names.append(test_ImageNames[i])
self.test_lbl.append(test_label[i])
le = preprocessing.LabelEncoder()
self.train_lbl = le.fit_transform(self.train_lbl)
self.test_lbl = le.fit_transform(self.test_lbl)
self.train_lbl = torch.as_tensor(self.train_lbl)
self.test_lbl = torch.as_tensor(self.test_lbl)
def __len__(self):
if self.train:
return len(self.train_img_names)
else:
return len(self.test_img_names)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
if self.train:
TheImage = self.transform(Image.open(self.train_img_names[idx]).convert('RGB'))
TheLabel = self.train_lbl[idx]
else:
TheImage = self.transform(Image.open(self.test_img_names[idx]).convert('RGB'))
TheLabel = self.test_lbl[idx]
sample = {'input': TheImage, 'label': TheLabel}
return sample
image_path = ‘/scratch/nikiguo93/places365/’
file_list = [‘mountain_snowy’,‘bakery-shop’,‘lighthouse’,‘beach’,‘ball_pit’,‘dining_room’,‘highway’,‘bridge’,‘creek’,
‘waterfall’,‘forest-broadleaf’,‘nursery’,‘campsite’,‘car_interior’,‘toyshop’,‘shoe_shop’,‘bookstore’,‘coast’,‘ocean’,‘corridor’,
‘skyscraper’,‘water_tower’,‘closet’,‘landing_deck’,‘florist_shop-indoor’,‘park’,‘pasture’,‘pantry’,‘swimming_pool-outdoor’,
‘supermarket’,‘conference_room’,‘corn_field’,‘movie_theater-indoor’,‘laundromat’,‘staircase’,‘street’,‘home_theater’,‘utility_room’,
‘auditorium’,‘river’,‘castle’,‘butchers_shop’,‘windmill’,‘desert-sand’,‘tower’,‘cockpit’,‘office’,‘kitchen’,‘carrousel’,
‘playground’,‘wheat_field’]
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
transform = transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),
normalize,
])
PLACES_Traindataset = Places365Dataset(image_path,file_list,transform,train=True)
PLACES_Testdataset = Places365Dataset(image_path,file_list,transform,train=False)
train_sampler = torch.utils.data.distributed.DistributedSampler(PLACES_Traindataset,shuffle=True,)
val_sampler = torch.utils.data.distributed.DistributedSampler(PLACES_Testdataset)
TraindataLoader = DataLoader(PLACES_Traindataset, batch_size=64,num_workers=8,pin_memory=True,sampler=train_sampler)
TestdataLoader = DataLoader(PLACES_Testdataset, batch_size=64,num_workers=8,shuffle=False,sampler=val_sampler)
del PLACES_Traindataset
del PLACES_Testdataset
def load_model(model,modelpath):
if os.path.isfile(modelpath):
print("=> loading checkpoint '{}'".format(modelpath))
checkpoint = torch.load(modelpath)
state_dict = checkpoint['best_model']
model_dict = model.state_dict()
new_state_dict = OrderedDict()
matched_layers, discarded_layers = [], []
for k, v in state_dict.items():
if k.startswith('module.'):
k = k[7:] # discard module.
if k in model_dict and model_dict[k].size() == v.size():
new_state_dict[k] = v
matched_layers.append(k)
else:
discarded_layers.append(k)
model_dict.update(new_state_dict)
model.load_state_dict(model_dict)
model.eval().cuda()
class CustomResNet(torch.nn.Module):
def init(self,modelname):
super(CustomResNet, self).init()
resnet50 = models.dictmodelname
self.features_1= torch.nn.ModuleList(resnet50.children())[:-2]
self.features_2= torch.nn.ModuleList(resnet50.children())[:-3]
self.features_3= torch.nn.ModuleList(resnet50.children())[:-4]
self.features_4= torch.nn.ModuleList(resnet50.children())[:-5]
self.part_1 = torch.nn.Sequential(
*self.features_1,
torch.nn.ReLU(),
torch.nn.Flatten(),
torch.nn.Linear(100352,256)
)
self.part_2 = torch.nn.Sequential(
*self.features_2,
torch.nn.ReLU(),
torch.nn.Flatten(),
torch.nn.Linear(200704,256)
)
self.regression = torch.nn.Sequential(
torch.nn.ReLU(),
torch.nn.Linear(512,1700))
self.classification = torch.nn.Sequential(
*self.features_1,
torch.nn.AdaptiveAvgPool2d((1, 1)),
torch.nn.Flatten(),
torch.nn.Linear(2048,1654))
def forward(self, x):
x1 = self.part_1(x)
x2 = self.part_2(x)
x_eeg = torch.cat([x1,x2],axis=1)
#x_eeg = x_eeg.view(x_eeg.size(0),-1)
x_eeg = self.regression(x_eeg)
x_img = self.classification(x)
return x_eeg,x_img
if args.mtl == ‘True’:
print(args.mtl)
mtl = CustomResNet(args.modelname)
modelpath = args.dirpath + ‘real_zeeg_sub’ + str(args.sub_idx) + ‘_resnet50.pth’
load_model(model = mtl,modelpath=modelpath)
model_ft = mtl.classification
del mtl
num_ftrs=model_ft[10].in_features
model_ft[10] = torch.nn.Linear(num_ftrs, args.categories,bias = True)
else:
model_ft = models.resnet50(pretrained=True)
num_ftrs=model_ft.fc.in_features
model_ft.fc = torch.nn.Linear(num_ftrs, 1654,bias = True)
modelpath = args.dirpath + ‘finetune_classification_resnet50_all.pth’
load_model(model_ft,modelpath)
num_ftrs=model_ft.fc.in_features
model_ft.fc = torch.nn.Linear(num_ftrs, args.categories,bias = True)
#freeze part of the resnet
count = 0
for child in model_ft.children():
count+=1
if count < 9:
for param in child.parameters():
param.requires_grad = False
model_ft.cuda()
#parameters to optimized
params_to_update = model_ft.parameters()
print(“Params to learn:”)
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
print(“\t”,name)
#define the optimizer
#optimizer_ft = SGD(params_to_update, lr = 0.01,momentum=0.09,weight_decay=0.0005)
optimizer_ft = Adam(params_to_update, lr=args.lr)
model_ft,optimizer_ft = amp.initialize(model_ft,optimizer_ft,opt_level=‘O2’)
model_ft = DDP(model_ft,delay_allreduce=True)
def reduce_value(value, average=True):
world_size = torch.distributed.get_world_size()
if world_size < 2:
return value
with torch.no_grad():
torch.distributed.all_reduce(value)
if average:
value /= world_size
return value
def train(TraindataLoader,optimizer,model,criterion,ep):
model.train()
train_loss = 0
num_correct = 0.0
batch_size = 0
#set sampler
TraindataLoader.sampler.set_epoch(ep)
for i_batch, sample_batched in enumerate(TraindataLoader):
local_X = sample_batched['input'].cuda()
local_Y = sample_batched['label'].cuda()
local_X = torch.autograd.Variable(local_X)
local_Y = torch.autograd.Variable(local_Y)
# compute gradient and do SGD step
optimizer.zero_grad()
# compute the model output
with torch.set_grad_enabled(True):
yhat = model(local_X)
print(yhat.size())
print(local_Y.size())
#yhat= torch.reshape(yhat,[yhat.size()[0],17,100])
# calculate loss
loss = criterion(yhat, local_Y)
'''
pred_category,indx_pred = yhat.max(1)
num_correct += indx_pred.eq(local_Y).sum().item()
# credit assignment
loss.backward()
'''
#############################################################
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
#############################################################
#calculate loss
loss = reduce_value(loss,average=True)
train_loss += loss.item()*yhat.size()[0]
batch_size += yhat.size()[0]
# update model weights
optimizer.step()
#measure accuracy
indx_pred = torch.max(yhat,dim=1)[1]
num_correct += torch.eq(indx_pred,local_Y).sum()
#scheduler.step()
num_correct = reduce_value(num_correct,average=True).item()
acc = num_correct/batch_size
avg_trainloss = train_loss/batch_size
torch.cuda.synchronize()
return avg_trainloss,acc
def test(TestdataLoader, model,criterion):
test_loss = 0
num_correct = 0.0
batch_size = 0
# switch to evaluate mode
model.eval()
with torch.no_grad():
for i_batch, sample_batched in enumerate(TestdataLoader):
local_X = sample_batched['input'].cuda()
local_Y = sample_batched['label'].cuda()
local_X = torch.autograd.Variable(local_X)
local_Y = torch.autograd.Variable(local_Y)
# compute the model output
yhat = model(local_X)
#yhat= torch.reshape(yhat,[yhat.size()[0],17,100])
# calculate loss
loss = criterion(yhat, local_Y)
loss = reduce_value(loss,average=True)
batch_size += yhat.size()[0]
test_loss += loss.item()*yhat.size()[0]
#measure accuracy
indx_pred = torch.max(yhat,dim=1)[1]
num_correct += torch.eq(indx_pred,local_Y).sum()
#batch_size += yhat.size()[0]
num_correct = reduce_value(num_correct,average=True).item()
acc = num_correct/batch_size
avg_testloss = test_loss/batch_size
torch.cuda.synchronize()
return avg_testloss,acc
MaxEpoch = args.MaxEpoch
min_loss = 50000
best_model = None
best_epoch = -1
train_batches = 64
test_batches = 64
define the optimization
criterion = CrossEntropyLoss().cuda()
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft, step_size=1000, gamma=0.1)
def adjust_learning_rate(optimizer, epoch, init_lr):
“”“Sets the learning rate to the initial LR decayed by 10 every 30 epochs”“”
lr = init_lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group[‘lr’] = lr
for epoch in range(MaxEpoch):
[avg_trainloss,acc_train] = train(TraindataLoader,optimizer_ft,model_ft,criterion,epoch)
writer.add_scalar(“Loss/train”, avg_trainloss, epoch+1)
writer.add_scalar(“acc_train/train”, acc_train, epoch+1)
[avg_testloss,acc_test] = test(TestdataLoader, model_ft,criterion)
#scheduler.step()
if avg_testloss <= min_loss:
min_loss = avg_testloss
best_model = deepcopy(model_ft.state_dict())
best_epoch = epoch+1
writer.add_scalar("Loss/test", avg_testloss, epoch+1)
writer.add_scalar("acc_test/test", acc_test, epoch+1)
writer.flush()
print("Best Epoch is: ", best_epoch , " with test loss: ", min_loss)
model_state_dict= model_ft.state_dict()
results_dir = ‘/scratch/nikiguo93/places365/results/’
save model
if args.local_rank == 0:
torch.save({
‘epoch’: epoch,
‘model_state_dict’: model_state_dict,
‘best_model’: best_model,
‘optimizer_state_dict’: optimizer_ft.state_dict(),
‘loss’: avg_trainloss,
}, results_dir + ‘places365_mtl_’ + str(args.mtl) + ‘.pth’)
writer.close()