Hello all! I am trying to train a simple PyTorch model implemented from the Siamese CNN paper. The training runs just fine and all, but the run seems to terminate prematurely at the same iteration (254) each time I run it. There is no error code, and adding in a line of code to print the output each iteration shows no visible bugs as well.
My training code is as follow:
from tqdm import tqdm
import argparse
import torch
import torch.nn as nn
import torchvision
from torch.autograd import Variable
from torchvision import transforms
import numpy as np
import pickle
from torch.utils.data import DataLoader
from siameseNet import SiameseNet
import os
from data import *
import apex
from apex import amp
def main(args):
TRAIN_DIR = args.input_dir
ITERATIONS = 50000
# Dataset preparation
print("Preparing the dataset")
data_transforms = transforms.Compose([
transforms.RandomAffine(15),
transforms.ToTensor(),
])
train_ds = OmniglotTrain(TRAIN_DIR, transform=data_transforms)
trainLoader = DataLoader(train_ds, batch_size=128, shuffle=False, num_workers=4, pin_memory=True)
# Defining some model components
criterion = nn.BCEWithLogitsLoss()
net = SiameseNet().cuda()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
amp.initialize(net, optimizer, opt_level="O1")
# Initializing Apex
net.train()
optimizer.zero_grad()
train_loss = []
loss_val = 0
# Training Run
print(f"Training for {ITERATIONS} iterations")
for batch_id, (img1, img2, label) in tqdm(enumerate(trainLoader, 1)):
if batch_id > ITERATIONS:
break
img1, img2, label = Variable(img1.cuda(non_blocking=True)), Variable(img2.cuda(non_blocking=True)), Variable(label.cuda(non_blocking=True))
optimizer.zero_grad()
output = net.forward(img1, img2)
loss = criterion(output, label)
loss_val += loss.item()
train_loss.append(loss_val)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
if batch_id % 10000 == 0:
print("Saving model weights")
torch.save(net.state_dict(), args.save_path)
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument('-i', '--input_dir', help="Data directory")
ap.add_argument('-s', '--save_path', help="Path to save model weights", default=".")
main(ap.parse_args())
My Dataset class is as follows:
class OmniglotTrain(Dataset):
def __init__(self, directory, transform=None):
super(OmniglotTrain, self).__init__()
self.data, self.num_classes = self.load(directory)
self.transform = transform
def load(self, directory):
data = {}
idx = 0
for alphaPath in os.listdir(directory):
for charPath in os.listdir(os.path.join(directory, alphaPath)):
data[idx] = []
for sample in os.listdir(os.path.join(directory, alphaPath, charPath)):
with open(os.path.join(directory, alphaPath, charPath, sample), 'rb') as image:
data[idx].append(Image.open(image).convert('L'))
idx += 1
return data, idx
def __len__(self):
return 32460
def __getitem__(self, index):
label = None
img1 = None
img2 = None
# Get image from same class
if index % 2 == 1:
label = 1.0
idx1 = random.randint(0, self.num_classes-1)
image1 = random.choice(self.data[idx1])
image2 = random.choice(self.data[idx1])
# Get image from different class
else:
label = 0.0
idx1 = random.randint(0, self.num_classes-1)
idx2 = random.randint(0, self.num_classes-1)
while idx1 == idx2:
idx2 = random.randint(0, self.num_classes-1)
image1 = random.choice(self.data[idx1])
image2 = random.choice(self.data[idx2])
if self.transform:
image1 = self.transform(image1)
image2 = self.transform(image2)
return image1, image2, torch.from_numpy(np.array([label], dtype=np.float32))
Output in terminal:
root@0cda0d56c14f:/hdd/papers/siamese# python train.py -i "/hdd/supervised-reptile/data/omniglot"
Preparing the dataset
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods.
Defaults for this optimization level are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Training for 50000 iterations
254it [00:42, 6.99it/s]
root@0cda0d56c14f:/hdd/papers/siamese#
**Note, I have also tried disabling AMP, but the output is the same. This particular code also gave the same output regardless of whether I ran it using AMP in Docker, or using a conda environment on my local machine.
Hope anyone can help!