Training terminates halfway without any errors

Hello all! I am trying to train a simple PyTorch model implemented from the Siamese CNN paper. The training runs just fine and all, but the run seems to terminate prematurely at the same iteration (254) each time I run it. There is no error code, and adding in a line of code to print the output each iteration shows no visible bugs as well.

My training code is as follow:

from tqdm import tqdm
import argparse
import torch
import torch.nn as nn
import torchvision
from torch.autograd import Variable
from torchvision import transforms
import numpy as np
import pickle
from torch.utils.data import DataLoader
from siameseNet import SiameseNet
import os
from data import *
import apex
from apex import amp

def main(args):
    TRAIN_DIR = args.input_dir
    ITERATIONS = 50000

    # Dataset preparation
    print("Preparing the dataset")
    data_transforms = transforms.Compose([
        transforms.RandomAffine(15),
        transforms.ToTensor(),
    ])

    train_ds = OmniglotTrain(TRAIN_DIR, transform=data_transforms)
    trainLoader = DataLoader(train_ds, batch_size=128, shuffle=False, num_workers=4, pin_memory=True)

    # Defining some model components
    criterion = nn.BCEWithLogitsLoss()
    net = SiameseNet().cuda()
    optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
    amp.initialize(net, optimizer, opt_level="O1")

    # Initializing Apex
    net.train()
    optimizer.zero_grad()

    train_loss = []
    loss_val = 0

    # Training Run
    print(f"Training for {ITERATIONS} iterations")
    for batch_id, (img1, img2, label) in tqdm(enumerate(trainLoader, 1)):
        if batch_id > ITERATIONS:
            break
        img1, img2, label = Variable(img1.cuda(non_blocking=True)), Variable(img2.cuda(non_blocking=True)), Variable(label.cuda(non_blocking=True))

        optimizer.zero_grad()
        output = net.forward(img1, img2)
        loss = criterion(output, label)

        loss_val += loss.item()
        train_loss.append(loss_val)
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        optimizer.step()

        if batch_id % 10000 == 0:
            print("Saving model weights")
            torch.save(net.state_dict(), args.save_path)

if __name__ == '__main__':
    ap = argparse.ArgumentParser()
    ap.add_argument('-i', '--input_dir', help="Data directory")
    ap.add_argument('-s', '--save_path', help="Path to save model weights", default=".")
    main(ap.parse_args())

My Dataset class is as follows:

class OmniglotTrain(Dataset):
    def __init__(self, directory, transform=None):
        super(OmniglotTrain, self).__init__()
        self.data, self.num_classes = self.load(directory)
        self.transform = transform

    def load(self, directory):
        data = {}
        idx = 0
        for alphaPath in os.listdir(directory):
            for charPath in os.listdir(os.path.join(directory, alphaPath)):
                data[idx] = []
                for sample in os.listdir(os.path.join(directory, alphaPath, charPath)):
                    with open(os.path.join(directory, alphaPath, charPath, sample), 'rb') as image:
                        data[idx].append(Image.open(image).convert('L'))
                idx += 1

        return data, idx

    def __len__(self):
        return 32460

    def __getitem__(self, index):
        label = None
        img1 = None
        img2 = None
        # Get image from same class
        if index % 2 == 1:
            label = 1.0
            idx1 = random.randint(0, self.num_classes-1)
            image1 = random.choice(self.data[idx1])
            image2 = random.choice(self.data[idx1])
        # Get image from different class
        else:
            label = 0.0
            idx1 = random.randint(0, self.num_classes-1)
            idx2 = random.randint(0, self.num_classes-1)
            while idx1 == idx2:
                idx2 = random.randint(0, self.num_classes-1)
            image1 = random.choice(self.data[idx1])
            image2 = random.choice(self.data[idx2])

        if self.transform:
            image1 = self.transform(image1)
            image2 = self.transform(image2)

        return image1, image2, torch.from_numpy(np.array([label], dtype=np.float32))

Output in terminal:

root@0cda0d56c14f:/hdd/papers/siamese# python train.py -i "/hdd/supervised-reptile/data/omniglot"
Preparing the dataset
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Training for 50000 iterations
254it [00:42,  6.99it/s]
root@0cda0d56c14f:/hdd/papers/siamese#

**Note, I have also tried disabling AMP, but the output is the same. This particular code also gave the same output regardless of whether I ran it using AMP in Docker, or using a conda environment on my local machine.
Hope anyone can help!

Could you check the length of your DataLoader?
Maybe your training just finishes before reaching 10000 batches?

1 Like

Ahhh that explains it! My DataLoader has a length of 254. Which corresponds correctly to the __len__ value my Dataset has (32460 // 128 = 254). Thank you!