Training slows down over time with each iteration

I am trying to implement the relational network paper and facing the issue of training time increasing with each iteration in a haphazard manner as follows:

I am running this across 4 TitanX GPUs on linux 14.04. I have looked up this thread and tried to initialize the hidden state without letting them to be at default but started facing illegal memory access errors.

I have also tried launhing the code using CUDA_LAUNCH_BLOCKING=1 python trainer.py but this took what seemed like forever to start the training so I didn’t do it. I have currently stored the dataset in .h5 files and taking in the batches from it.

Please find the code of my model below:

class RelationalNetwork(nn.Module):

    def __init__(self):
        super(RelationalNetwork, self).__init__()

        # Define the parameters for the RN network
        self.conv_layer_channels = [24, 24, 24, 24] # Can be substituted by some other file
        self.in_dim = cfg.TRAIN.IMG_DIM # Working only on CLEVR
        self.g_theta_units = [256, 256, 256, 256] # Can be substituted as well
        self.question_vector_size = cfg.TRAIN.QUESTION_VECTOR_SIZE
        self.embedding_dim = cfg.TRAIN.EMBEDDING_DIM
        self.vocab_size = cfg.TRAIN.VOCAB_SIZE
        self.answer_size = cfg.TRAIN.ANSWER_SIZE
        self.batch_size = cfg.TRAIN.BATCH_SIZE
        self.use_cuda = cfg.TRAIN.USE_CUDA # Can be set using args and thus can be substituted
        self.rnn_type = cfg.TRAIN.RNN_TYPE
        self.n_layers = 1

        # Define the word embedding for the input questions
        self.question_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=0)

        # Define the lstm to process the questions
        self.lstm = nn.LSTM(self.embedding_dim, self.question_vector_size, num_layers=1)

        # Initialize the hidden state of the lstm
        # TODO: Check different initializations of the hidden state, currently let them default to zero
        # self.hidden = self.init_hidden()

        # Define the other layers of the relational network
        self.convolutional_layer()
        self.g_theta_layer()
        self.f_phi_layer()

    def init_hidden(self, x=None):
        if self.rnn_type == 'lstm':

            # As I am using 4 GPUs
            if x == None:
                return (Variable(torch.zeros(self.n_layers, self.batch_size / 4, self.question_vector_size)),
                        Variable(torch.zeros(self.n_layers, self.batch_size / 4, self.question_vector_size)))
            else:
                return (Variable(x[0].data), Variable(x[1].data))  # TODO: Problem might be here

    def convolutional_layer(self):
        self.conv1 = nn.Conv2d(self.in_dim, self.conv_layer_channels[0], 3, stride=2, padding=1)
        self.bn1 = nn.BatchNorm2d(self.conv_layer_channels[0])
        self.conv2 = nn.Conv2d(self.conv_layer_channels[0], self.conv_layer_channels[1], 3, stride=2, padding=1)
        self.bn2 = nn.BatchNorm2d(self.conv_layer_channels[1])
        self.conv3 = nn.Conv2d(self.conv_layer_channels[1], self.conv_layer_channels[2], 3, stride=2, padding=1)
        self.bn3 = nn.BatchNorm2d(self.conv_layer_channels[2])
        self.conv4 = nn.Conv2d(self.conv_layer_channels[2], self.conv_layer_channels[3], 3, stride=2, padding=1)
        self.bn4 = nn.BatchNorm2d(self.conv_layer_channels[3])

    def g_theta_layer(self):
        self.g_fc1 = nn.Linear((self.conv_layer_channels[3] + 2) * 2 + self.question_vector_size, 256)
        self.g_fc2 = nn.Linear(256, 256)
        self.g_fc3 = nn.Linear(256, 256)
        self.g_fc4 = nn.Linear(256, 256)

        self.coord_oi = torch.FloatTensor(self.batch_size, 2)
        self.coord_oj = torch.FloatTensor(self.batch_size, 2)
        if self.use_cuda:
            self.coord_oi = self.coord_oi.cuda()
            self.coord_oj = self.coord_oj.cuda()
        self.coord_oi = Variable(self.coord_oi)
        self.coord_oj = Variable(self.coord_oj)
        # For preparing the coord tensor, use the '1' dim as 64 because the size of the conv_feature_map
        # is [BS x 24 x 8 x 8] thus forming a 64 object feature map for each image of the mini-batch.
        self.coord_tensor = torch.FloatTensor(self.batch_size / 4, 64, 2)
        if self.use_cuda:
            self.coord_tensor = self.coord_tensor.cuda()
        self.coord_tensor = Variable(self.coord_tensor)
        np_coord_tensor = np.zeros((self.batch_size / 4, 64, 2))
        for obj in range(64):
            np_coord_tensor[:, obj, :] = np.array(self.cvt_coord(obj))
        self.coord_tensor.data.copy_(torch.from_numpy(np_coord_tensor))
        # Size of the coord tensor should be (64x64x2)

    # Changing this based on the size of the convolution feature map
    def cvt_coord(self, i):
        ret_list = [(i/8-2)/2.0, (i%8-2)/2.0]
        return ret_list

    def f_phi_layer(self):
        self.f_fc1 = nn.Linear(256, 256)
        self.f_fc2 = nn.Linear(256, 256)
        self.f_fc3 = nn.Linear(256, self.answer_size)

    def apply_convolution(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        return x


    def apply_g_theta(self, conv_feature_map, question_vector):
        x = conv_feature_map

        # The code below is adopted from:
        # https://github.com/kimhc6028/relational-networks
        # Instead of using for loops, accessing the objects for g_theta in a vectorized manner.
        mb = self.batch_size / 4
        num_channels = self.conv_layer_channels[-1]
        d = x.size()[2]

        # Create x_flat
        x_flat = x.view(mb, num_channels, d*d).permute(0, 2, 1)

        # add coordinates
        x_flat = torch.cat([x_flat, self.coord_tensor], 2)

        # add questions everywhere
        question_vector = torch.unsqueeze(question_vector, 1)
        question_vector = question_vector.repeat(1, 64, 1)
        question_vector = torch.unsqueeze(question_vector, 2)

        # cast pairs against each other
        x_i = torch.unsqueeze(x_flat, 1)
        x_i = x_i.repeat(1, 64, 1, 1)
        x_j = torch.unsqueeze(x_flat, 2)
        x_j = torch.cat([x_j, question_vector], 3)
        x_j = x_j.repeat(1, 1, 64, 1)

        # concatenate everything to create x_full
        x_full = torch.cat([x_i, x_j], 3)
        
        # reshape for the network
        x_ = x_full.view(mb*d*d*d*d, 26+26+128)

        x_ = F.relu(self.g_fc1(x_))
        x_ = F.relu(self.g_fc2(x_))
        x_ = F.relu(self.g_fc3(x_))
        x_ = F.relu(self.g_fc4(x_))

        # reshape and sum for the f_phi network
        x_g = x_.view(mb, d*d*d*d, 256)
        x_g = x_g.sum(1).squeeze()
        return x_g

    def apply_f_phi(self, x_g):
        x_f = F.relu(self.f_fc1(x_g))
        x_f = F.dropout(F.relu(self.f_fc2(x_f)))
        x_f = self.f_fc3(x_f)
        f_phi_out = F.log_softmax(x_f)
        return f_phi_out
    
    def forward(self, image, question_vector):
        question_vector = self.question_embeddings(question_vector)
        question_vector = question_vector.permute(1, 0, 2)

        # Pass the question vector through the lstm to get the final state vector out
        self.lstm.flatten_parameters()
        out_question_vector, out_hidden = self.lstm(question_vector)
        self.lstm.flatten_parameters()
        out_question_vector = out_question_vector[-1]

        conv_feature_map = self.apply_convolution(image)
        g_theta_output = self.apply_g_theta(conv_feature_map=conv_feature_map, question_vector=out_question_vector)
        f_phi_out = self.apply_f_phi(g_theta_output)

        return f_phi_out

This is without the re-initialization of the hidden state. I tried that but received an illegal memory access error which I think is because of the else case in the init_hidden method.

My training file is as below:


# Define the model and port it to gpu
model = RelationalNetwork()

if torch.cuda.device_count() > 1:
    print("Using ", torch.cuda.device_count(), " GPUs")
    model = nn.DataParallel(model)

if args.cuda:
    model = model.cuda()

# Define the optimizer for training the network
optimizer = optim.Adam(model.parameters(), lr=cfg.TRAIN.LEARNING_RATE)
# criterion = nn.CrossEntropyLoss()

# Define the train step
def train(updates_per_epoch):
    train_loss = 0.0
    train_accuracy = 0.0
    for iteration in range(updates_per_epoch):
        pbar.update(iteration)
        images, questions, answers = train_dataset.next_batch(args.batch_size)

        # Convert the input images into tensor Variables
        images = Variable(torch.from_numpy(images).permute(0, 3, 1, 2).float())

        # Process the questions and answers separately
        questions = Variable(torch.LongTensor(questions))
        answers = Variable(torch.LongTensor(answers)).view(args.batch_size)

        if args.cuda:
            images = images.cuda()
            questions = questions.cuda()
            answers = answers.cuda()

        model.zero_grad()
        answers_hat = model(images, questions)
        loss = F.nll_loss(answers_hat, answers)
        # print("The loss is getting calculated here")
        loss.backward()
        optimizer.step()
        train_loss += loss.data[0]
        pred = answers_hat.data.max(1)[1]
        correct = pred.eq(answers.data).cpu().sum()
        accuracy = correct * 100. / len(answers)
        train_accuracy += accuracy
    train_accuracy = train_accuracy / updates_per_epoch
    train_loss = train_loss / updates_per_epoch
    return train_loss, train_accuracy

# Define the validation step, currently doing it for 10 random batches
def val():
    model.eval()
    val_accuracy = 0.0
    for _ in range(10):
        val_images, val_questions, val_answers = val_dataset.next_batch(args.batch_size)

        # Convert the validation images into tensor Variables
        val_images = Variable(torch.from_numpy(val_images).permute(0, 3, 1, 2).float())

        # Process the questions and the answers
        val_questions = Variable(torch.LongTensor(val_questions))
        val_answers = Variable(torch.LongTensor(val_answers)).view(args.batch_size)

        if args.cuda:
            val_images = val_images.cuda()
            val_questions = val_questions.cuda()
            val_answers = val_answers.cuda()

        val_answers_hat = model(val_images, val_questions)
        val_pred = val_answers_hat.data.max(1)[1]
        val_correct = val_pred.eq(val_answers.data).cpu().sum()
        accuracy = val_correct * 100.0 / len(val_answers)
        val_accuracy += accuracy
    val_accuracy = val_accuracy / 10.0
    return val_accuracy


# Define the parameters to be used by the progress bar
number_examples = train_dataset._num_examples  # See this value and if this works.
updates_per_epoch = number_examples // cfg.TRAIN.BATCH_SIZE

# Create one-hot answers dictionary to be used in prepare answers
with open('./data/clver_rn/answer_to_ix.json', 'r') as answer_file:
    answer_to_ix = json.load(answer_file)

answer_to_one_hot = {}
one_hot_init_vector = [0] * len(answer_to_ix)

# Set up the training loop
for epoch in range(1, args.epochs + 1):
    epoch_start_time = time.time()
    widgets = ['epoch #%d|' % epoch, Percentage(), Bar(), ETA()]
    pbar = ProgressBar(maxval=updates_per_epoch, widgets=widgets)
    pbar.start()
    
    # Call the train and the test step for the dataset
    epoch_loss, epoch_accuracy = train(updates_per_epoch)
    log_line_train = '%s: %s; %s: %s; ' % ("Training Loss", epoch_loss, "Training Accuracy", epoch_accuracy)
    val_accuracy = val()
    log_line_val = '%s: %s ' % ("Validation Accuracy", val_accuracy)
    epoch_end_time = time.time()
    time_taken = epoch_end_time - epoch_start_time
    log_time_line = '%s: %s' % ("Time taken for the current epoch", time_taken)
    print("Epoch %d | " % (epoch) + log_line_train + log_line_val + log_time_line)
    sys.stdout.flush()

Any advice to solve this issue is appreciated.

I ran my code on another GPU server and each epoch now consistently takes around 680 seconds which is significantly less than the previous run. I guess there is something wrong with my environment.

Any suggestions on what might be the possible issues or how to make the above implementation faster are welcome.