I have been trying to implement the network described in A simple neural network for relational reasoning for the CLEVR dataset. I am facing the issue as follows:
My training accuracy keeps on increasing over epochs but my validation gets stuck after 10th epoch ~50%. I have tried removing the data augmentation based on the suggestion by Kim Heecheol whose implementation of the same for sort-of-clevr dataset can be found here.
I am still a novice in the process of learning how to implement such networks, please let me know if there are any issues on my end. I have tried to follow the paper as closely as possible.
My implementation can be found here. A snippet of my module is as follows:
class RelationalNetwork(nn.Module):
def __init__(self):
super(RelationalNetwork, self).__init__()
# Define the parameters for the RN network
self.conv_layer_channels = [24, 24, 24, 24] # Can be substituted by some other file
self.in_dim = cfg.TRAIN.IMG_DIM # Working only on CLEVR
self.g_theta_units = [256, 256, 256, 256] # Can be substituted as well
self.question_vector_size = cfg.TRAIN.QUESTION_VECTOR_SIZE
self.embedding_dim = cfg.TRAIN.EMBEDDING_DIM
self.vocab_size = cfg.TRAIN.VOCAB_SIZE
self.answer_size = cfg.TRAIN.ANSWER_SIZE
self.batch_size = cfg.TRAIN.BATCH_SIZE
self.use_cuda = cfg.TRAIN.USE_CUDA # Can be set using args and thus can be substituted
self.rnn_type = cfg.TRAIN.RNN_TYPE
self.n_layers = 1
# Define the word embedding for the input questions
self.question_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=0)
# Define the lstm to process the questions
self.lstm = nn.LSTM(self.embedding_dim, self.question_vector_size, num_layers=1)
# Initialize the hidden state of the lstm
# TODO: Check different initializations of the hidden state, currently let them default to zero
# self.hidden = self.init_hidden()
# Define the other layers of the relational network
self.convolutional_layer()
self.g_theta_layer()
self.f_phi_layer()
def init_hidden(self, x=None):
if self.rnn_type == 'lstm':
# As I am using 4 GPUs
if x == None:
return (Variable(torch.zeros(self.n_layers, self.batch_size / 4, self.question_vector_size)),
Variable(torch.zeros(self.n_layers, self.batch_size / 4, self.question_vector_size)))
else:
return (Variable(x[0].data), Variable(x[1].data)) # TODO: Problem might be here
def convolutional_layer(self):
self.conv1 = nn.Conv2d(self.in_dim, self.conv_layer_channels[0], 3, stride=2, padding=1)
self.bn1 = nn.BatchNorm2d(self.conv_layer_channels[0])
self.conv2 = nn.Conv2d(self.conv_layer_channels[0], self.conv_layer_channels[1], 3, stride=2, padding=1)
self.bn2 = nn.BatchNorm2d(self.conv_layer_channels[1])
self.conv3 = nn.Conv2d(self.conv_layer_channels[1], self.conv_layer_channels[2], 3, stride=2, padding=1)
self.bn3 = nn.BatchNorm2d(self.conv_layer_channels[2])
self.conv4 = nn.Conv2d(self.conv_layer_channels[2], self.conv_layer_channels[3], 3, stride=2, padding=1)
self.bn4 = nn.BatchNorm2d(self.conv_layer_channels[3])
def g_theta_layer(self):
self.g_fc1 = nn.Linear((self.conv_layer_channels[3] + 2) * 2 + self.question_vector_size, 256)
self.g_fc2 = nn.Linear(256, 256)
self.g_fc3 = nn.Linear(256, 256)
self.g_fc4 = nn.Linear(256, 256)
self.coord_oi = torch.FloatTensor(self.batch_size, 2)
self.coord_oj = torch.FloatTensor(self.batch_size, 2)
if self.use_cuda:
self.coord_oi = self.coord_oi.cuda()
self.coord_oj = self.coord_oj.cuda()
self.coord_oi = Variable(self.coord_oi)
self.coord_oj = Variable(self.coord_oj)
# For preparing the coord tensor, use the '1' dim as 64 because the size of the conv_feature_map
# is [BS x 24 x 8 x 8] thus forming a 64 object feature map for each image of the mini-batch.
self.coord_tensor = torch.FloatTensor(self.batch_size / 4, 64, 2)
if self.use_cuda:
self.coord_tensor = self.coord_tensor.cuda()
self.coord_tensor = Variable(self.coord_tensor)
np_coord_tensor = np.zeros((self.batch_size / 4, 64, 2))
for obj in range(64):
np_coord_tensor[:, obj, :] = np.array(self.cvt_coord(obj))
self.coord_tensor.data.copy_(torch.from_numpy(np_coord_tensor))
# Size of the coord tensor should be (64x64x2)
# Changing this based on the size of the convolution feature map
def cvt_coord(self, i):
ret_list = [(i/8-2)/2.0, (i%8-2)/2.0]
return ret_list
def f_phi_layer(self):
self.f_fc1 = nn.Linear(256, 256)
self.f_fc2 = nn.Linear(256, 256)
self.f_fc3 = nn.Linear(256, self.answer_size)
def apply_convolution(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
x = F.relu(self.bn4(self.conv4(x)))
return x
def apply_g_theta(self, conv_feature_map, question_vector):
x = conv_feature_map
# The code below is adopted from:
# https://github.com/kimhc6028/relational-networks
# Instead of using for loops, accessing the objects for g_theta in a vectorized manner.
mb = self.batch_size / 4
num_channels = self.conv_layer_channels[-1]
d = x.size()[2]
# Create x_flat
x_flat = x.view(mb, num_channels, d*d).permute(0, 2, 1)
# add coordinates
x_flat = torch.cat([x_flat, self.coord_tensor], 2)
# add questions everywhere
question_vector = torch.unsqueeze(question_vector, 1)
question_vector = question_vector.repeat(1, 64, 1)
question_vector = torch.unsqueeze(question_vector, 2)
# cast pairs against each other
x_i = torch.unsqueeze(x_flat, 1)
x_i = x_i.repeat(1, 64, 1, 1)
x_j = torch.unsqueeze(x_flat, 2)
x_j = torch.cat([x_j, question_vector], 3)
x_j = x_j.repeat(1, 1, 64, 1)
# concatenate everything to create x_full
x_full = torch.cat([x_i, x_j], 3)
# reshape for the network
x_ = x_full.view(mb*d*d*d*d, 26+26+128)
x_ = F.relu(self.g_fc1(x_))
x_ = F.relu(self.g_fc2(x_))
x_ = F.relu(self.g_fc3(x_))
x_ = F.relu(self.g_fc4(x_))
# reshape and sum for the f_phi network
x_g = x_.view(mb, d*d*d*d, 256)
x_g = x_g.sum(1).squeeze()
return x_g
def apply_f_phi(self, x_g):
x_f = F.relu(self.f_fc1(x_g))
x_f = F.dropout(F.relu(self.f_fc2(x_f)))
x_f = self.f_fc3(x_f)
f_phi_out = F.log_softmax(x_f)
return f_phi_out
def forward(self, image, question_vector):
question_vector = self.question_embeddings(question_vector)
question_vector = question_vector.permute(1, 0, 2)
# Pass the question vector through the lstm to get the final state vector out
self.lstm.flatten_parameters()
out_question_vector, out_hidden = self.lstm(question_vector)
self.lstm.flatten_parameters()
out_question_vector = out_question_vector[-1]
conv_feature_map = self.apply_convolution(image)
g_theta_output = self.apply_g_theta(conv_feature_map=conv_feature_map, question_vector=out_question_vector)
f_phi_out = self.apply_f_phi(g_theta_output)
return f_phi_out
Feel free to comment on this.