Difficulty Replicating Simple Binary Classification Tensorflow Results in PyTorch

Hello,

I am trying to train a model in PyTorch, which I have successfully trained in Tensorflow. However, in PyTorch, the model achieves random accuracy (it is a binary classification task). In Tensorflow, the performance is 100% (the task is easy). I’ve been working on this for over a week now, and cannot understand what the problem is.

The task is simple: Given an input sentence that describes a coordinate location, and a X coordinate, is the coordinate matching the sentence described? For example, if you are given a sentence saying “X coordinate is positive”, then the model should predict positive if the input X coordinate is > 0, else negative. Likewise, if the input sentence is “X coordinate is negative”, then the model should predict positive if the input X coordinate is < 0, else negative.

To simulate this for debugging, I created a very simple example with two sentences tokenized as [1, 5, 6, 8] and [1, 5, 6, 9]. For x coordinates, they can be either -0.5 or 0.5. If the sentence is [1, 5, 6, 8], then x should be 0.5 for a positive prediction, else the model should predict negative. The reverse applies for [1, 5, 6, 9].

The code for this task is as follows. Any help would be greately appreciated!

import torch
import torch.nn as nn
import numpy as np
import sklearn.metrics as skm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

n_epochs = 10
input_dim = 4    
hidden_dim = 128
layer_dim = 2
output_dim = 2
batch_size = 50

class FeatureDataSet(torch.utils.data.Dataset):

    def __init__(self, x_train, y_train, x_coordinates):
        self.x_train = torch.tensor(x_train, dtype=torch.long)
        self.y_train = torch.tensor(y_train)
        self.x_coordinates = torch.tensor(x_coordinates, dtype=torch.float32)
    def __len__(self):
        return len(self.y_train)
    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx], self.x_coordinates[idx]


class RNN(nn.Module):

    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, batch_size):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # linear layer to encode the coordinate
        self.encode_x = nn.Linear(1, hidden_dim).cuda()
        self._embeddings = nn.Embedding(40, 100).cuda()

        # hidden_dim is 128
        # layer_dim is 2
        self.lstm = nn.LSTM(100, hidden_dim, layer_dim, batch_first=True).cuda()
        self.fc = nn.Linear(2 * hidden_dim, output_dim).cuda()
        self.batch_size = batch_size
        self.hidden = None

    def init_hidden(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        return [t.cpu() for t in (h0, c0)]

    def forward(self, x, x_coordinate):
        #initializing the hidden states
        h0, c0 = self.init_hidden(x)
        embeds = self._embeddings(x)
        out, (hn, cn) = self.lstm(embeds.cuda(), (h0.cuda(), c0.cuda()))

        x_embed = F.relu(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
        representations_so_far_added = torch.cat([out[:, -1, :], x_embed], dim=1)

        out = self.fc(representations_so_far_added)
        return out

    

model = RNN(input_dim, hidden_dim, layer_dim, output_dim, batch_size)
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=0.001)


x_train = []
x_coordinates = []
y_train = []
for i in range(10000):
    # create the data. if x_coordinate is 0.5 and the sentence says that (represented by [1, 5, 6, 8]), then we should predict positive else negative (if the x_coordinate == -0.5)
    # same applies if the x_coordinate == -0.5, just that the sentence is now [1, 5, 6, 9]
    if np.random.randint(0, 2) == 0:
        if np.random.randint(0, 2) == 0:
            # x coordinate > 0
            x_train.append([1, 5, 6, 8])
            x_coordinates.append([0.5])
            y_train.append(1.0)
        else:
            # x coordinate > 0 negative
            x_train.append([1, 5, 6, 8])
            x_coordinates.append([-0.5])
            y_train.append(0.0)
    else:
        if np.random.randint(0, 2) == 0:
            # x coordinate < 0
            x_train.append([1, 5, 6, 9])
            x_coordinates.append([-0.5])
            y_train.append(1.0)
        else:
            # x coordinate < 0 negative
            x_train.append([1, 5, 6, 9])
            x_coordinates.append([0.5])
            y_train.append(0.0)

# print a sample of data 
print(x_train[:10])
print(y_train[:10])
print(x_coordinates[:10])

# create a dataloader
trainingDataset = FeatureDataSet(x_train=x_train, y_train=y_train, x_coordinates=x_coordinates)
train_loader = torch.utils.data.DataLoader(dataset=trainingDataset, batch_size=batch_size, shuffle=True)


# for each epoch
for epoch in range(1, n_epochs + 1):
    acc_all = []
    # each batch
    for i, (x_batch, y_batch, x_coord_batch) in enumerate(train_loader):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        x_coord_batch = x_coord_batch.to(device)

        opt.zero_grad()
        # pass in the text (x_batch) and coordinate (x_coord_batch)
        out = model(x_batch, x_coordinate=x_coord_batch)
        loss = criterion(out.float(), y_batch.type(torch.LongTensor).cuda())
        loss.backward()
        opt.step()

        pred_idx = F.log_softmax(out, dim=1)
        target_labels = y_batch.cpu().int()
        pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()

        curr_acc = skm.accuracy_score(target_labels, pred_labels)
        acc_all.append(curr_acc)

    # this is 50% for me, which is effectively random
    print(np.mean(acc_all))

I don’t understand why the performance is random!