Why my code is still not deterministic?

Hi, I dealt with those issues before, and all my CNNs gave me determenistic results, now that I’m using RNNs I’m getting non-determenistic results again.
First all the models are running on the same gpu.
This is my code (in short):

if __name__ == "__main__":
    args = parser.parse_args()
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    reviews = pd.read_csv("Reviews.csv")
    reviews['review_length'] = reviews['review'].apply(lambda x: len(x.split()))
    tok = spacy.load('en_core_web_sm')

   ....some data manipulation....
    X = list(reviews['encoded'])
    y = list(reviews['rating'])
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=args.seed)#X[:haluka],X[haluka:],y[:haluka],y[haluka:]#
    train_ds = ReviewsDataset(X_train, y_train)
    valid_ds = ReviewsDataset(X_valid, y_valid)

    train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True)
    val_dl = DataLoader(valid_ds, batch_size=args.batch_size)
    model_fixed = LSTM_fixed_len(vocab_size, 50, 50)

    train_model(model_fixed.cuda(), epochs=args.epochs, lr=0.01)

One of my thoughts was that “train_test_split” may cause it, but even when straight forward splitting I got non-deterministic results.

I’ll add the functions:

class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')  # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

def train_model(model, epochs, lr=0.001):
    global best_acc, best_histo, best_epoch
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        sum_loss = 0.0
        total = 0
        for batch_idx,(x, y, l) in enumerate(train_dl):
            x = x.long().cuda()
            y = y.long().cuda()
            l = l.cuda()
            y_pred = model(x,l)
            loss = F.cross_entropy(y_pred, y)
            sum_loss += loss.item() * y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, histogram  = validation_metrics(model, val_dl)
        if val_acc > best_acc:
           best_acc = val_acc
           best_histo = histogram 
           best_epoch = i

def validation_metrics(model, valid_dl):
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long().cuda()
        y = y.long().cuda()
        l = l.cuda()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item() * y.shape[0]
    return sum_loss / total, correct / total

class LSTM_fixed_len(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)

    def forward(self, x, l):
        x = self.embeddings(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

I tried to make the code short as possible, but I wanted to show most of it since I don’t know what can make things non-deterministic, I’m NLP newbie so maybe something in the tokenizing process, something in the lstms/embedding?

I’ll be happy for your help, if you got any clue.
two comments:

  1. Of course that the different results come from the same seed.
  2. Just wanted to clarify that the code is not mine, and I’m claiming it to be.

Could you just use your nn.LSTM module and check if the outputs are deterministic or not?
If not, which CUDA and cudnn version are you using?

Ok, I did:

class LSTM_fixed_len(torch.nn.Module):
    def __init__(self):
        self.lstm = nn.LSTM(3,3, batch_first=True)

    def forward(self, x, l):
        lstm_out, (ht, ct) = self.lstm(x)
        return ht[-1]

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
vec = torch.normal(torch.zeros(2,2,3),torch.ones(2,2,3))#.view(1,3)
vec1 = torch.normal(torch.zeros(2,2,3),torch.ones(2,2,3))#.view(1,3)

model = LSTM_fixed_len()
out = model(vec,vec1)

And got determenistic results

What is our next step?