Non-deterministic evaluating results with different batch sizes

Hi, colleagues. We have a problem with determinism with different batch sizes
We are evaluating a GRU model, but with different batch sizes we get different results after concatenation. We use flag CUBLAS_WORKSPACE_CONFIG=:4096:2, but it does not help.
Their absolute difference is stable, namely 1.3411e-07, but it is important for our task to get equal results with different batch sizes.
Here is link for input tensor.

import torch
from torch import nn

#Simple GRUModel
class GRUModel(nn.Module):
    def __init__(self, data_dim, hidden_size, num_layers, dropout_prob, output_size, device):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.gru = nn.GRU(input_size=data_dim, hidden_size=self.hidden_size, num_layers=self.num_layers,
                          dropout=dropout_prob, batch_first=True)
        self.linear = nn.Linear(self.hidden_size, self.output_size)
        self.device = device

    def forward(self, x, hidden):
        x, hidden = self.gru(x, hidden)
        x = x.contiguous().view(x.size(0), -1)
        x = self.linear(x)
        return x, hidden

    def init_hidden_state(self, batch_size, dtype=torch.float32):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size, dtype=dtype, device=self.device)
        return hidden

#Setting seeds
torch.set_num_threads(1)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda')
batch_x = torch.load("batch_x0", map_location=device)
batch_x = batch_x.float()
batch_x.to(device)

model = GRUModel(data_dim=256, hidden_size=2, num_layers=1, dropout_prob=0, output_size=4, device=device)
model.to(device)
model.eval()

all_probs = []

for i, el in enumerate(batch_x):
    hidden_ = model.init_hidden_state(1)
    el = el.unsqueeze(0)
    batch_probs, _ = model(el, hidden_)
    all_probs.append(batch_probs)

all_probs = torch.stack(all_probs).squeeze(1)
batch_probs, hidden = model(batch_x, model.init_hidden_state(32))
print(torch.equal(all_probs, batch_probs))

You could try to disable the cublas workspace by setting both sizes to zero via =:0:0, but I don’t think you can assume that potentially different algorithms will yield bitwise identical results.

1 Like