Hi, colleagues. We have a problem with determinism with different batch sizes
We are evaluating a GRU model, but with different batch sizes we get different results after concatenation. We use flag CUBLAS_WORKSPACE_CONFIG=:4096:2, but it does not help.
Their absolute difference is stable, namely 1.3411e-07, but it is important for our task to get equal results with different batch sizes.
Here is link for input tensor.
import torch
from torch import nn
#Simple GRUModel
class GRUModel(nn.Module):
def __init__(self, data_dim, hidden_size, num_layers, dropout_prob, output_size, device):
super(GRUModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.gru = nn.GRU(input_size=data_dim, hidden_size=self.hidden_size, num_layers=self.num_layers,
dropout=dropout_prob, batch_first=True)
self.linear = nn.Linear(self.hidden_size, self.output_size)
self.device = device
def forward(self, x, hidden):
x, hidden = self.gru(x, hidden)
x = x.contiguous().view(x.size(0), -1)
x = self.linear(x)
return x, hidden
def init_hidden_state(self, batch_size, dtype=torch.float32):
hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size, dtype=dtype, device=self.device)
return hidden
#Setting seeds
torch.set_num_threads(1)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device('cuda')
batch_x = torch.load("batch_x0", map_location=device)
batch_x = batch_x.float()
batch_x.to(device)
model = GRUModel(data_dim=256, hidden_size=2, num_layers=1, dropout_prob=0, output_size=4, device=device)
model.to(device)
model.eval()
all_probs = []
for i, el in enumerate(batch_x):
hidden_ = model.init_hidden_state(1)
el = el.unsqueeze(0)
batch_probs, _ = model(el, hidden_)
all_probs.append(batch_probs)
all_probs = torch.stack(all_probs).squeeze(1)
batch_probs, hidden = model(batch_x, model.init_hidden_state(32))
print(torch.equal(all_probs, batch_probs))