There are snippets of code about batchnorm1d.
Even I set random seed and made the model as eval() or with torch.no_grad():, the output result was varied every time. But while I removed the batchnorm1d, or activation funtion F.relu(), the output maintained the same.  It’s a very stranger behavior, does anyone meet this or have already solved it ?
def set_seed(random_seed):
    np.random.seed(random_seed)
    random.seed(random_seed)
    
    
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data = dataset[0]
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_layer = 10):
        super().__init__()
        self.num_layer = num_layer
        
        self.conv_list = ModuleList([])
        self.bn_list = ModuleList([])
        self.cached = True
        
        
        self.conv_list.append(GCNConv(dataset.num_features, hidden_channels, cached=self.cached, bias=False))
        for _ in range(self.num_layer - 2):
            self.conv_list.append(GCNConv(hidden_channels, hidden_channels, cached=self.cached, bias=False))
        self.conv_list.append(GCNConv(hidden_channels, len(data.y.unique()), cached=self.cached, bias=False))
        
        for i in range(self.num_layer):
            self.bn_list.append(nn.BatchNorm1d(self.conv_list[i].out_channels, momentum=0.3))
            
            
    def forward(self, x, edge_index):
        
        for i in range(self.num_layer):
            if i == 0 or i == self.num_layer-1:
                x = F.dropout(x, 0.6, training=self.training)
            x = self.conv_list[i](x, edge_index)
            x = self.bn_list[i](x)
            x = F.relu(x)
            
            
        return x
set_seed(10)
model = GCN(hidden_channels=16).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x.to(device), data.edge_index.to(device))  # Perform a single forward pass.
      loss = criterion(out[data.train_mask].to(device), data.y[data.train_mask].to(device))  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss
def test():
    
    
    with torch.no_grad():
        model.eval()
        out = model(data.x.to(device), data.edge_index.to(device))
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred[data.test_mask].to(device) == data.y[data.test_mask].to(device)  # Check against ground-truth labels.
    test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
    return test_acc
for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')