Have no cue about the batchnormalization1d impacts the reproducibility or may be a bug

ZhaoBo_Hu · April 12, 2023, 6:58am

There are snippets of code about batchnorm1d.
Even I set random seed and made the model as eval() or with torch.no_grad():, the output result was varied every time. But while I removed the batchnorm1d, or activation funtion F.relu(), the output maintained the same. It’s a very stranger behavior, does anyone meet this or have already solved it ?

def set_seed(random_seed):
    np.random.seed(random_seed)
    random.seed(random_seed)
    
    
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data = dataset[0]

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_layer = 10):
        super().__init__()
        self.num_layer = num_layer
        
        self.conv_list = ModuleList([])
        self.bn_list = ModuleList([])

        self.cached = True
        
        
        self.conv_list.append(GCNConv(dataset.num_features, hidden_channels, cached=self.cached, bias=False))
        for _ in range(self.num_layer - 2):
            self.conv_list.append(GCNConv(hidden_channels, hidden_channels, cached=self.cached, bias=False))
        self.conv_list.append(GCNConv(hidden_channels, len(data.y.unique()), cached=self.cached, bias=False))
        
        for i in range(self.num_layer):
            self.bn_list.append(nn.BatchNorm1d(self.conv_list[i].out_channels, momentum=0.3))
            
            
    def forward(self, x, edge_index):
        
        for i in range(self.num_layer):
            if i == 0 or i == self.num_layer-1:
                x = F.dropout(x, 0.6, training=self.training)
            x = self.conv_list[i](x, edge_index)
            x = self.bn_list[i](x)
            x = F.relu(x)
            
            
        return x

set_seed(10)
model = GCN(hidden_channels=16).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x.to(device), data.edge_index.to(device))  # Perform a single forward pass.
      loss = criterion(out[data.train_mask].to(device), data.y[data.train_mask].to(device))  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
    
    
    with torch.no_grad():
        model.eval()
        out = model(data.x.to(device), data.edge_index.to(device))
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred[data.test_mask].to(device) == data.y[data.test_mask].to(device)  # Check against ground-truth labels.
    test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
    return test_acc


for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

J_Johnson · April 12, 2023, 7:25am

Welcome to the forums!

How are you defining GCNConv? It is not clear from your code and this class does not exist natively in PyTorch.

At any rate, I’m not finding this issue with the BatchNorm1d layers with ReLU. The output is unchanged. Here is a simple example:

import torch
import torch.nn as nn

model = nn.Sequential(nn.Conv1d(3, 64, kernel_size=3, padding=1),
                      nn.BatchNorm1d(64), nn.ReLU(),
                      nn.Conv1d(64, 1, kernel_size=3),
                      nn.BatchNorm1d(1), nn.ReLU(),
                      nn.AdaptiveAvgPool1d(1))

temp_data=torch.rand((1, 3, 128))

model.eval()
with torch.no_grad():
    while True:
        print(model(temp_data))

If you mean you are getting a difference between having .eval() on and off, that is to be expected. See here and here.

ZhaoBo_Hu · April 12, 2023, 9:11am

Hi johnson, GCNConv is from Library PyTorch Geometrics, a powerful library for graph neural network, I believe that the issue not coming from there, this is the reference website : GCNConv

And this is the complete code, would you mind testing on your side for multiple times to check if the output varied ?

import torch
import torch.nn.functional as F
from torch.nn import ModuleList, ReLU, Dropout
from torch import nn
from torch_geometric.datasets import Planetoid, CitationFull, Coauthor, Amazon
from torch_geometric.transforms import NormalizeFeatures

from torch_geometric.nn import GCNConv, GATConv


import random
import pandas as pd
import numpy as np

def set_seed(random_seed):
    np.random.seed(random_seed)
    random.seed(random_seed)
    
    
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_layer = 10):
        super().__init__()
        self.num_layer = num_layer
        
        self.conv_list = ModuleList([])
        self.bn_list = ModuleList([])

        self.cached = True
        
        
        self.conv_list.append(GCNConv(dataset.num_features, hidden_channels, cached=self.cached, bias=False))
        for _ in range(self.num_layer - 2):
            self.conv_list.append(GCNConv(hidden_channels, hidden_channels, cached=self.cached, bias=False))
        self.conv_list.append(GCNConv(hidden_channels, len(data.y.unique()), cached=self.cached, bias=False))
        
        for i in range(self.num_layer):
            self.bn_list.append(nn.BatchNorm1d(self.conv_list[i].out_channels, momentum=0.3))
            
            
    def forward(self, x, edge_index):
        
        for i in range(self.num_layer):
            if i == 0 or i == self.num_layer-1:
                x = F.dropout(x, 0.6, training=self.training)
            x = self.conv_list[i](x, edge_index)
            x = self.bn_list[i](x)
            x = F.relu(x)
            
            
        return x

dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data = dataset[0]


set_seed(10)
model = GCN(hidden_channels=16).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x.to(device), data.edge_index.to(device))  # Perform a single forward pass.
      loss = criterion(out[data.train_mask].to(device), data.y[data.train_mask].to(device))  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
    
    
    with torch.no_grad():
        model.eval()
        out = model(data.x.to(device), data.edge_index.to(device))
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred[data.test_mask].to(device) == data.y[data.test_mask].to(device)  # Check against ground-truth labels.
    test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
    return test_acc


for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

J_Johnson · April 12, 2023, 9:20am

It is not necessary for me to install another library to demonstrate that .eval() and .train() change the outputs of batchnorm and dropout layers.

import torch
import torch.nn as nn

model = nn.Sequential(nn.BatchNorm1d(1), nn.AdaptiveAvgPool1d(1))

data = torch.tensor([[[0.1, 0.3, 0.2]]])

print(model(data))
model.eval()
print(model(data))

tensor([[[3.9736e-08]]], grad_fn=<SqueezeBackward1>)
tensor([[[0.1896]]], grad_fn=<SqueezeBackward1>)

That is exactly as expected.

ZhaoBo_Hu · April 12, 2023, 9:30am

Hi, the issue is not about the on or off eval()

I check my code on CPU, output result is unchange every time but GPU performs differently.

You can check this channel, but I am sure if this bug still exists : https://github.com/pytorch/pytorch/issues/53691