CNN with torch.nn.functional vs. with torch.nn

Can anyone point out why the results for CIFAR10 dataset while running for only 1 epoch is different for below CNN same architecture but defined in different ways !

  1. First way with torch.nn and Sequential:
 class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, kernel_size_p, stride_p, drop_out):
        super(Net, self).__init__()
        self.Sequence = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
            torch.nn.BatchNorm2d(out_channels),
            torch.nn.Dropout2d(p=drop_out),
            torch.nn.ReLU(),  
            
            torch.nn.Conv2d(out_channels, out_channels*2, kernel_size, stride, padding),
            torch.nn.BatchNorm2d(out_channels*2),
            torch.nn.Dropout2d(p=drop_out),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size_p, stride_p),
            
            torch.nn.Conv2d(out_channels*2, out_channels*3, kernel_size, stride, padding),
            torch.nn.BatchNorm2d(out_channels*3),
            torch.nn.Dropout2d(p=drop_out),
            torch.nn.ReLU(),
            
    
            torch.nn.Conv2d(out_channels*3, out_channels*4, kernel_size, stride, padding),
            torch.nn.BatchNorm2d(out_channels*4),
            torch.nn.Dropout2dp=(drop_out),
            torch.nn.ReLU(),
            #torch.nn.MaxPool2d(kernel_size_p, stride_p),
            
            torch.nn.Conv2d(out_channels*4, 10, 4, 1, 0),
            torch.nn.Flatten(),
            )

    def forward(self, x):
        return self.Sequence(x)

With above architecture I am getting below results which are acceptable.

Batch     Time     TestLoss     TestAcc     Loss     Acc     avg. Loss     avg. Acc     
0         0.215    2.303        9.89        2.382    8.0     0.238         0.8          
200       1.392    2.303        9.89        1.565    44.0    1.573         42.9         
400       2.769    1.372        49.35       1.392    51.0    1.408         47.0         
490       3.302    1.372        49.35       1.479    42.0    1.408         49.2

  1. Now keeping the rest of the code same I am just defining architecture in below manner. But the results are scary.
def running_mean(out_channels):    
    n_t = torch.randn(out_channels,32,32) #image size is 32*32
    norm_mean = torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    running_mean = norm_mean(n_t)
    return running_mean

def running_var(out_channels):    
    n_t = torch.randn(out_channels,32,32)
    norm_var = torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    running_var = norm_var(n_t)
    return running_var
    

class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, kernel_size_p, stride_p, drop_out):
        super(Net, self).__init__()
 
        self.conv_1_weight = torch.nn.Parameter(torch.randn(out_channels, in_channels, kernel_size, kernel_size))
        self.bias_1_weight = torch.nn.Parameter(torch.randn(out_channels))
 
        self.conv_2_weight = torch.nn.Parameter(torch.randn(out_channels*2, out_channels, kernel_size, kernel_size))
        self.bias_2_weight = torch.nn.Parameter(torch.randn(out_channels*2))
        
        self.conv_3_weight = torch.nn.Parameter(torch.randn(out_channels*3, out_channels*2, kernel_size, kernel_size))
        self.bias_3_weight = torch.nn.Parameter(torch.randn(out_channels*3))
 
        self.conv_4_weight = torch.nn.Parameter(torch.randn(out_channels*4, out_channels*3, kernel_size, kernel_size))
        self.bias_4_weight = torch.nn.Parameter(torch.randn(out_channels*4))
        
        self.conv_5_weight = torch.nn.Parameter(torch.randn(10, out_channels*4, 4, 4))
        self.bias_5_weight = torch.nn.Parameter(torch.randn(10))

 
    def forward(self, x):
        x = x.view(x.size(0), 3, 32, 32)
        out = torch.nn.functional.conv2d(x, self.conv_1_weight, self.bias_1_weight, stride, padding)
        out = torch.nn.functional.batch_norm(out, running_mean= torch.zeros(out_channels), running_var= torch.ones(out_channels))
        # out = torch.nn.functional.batch_norm(out, running_mean= running_mean(out_channels), running_var= running_var(out_channels))
        out = torch.nn.functional.dropout2d(out, p=drop_out)
        out = torch.nn.functional.relu(out, inplace=False)
        
        out = torch.nn.functional.conv2d(out, self.conv_2_weight, self.bias_2_weight, stride, padding)
        out = torch.nn.functional.batch_norm(out, running_mean= torch.zeros(out_channels*2), running_var= torch.ones(out_channels*2))
        # out = torch.nn.functional.batch_norm(out, running_mean= running_mean(out_channels*2), running_var= running_var(out_channels*2))
        out = torch.nn.functional.dropout2d(out, p=drop_out)
        out = torch.nn.functional.relu(out, inplace=False)
        out = torch.nn.functional.max_pool2d(out, kernel_size_p, stride_p)
        
        out = torch.nn.functional.conv2d(out, self.conv_3_weight, self.bias_3_weight, stride, padding)
        out = torch.nn.functional.batch_norm(out, running_mean= torch.zeros(out_channels*3), running_var= torch.ones(out_channels*3))
        # out = torch.nn.functional.batch_norm(out, running_mean= running_mean(out_channels*3), running_var= running_var(out_channels*3))
        out = torch.nn.functional.dropout2d(out, p=drop_out)
        out = torch.nn.functional.relu(out, inplace=False)
        
        out = torch.nn.functional.conv2d(out, self.conv_4_weight, self.bias_4_weight, stride, padding)
        out = torch.nn.functional.batch_norm(out, running_mean= torch.zeros(out_channels*4), running_var= torch.ones(out_channels*4))
        # out = torch.nn.functional.batch_norm(out, running_mean= running_mean(out_channels*4), running_var= running_var(out_channels*4))
        out = torch.nn.functional.dropout2d(out, p=drop_out)
        out = torch.nn.functional.relu(out, inplace=False)
        
        out = torch.nn.functional.conv2d(out, self.conv_5_weight, self.bias_5_weight, stride, padding)
        out = out.view(out.size(0), -1) 
        return out

Now the results are:

Batch     Time     TestLoss    TestAcc     TrainLoss     TrainAcc         
0         0.226    7996969.105  9.79        8810812.0      7.0               
200       1.407    7996969.105  9.79        606847.938     13.0            
400       2.807    285835.691   11.45       40160.824      9.0                
490       3.329    285835.691   11.45       18792.752      6.0

I think the difference in results is because of this:

In case of torch.nn.BatchNorm2d, there is only 1 argument needed which number of out_channels(say 32). While torch.nn.functional.batch_norm(out, running_mean, running_var) …takes three mandatory arguments.
The first one, I gave “out” which is output of convolutional layer (but I’m not sure if it’s right ). I don’t have any idea about the other 2 arguments running_mean and running_var (I just gave it a try).

Some differences are:

  • your module approach uses affine batchnorm layers, which update their running stats and use the affine parameters (weight and bias), while your functional approach resets the running stats and doesn’t use the trainable affine parameters.
  • you are not using the training argument for batchnorm and dropout layers. You won’t be able to use these layers in their evaluation mode, which would be used via model.eval() in the first approach
1 Like

Hi @ptrblck,
I included training= True argument for batch_norm and dropout2d for 2nd architecture as follows:

out = torch.nn.functional.batch_norm(out, running_mean= torch.zeros(out_channels), running_var= torch.ones(out_channels),training=True)
out = torch.nn.functional.dropout2d(out, p=drop_out,training=True)

and like wise foe every layer.
I’m getting the following results after 10 epochs:

Batch     Time    TestLoss     TestAcc    TrainLoss TrainAcc          
0         0.192    50.07        9.69        51.799   9.0              
200       1.313    50.07        9.69        27.512   18.0             
400       2.617    25.648       21.43       20.614   20.0             
.
.
.
1400      8.938    10.299       25.65       12.213   17.0            
1600      10.241   8.948        25.78       7.674    30.0             
1800      11.537   7.973        26.16       7.099    27.0             
2000      12.835   7.16         26.24       6.351    22.0             

.
.
.   
4800      30.327   3.46         27.8        2.886    34.0            
4990      31.37    3.46         27.8        3.908    23.0    

Now, if I use nn.Sequential and torch.nn module (1st architecture) keeping other things the same, the results are as follows after 10 epochs:


Batch     Time    TestLoss     TestAcc    TrainLoss TrainAcc          
0         0.203    2.303        6.99        2.434    9.0          
200       1.338    2.303        6.99        1.586    41.0            
400       2.684    1.392        49.09       1.444    49.0            
.
.
.          
1600      10.542   0.876        69.24       0.996    64.0           
1800      11.857   0.833        70.8        1.116    55.0            
2000      13.177   0.791        71.96       1.249    60.0            
.
.
.    
4600      29.704   0.634        78.15       0.641    77.0             
4800      31.024   0.625        78.25       0.702    79.0           
4990      32.095   0.625        78.25       0.906    63.0   

Upon comparison of both results there are two things which are contrary:

  1. Final test accuracy: 27.8% vs. 78.25%
  2. Initial test loss: 50.07 vs. 2.303

Here are more details if you could look at:

#variables

in_channels=3
out_channels= 32       
kernel_size = 5
stride = 1
kernel_size_p = 2
stride_p = 2
optimizer = torch.optim.Adam(Classifier.parameters(), lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma= 0.9)
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(num_epochs):
    
    for _, (images, labels) in enumerate(train_loader):
        if Batches % 250== 0:
            Classifier.eval()
            with torch.no_grad():
                correct=0
                total=0
                LastTestLoss = 0
                LastTestAcc = 0
                for _, (testimages, testlabels) in enumerate(test_loader):
                    output = Classifier(testimages)
                    LastTestLoss += criterion(output, testlabels).item()
                    predictions = torch.argmax(output,1)
                    total += testlabels.shape[0]
                    correct += (predictions == testlabels).sum().float().item()
                LastTestAcc = correct*100.0/total
                LastTestLoss /= len(test_loader)
            Classifier.train()                
        outputs = Classifier(images)               
        loss = criterion(outputs, labels) 
              
        loss.backward()
        optimizer.step()                                
        optimizer.zero_grad()                      
        
        # Tracking Accuracy
        prediction = torch.argmax(outputs,1)
        correct = (prediction == labels).sum().float().item()
        acc = 100.0*(correct/labels.shape[0])
    
       ...
        Batches+=1
    scheduler.step()

The first issue is not addressed, i.e. you are still resetting the running stats and are not using trainable parameters for the functional batchnorm approach.
You would have to store the running_mean and running_var as buffers (and use them in the functional call) instead of recreating them, and do the same for the weight and bias parameters.

Thank you for the reply @ptrblck.
I am a newbie, I followed your suggestion and these links: link1 and link2

Can you please look at below code and suggest changes?

class Conv(torch.nn.Module):

    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, bias):
        
        super(Conv, self).__init__()
        
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.weight = torch.nn.Parameter(torch.Tensor(out_channels, in_channels, kernel_size, kernel_size))
        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.kaiming_uniform_(self.weight, a= math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            torch.nn.init.uniform_(self.bias, -bound, bound)

    def extra_repr(self):
        args = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
             ', stride={stride}')
        if self.padding != (0,) * len(self.padding):
            args += ', padding={padding}'
        return args.format(**self.__dict__)

    def __setstate__(self, state):
        super(Conv, self).__setstate__(state)
        if not hasattr(self, 'padding_mode'):
            self.padding_mode = 'zeros'
            
            
class Conv2d(Conv):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, bias = True):
        super(Conv2d, self).__init__(in_channels, out_channels, kernel_size, stride, padding, bias)

    def conv_forward(self, input, weight):
        return torch.nn.functional.conv2d(input, weight, self.bias, self.stride, self.padding)

    def forward(self, input):
        return self.conv_forward(input, self.weight)


class B_Norm(torch.nn.Module):
    
    def __init__(self, num_channels, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True):
        super(B_Norm, self).__init__()
        self.num_channels = num_channels
        self.eps = eps
        self.momentum = momentum
        self.affine = affine
        self.track_running_stats = track_running_stats
        
        if self.affine:
            self.weight = torch.nn.Parameter(torch.Tensor(num_channels))
            self.bias = torch.nn.Parameter(torch.Tensor(num_channels))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)
            
        if self.track_running_stats:
            self.register_buffer('running_mean', torch.zeros(num_channels))
            self.register_buffer('running_var', torch.ones(num_channels))
            self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
        else:
            self.register_parameter('running_mean', None)
            self.register_parameter('running_var', None)
            self.register_parameter('num_batches_tracked', None)
        self.reset_parameters()

    def reset_running_stats(self):
        if self.track_running_stats:
            self.running_mean.zero_()
            self.running_var.fill_(1)
            self.num_batches_tracked.zero_()

    def reset_parameters(self):
        self.reset_running_stats()
        if self.affine:
            self.weight.data.uniform_()
            self.bias.data.zero_()

    def forward(self, input):
       
        exp_avg_factor = 0.0

        if self.training and self.track_running_stats:
            self.num_batches_tracked += 1
            if self.momentum is None:  #cumulative moving average
                exp_avg_factor = 1.0 / self.num_batches_tracked.item()
            else:  # exponential moving average
                exp_avg_factor = self.momentum

        return torch.nn.functional.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias,
            self.training or not self.track_running_stats, exp_avg_factor, self.eps)

class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, kernel_size_p, stride_p, drop_out):
        super(Net,self).__init__()
 
        # self.conv_1_weight = torch.randn(out_channels, in_channels, kernel_size, kernel_size)
        # self.bias_1_weight = torch.randn(out_channels)
 
        # self.conv_2_weight = torch.randn(out_channels*2, out_channels, kernel_size, kernel_size)
        # self.bias_2_weight = torch.randn(out_channels*2)
        
        # self.conv_3_weight = torch.randn(out_channels*3, out_channels*2, kernel_size, kernel_size)
        # self.bias_3_weight = torch.randn(out_channels*3)
 
        # self.conv_4_weight = torch.randn(out_channels*4, out_channels*3, kernel_size, kernel_size)
        # self.bias_4_weight = torch.randn(out_channels*4)
        
        # self.conv_5_weight = torch.randn(10, out_channels*4, 4, 4)
        # self.bias_5_weight = torch.randn(10)
 
    def forward(self, x):
        x = x.view(x.size(0), 3, 32, 32)
        out = Conv2d(x, in_channels, out_channels, kernel_size, stride, padding, bias = True)
        out = B_Norm(out, out_channels, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True)
        out = torch.nn.functional.dropout2d(out, p=drop_out ,training=True)
        out = torch.nn.functional.relu(out, inplace=False)
        
        out = Conv2d(out, out_channels, out_channels*2, kernel_size, stride, padding, bias = True)
        out = B_Norm(out,out_channels*2, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True)
        out = torch.nn.functional.dropout2d(out, p=drop_out, training=True)
        out = torch.nn.functional.relu(out, inplace=False)
        out = torch.nn.functional.max_pool2d(out, kernel_size_p, stride_p)
        
        out = Conv2d(out, out_channels*2, out_channels*3, kernel_size, stride, padding, bias = True)
        out = B_Norm(out, out_channels*3, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True)
        out = torch.nn.functional.dropout2d(out, p=drop_out, training=True)
        out = torch.nn.functional.relu(out, inplace=False)
        
        out = Conv2d(out, out_channels*3, out_channels*4, kernel_size, stride, padding, bias = True)
        out = B_Norm(out, out_channels*4, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True)
        out = torch.nn.functional.dropout2d(out, p=drop_out, training=True)
        out = torch.nn.functional.relu(out, inplace=False)
        
        out = Conv2d(out, out_channels*4, 10, 4, stride, padding, bias= True)
        out = out.view(out.size(0), -1) 
        return out
        
      

But, I’m getting this error: ValueError: optimizer got an empty parameter list

The conv and batchnorm modules look alright.
In your current forward pass you are reinitializing these modules in each iteration, so you should create these modules in Net.__init__.
However, I’m currently not sure why you are reimplementing the conv and batchnorm modules instead of directly using the PyTorch implementations.
Your current approach uses the functional API in the conv and batchnorm layers, which is also the case for e.g. nn.Conv2d.

If you want to use only the functional API, you should create the parameters and buffers e.g. in Net.__init__ and use them in forward via e.g. F.conv2d.

I’m getting confused. After your first comment, I came across above implementation, which I now reckon is nothing but an.Conv2d and mn.Batchnorm2d.

For the first point you mentioned, should I inherit the Conv2d and B_Norm in super(Net,self).init(Conv2d,B_Norm) like this?
If so then it will show error.

But yes, I want to only use the functional API. Which I tried, I created parameters (self.conv_1_weight,…), but I really don’t get how can I create buffers and how can I store running stats for F.batch_norm as well.

Can you please show, how it can be done?

Many thanks.

You can initialize the buffers as tensors and reuse them.
Note that you have to take care of storing and using the parameters as well as buffers, if you don’t want to use nn.Modules, which would store them internally.
Here is a small example:

# Create parameters and buffers
weight = nn.Parameter(torch.ones(3))
bias = nn.Parameter(torch.zeros(3))
running_mean = torch.zeros(3)
running_var = torch.ones(3)

# Pass parameters to optimizer
optimizer = torch.optim.SGD([weight, bias], lr=1.)

# Execute for and backward
input = torch.randn(2, 3, 24, 24)
out = F.batch_norm(input, running_mean, running_var, weight, bias, training=True)
out.mean().backward()
optimizer.step()

# buffers were updated
print(running_mean)
print(running_var)

Thank you once again.
So, I have to do create weight,bias, running_mean and running_var for each layer in init, right?

In your code, I am guessing there’s no storing is done, can you please inform where exactly and how to store?

You can store these parameters and buffers in the __init__ method of an nn.Module, but then again you wouldn’t use the functional API but modules.
If you want to store these tensors in a module why wouldn’t you want to use the module (nn.Conv2d, nn.BatchNorm2d) directly?