Access to grad_output (the input to the backward)

Thank you for clarification.
I am still having a hard time replicating Pytorch’s loss.backward() from scratch.
I computed dLoss/dy when Loss is a CorssEntropy loss and fed it to my backward network. I compared the loss and also the gradients computed by my network to the same network being trained with loss.backward(). But, gradients are not the same the ones computed by loss.backward() and I am puzzled why. I would appreciate it if you could point me to where my logic is wrong. Here is the code:

import torch.nn as nn
import torch.optim as optim
import torchvision

import numpy as np
import scipy.stats as ss
import scipy
import torch
from torchvision import datasets, transforms

imagesetdir = '/.' #path_prefix+'/Data/'

use_cuda = True
batch_size = 64
kwargs = {'num_workers': 0, 'pin_memory': True, 'drop_last':True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST(imagesetdir, train=True, download=True,
                    transform=transforms.Compose([
                        transforms.Resize(32),
                        transforms.ToTensor(),
                       
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST(imagesetdir, train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=False, **kwargs)



class Forward(nn.Module):
    def __init__(self):
        super(Forward, self).__init__()
        self.fc_0 = nn.Linear(1024, 40, bias=False)
        self.fc_1 = nn.Linear(40, 10, bias=False)

    def forward(self, x):
        x0 = self.fc_0(x)
        x1 = self.fc_1(x0)

        return x1, [x0, x1]

class Backward(nn.Module):
    def __init__(self):
        super(Backward, self).__init__()
        self.fc_1 = nn.Linear(10, 40, bias=False)
        self.fc_0 = nn.Linear(40, 1024, bias=False)

    def forward(self, x):
        x1 = self.fc_1(x)
        x0 = self.fc_0(x1)

        return [x0, x1]

def transpose_weights(state_dict):

    state_dict_new = {}
    for k, item in state_dict.items():
        state_dict_new.update({k: item.t()})
    return state_dict_new


modelF = Forward().cuda() # main model
modelB = Backward().cuda() # backward network to compute gradients for modelF

modelC = Forward().cuda() # Forward Control model to compare to BP

optimizerC = optim.Adam(modelC.parameters(), lr=0.0001)
optimizer = optim.Adam(modelF.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

# -------Implementing BP without using loss.backward() ----------

n_classes = 10
onehot = torch.zeros(train_loader.batch_size, n_classes).cuda()
Softmax = nn.Softmax(dim=1)
for epoch in range(20):

    loss_running = 0
    lossC_running = 0
    for i, (inputs, target) in enumerate(train_loader):

        inputs = inputs.view(train_loader.batch_size, -1).cuda()
        target = target.cuda()
        onehot.zero_()
        onehot.scatter_(1, target.view(train_loader.batch_size,-1), 1)
        
        # ------------- BP Control ------------------------------------------
        outputsC, _ = modelC(inputs)
        lossC = criterion(outputsC, target)
        paramsC = [p for p in modelC.parameters()]
        optimizerC.zero_grad()
        lossC.backward()
        optimizerC.step()

        lossC_running += lossC.item()

        # -------------Implementing BP bypassing loss.backward()-------------
        modelB.load_state_dict(transpose_weights(modelF.state_dict()) )

        outputs, activationsF = modelF(inputs)
        probs = Softmax(outputs.detach())
        # the gradient of CrossEntropy is pi-yi (pi: softmax of the output, yi:onehot label)
        activationsB = modelB(probs - onehot)

        loss = criterion(outputs, target)
        optimizer.zero_grad()
        

        ParamsF = [p for p in modelF.parameters() if p.requires_grad]

        # copy the backward gradients into the parameter grads.
        for ia, a in enumerate(activationsB):
            pF = ParamsF[ia] # parameters of the forward model
            h = activationsF[ia] # forward activations

            # computing the gradients by multiplying forward activations and backward activations
            pF.grad = torch.matmul(h.t().detach(), a.clone().detach())
                
        optimizer.step()
        loss_running += loss.item()
    
    print(epoch, loss_running/(i+1), lossC_running/(i+1))