Tensor size error with gradients stored at each training iteration

Hi everyone,
I’m working on a project where I need to write an optimization algorithm for an MNIST program, and the update of the algorithm requires the gradient of the loss-max function at a previous iteration in addition to that at the current iteration. I tried storing the gradient at each iteration in a dictionary so that they can be added to the gradient at the subsequent iteration for the update. However, I keep getting the error message “RuntimeError: The size of tensor a (5) must match the size of tensor b (10) at non-singleton dimension 3,” and I don’t know what I’m doing wrong here. The algorithm is in my training function below:

def train(args, model, device, train_loader, optimizer, epoch, lr, iter):
    model.train()
    v0dict = {}
    grad0dict = {}
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()

        L = 2
        eta = 1/(2*L)
        S = pow(args.batch_size, 0.5)
        print(iter)
        for p in model.parameters():
            if iter == 1:
                grad0 = p.grad
                v0 = grad0/S
                v0dict[iter] = v0
                grad0dict[iter] = grad0
                p.data.add_(-eta*v0)
                print('grad size: {}\n'.format(p.grad.size()))
                print('data size: {}\n'.format(p.data.size()))
            else:
                v0 = v0dict[iter-1]
                grad0 = grad0dict[iter-1]
                print('grad size: {}\n'.format(p.grad.size()))
                print('data size: {}\n'.format(p.data.size()))
                v = v0 + (p.grad - grad0)/S
                v0dict[iter] = v
                grad0dict[iter] = grad0
                p.data.add_(-eta*v)

         optimizer.zero_grad()        
         iter += 1

Could someone show me where I’ve gone wrong? Any help would be greatly appreciated.