Hi everyone,

I’m working on a project where I need to write an optimization algorithm for an MNIST program, and the update of the algorithm requires the gradient of the loss-max function at a previous iteration in addition to that at the current iteration. I tried storing the gradient at each iteration in a dictionary so that they can be added to the gradient at the subsequent iteration for the update. However, I keep getting the error message “RuntimeError: The size of tensor a (5) must match the size of tensor b (10) at non-singleton dimension 3,” and I don’t know what I’m doing wrong here. The algorithm is in my training function below:

```
def train(args, model, device, train_loader, optimizer, epoch, lr, iter):
model.train()
v0dict = {}
grad0dict = {}
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
L = 2
eta = 1/(2*L)
S = pow(args.batch_size, 0.5)
print(iter)
for p in model.parameters():
if iter == 1:
grad0 = p.grad
v0 = grad0/S
v0dict[iter] = v0
grad0dict[iter] = grad0
p.data.add_(-eta*v0)
print('grad size: {}\n'.format(p.grad.size()))
print('data size: {}\n'.format(p.data.size()))
else:
v0 = v0dict[iter-1]
grad0 = grad0dict[iter-1]
print('grad size: {}\n'.format(p.grad.size()))
print('data size: {}\n'.format(p.data.size()))
v = v0 + (p.grad - grad0)/S
v0dict[iter] = v
grad0dict[iter] = grad0
p.data.add_(-eta*v)
optimizer.zero_grad()
iter += 1
```

Could someone show me where I’ve gone wrong? Any help would be greatly appreciated.