I was trying to add a Hebbian learning term (based on which weight update is proportional to the product of pre and post-synaptic activations) to the inner optimization. To that end, I modified the code as following:

```
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import _stateless
class MyDataset(Dataset):
def __init__(self, N):
self.N = N
self.x = torch.rand(self.N, 10)
self.y = torch.randint(0, 3, (self.N,))
def __len__(self):
return self.N
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.fc1 = nn.Linear(10, 10)
self.fc2 = nn.Linear(10, 3)
self.relu = nn.ReLU()
self.alpha = nn.Parameter(torch.randn(1))
self.beta = nn.Parameter(torch.randn(1))
def forward(self, x):
y = self.relu(self.fc1(x))
return (x, y), self.fc2(y)
def Optim(params, alpha, beta, y, logits):
# -- add network output to activations
softmax = nn.Softmax(dim=1)
activations = []
for item in y:
activations.append(item)
activations.append(softmax(logits))
i = 0
for k, p in params.items():
print(k[4:])
if k[4:] == 'weight':
p.update = - alpha * p.grad + beta * torch.matmul(activations[i+1].T, activations[i])
params[k] = p + p.update # update weight
elif k[4:] == 'bias':
p.update = - alpha * p.grad + beta * activations[i + 1].squeeze(0)
params[k] = p + p.update # update weight
i += 1
epochs = 20
N = 10
dataset = DataLoader(dataset=MyDataset(N), batch_size=1)
model = MyModel()
loss_func = nn.CrossEntropyLoss()
optim = optim.Adam([model.alpha], lr=1e-3)
torch.autograd.set_detect_anomaly(True)
for i in range(epochs):
model.train()
train_loss = 0
params = dict(model.named_parameters())
for batch_idx, (x, y) in enumerate(dataset):
params = {k: v.clone() for k, v in params.items()}
activations, logits = _stateless.functional_call(model, params, x) # predict
loss_inner = loss_func(logits, y) # loss
loss_inner.backward(create_graph=True, inputs=params.values()) # compute grad
train_loss += loss_inner.item() # store loss
Optim(params, model.alpha, model.beta, activations, logits)
print('Train Epoch: {}\tLoss: {:.6f}'.format(i, train_loss / N))
activations, logits = _stateless.functional_call(model, params, x) # predict
loss_meta = loss_func(logits, y)
loss_meta.backward()
optim.step()
```

This gives me the error

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1]] is at version 11; expected version 10 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

when I change the line

```
Optim(params, model.alpha, model.beta, activations, logits)
```

to

```
Optim(params, model.alpha, model.beta.clone(), activations, logits)
```

the problem vanishes. Since `model.alpha`

does not need cloning,

- I’m wondering if using
`model.beta.clone()`

is the correct solution?
- Why unlike
`model.beta`

, cloning is not needed when passing `model.alpha`

to `Optim`

?