I couldn’t reproduce the issue on my setup; note that you cannot use `ll`

to compare the weights after each iteration as the underlying storage does not change and each of the new elements of the list actually point to the same memory. The changes to the weights also seem to be very small depending on the learning rate.

Code I used:

```
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
ll = list()
class SiameseNetwork(nn.Module):
def __init__(self, device):
super(SiameseNetwork, self).__init__()
self.model = torchvision.models.densenet161(pretrained=True)
self.model.classifier = nn.Linear(2208, 23)
#self.name = './ModelsCh/Densent161_focal_best.pt'
#self.model.load_state_dict(torch.load(self.name))
self.model.classifier = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(2208, 256),
nn.ReLU(),
nn.Dropout(0.5)
)
self.fc1 = nn.Linear(256, 23)
self.fc2 = nn.Linear(46, 10)
self.ptsigmoid = nn.Sigmoid()
self.drop = nn.Dropout(0.5)
self.w = nn.Linear(10, 23)
def forward_once(self, x):
output = self.model(x)
output = self.fc1(F.relu(output))
if self.training:
outpu2 = 1
else:
outpu2 = self.fc2(F.relu(torch.cat((output, output), dim=1)))
output = torch.add(self.w(outpu2), output)
return output, outpu2
def forward(self, input1, input2):
output1, out1 = self.forward_once(input1)
output2, out2 = self.forward_once(input2)
feaco = torch.cat((output1, output2), dim=1)
feaco1 = self.fc2(F.relu(self.drop(feaco)))
shap1 = self.w(feaco1)
ll.append(self.w.weight.data)
print(torch.sum(self.w.weight))
output1 = torch.add(shap1, output1)
output2 = torch.add(shap1, output2)
return output1, output2, feaco1
net = SiameseNetwork('cuda')
net = net.cuda()
in1 = torch.randn(1, 3, 224, 224, device='cuda')
in2 = torch.randn(1, 3, 224, 224, device='cuda')
target = torch.tensor([3], device='cuda', dtype=torch.long)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), 1e-1,
momentum=0.9,
weight_decay=1e-4)
for i in range(0, 10):
_, out, _ = net(in1, in2)
loss = criterion(out, target)
optimizer.zero_grad()
loss.backward()
print(loss.item())
optimizer.step()
print([data.storage().data_ptr() for data in ll])
```

Output:

```
tensor(1.6240, device='cuda:0', grad_fn=<SumBackward0>)
3.283719062805176
tensor(1.6240, device='cuda:0', grad_fn=<SumBackward0>)
1.6470584869384766
tensor(1.6240, device='cuda:0', grad_fn=<SumBackward0>)
0.0
tensor(1.6239, device='cuda:0', grad_fn=<SumBackward0>)
0.0
tensor(1.6239, device='cuda:0', grad_fn=<SumBackward0>)
0.0
tensor(1.6238, device='cuda:0', grad_fn=<SumBackward0>)
0.0
tensor(1.6237, device='cuda:0', grad_fn=<SumBackward0>)
0.0
tensor(1.6236, device='cuda:0', grad_fn=<SumBackward0>)
0.0
tensor(1.6235, device='cuda:0', grad_fn=<SumBackward0>)
0.0
tensor(1.6234, device='cuda:0', grad_fn=<SumBackward0>)
0.0
[139980355652608, 139980355652608, 139980355652608, 139980355652608, 139980355652608, 139980355652608, 139980355652608, 139980355652608, 139980355652608, 139980355652608]
```