BUG Shared weights CPU/CUDA

I am trying to share weights between two layers d1 and d2 (with a transpose), and I get strange behavior.
The weights are shared on the bloc .

  • bloc on position 1:
    • using device('cpu'): the weights are shared
    • using device('cuda'): the weights are not shared, the assert fail after the first optimizer.step
  • bloc on position 2:
    • using device('cpu'): the weights are shared
    • using device('cuda'): the weights are shared

=> I would like to have the bloc on position 1 however. In a clean way.

import random
import torch
from torch.nn import Linear, Sequential, Sigmoid, Parameter

device = torch.device('cuda')  # 'cuda' 'cpu'

d1 = Linear(5, 2, True)
d2 = Linear(2, 5, True)

# POSITION 1
# start bloc
d1_w = d1.weight
d2.weight = Parameter(d1_w.transpose(0, 1))
d2_w = d2.weight
# end bloc

D = Sequential(
    *[Sequential(d1, Sigmoid()),
    Sequential(d2, Sigmoid()),]
).to(device)

# POSITION 2



# check if the weights are shared as the start
assert (d1_w == d2_w.transpose(0, 1)).all()
print(d1_w)
print(d2_w.transpose(0, 1))

optimizer = torch.optim.Adam(D.parameters(), 0.001)
loss_func = torch.nn.MSELoss()

for _ in range(40):
    # just random training function
    optimizer.zero_grad()

    input = torch.Tensor([random.randint(1, 40) for _ in range(10)]).reshape([2, 5]).to(device)
    target = torch.Tensor([random.randint(1, 40) for _ in range(10)]).reshape([2, 5]).to(device)
    hat_target = D(input)

    loss = loss_func(hat_target, target)
    loss.backward()
    optimizer.step()

    # check if the weight are still equals
    d1_w = D[0][0].weight
    d2_w = D[-1][0].weight
    print("====")
    print(d1_w)
    print(d2_w.transpose(0,1))
    assert (d1_w == d2_w.transpose(0, 1)).all()