I am trying to share weights between two layers d1
and d2
(with a transpose
), and I get strange behavior.
The weights are shared on the bloc
.
- bloc on position 1:
- using
device('cpu')
: the weights are shared - using
device('cuda')
: the weights are not shared, the assert fail after the firstoptimizer.step
- using
- bloc on position 2:
- using
device('cpu')
: the weights are shared - using
device('cuda')
: the weights are shared
- using
=> I would like to have the bloc on position 1 however. In a clean way.
import random
import torch
from torch.nn import Linear, Sequential, Sigmoid, Parameter
device = torch.device('cuda') # 'cuda' 'cpu'
d1 = Linear(5, 2, True)
d2 = Linear(2, 5, True)
# POSITION 1
# start bloc
d1_w = d1.weight
d2.weight = Parameter(d1_w.transpose(0, 1))
d2_w = d2.weight
# end bloc
D = Sequential(
*[Sequential(d1, Sigmoid()),
Sequential(d2, Sigmoid()),]
).to(device)
# POSITION 2
# check if the weights are shared as the start
assert (d1_w == d2_w.transpose(0, 1)).all()
print(d1_w)
print(d2_w.transpose(0, 1))
optimizer = torch.optim.Adam(D.parameters(), 0.001)
loss_func = torch.nn.MSELoss()
for _ in range(40):
# just random training function
optimizer.zero_grad()
input = torch.Tensor([random.randint(1, 40) for _ in range(10)]).reshape([2, 5]).to(device)
target = torch.Tensor([random.randint(1, 40) for _ in range(10)]).reshape([2, 5]).to(device)
hat_target = D(input)
loss = loss_func(hat_target, target)
loss.backward()
optimizer.step()
# check if the weight are still equals
d1_w = D[0][0].weight
d2_w = D[-1][0].weight
print("====")
print(d1_w)
print(d2_w.transpose(0,1))
assert (d1_w == d2_w.transpose(0, 1)).all()