Share a layer between two different models

Hi,

Is the following right way to share a layer between two different networks? or it is better to have a separate module for a shared layer?

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class A(nn.Module):

   def __init__(self):

       super(A, self).__init__()
       hsize_fc = 5
       self.hsiz_rec = 7
       self.fc = nn.Linear(3, hsize_fc)
       self.rec_layer =nn.GRU(hsize_fc,
                              self.hsiz_rec,
                              bidirectional = False,
                              batch_first = True,
                              num_layers = 1)
   def forward(self, x):
       bsize = x.shape[0]
       x = self.fc(x)
       xs = x.view(bsize, 1, -1)
       ih = torch.zeros(1, bsize, self.hsiz_rec)
       _,h = self.rec_layer(xs, ih)
       
       return h.squeeze(0)

class B(nn.Module):

   def __init__(self, shared_re):
       super(B, self).__init__()
       hsize_fc = 5
       self.hsiz_rec = 7
       self.fc = nn.Linear(3, hsize_fc)
       self.rec_layer = shared_re
       
   def forward(self, x):
       bsize = x.shape[0]
       x = self.fc(x)
       xs = x.view(bsize, 1, -1)
       ih = torch.zeros(1, bsize, self.hsiz_rec)
       
       #### shared layer
       _, h = self.rec_layer(xs, ih)
       ####
       
       return h.squeeze(0)
       
net_A = A()
net_B = B(shared_re = net_A.rec_layer)
optim_A = optim.Adam(net_A.parameters())
optim_B = optim.Adam(net_B.parameters())
dummy = torch.randn(5,7)

x_A = torch.rand(5,3)
y_A_hat = net_A(x_A)
loss_A = F.mse_loss(y_A_hat, dummy)

x_B = torch.rand(5,3)
y_B_hat = net_B(x_B)
loss_B = F.mse_loss(y_B_hat, dummy)

###### A
optim_A.zero_grad()
loss_A.backward()
optim_A.step()

##### B
optim_B.zero_grad()
loss_B.backward()
optim_B.step()

In addition, if the shared layer has been implemented like above, it will be updated when optimizer runs either for A or B, is that correct? (i.e. running any the followings will change/update shared layer weights.)

optim_B.zero_grad()
loss_B.backward()
optim_B.step()

###### A
optim_A.zero_grad()
loss_A.backward()
optim_A.step()

Thanks

1 Like

Yes, this should work.
Here is a small (simplified) example:


class A(nn.Module):
   def __init__(self):
       super(A, self).__init__()
       self.fc = nn.Linear(3, 1)

   def forward(self, x):
       x = self.fc(x)       
       return x

class B(nn.Module):
   def __init__(self, shared_re):
       super(B, self).__init__()
       self.shared_fc = shared_re
       
   def forward(self, x):
       x = self.shared_fc(x)
       return x
       
net_A = A()
net_B = B(shared_re = net_A.fc)
optim_A = optim.Adam(net_A.parameters())
optim_B = optim.Adam(net_B.parameters())
target = torch.randn(1,1)

x_A = torch.rand(1, 3)
y_A_hat = net_A(x_A)
loss_A = F.mse_loss(y_A_hat, target)

x_B = torch.rand(1, 3)
y_B_hat = net_B(x_B)
loss_B = F.mse_loss(y_B_hat, target)

###### A
optim_A.zero_grad()
loss_A.backward()

print(net_A.fc.weight.grad)
> tensor([[0.5219, 0.6192, 0.8145]])
print(net_B.shared_fc.weight.grad)
> tensor([[0.5219, 0.6192, 0.8145]])

optim_A.step()

##### B
optim_B.zero_grad()
loss_B.backward()

print(net_A.fc.weight.grad)
> tensor([[0.6641, 0.0777, 0.2146]])
print(net_B.shared_fc.weight.grad)
> tensor([[0.6641, 0.0777, 0.2146]])

optim_B.step()
7 Likes

Will having two optimizers like in this example work the same as having one optimizer with net_A.parameters() + net_B.parameters()? Or will this cause a better result for net_B because it’s updated last with net_B’s loss?