I have the following very simple resnet that I currently use:
class ResNetSimple(nn.Module): def __init__(self, block, layers, channels): super().__init__() assert len(layers) == 3 layers_ =  chan = channels for sublayers in layers: layers_.append(block(chan,chan+1)) chan += 1 for layer in range(sublayers): layers_.append(block(chan, chan)) self.layers = nn.Sequential(*layers_) self.channels_out = chan def forward(self, x): x = self.layers(x) return x class ResnetBlock(nn.Module): def __init__(self, channels_in,channels_out,dt=0.1): super().__init__() self.dt = dt self.nl = nn.Tanh() self.linear = nn.Linear(channels_in,channels_out) self.channels_in = channels_in self.channels_out = channels_out if channels_in != channels_out: self.scaling = torch.zeros((channels_in,channels_out)) for i in range(min(channels_out,channels_in)): self.scaling[i,i] = 1 def forward(self, x): a = self.linear(x) a = self.nl(a) if self.channels_in == self.channels_out: res = x + self.dt * a else: res = x @ self.scaling + self.dt * a return res
Afterwards I call:
descriptor = nn.DataParallel(descriptor).cuda()
where descriptor is the network defined above.
Now the problem comes when I actually run data through this network, since self.scaling does not get transferred to the gpu with the above. How do I ensure that this parameter gets correctly transferred as well in the above?