I have the following very simple resnet that I currently use:
class ResNetSimple(nn.Module):
def __init__(self, block, layers, channels):
super().__init__()
assert len(layers) == 3
layers_ = []
chan = channels
for sublayers in layers:
layers_.append(block(chan,chan+1))
chan += 1
for layer in range(sublayers):
layers_.append(block(chan, chan))
self.layers = nn.Sequential(*layers_)
self.channels_out = chan
def forward(self, x):
x = self.layers(x)
return x
class ResnetBlock(nn.Module):
def __init__(self, channels_in,channels_out,dt=0.1):
super().__init__()
self.dt = dt
self.nl = nn.Tanh()
self.linear = nn.Linear(channels_in,channels_out)
self.channels_in = channels_in
self.channels_out = channels_out
if channels_in != channels_out:
self.scaling = torch.zeros((channels_in,channels_out))
for i in range(min(channels_out,channels_in)):
self.scaling[i,i] = 1
def forward(self, x):
a = self.linear(x)
a = self.nl(a)
if self.channels_in == self.channels_out:
res = x + self.dt * a
else:
res = x @ self.scaling + self.dt * a
return res
Afterwards I call:
descriptor = nn.DataParallel(descriptor).cuda()
where descriptor is the network defined above.
Now the problem comes when I actually run data through this network, since self.scaling does not get transferred to the gpu with the above. How do I ensure that this parameter gets correctly transferred as well in the above?