Using dataparallel and cuda with non trainable vars?

I have the following very simple resnet that I currently use:

class ResNetSimple(nn.Module):
    def __init__(self, block, layers, channels):
        super().__init__()
        assert len(layers) == 3
        layers_ = []
        chan = channels
        for sublayers in layers:
            layers_.append(block(chan,chan+1))
            chan += 1
            for layer in range(sublayers):
                layers_.append(block(chan, chan))
        self.layers = nn.Sequential(*layers_)
        self.channels_out = chan

    def forward(self, x):
        x = self.layers(x)
        return x



class ResnetBlock(nn.Module):
    def __init__(self, channels_in,channels_out,dt=0.1):
        super().__init__()
        self.dt = dt
        self.nl = nn.Tanh()
        self.linear = nn.Linear(channels_in,channels_out)
        self.channels_in = channels_in
        self.channels_out = channels_out
        if channels_in != channels_out:
            self.scaling = torch.zeros((channels_in,channels_out))
            for i in range(min(channels_out,channels_in)):
                self.scaling[i,i] = 1

    def forward(self, x):
        a = self.linear(x)
        a = self.nl(a)
        if self.channels_in == self.channels_out:
            res = x + self.dt * a
        else:
            res = x @ self.scaling + self.dt * a
        return res

Afterwards I call:

descriptor = nn.DataParallel(descriptor).cuda()

where descriptor is the network defined above.

Now the problem comes when I actually run data through this network, since self.scaling does not get transferred to the gpu with the above. How do I ensure that this parameter gets correctly transferred as well in the above?

Since scaling doesn’t require gradients, you should register this tensor as a buffer via

self.register_buffer('scaling', torch.zeros(...))

This will make sure to push this tensor to the specified device(s).

1 Like

That worked perfectly, thank you!