(when checking argument for argument mat1 in method wrapper_addmm)

class NICE(pl.LightningModule):

def __init__(self, in_features=784, hidden_features=1000, num_coupling=4):
    super().__init__()
    self.save_hyperparameters()
    self.layers = nn.ModuleList()
    self.num_coupling = num_coupling
    for _ in range(self.num_coupling):
        self.layers.append(nn.Sequential(
            nn.Linear(in_features // 2, hidden_features), nn.ReLU(),
            nn.Linear(hidden_features, hidden_features), nn.ReLU(),
            nn.Linear(hidden_features, hidden_features), nn.ReLU(),
            nn.Linear(hidden_features, hidden_features), nn.ReLU(),
            nn.Linear(hidden_features, hidden_features), nn.ReLU(),
            nn.Linear(hidden_features, in_features // 2),))
    self.scale = nn.Parameter(torch.zeros(in_features))
    # logistic distribution for prior
    base = torch.distributions.uniform.Uniform(torch.tensor(0.0).to('cuda'), torch.tensor(1.0).to('cuda'))
    transforms = [torch.distributions.transforms.SigmoidTransform().inv,
                  torch.distributions.transforms.AffineTransform(torch.tensor(0.0).to('cuda'), 
                                                                 torch.tensor(1.0).to('cuda'))]
    self.prior = torch.distributions.TransformedDistribution(base, transforms)

def forward(self, x):
    # x: (batch_size, 784)
    x = x.view(x.size(0), -1)
    z, log_det = self.forward_(x)
    log_prob = self.prior.log_prob(z).sum(dim=1) + log_det
    return log_prob

def forward_(self, x):
    # x: (batch_size, 784)
    z = x
    log_det = 0
    s = torch.exp(self.scale) # Positive scale
    for i in range(self.num_coupling):
        # Check the number of layers in self.layers
        # print(len(self.layers)) # 4
        z1 = z.chunk(2, dim=1)[0] if (i % 2 == 0) else z.chunk(2, dim=1)[1]
        z2 = z.chunk(2, dim=1)[1] if (i % 2 == 0) else z.chunk(2, dim=1)[0]
        z2 = z2 + self.layers[i](z1)
        z = torch.cat([z1, z2], dim=1) if (i % 2 == 0) else torch.cat([z2, z1], dim=1)
    z = z * s
    log_det += torch.log(s).sum()
    return z, log_det

def inverse(self, z):
    # z: (batch_size, 784)
    z = z.view(z.size(0), -1)
    x = self.inverse_(z)
    x = x.view(x.size(0), 1, 28, 28)
    return x

def inverse_(self, z):
    # z: (batch_size, 784)
    x = z
    s = torch.exp(self.scale).to('cuda')
    x = x / s
    # print(x.is_cuda) # True
    for i in range(self.num_coupling -1, -1, -1):
        # Assumed that the num_coupling is even
        x1 = x.chunk(2, dim=1)[0] if (i % 2 == 1) else x.chunk(2, dim=1)[1]
        x2 = x.chunk(2, dim=1)[1] if (i % 2 == 1) else x.chunk(2, dim=1)[0]
        x1 = x1 - self.layers[i](x2)
        x = torch.cat([x1, x2], dim=1) if (i % 2 == 1) else torch.cat([x2, x1], dim=1)
    return x

def sample(self, img_shape):
    z = self.prior.sample(img_shape).to('cuda')
    x = self.inverse(z)
    return x

========================
It works for training, but error occurs when sampling:

import matplotlib.pyplot as plt

samples = model.sample(img_shape=[16,1,28,28])
samples = samples.view(16, 28, 28).cpu().numpy()
plt.figure(figsize=(10,10))
for i in range(16):
plt.subplot(4,4,i+1)
plt.imshow(samples[i], cmap=“gray”)
plt.axis(“off”)
plt.show()

Check if self.layers were moved to the GPU and if x2 was moved as well since one of these two objects is still stored on the CPU and causes the device mismatch.

Thank you for the prompt reply. I checked that x1 and x2 are on the GPU and moved self.layers to the GPU using the following code. Still get the same error.

def init(self, in_features=784, hidden_features=1000, num_coupling=4):
super().init()
self.save_hyperparameters()
self.layers = nn.ModuleList()
self.num_coupling = num_coupling
for _ in range(self.num_coupling):
self.layers.append(nn.Sequential(
nn.Linear(in_features // 2, hidden_features), nn.ReLU(),
nn.Linear(hidden_features, hidden_features), nn.ReLU(),
nn.Linear(hidden_features, hidden_features), nn.ReLU(),
nn.Linear(hidden_features, hidden_features), nn.ReLU(),
nn.Linear(hidden_features, hidden_features), nn.ReLU(),
nn.Linear(hidden_features, in_features // 2),))
self.layers.to(‘cuda’)
self.scale = nn.Parameter(torch.zeros(in_features)).to(‘cuda’)

BTW, I don’t understand why this code works in training but not in sampling. I thought that self.layers in the model moved to the GPU when I initialized the model by

model = NICE().to(‘cuda’)

After the training, isn’t the model still staying on the GPU?

Yes, you are right. Once the model is moved to the GPU its submodules and parameters will stay there unless you are reinitializing it. Since your training seems to work whole the inference fails could you check why the model changed? Are you recreating it somewhere?

Not that I know of. Sampling works now by adding the self.layers.to(‘cuda’) in the inverse_ function as you’ve advised.

def inverse(self, z):
# z: (batch_size, 784)
z = z.view(z.size(0), -1)
x = self.inverse_(z)
x = x.view(x.size(0), 1, 28, 28)
return x

def inverse_(self, z):
    # z: (batch_size, 784)
    x = z
    s = torch.exp(self.scale).to('cuda')
    x = x / s
    **self.layers.to('cuda')** 
    for i in range(self.num_coupling -1, -1, -1):
        # Assumed that the num_coupling is even
        x1 = x.chunk(2, dim=1)[0] if (i % 2 == 1) else x.chunk(2, dim=1)[1]
        x2 = x.chunk(2, dim=1)[1] if (i % 2 == 1) else x.chunk(2, dim=1)[0]
        x1 = x1 - self.layers[i](x2)
        x = torch.cat([x1, x2], dim=1) if (i % 2 == 1) else torch.cat([x2, x1], dim=1)
    return x

def sample(self, img_shape):
    z = self.prior.sample(img_shape).to('cuda')
    x = self.inverse(z)
    return x

These three functions (methods) were not used in the training step since only the forward step was used. I guess the sampling call might recreate the model.

samples = model.sample(img_shape=[16,1,28,28])

But still, I don’t get the picture.

I also cannot explain the issue you are seeing since your code works fine for me:

model = NICE().to('cuda')

x = torch.randn(10, 784).to('cuda')
out = model(x)
print(out.device)
# cuda:0

samples = model.sample(img_shape=[16,1,28,28])
print(samples.device)
# cuda:0

Note that I had to remove the pl.LightningModule as the base class and used the plain nn.Module instead. Do you see the error using my code snippet and if so, could you also replace the LightningModule with nn.Module?

I think the culprit is pl.Trainer which automatically assigns a device for training. I ran your code before and after the training via trainer.fit and found the following,

Before training; the model is on GPU.

x = torch.randn(10, 784).to(‘cuda’)
out = model(x)
print(out.device) # cuda:0

Init data

dm = MNISTDataModule(batch_size=64)

Init trainer

trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, “NICE”),
accelerator=“gpu”,
devices=1,
max_epochs=1,
gradient_clip_val=1.0,
callbacks=[ModelCheckpoint(save_weights_only=True, mode=“min”, monitor=“val_loss”),
TQDMProgressBar(refresh_rate=50)])

Train

trainer.fit(model, dm)

After training; model is on CPU.

x = torch.randn(10, 784)
out = model(x)
print(out.device) # cpu

pl.Trainer moves the tensors (model) to the GPU during training and back to the CPU after training, if their locations are not explicitly specified.

Thank you so much for the help!

That’s interesting. Do you know why that’s the case as it seems to be a bit unexpected (at least to me)?

I have no idea^^. I think it is related to the fact that Pytorch Lightning takes care of device assignments by itself. No further clue.

  1. There are no .cuda() or .to(device) calls required. Lightning does these for you.

don’t do in Lightning

x = torch.Tensor(2, 3)
x = x.cuda()
x = x.to(device)

do this instead

x = x

leave it alone! # or to init a new tensor

new_x = torch.Tensor(2, 3)
new_x = new_x.to(x)

1 Like