IndexError: Caught IndexError in replica 0 on device 0. and IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

STR720 · April 12, 2023, 8:01am

I using this GAN generator

import torch
import torch.nn as nn

class  ResNet18_CIFAR10_Deconv_GAN(nn.Module):
    def __init__(self, latent_dim=512,fw_layers=1,num_classes=3):
        super( ResNet18_CIFAR10_Deconv_GAN, self).__init__()
        
        self.latent_dim = latent_dim
        self.num_classes = num_classes
        
        # Define the layers
        self.fc = nn.Linear(self.latent_dim, 1024 * 62 * 62)  # output shape: [batch_size, 1024*62*62]
        self.conv1 = nn.Sequential(
            nn.ConvTranspose2d(1024, 512, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True)  # output shape: [batch_size, 512, 124, 124]
        )
        self.conv2 = nn.Sequential(
            nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True)  # output shape: [batch_size, 256, 248, 248]
        )
        self.conv3 = nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True)  # output shape: [batch_size, 128, 496, 496]
        )
        self.conv4 = nn.Sequential(
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True)  # output shape: [batch_size, 64, 992, 992]
        )
        self.conv5 = nn.Sequential(
            nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(True)  # output shape: [batch_size, 32, 1984, 1984]
        )
        self.conv6 = nn.Sequential(
            nn.ConvTranspose2d(32, 3, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(3),
            nn.ReLU(True)  # output shape: [batch_size, 3, 1994, 1994]
            #nn.Tanh() 
        )
        self.conv7 = nn.Sequential(
            nn.ConvTranspose2d(3, 3, kernel_size=11, stride=1, padding=0),
            nn.Tanh()  # output shape: [batch_size, 3, 1994, 1994]
        )
        
        
    def forward(self, z,y_hat):
        # Reshape the input noise vector to a 4D tensor
        x = z
        ys = F.one_hot(y_hat, num_classes=self.num_classes)
        x = torch.cat((x, ys), dim=1)
        x = self.fc(z)
        x = x.view(-1, 1024, 62, 62)  # reshape to [batch_size, 1024, 62, 62]
        
        # Upscale the image with transposed convolutions
        x = self.conv1(x)  # output shape: [batch_size, 512, 124, 124]
        x = self.conv2(x)  # output shape: [batch_size, 256, 248, 248]
        x = self.conv3(x)  # output shape: [batch_size, 128, 496, 496]
        x = self.conv4(x)  # output shape: [batch_size, 64, 992, 992]
        x = self.conv5(x)  # output shape: [batch_size, 32, 1984, 1984]
        x = self.conv6(x)  # output shape: [batch_size, 3, 1994, 1994]
        x = self.conv7(x)

        return x

To use in

def learn_prototype(model, generator, optimizer_g, n_epochs, trainset, batch_size, min_val, max_val,
                    device, fw_layers=1, save_dir=None):
    model.eval()
  
    generator.train()

    gan_dict = {"min": min_val, "max": max_val}

    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

    for e in range(n_epochs):
        running_loss_gen = 0
        # set requires grad False
        set_requires_grad(model, False)
        #set_requires_grad(classifier, False)
        for i, (images, gt_images) in enumerate(trainloader):
            images = images.to(device)
            gt_images = gt_images.to(device)

            # train generator
            optimizer_g.zero_grad()

            zs_real, ys_real = get_classifier_features(model, images)
            print(zs_real.shape)
            print(ys_real.shape)

            #for l_idx in range(fw_layers):
            zs_real[0] = (zs_real[0] - min_val) / (max_val - min_val)
            
            zs_real = zs_real.to(device)
            ys_real = ys_real.to(device)
            x_hat = generator(zs_real, ys_real)

            x_hat = x_hat + images
            g_loss = torch.mean(ase_loss_weighted(x_hat, gt_images, device))

            running_loss_gen += g_loss.item()

            g_loss.backward()
            optimizer_g.step()

        running_loss_gen = running_loss_gen / len(trainset)

        print('Epoch {} -- Reconstruction Loss: {:.2f}'.format(e, running_loss_gen))

        if (e + 1) % 10 == 0:
            mapping = {"generator": copy.deepcopy(generator.state_dict())}
            gan_dict[e] = mapping

    torch.save(gan_dict, save_dir)
    return generator

using a dataparallel like

gen = nn.DataParallel(gen,device_ids = [0,1,3])

and calling the function as

learn_prototype(model = trained_model,
                 generator = gen,
                 optimizer_g = gen_optimizer,
                 n_epochs = 150,
                 trainset = dataset,
                 batch_size = 2, 
                 min_val = norm_min,
                 max_val = norm_max,
                device= device, 
                 fw_layers=1, 
                 save_dir="Models/generator_model/")

But I am getting this strange error. Can Someone help. I am not sure where am I doing wrong.

ptrblck · April 12, 2023, 8:15am

Based on the error message I would guess x is a 1D tensor and thus the torch.cat operation fails as seen here:

y_hat = torch.randint(0, 10, (10,))
x = torch.randn(10)

ys = F.one_hot(y_hat, num_classes=10)
x = torch.cat((x, ys), dim=1)
# IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

STR720 · April 12, 2023, 8:36am

Thank you @ptrblck for your response. I made a mistake I got that corrected (I will edit the post code) but now I am facing a new issue with Dataparallel

Any leads regarding this

ptrblck · April 12, 2023, 8:38am

It seems you are running into a device mismatch in one of the models, so would need to check if you are moving some parameters manually to the wrong device.

I would also recommend posting code snippets directly by wrapping them into three backticks ```, as screenshots make debugging unnecessary hard.

STR720 · April 12, 2023, 8:53am

@ptrblck Sorry for the inconvenience

Here is my full code

def get_features(model, inputs):
    features = []
    def hook(module, input, output):
        features.append(output.cpu())
    handle = model.module.avgpool.register_forward_hook(hook)
    model(inputs)
    handle.remove()
    return torch.cat(features, dim=0)

to get the features

def get_classifier_features(model, images):
    model.eval()
    with torch.no_grad():
        zs = get_features(model, images)
        zs = zs.view(zs.shape[0], -1)
        y_hat = model(images)
        y_hat = torch.argmax(y_hat, dim=1)
        
    return zs, y_hat

Then I am sending the model to device and Dataparallel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = trained_model.to(device)
trained_model = nn.DataParallel(trained_model,device_ids = [0,1,3])

Code to get Min and Max

def get_norm_values(model, dataloader, device):
    min_val = 0
    max_val = 0
    model.eval()
    min_vals = np.zeros(1)
    max_vals = np.zeros(1)

    for images, _,_ in tqdm_notebook(dataloader):
        images = images.to(device)

        zs, ys = get_classifier_features(model, images)

        min_vals[0], max_vals[0] = zs[0].cpu().numpy().min(), zs[0].cpu().numpy().max()

        curr_min = min_vals.min()
        curr_max = max_vals.max()
        min_val = min(min_val, curr_min)
        max_val = max(max_val, curr_max)

    return min_val, max_val

Then I am sending the above generator to Dataparallel

gen = ResNet18_CIFAR10_Deconv_GAN(latent_dim=512, fw_layers=1, num_classes=3)
gen = gen.to(device)
gen = nn.DataParallel(gen,device_ids = [0,1,3])
gen_optimizer = optim.Adam(gen.parameters(), lr=0.0002, betas=(0.5,0.999))

and This

def learn_prototype(model, generator, optimizer_g, n_epochs, trainset, batch_size, min_val, max_val,
                    device, fw_layers=1, save_dir=None):
    model.eval()
  
    generator.train()

    gan_dict = {"min": min_val, "max": max_val}

    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

    for e in range(n_epochs):
        running_loss_gen = 0
        # set requires grad False
        set_requires_grad(model, False)
        #set_requires_grad(classifier, False)
        for i, (images, gt_images) in enumerate(trainloader):
            images = images.to(device)
            gt_images = gt_images.to(device)

            # train generator
            optimizer_g.zero_grad()

            zs_real, ys_real = get_classifier_features(model, images)
            print(zs_real.shape)
            print(ys_real.shape)

            #for l_idx in range(fw_layers):
            zs_real = (zs_real - min_val) / (max_val - min_val)
            
            zs_real = zs_real.to(device)
            ys_real = ys_real.to(device)
            x_hat = generator(zs_real, ys_real)

            x_hat = x_hat + images
            g_loss = torch.mean(ase_loss_weighted(x_hat, gt_images, device))

            running_loss_gen += g_loss.item()

            g_loss.backward()
            optimizer_g.step()

        running_loss_gen = running_loss_gen / len(trainset)

        print('Epoch {} -- Reconstruction Loss: {:.2f}'.format(e, running_loss_gen))

        if (e + 1) % 10 == 0:
            mapping = {"generator": copy.deepcopy(generator.state_dict())}
            gan_dict[e] = mapping

    torch.save(gan_dict, save_dir)
    return generator

calling the above function as

 learn_prototype(model = trained_model,
                 generator = gen,
                 optimizer_g = gen_optimizer,
                 n_epochs = 150,
                 trainset = dataset,
                 batch_size = 2, 
                 min_val = norm_min,
                 max_val = norm_max,
                device= device, 
                 fw_layers=1, 
                 save_dir="Models/generator_model/")

Which is give that Dataparallel error as

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Input In [52], in <cell line: 1>()
----> 1 learn_prototype(model = trained_model,
      2                 generator = gen,
      3                 optimizer_g = gen_optimizer,
      4                 n_epochs = 150,
      5                 trainset = dataset,
      6                 batch_size = 2, 
      7                 min_val = norm_min,
      8                 max_val = norm_max,
      9                device= device, 
     10                 fw_layers=1, 
     11                 save_dir="Models/generator_model/")

Input In [47], in learn_prototype(model, generator, optimizer_g, n_epochs, trainset, batch_size, min_val, max_val, device, fw_layers, save_dir)
     30 zs_real = zs_real.to(device)
     31 ys_real = ys_real.to(device)
---> 32 x_hat = generator(zs_real, ys_real)
     34 x_hat = x_hat + images
     35 g_loss = torch.mean(ase_loss_weighted(x_hat, gt_images, device))

File /isi/w/lb27/softwares/miniconda3/envs/sai_msc/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /isi/w/lb27/softwares/miniconda3/envs/sai_msc/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:168, in DataParallel.forward(self, *inputs, **kwargs)
    166     return self.module(*inputs[0], **kwargs[0])
    167 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 168 outputs = self.parallel_apply(replicas, inputs, kwargs)
    169 return self.gather(outputs, self.output_device)

File /isi/w/lb27/softwares/miniconda3/envs/sai_msc/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:178, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
    177 def parallel_apply(self, replicas, inputs, kwargs):
--> 178     return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])

File /isi/w/lb27/softwares/miniconda3/envs/sai_msc/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:86, in parallel_apply(modules, inputs, kwargs_tup, devices)
     84     output = results[i]
     85     if isinstance(output, ExceptionWrapper):
---> 86         output.reraise()
     87     outputs.append(output)
     88 return outputs

File /isi/w/lb27/softwares/miniconda3/envs/sai_msc/lib/python3.10/site-packages/torch/_utils.py:461, in ExceptionWrapper.reraise(self)
    457 except TypeError:
    458     # If the exception takes multiple arguments, don't try to
    459     # instantiate since we don't know how to
    460     raise RuntimeError(msg) from None
--> 461 raise exception

RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/isi/w/lb27/softwares/miniconda3/envs/sai_msc/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/isi/w/lb27/softwares/miniconda3/envs/sai_msc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/isi/w/lb27/softwares/miniconda3/envs/sai_msc/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 154, in forward
    raise RuntimeError("module must have its parameters and buffers "
RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cuda:1

ptrblck · April 12, 2023, 5:16pm

I don’t see any obvious issues in your code, but also note that nn.DataParallel is in maintenance mode and I don’t know if you might be running into some internal failures now.
We generally recommnd using DistributedDataPatallel as it’s the faster and supported util.