Cuda Error THCTensorScatterGather.cu:70 when using backward() on gradient penalty

Kay · August 21, 2017, 12:11pm

I use the newest Pytorch Version (from source, CUDA8, CUDNN6) for improved WGANS.
Everything works fine, but after some number of iterations I receive this error:

RuntimeErrorTraceback (most recent call last)
<ipython-input-10-4c802ee3c846> in <module>()
     29 
     30         fake_batch = Variable(gen.forward(noise).data,requires_grad=True)
---> 31         loss_d = disc.train_batch(X_batch,fake_batch,lambd=lambd)
     32 
     33         del X_batch,noise,fake_batch

/home/hartmank/braindecode/GAN_env/GAN/train_modules.pyc in train_batch(self, batch_real, batch_fake, lambd)
    250                 # gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * lambd
    251                 loss_penalty = self.calc_gradient_penalty(batch_real, batch_fake,lambd)
--> 252                 loss_penalty.backward()
    253 
    254                 #loss = loss_fake - loss_real + loss_penalty

/home/hartmank/anaconda2/envs/pytorch_GAN_CUDNN6/lib/python2.7/site-packages/torch/autograd/variable.pyc in backward(self, gradient, retain_graph, create_graph, retain_variables)
    155                 Variable.
    156         """
--> 157         torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
    158 
    159     def register_hook(self, hook):

/home/hartmank/anaconda2/envs/pytorch_GAN_CUDNN6/lib/python2.7/site-packages/torch/autograd/__init__.pyc in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
     96 
     97     Variable._execution_engine.run_backward(
---> 98         variables, grad_variables, retain_graph)
     99 
    100 

/home/hartmank/anaconda2/envs/pytorch_GAN_CUDNN6/lib/python2.7/site-packages/torch/autograd/function.pyc in apply(self, *args)
     89 
     90     def apply(self, *args):
---> 91         return self._forward_cls.backward(self, *args)
     92 
     93 

/home/hartmank/anaconda2/envs/pytorch_GAN_CUDNN6/lib/python2.7/site-packages/torch/nn/_functions/thnn/pooling.pyc in backward(ctx, ggI, _ggIndices)
    139         # ggO is equivalent to the 1d case, but the indices are given wrt the last two dimensions combined
    140         indices_view = indices.view(indices.size()[:-2] + (-1,))
--> 141         ggO = ggI.contiguous().view(ggI.size()[:-2] + (-1,)).gather(dim=2, index=indices_view).view_as(indices)
    142         return gI, None, ggO, None, None, None, None, None, None
    143 

/home/hartmank/anaconda2/envs/pytorch_GAN_CUDNN6/lib/python2.7/site-packages/torch/autograd/variable.pyc in gather(self, dim, index)
    683 
    684     def gather(self, dim, index):
--> 685         return Gather.apply(self, dim, index)
    686 
    687     def scatter(self, dim, index, source):

/home/hartmank/anaconda2/envs/pytorch_GAN_CUDNN6/lib/python2.7/site-packages/torch/autograd/_functions/tensor.pyc in forward(ctx, input, dim, index)
    558         ctx.save_for_backward(index)
    559         ctx.dim = dim
--> 560         return input.gather(dim, index)
    561 
    562     @staticmethod

RuntimeError: cuda runtime error (59) : device-side assert triggered at /home/hartmank/pytorch_install/torch/lib/THC/generic/THCTensorScatterGather.cu:70

The number of epochs until this error occurs is lower for higher learning rates.
Could the problem stem from too high losses/gradients?

Functions of interest:

    def train_batch(self, batch_real, batch_fake, lambd=10):
        if not self.did_init_train:
            self.train_init()

        self.optimizer.zero_grad()
        for p in self.parameters():
            p.requires_grad = True

        fx_real = self.forward(batch_real)
        loss_real = -fx_real.mean()
        loss_real.backward()
        loss_r = -loss_real.data[0]
        del fx_real

        fx_fake = self.forward(batch_fake)
        loss_fake = fx_fake.mean()
        loss_fake.backward()
        loss_f = loss_fake.data[0]
        del fx_fake

        # https://github.com/caogang/wgan-gp/blob/master/gan_toy.py
        loss_penalty = self.calc_gradient_penalty(batch_real, batch_fake,lambd)
        loss_penalty.backward()
        
        #loss = loss_fake - loss_real + loss_penalty

        # Backprop gradient
        penalty = loss_penalty.data[0]
        #del loss_real,loss_fake,loss_penalty
        #loss.backward()

        # Update parameters
        self.optimizer.step()

        return (loss_f,loss_r,penalty) # return loss


    def calc_gradient_penalty(self, real_data, fake_data,lambd):
        alpha = torch.rand(real_data.size(0),*((len(real_data.size())-1)*[1]))
        alpha = alpha.expand(real_data.size())
        alpha = alpha.cuda()

        interpolates = alpha * real_data.data + ((1 - alpha) * fake_data.data)

        interpolates = interpolates
        interpolates = Variable(interpolates, requires_grad=True)

        disc_interpolates = self(interpolates)

        gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                                  grad_outputs=torch.ones(disc_interpolates.size()).cuda(),
                                  create_graph=True, retain_graph=True, only_inputs=True)[0]


        del alpha, interpolates, disc_interpolates

        gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * lambd       

        return gradient_penalty

arubio · December 19, 2017, 1:27pm

Hi @Kay, did you manage to solve the problem? I’m creating my own loss function and I get the same error when I call loss.backward(). Was it related to high values in losses or gradients?

Thanks!

smth · December 19, 2017, 2:02pm

device-side asserts means that you are doing out-of-bounds indexing, either in a standard [...] indexing operation, or scatter/gather calls or masked_* / index_* calls

arubio · December 19, 2017, 2:34pm

That’s what I had read in other discussions, but the strange thing is that changing a line in my loss function from:
torch.nn.functional.pairwise_distance(...)
to:
1 - torch.nn.functional.cosine_similarity(...)
avoids this device-side asserts error (although it makes the loss go to -inf after a few iterations). The rest of the code (where indexing problems might appear) is exactly the same.