Pytorch doesn't register model parameters to cuda backend

I am trying to generate text using a sequence to sequence model in Pytorch from the Apocalypse Now movie script. My code is almost verbatim to this one.

The following is the model, a character level LSTM.

class CharLSTM(nn.Module):

        def __init__(self, chars, hidden_size, n_layers=2, drop_out=0.5, lr= 0.001):

        super(CharLSTM, self).__init__()

       # Set all the hyperparameters of your network

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.drop_out = drop_out
        self.lr = lr

        # set vocabulary and get indices for these
        self.chars = chars
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {w : i for i, w in self.int2char.items()}

    # define the lstm network, this outputs the next char and cell state and hidden state

       self.lstm = nn.LSTM(input_size=len(self.chars), hidden_size=hidden_size, num_layers=n_layers, dropout=drop_out,
                   batch_first=True)
    # add dropout
       self.drop_out = nn.Dropout(drop_out)

       self.fc = nn.Linear(hidden_size, len(chars))

    self.init_weights()

     def forward(self, x, h_0):
        """compute current and hidden units, stack outputs and pass it to linear layer"""

         x, (h, c) = self.lstm(x, h_0)

         x = self.drop_out(x)
         x = x.view(x.size()[0]*x.size()[1], self.hidden_size)

          x = self.fc(x)

         return x, (h, c)

      def predict(self, char, h=None, cuda=False, top_k=None):
         """given a character, predict the next character in the sequence."""
    
         if cuda():
           self.cuda()
         else:
           self.cpu()
        
        # Initialize hidden state
        if h is None:
            h = self.init_hidden(1)

         # get the integer of character.
          ch = np.array([[self.char2int[char]]])

           # one_hot_encode
          one_hot = one_hot_encode(ch, len(self.chars))

           # convet to tensor
           one_hot_torch = torch.from_numpy(one_hot)
    
          if cuda():
             one_hot_torch = one_hot_torch.cuda()

          # create a tuple of the hidden state
          # this is what LSTM expects
         h = tuple([each.data for each in h])
         out, h = self.forward(ch, h)

         # Prob distribution over all the characters
         probs = F.softmax(out, dim=1).data

         # convert back prob to cpu if model was
         # set to gpu
    
          if cuda():
             probs = probs.cpu()

         # if top number of preds to get
         # wasn't pass, take the distribution over whole character length
         if top_k is None:
            top_ch = np.arange(len(self.chars))
         else:
            probs, top_ch = probs.topk(top_k)
           top_ch = top_ch.numpy().squeeze()

           # reduce dims of size 1
            probs = probs.numpy().squeeze()

           # sample from top_char with each probs of each character being
          # the char with higher prob will be chosen since dividing the
          # highest value with probs.sum() is what is the best.
          char = np.random.choice(top_ch, p=probs/probs.sum())

         return self.char2int[char], h

      def init_weights(self):

           initrange = 0.1

          self.fc.bias.data.fill_(0)
          self.fc.weight.data.uniform_(-1, 1)

       def init_hidden(self, n_seqs):

            weight = next(self.parameters()).data
           return (weight.new(self.n_layers, n_seqs, self.hidden_size).zero_(),
             weight.new(self.n_layers, n_seqs, self.hidden_size).zero_())

instantiating an object of this class

    model = CharLSTM(chars, hidden_size=512, n_layers=2)
    model

CharLSTM(
(lstm): LSTM(58, 512, num_layers=2, batch_first=True, dropout=0.5)
(drop_out): Dropout(p=0.5)
(fc): Linear(in_features=512, out_features=58, bias=True)
)

The following is my train function where I send my inputs, targets for train and validation datasets, as well as my model.

    def train(model, data, epochs=10, n_seqs=10, n_steps=40, lr=0.001, clip=5, val_frac= 0.1, cuda=False,
      print_every=10):

""" model: the model to b trained
    data: the data on which we train
    epochs: number of epochs to train for
    n_seqs" number of sequences in our batch
    n_steps: time step for each sequence
    lr: learning rate
    clip: value used to clip the network gradient to prevent exploding gradeint.
    val_frac: the fraction of data used for validation
    print_every: the number of seconds for which we print out model statistics
"""

    # change model to train mode
    **model.train()**

     # define optimizer and loss function
     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
     criterion = nn.CrossEntropyLoss()


      # trin and validation split
      val_idx = int(len(data)*(1-val_frac))
      data, val_data = data[:val_idx], data[val_idx:]

      **if cuda:**
          ** model.cuda()**

        counter = 0
       n_chars = len(model.chars)

        # loop over epochs
            for epoch in range(epochs):

              # initialize hidden layer of the model
             h = model.init_hidden(n_seqs)

    # loop over batches
    for x, y in get_batches(data, n_seqs, n_steps):

        counter += 1

        # one hot encode
        x = one_hot_encode(x, n_chars)

        # convert to tensors
        inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

        # move inputs and targets to cuda
        
        **inputs, targets = inputs.cuda(), targets.cuda()**
            
        # New hidden state being created to prevented backpropogating through the 
        # entire history
        h = tuple([each.data for each in h])

        # zero out gradient to prevent accumulation
        model.zero_grad()

        # get output and hidden
        out, h = model.forward(inputs, h)
        loss = criterion(out, targets.view(n_seqs*n_steps).type(torch.cuda.LongTensor))

        # backpropogate loss
        loss.backward()

        # use gradient clipping to prevent exploding gradient
        nn.utils.clip_grad_norm_(model.parameters(), clip)

        # take a step in the los surface
        optimizer.step()

        if counter % print_every ==0:

            # initilize hidden state for validation
            val_hidden = model.init_hidden(n_seqs)
            val_losses = []

            for x, y in get_batches(val_data, n_seqs, n_steps):

                x = one_hot_encode(x, n_chars)

                x, y = torch.from_numpy(x), torch.from_numpy(y)

                val_hidden = tuple([each.data for each in val_hidden])
                
                inputs, targets = x, y
                
                if cuda:
                  **inputs, targets = inputs.cuda(), targets.cuda()**
                  
                out, val_hidden = model.forward(inputs, val_hidden)

                val_loss = criterion(out, targets.view(n_seqs*n_steps).type(torch.LongTensor))

                val_losses.append(val_loss.item())

                print('Epoch:'.format(epoch+1),
                      'Steps:'.format(counter),
                      'train loss {.:4f}'.format(loss.item),
                      'val loss {.:4f}'.format(np.mean(val_losses)))

running this with batch_size, i.e., n_seqs= 128
the sequence length, which here is n_steps, in the sense of time steps = 100

    # define batch_size and sequence length
    n_seqs, n_steps = 128, 100

    train(model, encoded, epochs=25, n_seqs=n_seqs, n_steps=n_steps, lr=0.001, cuda=True, print_every=10)

This throws the followin error


RuntimeError Traceback (most recent call last)
in ()
1 n_seqs, n_steps = 128, 100
----> 2 train(model, encoded, epochs=25, n_seqs=n_seqs, n_steps=n_steps, lr=0.001, cuda=True, print_every=10)

4 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
1869 .format(input.size(0), target.size(0)))
1870 if dim == 2:
-> 1871 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
1872 elif dim == 4:
1873 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)

RuntimeError: Expected object of backend CUDA but got backend CPU for argument #2 ‘target’

A quick check of whether the model parameters were registered by quora returns false.

    next(model.parameters()).is_cuda
    False

Working in the colab environment.

    torch.cuda.get_device_name()
    'Tesla T4'

Which is perplexing since I moved my parameters to cuda. I don’t get where I am going wrong. Please help, thank you so much for your time.

It looks like the error message is thrown in the validation loop.
While you are using:

loss = criterion(out, targets.view(n_seqs*n_steps).type(torch.cuda.LongTensor))

in your training loop, you are not creating a CUDA tensor in the validation loop:

val_loss = criterion(out, targets.view(n_seqs*n_steps).type(torch.LongTensor))

Thanks a bunch. I made a ton of other such silly mistakes throughout my code, but now the whole thing works well. Thanks again

I’m glad the code is working now.
To avoid such issues in the future, I would use the .to() operation with a specified device, so that changing this single variable will switch between CPU and GPU:

device = 'cuda:0'
# device = 'cpu'  # set for CPU run

targets = targets.to(device)
x = torch.randn(1, device=device)
model.to(device)
...

Thanks a lot, I shall practise this going forward. If I were to do this, switching the model back to CPU for prediction would just entail calling model.cpu() right?

This would be one approach, but I would recommend to set device='cpu' and use the same code if possible.

Thank you, will surely do.