Weird error transfering my model to cuda

Moosquibe · April 27, 2018, 3:36pm

Hi,

I am trying to train the following model on GPU:

import torch.nn as nn
class ConvLSTM(nn.Module):
    """Convolutional lstm model to classify raw waveform files. The
    architecture is:
    
    Input -> Conv -> ReLU -> Pooling -> LSTM -> Softmax
    
    Properties:
      CONVOLUTIONAL LAYER:
        conv_kernel_size :  The size of the filters for the conv. layer
        conv_stride: Stride for the conv. layer
        num_features: Number of output channels for conv. layer
      MAX POOLING LAYER:
        pooling_kernel: Size of the max-pooling window
      LSTM:
        hidden_size: The dimension of the hidden state
        num_layers: Number of hidden layers inside lstm
        num_of_classes: Number of classes to classify into
        bias: Have/not have bias terms in the layers
    """                     
    
    def __init__(self, conv_kernel_size, conv_stride, num_features, 
                 pooling_kernel, hidden_size, num_layers = 1, 
                 num_of_classes = 2, bias = True, ):
        super(ConvLSTM, self).__init__()
        self.conv_kernel_size = conv_kernel_size
        self.conv_stride = conv_stride
        self.num_features = num_features
        self.pooling_kernel = pooling_kernel
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_of_classes = num_of_classes
        self.bias = bias

        
        self.conv = nn.Conv1d(
            in_channels = 1,
            out_channels = num_features,
            kernel_size = conv_kernel_size,
            stride = conv_stride,
            bias = bias
            )
        
        self.relu = nn.ReLU()
        
        if pooling_kernel > 1:
            self.pooling = nn.MaxPool1d(
                kernel_size = pooling_kernel
                )
        
        self.lstm = nn.LSTM(
            input_size = num_features,
            hidden_size = hidden_size,
            num_layers = num_layers,
            dropout = 0.5,
            bias = bias
            )
        
        self.linear = nn.Linear(
            in_features = hidden_size,
            out_features = num_of_classes,
            bias = bias
            )
        
        self.softmax = nn.Softmax(dim = 1)
        
    def forward(self, wav_minibatch):
        """ Forward pass of the Convolutional LSTM audio based 
        ideology classifying network.
        
        INPUT:
            wav_minibatch: The raw waveform of a spoken word
            hidden_init: Initial hidden state
            cell_init: Initial cell state 
        
        OUTPUT:
            prob_score: Probability scores over the classes"""
        
        x = self.conv(wav_minibatch)
        x = self.relu(x)
        if self.pooling_kernel > 1:
            x = self.pooling(x)
        x = x.permute(2,0,1)
        output, x = self.lstm(x)
        x = self.linear(x[0][0,:,:])
        probs = self.softmax(x)
        return probs

However, when I try to put the model onto Cuda:

classifier = ConvLSTM(
    conv_kernel_size = 5,
    conv_stride = 1,
    num_features = 32,
    pooling_kernel = 2,
    hidden_size = 1024,
    num_layers = 1,
    num_of_classes = 2,
    bias = True
    )

if torch.cuda.is_available():
    classifier = classifier.cuda()

I get the following error without any further error message:

AssertionErrorTraceback (most recent call last)
<ipython-input-23-9dc2b59bee55> in <module>()
      3 if torch.cuda.is_available():
----> 4     classifier = classifier.cuda()

/home/zsolt/anaconda3/lib/python3.5/site-packages/torch/nn/modules/module.py in cuda(self, device)
    214             Module: self
    215         """
--> 216         return self._apply(lambda t: t.cuda(device))
    217 
    218     def cpu(self):

/home/zsolt/anaconda3/lib/python3.5/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    144     def _apply(self, fn):
    145         for module in self.children():
--> 146             module._apply(fn)
    147 
    148         for param in self._parameters.values():

/home/zsolt/anaconda3/lib/python3.5/site-packages/torch/nn/modules/rnn.py in _apply(self, fn)
    121     def _apply(self, fn):
    122         ret = super(RNNBase, self)._apply(fn)
--> 123         self.flatten_parameters()
    124         return ret
    125 

/home/zsolt/anaconda3/lib/python3.5/site-packages/torch/nn/modules/rnn.py in flatten_parameters(self)
    109             # Slice off views into weight_buf
    110             all_weights = [[p.data for p in l] for l in self.all_weights]
--> 111             params = rnn.get_parameters(fn, handle, fn.weight_buf)
    112 
    113             # Copy weights and update their storage

/home/zsolt/anaconda3/lib/python3.5/site-packages/torch/backends/cudnn/rnn.py in get_parameters(fn, handle, weight_buf)
    163                 # might as well merge the CUDNN ones into a single tensor as well
    164                 if linear_id == 0 or linear_id == num_linear_layers / 2:
--> 165                     assert filter_dim_a.prod() == filter_dim_a[0]
    166                     size = (filter_dim_a[0] * num_linear_layers // 2, filter_dim_a[2])
    167                     param = fn.weight_buf.new().set_(

AssertionError:

Any suggestions?

Atcold · April 27, 2018, 4:53pm

The assertion line tells you that filter_dim_a.prod() and filter_dim_a[0] differ.
For example, you could add a print and check what these two items are equal to. To pass the assertion, they should be equal to each other. Then, try to figure out why they are not. (Using a debugger helps you a lot with the back-tracking of errors, letting you climb the error stack trace.)

Moosquibe · April 27, 2018, 7:32pm

Thanks, turns out there was an incompatibility between Cudnn and Pytorch versions, upgrading to Cuda 9 solved the issue.