RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)` during forward pass

So as part of an educational exercise I decided to implement an lstm from scratch (not using torch.nn.module.rnn) And when I try to run it on my GPU, I get the following error:

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling cublasCreate(handle)

This seems to be occurring on the first forward pass through my lstm which is just an input gate (linear layer)

The following ismy code:

class lstm(nn.Module): 
    def __init__(self, input_size, hidden_dim, output_dim=1) -> None:
        super().__init__()
        self.input_dim = input_size
        self.hidden_dim  = hidden_dim
        
        self.forget_gate = nn.Sequential(
            nn.Linear(input_size+hidden_dim, hidden_dim),
            nn.Sigmoid()
        )
        self.input_gate = nn.Sequential(
            nn.Linear(input_size+hidden_dim, hidden_dim),
            nn.Sigmoid()
        )
        self.input_node = nn.Sequential(
            nn.Linear(input_size+hidden_dim, hidden_dim),
            nn.Tanh()
        )
        self.output_gate = nn.Sequential(
            nn.Linear(input_size+hidden_dim, hidden_dim),
            nn.Sigmoid()
        )
        self.tanh = nn.Tanh()

        # this output layer can be fancier if needed by the use case
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, h_in=None, c_in=None):

        if isinstance(x, PackedSequence):
            input, batch_sizes, sorted_indices, unsorted_indices = x
            max_batch_size = batch_sizes[0]
            if h_in is None: 
                h_in = self.init_h(max_batch_size, x)
                c_in = self.init_h(max_batch_size, x)
        
        data_offset = 0
        outputs = []
        for batch_size in batch_sizes:
            current_input = input[data_offset:data_offset + batch_size]
            data_offset += batch_size
            current_input = current_input.unsqueeze(0)
            if batch_size < max_batch_size: 
                h_in[:,batch_size:,:] = 0
                c_in[:,batch_size:,:] = 0
                pad_size = max_batch_size - batch_size
                # Create a tensor of zeros with the padding size
                padding = torch.zeros(1, pad_size, self.hidden_dim, device=current_input.device)
                # Concatenate the padding to the current input
                current_input = torch.cat([current_input, padding], dim=1)
            
            combined = torch.cat([current_input, h_in], dim=2)
            
            i_gate_output = self.input_gate(combined) # error occurs here
            i_node_output = self.input_node(combined)
            o_gate_output = self.output_gate(combined)
            f_gate_output = self.forget_gate(combined)

            c_out = (f_gate_output * c_in) + (i_node_output * i_gate_output)

            h_out = self.tanh(c_out) * o_gate_output
            out = self.output_layer(h_out)
            h_in = h_out
            c_in = c_out
            outputs.append(out)

       
        if isinstance(x, PackedSequence):
            output_packed = PackedSequence(outputs, batch_sizes, sorted_indices, unsorted_indices)
            return output_packed, h_out, c_out
        
        return out, h_out, c_out
    
    
    def init_h(self, batch_size, x):
        #alternatives include but not limited to Xavier/Kaiminh initialization
        return torch.zeros(1, batch_size, self.hidden_dim, dtype=x.data.dtype, device=x.data.device)

Any idea what could be going on here? Thanks!

You might be running out of memory before the cublas handle is created. Which PyTorch version and GPU are you using?

Seemed like an odd bug… I rebooted my device a couple times and it no longer seems to be a problem. I’m on a A2000 laptop and CUDA 12.0