Libtorch doesn't mirror Pytorch behavior in Linear Layer due to cuBlas

I have as minimal python code as I could manage follows:

import torch
import torch.nn as nn
import torch.nn.functional as F

class AC(nn.Module):
    def __init__(self, state_dim, action_dim, action_std_init, device):
        super(AC, self).__init__()
        
        self.device = device

        # Actor network
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, action_dim)
        )
        
        # Critic network
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

        # Action variance
        self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device)

        self.to(device)

    def forward(self, state):
        raise NotImplementedError
    
    def evaluate(self, state, action):
        mu = self.actor(state)
        std = torch.sqrt(self.action_var.expand_as(mu))
        action_logprobs = -torch.pow(action - mu, 2) / (2 * self.action_var.expand_as(action)) - std.log() - torch.log(torch.sqrt(torch.tensor(2 * torch.pi)))
        action_logprobs = action_logprobs.sum(2)

        dist_entropy = 0.5 + 0.5 * torch.log(2 * torch.pi * self.action_var)
        dist_entropy = dist_entropy.sum()

        state_values = self.critic(state)

        return action_logprobs, state_values, dist_entropy

# Main code
if __name__ == "__main__":
    # Check if CUDA is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if not torch.cuda.is_available():
        print("CUDA is not available! Exiting...")
        exit()

    # Create a minimal AC model
    state_dim = 4
    action_dim = 2
    action_std_init = 0.1
    model = AC(state_dim, action_dim, action_std_init, device)
    num_envs = 4
    num_samples = 40

    # Create dummy input tensors with known shapes
    state2 = torch.rand((num_samples, num_envs, state_dim)).to(device)  
    action2 = torch.rand((num_samples, num_envs, action_dim)).to(device) 

    try:
        action_logprobs2, state_values2, dist_entropy2 = model.evaluate(state2, action2)
        
        print("Action logprobs:", action_logprobs2)
        print("State values:", state_values2)
        print("Dist entropy:", dist_entropy2)
    except Exception as e:
        print("Error during evaluation:", str(e))

It runs and seems to work as I would like it to.
I mirror it in libtorch as follows:

#include <torch/torch.h>
#include <torch/script.h>
#include <cuda_runtime.h>
#include <iostream>

struct AC : torch::nn::Module {
    torch::nn::Sequential actor, critic;
    torch::Tensor action_var;
    torch::Device device;

    // Constructor
    AC(long int state_dim, long int action_dim, double action_std_init, torch::Device device)
        : actor(register_module("actor", torch::nn::Sequential(
                                             torch::nn::Linear(state_dim, 16),
                                             torch::nn::ReLU(),
                                             torch::nn::Linear(16, 32),
                                             torch::nn::ReLU(),
                                             torch::nn::Linear(32, action_dim)))),
          critic(register_module("critic", torch::nn::Sequential(
                                               torch::nn::Linear(state_dim, 16),
                                               torch::nn::ReLU(),
                                               torch::nn::Linear(16, 32),
                                               torch::nn::ReLU(),
                                               torch::nn::Linear(32, 1)))),
          device(device),
          action_var(torch::full({action_dim}, action_std_init * action_std_init, torch::TensorOptions().device(device))) {
        this->to(device);
    }

    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> evaluate(const torch::Tensor& state, const torch::Tensor& action) {
        torch::Tensor mu = actor->forward(state);
        torch::Tensor std = torch::sqrt(action_var.expand_as(mu));
        torch::Tensor action_logprobs = -torch::pow(action - mu, 2) / (2 * action_var.expand_as(action)) - std.log() - torch::log(torch::sqrt(torch::tensor(2 * M_PI, torch::TensorOptions().device(device))));
        action_logprobs = action_logprobs.sum(2);

        torch::Tensor dist_entropy = 0.5 + 0.5 * torch::log(2 * M_PI * action_var);
        dist_entropy = dist_entropy.sum();

        torch::Tensor state_values = critic->forward(state);

        return std::make_tuple(action_logprobs, state_values, dist_entropy);
    }
};

int main() {
    // Check if CUDA is available
    if (!torch::cuda::is_available()) {
        std::cerr << "CUDA is not available! Exiting..." << std::endl;
        return -1;
    }

    // Set CUDA device
    int device_id = 0;
    cudaSetDevice(device_id);
    torch::Device device(torch::kCUDA, device_id);

    // Create a minimal AC3 model
    long int state_dim = 4;
    long int action_dim = 2;
    double action_std_init = 0.1;
    AC model(state_dim, action_dim, action_std_init, device);
    model.to(device);  

    long int num_envs = 4;
    long int num_samples = 40;

    // Create dummy input tensors with known shapes
    torch::Tensor state2 = torch::rand({num_samples, num_envs, state_dim}).to(device);  // Batch of states
    torch::Tensor action2 = torch::rand({num_samples, num_envs, action_dim}).to(device);  // Corresponding actions

    // Test the evaluate method
    try {
        auto [action_logprobs2, state_values2, dist_entropy2] = model.evaluate(state2, action2);

        std::cout << "Action logprobs: " << action_logprobs2 << std::endl;
        std::cout << "State values: " << state_values2 << std::endl;
        std::cout << "Dist entropy: " << dist_entropy2 << std::endl;
    } catch (const c10::Error& e) {
        std::cerr << "Error during evaluation: " << e.what() << std::endl;
        return -1;
    }

    return 0;
}

I get the following error at the first line of the evaluate function (high in the callstack is the linear layer):

Error during evaluation: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
Exception raised from gemm<float> at ../aten/src/ATen/cuda/CUDABlas.cpp:427 (most recent call first):

However in running the non minimal example, tensor operations were working fine, and even using the same network with different dimension inputs and different function (shape:4,4) worked.

linking to cublas doesn’t seem to be the problem, I testing pure cublas calls in my build system and they worked fine.

Any idea on why I am seeing this discrepancy in libtorch and pytorch and how to fix it?

pytorch version: 2.1.0
libtorch version: 2.1.0
os: ubuntu 22.04 lts
nvidia-driver: 550.54.15
device: H100 80GB PCIE
cuda version: 12.1

Check via LD_DEBUG=libs which cuBLAS libs are loaded and make sure they are the same.

Found out I was linking wrong. Edited my build system and it worked.