RuntimeError during compilation on different computer

I have a module that is running fine on one computer, but on another computer it throws this error

  File "/home/guillefix/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
RuntimeError: __nv_nvrtc_builtin_header.h(78048): error: function "operator delete(void *, size_t)" has already been defined

__nv_nvrtc_builtin_header.h(78049): error: function "operator delete[](void *, size_t)" has already been defined

2 errors detected in the compilation of "default_program".

nvrtc compilation failed: 

#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)


template<typename T>
__device__ T maximum(T a, T b) {
  return isnan(a) ? a : (a > b ? a : b);
}

template<typename T>
__device__ T minimum(T a, T b) {
  return isnan(a) ? a : (a < b ? a : b);
}

extern "C" __global__
void fused_tanh_mul_add__14203776399538843293(float* tv_, float* tb0_2, float* tv__, float* tv___, float* tv____, float* aten_log, float* aten_cat) {
{
if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<8ll ? 1 : 0) {
    aten_cat[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = (((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<4ll ? 1 : 0) ? (__ldg(tv__ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x))) / (1.f / (1.f + (expf(0.f - ((__ldg(tv___ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x))) * (tanhf(__ldg(tv____ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)))) + 2.f)))) + 0.1192029193043709f) - (__ldg(tb0_2 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x))) : __ldg(tv_ + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) - 4ll));
  }if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<4ll ? 1 : 0) {
    float v = __ldg(tv___ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
    float v_1 = __ldg(tv____ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
    aten_log[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = logf(1.f / (1.f + (expf(0.f - (v * (tanhf(v_1)) + 2.f)))) + 0.1192029193043709f);
  }}
}

The computer in which it works has cuda 11.4, and torch 1.10.0a0+git36449ea, and the comptuer in which it doesnt work has cuda 11.5 and torch 1.10.1+cu102 (though I also tried with 1.10.0+cu102). So it seems like quite a similar set up. Any idea about why it’s failing?

It worked after updating the Nvidia Toolkit to version 11.6!