RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

I cannot reproduce the issue on a T4 with torch==1.13.0+cu117:

import torch
torch.cuda.get_device_name(0)
# 'Tesla T4'
torch.__version__
# '1.13.0+cu117'

import torch.nn.functional as F
import torch
a = torch.rand((1, 2, 3)).to('cuda')
b = torch.rand((1, 3, 24, 94)).to('cuda')
grid = F.affine_grid(a, b.size())

print(grid)
tensor([[[[0.4507, 0.2959],
          [0.4582, 0.3064],
          [0.4656, 0.3169],
          ...,
          [1.1288, 1.2493],
          [1.1363, 1.2597],
          [1.1437, 1.2702]],

         [[0.4635, 0.3081],
          [0.4710, 0.3186],
          [0.4784, 0.3291],
          ...,
          [1.1417, 1.2615],
          [1.1491, 1.2719],
          [1.1566, 1.2824]],

         [[0.4763, 0.3203],
          [0.4838, 0.3308],
          [0.4913, 0.3413],
          ...,
          [1.1545, 1.2736],
          [1.1619, 1.2841],
          [1.1694, 1.2946]],
...