I have developed a batch linear regression algorithm in c++ which reads

```
#include <torch/extension.h>
/// param x: (batch, sample, feature)
/// param y: (batch, sample)
/// return (batch, feature)
at::Tensor lr_batch_fit(const at::Tensor &x, const at::Tensor &y){
const int64_t batch = x.size(0), sample = x.size(1), features = x.size(2);
const at::Tensor & xt = at::transpose(x, 1, 2);
const at::Tensor & s = at::bmm(xt, x);
const at::Tensor & yp = y.view({batch, sample, 1});
const at::Tensor & pre_beta = at::bmm(xt, yp);
at::Tensor s_lu, pivots, infos;
std::tie(s_lu, pivots, infos) = at::_lu_with_info(s, true, true);
/*
const auto data = at::_lu_with_info(s, true, true);
const at::Tensor & s_lu = std::get<0>(data);
const at::Tensor & pivots = std::get<1>(data);
const at::Tensor & infos = std::get<2>(data);
*/
at::Tensor beta = at::zeros({batch, features, 1}, x.options());
if(0 == infos.nonzero().size(0)){
at::lu_solve_out(beta, pre_beta, s_lu, pivots);
}
return beta.squeeze(2);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("lr_batch_fit_c", &lr_batch_fit);
}
```

The corrresponding python code is

```
import torch
def lr_batch_fit_py(x, y):
"""
:param x: (batch, sample, features)
:param y: (batch, sample,)
:return: beta (batch, features)
"""
batch, sample, features = x.shape
xt = torch.transpose(x, 1, 2)
s = torch.bmm(xt, x)
y = y.view(batch, sample, 1)
pre_beta = torch.bmm(xt, y)
s_lu, pivots, infos = torch.lu(s, pivot=True, get_infos=True)
beta = torch.zeros(batch, features, 1, dtype=x.dtype, device=x.device)
if 0 == infos.nonzero().size(0):
torch.lu_solve(pre_beta, s_lu, pivots, out=beta)
return beta.squeeze(2)
```

and here is the testing code

```
import torch
import time
from lr_batch_c import lr_batch_fit_c
from lr_batch import lr_batch_fit_py
def test_lr_batch():
def check_equal(x, y, eps=1e-8):
assert torch.sum(torch.abs(x - y)) <= eps
tt1, tt2 = 0, 0
x = torch.randn(4, 48000, 160, dtype=torch.float64).cuda()
beta = torch.randn(4, 160, 1, dtype=torch.float64).cuda()
y = torch.bmm(x, beta).squeeze(2)
beta = beta.squeeze(2)
for i in range(100):
t1 = time.time()
beta2 = lr_batch_fit_c(x, y)
t2 = time.time()
beta3 = lr_batch_fit_py(x, y)
t3 = time.time()
tt1 += (t2 - t1)
tt2 += (t3 - t2)
check_equal(beta, beta2)
check_equal(beta, beta3)
print("test_lr_batch c_time %.2fs py_time %.2fs" % (tt1, tt2))
```

After running `test_lr_batch`

several times with cuda, I found that the c++ version is slower than the py version. In my naive thinking, since c++ version will save some time at the boundary between py and c++, it should be faster. Could anyone help me figure it out?

(Note with cpu, the c++ version is a little bit faster than py verion, as anticipated.)

Thanks.