I’m trying to rewrite the following sparse multiplication operation as a C++ extension.
def sparse_mul(weights, bias, in_values, active_out_indices):
# for the ith input in batch, produce output only for the nodes in active_out_indices[i]
# active_out_indices.size(1) is expected to be around 10% of weights.size(0)
active_weights = weights[[active_out_indices]]
active_bias = bias[[active_out_indices]]
out_values = torch.bmm(active_weights, in_values.unsqueeze(-1)).squeeze(-1) + active_biases
return out_values
I want to do this because the above python implementation performs unnecessary data copies due to the advanced indexing. The following is the C++ code I have as of now. It performs almost 50x slower (on my 4 core CPU) than the python implementation and also the regular dense multiplication. Can someone tell me what I’m doing wrong? Can I not expect a speedup here? I’m new to PyTorch C++ and am trying to follow the solution given in https://discuss.pytorch.org/t/using-at-parallel-for-in-a-custom-operator/82747/5
// sparse_mul.cpp
#include <torch/extension.h>
#include <vector>
#define _OPENMP
#include <ATen/ParallelOpenMP.h>
torch::Tensor mul(
torch::Tensor in_values,
torch::Tensor active_out_indices,
torch::Tensor weights,
torch::Tensor bias) {
auto out_values = torch::empty(active_out_indices.sizes());
auto active_out_indices_a = active_out_indices.accessor<long,2>();
int batch_size = active_out_indices_a.size(0);
int active_out_dim = active_out_indices_a.size(1);
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; i++){
torch::set_num_threads(1);
auto in_values_i = in_values[i];
for(int j=0; j<active_out_dim; j++){
out_values[i][j] = torch::dot(in_values_i, weights[active_out_indices_a[i][j]]) + bias[active_out_indices_a[i][j]];
}
}
});
return out_values;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("mul", &mul, "sparse multiplicatio");
}
# setup.py
from setuptools import setup, Extension
from torch.utils import cpp_extension
cpp_module = cpp_extension.CppExtension('sparse_mul_cpp',
sources=['sparse_mul.cpp'],
extra_compile_args=['-fopenmp'],
extra_link_args=['-lgomp']
)
setup(name='sparse_mul_cpp',
ext_modules=[cpp_module],
cmdclass={'build_ext': cpp_extension.BuildExtension})
I also get an issue “a leaf Variable that requires grad is being used in an in-place operation error” when I try to use larger sizes for the weight matrix. I’m using Pytorch version 1.6.0
Thank you for your time