Hello all, I want to build a custom Cuda kernel for calculating multiples IoU and adding them, to be called from a python program later. I need to speed this up and thought of using the GPU as this seems easily parallelizable.
The function calcsum looks something llike this -
void calcsum(TensorAccessor<float, 3> a, TensorAccessor<float, 3> b)
{
float net_sum;
for (int i=0;i<a.size(1);++i)
{
for (int j=0;j<b.size(1);++j)
{
auto xi = at::max(a[0][i][0], b[0][j][0]);
auto yi = at::max(a[0][i][1], b[0][j][1]);
auto wi = torch::clamp(torch::min(a[0][i][2], b[0][j][3])-xi, 0);
auto hi = torch::clamp(torch::min(a[0][i][2], b[0][j][3])-yi, 0));
auto ai = wi*hi;
auto au = ((a[0][i][2]-a[0][i][0])*(a[0][i][3])-a[0][i][1]) + ((b[0][j][2]-b[0][j][0])*(b[0][j][3])-b[0][j][1])-ai;
auto iou =ai/au;
if (iou>0.4)
net_sum+=iou;
}
}
}
int main() {
torch::Tensor tensor = torch::rand({1, 50000, 4});
torch::Tensor tensor1 = torch::rand({1, 52, 4});
auto accessor_a = tensor.accessor<float, 3>();
auto accessor_b = tensor1.accessor<float, 3>();
calcsum(accessor_a, accessor_b);
}
I have just started reading about how to parallelize loops using cuda, so I am pretty new to the field.
Could someone please guide as to how I could handle the nesting of loops and how to go ahead with it…
TIA