I am trying to optimize a code that calls the radius function from pytorch_cluster:
import torch
from torch_cluster import radius
import torch._dynamo as dynamo
def myradius(x: torch.Tensor, y: torch.Tensor, r: float,
batch_x: Optional[torch.Tensor] = None,
batch_y: Optional[torch.Tensor] = None,
max_num_neighbors: int = 32,
num_workers: int = 1) -> torch.Tensor:
return radius(x,y,r,batch_x, batch_y, max_num_neighbors, num_workers)
device = torch.device('cuda:0')
x2 = torch.tensor([0.0], device=device)
y2 = torch.tensor([1.0], device=device)
dynamo.explain(myradius, x2, y2, 2)
Executing this code results in the following error:
Traceback (most recent call last):
File "/home/raul/mambaforge/envs/torch2-test/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 1194, in run_node
return node.target(*args, **kwargs)
File "/home/raul/mambaforge/envs/torch2-test/lib/python3.10/site-packages/torch/_ops.py", line 499, in __call__
return self._op(*args, **kwargs or {})
RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet. Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_
mutable_data() to actually allocate memory.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/raul/mambaforge/envs/torch2-test/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 1152, in get_fake_value
return wrap_fake_exception(
File "/home/raul/mambaforge/envs/torch2-test/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 808, in wrap_fake_exception
return fn()
File "/home/raul/mambaforge/envs/torch2-test/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 1153, in <lambda>
lambda: run_node(tx.output, node, args, kwargs, nnmodule)
File "/home/raul/mambaforge/envs/torch2-test/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 1206, in run_node
raise RuntimeError(
RuntimeError: Failed running call_function torch_cluster.radius(*(FakeTensor(FakeTensor(..., device='meta', size=(1, 1)), cuda:0), FakeTensor(FakeTensor(..., device='meta',
size=(1, 1)), cuda:0), None, None, 2, 32, 1), **{}):
The tensor has a non-zero number of elements, but its data is not allocated yet. Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data()
to actually allocate memory.
(scroll up for backtrace)
I do not understand this error at all. Some pointers on how to approach this error would be much appreciated.
On the other hand, the definition of the radius function in pytorch_cluster is not that complicated, being most of the complexity hidden in the C++ side:
def radius(x: torch.Tensor, y: torch.Tensor, r: float,
batch_x: Optional[torch.Tensor] = None,
batch_y: Optional[torch.Tensor] = None, max_num_neighbors: int = 32,
num_workers: int = 1) -> torch.Tensor:
x = x.view(-1, 1) if x.dim() == 1 else x
y = y.view(-1, 1) if y.dim() == 1 else y
x, y = x.contiguous(), y.contiguous()
batch_size = 1
if batch_x is not None:
assert x.size(0) == batch_x.numel()
batch_size = int(batch_x.max()) + 1
if batch_y is not None:
assert y.size(0) == batch_y.numel()
batch_size = max(batch_size, int(batch_y.max()) + 1)
ptr_x: Optional[torch.Tensor] = None
ptr_y: Optional[torch.Tensor] = None
if batch_size > 1:
assert batch_x is not None
assert batch_y is not None
arange = torch.arange(batch_size + 1, device=x.device)
ptr_x = torch.bucketize(arange, batch_x)
ptr_y = torch.bucketize(arange, batch_y)
return torch.ops.torch_cluster.radius(x, y, ptr_x, ptr_y, r,
max_num_neighbors, num_workers)
On the suspicion that the ptr_x/y being None was somehow causing the error, I tried calling the function such that it constructs them.
x = torch.tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]], device=device).float()
batch_x = torch.tensor([0, 0, 0, 0], device=device)
y = torch.tensor([[-1, 0], [1, 0]], device=device).float()
batch_y = torch.tensor([0, 0], device=device)
dynamo.explain(myradius, x, y, 1.5, batch_x, batch_y)
This code succeeds. But I am clueless as to why -.-