I am meeting EXTREME fragmentation in my training, and I think it is due to the PointNet++ sample and group process here, because it was originally followed by torch.cuda.empty_cache()
calls, but this dramatically slows down training and thus I removed them. I wonder why this happens – number of points and batch size are always kept the same in my training process, and the allocator should work well.
The code is modified from Pointnet_Pointnet2_pytorch/models at master · yanx27/Pointnet_Pointnet2_pytorch · GitHub
def square_distance(src, dst):
"""
Calculate Euclid distance between each two points.
src^T * dst = xn * xm + yn * ym + zn * zm;
sum(src^2, dim=-1) = xn*xn + yn*yn + zn*zn;
sum(dst^2, dim=-1) = xm*xm + ym*ym + zm*zm;
dist = (xn - xm)^2 + (yn - ym)^2 + (zn - zm)^2
= sum(src**2,dim=-1)+sum(dst**2,dim=-1)-2*src^T*dst
Input:
src: source points, [B, N, C]
dst: target points, [B, M, C]
Output:
dist: per-point square distance, [B, N, M]
"""
B, N, _ = src.shape
_, M, _ = dst.shape
dist = -2 * torch.matmul(src, dst.permute(0, 2, 1))
dist += torch.sum(src ** 2, -1).view(B, N, 1)
dist += torch.sum(dst ** 2, -1).view(B, 1, M)
return dist
def index_points(points, idx):
"""
Input:
points: input points data, [B, N, C]
idx: sample index data, [B, S]
Return:
new_points:, indexed points data, [B, S, C]
"""
device = points.device
B = points.shape[0]
view_shape = list(idx.shape)
view_shape[1:] = [1] * (len(view_shape) - 1)
repeat_shape = list(idx.shape)
repeat_shape[0] = 1
batch_indices = torch.arange(B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
new_points = points[batch_indices, idx, :]
return new_points
def farthest_point_sample(xyz, npoint):
"""
Input:
xyz: pointcloud data, [B, N, 3]
npoint: number of samples
Return:
centroids: sampled pointcloud index, [B, npoint]
"""
return dgl.geometry.farthest_point_sampler(xyz, npoint)
def query_ball_point(radius, nsample, xyz, new_xyz):
"""
Input:
radius: local region radius
nsample: max sample number in local region
xyz: all points, [B, N, 3]
new_xyz: query points, [B, S, 3]
Return:
group_idx: grouped points index, [B, S, nsample]
"""
device = xyz.device
B, N, C = xyz.shape
_, S, _ = new_xyz.shape
group_idx = torch.arange(N, dtype=torch.long).to(device).view(1, 1, N).repeat([B, S, 1])
sqrdists = square_distance(new_xyz, xyz)
group_idx[sqrdists > radius ** 2] = N
group_idx = group_idx.sort(dim=-1)[0][:, :, :nsample]
group_first = group_idx[..., :1].repeat([1, 1, nsample])
mask = group_idx == N
group_idx[mask] = group_first[mask]
return group_idx
def sample_and_group(npoint, radius, nsample, xyz, points, returnfps=False):
"""
Input:
npoint:
radius:
nsample:
xyz: input points position data, [B, N, 3]
points: input points data, [B, N, D]
Return:
new_xyz: sampled points position data, [B, npoint, nsample, 3]
new_points: sampled points data, [B, npoint, nsample, 3+D]
"""
B, N, C = xyz.shape
S = npoint
fps_idx = farthest_point_sample(xyz, npoint) # [B, npoint, C]
# torch.cuda.empty_cache()
new_xyz = index_points(xyz, fps_idx)
# torch.cuda.empty_cache()
idx = query_ball_point(radius, nsample, xyz, new_xyz)
# torch.cuda.empty_cache()
grouped_xyz = index_points(xyz, idx) # [B, npoint, nsample, C]
# torch.cuda.empty_cache()
grouped_xyz_norm = grouped_xyz - new_xyz.view(B, S, 1, C)
# torch.cuda.empty_cache()
if points is not None:
grouped_points = index_points(points, idx)
new_points = torch.cat([grouped_xyz_norm, grouped_points], dim=-1) # [B, npoint, nsample, C+D]
else:
new_points = grouped_xyz_norm
if returnfps:
return new_xyz, new_points, grouped_xyz, fps_idx
else:
return new_xyz, new_points
By EXTREME fragmentation I mean:
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File ".../src/models/pointnet_util.py", line 199, in forward
new_xyz, new_points = sample_and_group(self.npoint, self.radius, self.nsample, xyz, points)
File ".../src/models/pointnet_util.py", line 130, in sample_and_group
idx = query_ball_point(radius, nsample, xyz, new_xyz)
File ".../src/models/pointnet_util.py", line 105, in query_ball_point
group_idx = group_idx.sort(dim=-1)[0][:, :, :nsample]
RuntimeError: CUDA out of memory. Tried to allocate 7.63 GiB (GPU 0; 79.35 GiB total capacity; 27.47 GiB already allocated; 5.05 GiB free; 72.49 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
It is like crazy – 27GB allocated but 72GB reserved.
Any input is welcome, and thanks in advance!