Hi, all. I’m struggling a NaN issue. Any thoughts would be appreciated.
When I feed the point cloud data to GPU by calling .cuda(), it becomes NaN very rarely. Code used for debugging is below.
import torch
import h5py
from torch.utils.data import DataLoader, Dataset
class PCDataset(Dataset):
def __init__(self):
self.pc_data = h5py.File("shapenet.h5", "r")["data"][:, :2048]
def __getitem__(self, index):
return self.pc_data[index]
def __len__(self):
return len(self.pc_data)
def main():
dataset = PCDataset()
data_loader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True, drop_last=True)
while True:
for batch_idx, pc in enumerate(data_loader):
if torch.any(~torch.isfinite(pc)):
print("cpu contains NaN")
nan_index = torch.nonzero(~torch.isfinite(pc), as_tuple=True)
print("cpu:", pc[nan_index])
pc_cuda = pc.cuda()
if torch.any(~torch.isfinite(pc_cuda)):
print("cuda contains NaN")
nan_index = torch.nonzero(~torch.isfinite(pc_cuda), as_tuple=True)
print("cuda:", pc_cuda[nan_index])
print("cpu:", pc[nan_index])
pc_from_dl = list(iter(data_loader))[batch_idx]
print("from dl cpu:", pc_from_dl[nan_index])
print("from dl cuda:", pc_from_dl[nan_index].cuda())
return
if __name__ == "__main__":
main()
Output:
cuda contains NaN
cuda: tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
device=‘cuda:0’)
cpu: tensor([ 0.2349, -0.2397, -0.1102, 0.0039, 0.1992, 0.3378, 0.0317, -0.3546,
0.2329, -0.1955, -0.3463, 0.2202, 0.2179, 0.0039, 0.0975, 0.2006,
0.0394, 0.0548, -0.2069, 0.1202, -0.2385])
from dl cpu: tensor([ 0.2349, -0.2397, -0.1102, 0.0039, 0.1992, 0.3378, 0.0317, -0.3546,
0.2329, -0.1955, -0.3463, 0.2202, 0.2179, 0.0039, 0.0975, 0.2006,
0.0394, 0.0548, -0.2069, 0.1202, -0.2385])
from dl cuda: tensor([ 0.2349, -0.2397, -0.1102, 0.0039, 0.1992, 0.3378, 0.0317, -0.3546,
0.2329, -0.1955, -0.3463, 0.2202, 0.2179, 0.0039, 0.0975, 0.2006,
0.0394, 0.0548, -0.2069, 0.1202, -0.2385], device=‘cuda:0’)
It’s really strange that the data in cpu doesn’t have any problem but the data just after .cuda() becomes NaN from time to time and even the data reloaded from a dataloader has no problem. My environment is the following.
–Environment–
python 3.8
torch 1.7.1+cu110
NVIDIA-SMI: 460.91.03
CUDA Version: 11.2
GPU: RTX 3090