The input data sometimes becomes NaN after .cuda()

63days · August 6, 2021, 7:49am

Hi, all. I’m struggling a NaN issue. Any thoughts would be appreciated.
When I feed the point cloud data to GPU by calling .cuda(), it becomes NaN very rarely. Code used for debugging is below.

import torch
import h5py
from torch.utils.data import DataLoader, Dataset

class PCDataset(Dataset):
    def __init__(self):
        self.pc_data = h5py.File("shapenet.h5", "r")["data"][:, :2048]

    def __getitem__(self, index):
        return self.pc_data[index]

    def __len__(self):
        return len(self.pc_data)

def main():
    dataset = PCDataset()
    data_loader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True, drop_last=True)

    while True:
        for batch_idx, pc in enumerate(data_loader):

            if torch.any(~torch.isfinite(pc)):
                print("cpu contains NaN")
                nan_index = torch.nonzero(~torch.isfinite(pc), as_tuple=True)
                print("cpu:", pc[nan_index])

            pc_cuda = pc.cuda()
            if torch.any(~torch.isfinite(pc_cuda)):
                print("cuda contains NaN")
                nan_index = torch.nonzero(~torch.isfinite(pc_cuda), as_tuple=True)
                print("cuda:", pc_cuda[nan_index])
                print("cpu:", pc[nan_index])
                pc_from_dl = list(iter(data_loader))[batch_idx]
                print("from dl cpu:", pc_from_dl[nan_index])
                print("from dl cuda:", pc_from_dl[nan_index].cuda())
                return

if __name__ == "__main__":
    main()

Output:
cuda contains NaN
cuda: tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
device=‘cuda:0’)
cpu: tensor([ 0.2349, -0.2397, -0.1102, 0.0039, 0.1992, 0.3378, 0.0317, -0.3546,
0.2329, -0.1955, -0.3463, 0.2202, 0.2179, 0.0039, 0.0975, 0.2006,
0.0394, 0.0548, -0.2069, 0.1202, -0.2385])
from dl cpu: tensor([ 0.2349, -0.2397, -0.1102, 0.0039, 0.1992, 0.3378, 0.0317, -0.3546,
0.2329, -0.1955, -0.3463, 0.2202, 0.2179, 0.0039, 0.0975, 0.2006,
0.0394, 0.0548, -0.2069, 0.1202, -0.2385])
from dl cuda: tensor([ 0.2349, -0.2397, -0.1102, 0.0039, 0.1992, 0.3378, 0.0317, -0.3546,
0.2329, -0.1955, -0.3463, 0.2202, 0.2179, 0.0039, 0.0975, 0.2006,
0.0394, 0.0548, -0.2069, 0.1202, -0.2385], device=‘cuda:0’)

It’s really strange that the data in cpu doesn’t have any problem but the data just after .cuda() becomes NaN from time to time and even the data reloaded from a dataloader has no problem. My environment is the following.

–Environment–
python 3.8
torch 1.7.1+cu110
NVIDIA-SMI: 460.91.03
CUDA Version: 11.2
GPU: RTX 3090

ptrblck · August 11, 2021, 4:39am

Could you update to the latest stable PyTorch release (or the nightly) and rerun your script, please?