I am having a problem getting .to(device) to work asynchronously. The training loop in the first code snippet below takes 3X longer than the second snippet. The first snippet sets pin_memory=True, non_blocking=True and num_workers=12. The second snippet moves tensors to the GPU in getitem and uses num_workers=0. Images that are being loaded are of shape [1, 512, 512]. The target is just a single float32.
Is there something I need to set in the CUDA drivers?
GPU: V100
PyTorch: 1.1.0
Python: 3.7.4
Cuda compilation tools, release 9.2, V9.2.148
conda version : 4.6.14
Ubuntu 16.04.5
# This is very slow.
device = “cuda”
class MyDataset (Dataset):
def __getitem__(self, idx):
image = self.get_image_tensor(idx)
target = self.get_target(idx)
return {"images": image, "targets": target}
train_dataset = MyDataset()
train_loader = DataLoader(
train_dataset,
batch_size=16,
shuffle=True,
num_workers=12,
pin_memory=True)
def train():
for batch in train_loader:
images = batch[“images”].to(device, non_blocking=True)
targets = batch[“targets”].to(device, non_blocking=True)
# This is faster, but still slower than it should be.
device = “cuda”
class MyDataset (Dataset):
def __getitem__(self, idx):
image = self.get_image_tensor(idx).to(device)
target = self.get_target(idx).to(device)
return {"images": image, "targets": target}
train_dataset = MyDataset()
train_loader = DataLoader(
train_dataset,
batch_size=16,
shuffle=True,
num_workers=0,
pin_memory=False)
def train():
for batch in train_loader:
images = batch[“images”]
targets = batch[“targets”]