I’m trying to use pin_memory and then later non_blocking to speed up transfers to the GPU and I am running into a problem where it seems like the data is not being transferred to the GPU.
The error is:
RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
Any help as to why/where the problem is would be helpful. Several code snippets below provide details on my setup
My dataset is a custom dataset (custom PulsarData class) and I added a pin_memory method as follows:
def pin_memory(self):
self.ft_data = self.ft_data.pin_memory()
self.dt_data = self.dt_data.pin_memory()
self.labels = self.labels.pin_memory()
return self
In addition this class has the following getitem method
def __getitem__(self, index: int)-> tuple:
ft_data = np.empty((*self.ft_dim, self.n_channels))
dt_data = np.empty((*self.dt_dim, self.n_channels))
# Do some processing before passing observation to model
ft_data = s.detrend(np.nan_to_num(np.array(self.ft_data[index], dtype=np.float32).T))
ft_data /= np.std(ft_data)
ft_data -= np.median(ft_data)
dt_data = np.nan_to_num(np.array(self.dt_data[index], dtype=np.float32))
dt_data /= np.std(dt_data)
dt_data -= np.median(dt_data)
ft_data = np.reshape(ft_data, (self.n_channels, *self.ft_dim))
dt_data = np.reshape(dt_data, (self.n_channels, *self.dt_dim))
# Return data as PyTorch Tensor
return torch.from_numpy(ft_data), torch.from_numpy(dt_data), torch.tensor(self.labels[index])
The dataloader in my training code is as follows:
train_data = PulsarData(files=train_data_files)
train_data, validate_data = random_split(train_data, [0.85, 0.15])
tr_dataloader = DataLoader(train_data, batch_size=args.batch_size, pin_memory=True, shuffle=True)
v_dataloader = DataLoader(validate_data, batch_size=args.batch_size, pin_memory=True, shuffle=False)
I pass the dataloader to another function that performs a single training pass (NOTE: DEVICE is essentially a constant for cuda. I use the same value to load the model to the GPU):
def train_loop(dataloader: DataLoader,
model: nn.Module,
data: str,
loss_fn: _Loss,
optimizer: Optimizer,
batch_size: int,
) -> None:
size = len(dataloader.dataset)
# Set the model to training mode - important for batch normalization and dropout layers
model.train()
start_time = time.time()
for batch_idx, (freq_data, dm_data, labels) in enumerate(dataloader):
batch_data = None
# Load labels to device
labels = labels.to(DEVICE, non_blocking=True)
# Add some noise to freq data to help avoid overtraining
if data == "freq":
noise = torch.randn_like(freq_data) * .1
batch_data = freq_data + noise
elif data == "dm":
batch_data = dm_data
else:
print(f"Invalid data type provided: {data}", flush=True)
sys.exit(0)
batch_data.to(DEVICE, non_blocking=True)
predicted = model(batch_data)