Slow .backward() with DataLoader

The use of DataLoader significantly slows down backward pass as compared to simple data usage. See example below:

import torch
import time

# Custom dataset

class Dataset(torch.utils.data.Dataset):
    def __init__(self, input_x, input_y, input_t, labels):
        self.input_x = input_x
        self.input_y = input_y
        self.input_t = input_t
        self.labels = labels
        self.input_x.requires_grad = True
        self.input_y.requires_grad = True
        self.input_t.requires_grad = True
        self.labels.requires_grad = True
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        return self.input_x[idx, :], self.input_y[idx, :], \
            self.input_t[idx, :], self.labels[idx, :]

# Neural Network (NN)
net = torch.nn.Sequential(
    torch.nn.Linear(3, 50),
    torch.nn.Tanh(),
    torch.nn.Linear(50, 50),
    torch.nn.Tanh(),
    torch.nn.Linear(50, 1)
    )

# Number of samples
num_samples = 1200

# ### Without Dataloader ###

# Generate (x, y, t) input data and output data
x, y, t = torch.rand(num_samples, 1, requires_grad=True), torch.rand(num_samples, 1, requires_grad=True), torch.rand(num_samples, 1, requires_grad=True)
output_data = torch.sin(torch.pi * x) + torch.cos(torch.pi * y) + t

# Optimiser
optimiser = torch.optim.Adam(net.parameters(), lr=1e-3)
optimiser.zero_grad()

# Compute loss u_t - (u_xx + u_yy) , u = NN prediction
output_pred = net(torch.cat((x, y, t), dim=1))
u_t = torch.autograd.grad(output_pred.sum(), t, create_graph=True)
u_x = torch.autograd.grad(output_pred.sum(), x, create_graph=True)
u_y = torch.autograd.grad(output_pred.sum(), y, create_graph=True)
u_xx = torch.autograd.grad(u_x[0].sum(), x, create_graph=True)
u_yy = torch.autograd.grad(u_y[0].sum(), y, create_graph=True)
loss_pde = torch.mean((u_t[0] - (u_xx[0] + u_yy[0]))**2)
start_time = time.process_time()
loss_pde.backward()
end_time = time.process_time()
print(f"PDE loss backward time without Dataloader: {end_time - start_time}")
optimiser.step()

# Compute MSE loss
output_pred = net(torch.cat((x, y, t), dim=1))
loss_mse = torch.mean((output_data - output_pred)**2)
start_time = time.process_time()
loss_mse.backward()
end_time = time.process_time()
print(f"MSE loss backward time without Dataloader: {end_time - start_time}")
optimiser.step()

# ### With Dataloader ###
# Generate (x, y, t) input data and output data
x, y, t = torch.rand(num_samples, 1), torch.rand(num_samples, 1), torch.rand(num_samples, 1)
output_data = torch.sin(torch.pi * x) + torch.cos(torch.pi * y) + t

# Create custom dataset and dataloder with only one batch
dataset = Dataset(x, y, t, output_data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=x.shape[0], shuffle=False)

for batch, (x_data, y_data, t_data, output_data) in enumerate(dataloader):
    optimiser.zero_grad()
    
    # Compute loss u_t - (u_xx + u_yy) , u = NN prediction
    output_pred = net(torch.cat((x_data, y_data, t_data), dim=1))
    u_t = torch.autograd.grad(output_pred.sum(), t, create_graph=True)
    u_x = torch.autograd.grad(output_pred.sum(), x, create_graph=True)
    u_y = torch.autograd.grad(output_pred.sum(), y, create_graph=True)
    u_xx = torch.autograd.grad(u_x[0].sum(), x, create_graph=True)
    u_yy = torch.autograd.grad(u_y[0].sum(), y, create_graph=True)
    loss_pde = torch.mean((u_t[0] - (u_xx[0] + u_yy[0]))**2)
    start_time = time.process_time()
    loss_pde.backward()
    end_time = time.process_time()
    print(f"PDE loss backward time with Dataloader: {end_time - start_time}")
    optimiser.step()

    # Compute MSE loss
    output_pred = net(torch.cat((x_data, y_data, t_data), dim=1))
    loss_mse = torch.mean((output_data - output_pred)**2)
    start_time = time.process_time()
    loss_mse.backward()
    end_time = time.process_time()
    print(f"MSE loss backward time with Dataloader: {end_time - start_time}")
    optimiser.step()

The time taken for the backward pass without DataLoader is 1 order of magnitude lower than using DataLoader. Is there any reason for the same? Is it recommended to use DataLoader? If so, why?
Thanks in advance.

It seems you running everything on the CPU so the DataLoader will compete for resources with your actual training. You are also using the main process to load and process the data so it will be executed sequentially. Try to use multiple workers via num_workers>=1 and use your GPU for the actual model training allowing the CPU to prefetch next batches.