Parallelism not working on ResNet50

I am trying to use multiple GPUs while training the ResNet50 on the CIFAR-10 dataset. The code runs on one GPU but doesnt work when Im integrating net = torch.nn.DataParallel(net) . No error raised. When trying to debug I found that evaluating the model on an input (loss = loss_fn(net(x), y)) seems to be the point where the program hangs. Any suggestions? Many thanks in advance!

import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=10,
                                                  shuffle=True, num_workers=0)

if __name__ == "__main__":
    data = trainloader

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    net = torchvision.models.resnet50()
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        net = torch.nn.DataParallel(net)
    net.to(device)
    loss_fn = torch.nn.CrossEntropyLoss(reduction="mean")

    optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

    # Train ResNet50 model
    net.train()
    for epoch in range(1, 1 + 1):  # only one epoch for debugging
        train_loss = 0.0
        for x, y in trainloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            loss = loss_fn(net(x), y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()