I am trying to use multiple GPUs while training the ResNet50 on the CIFAR-10 dataset. The code runs on one GPU but doesnt work when Im integrating net = torch.nn.DataParallel(net)
. No error raised. When trying to debug I found that evaluating the model on an input (loss = loss_fn(net(x), y)) seems to be the point where the program hangs. Any suggestions? Many thanks in advance!
import torch
import torchvision
import torchvision.transforms as transforms
transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=10,
shuffle=True, num_workers=0)
if __name__ == "__main__":
data = trainloader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = torchvision.models.resnet50()
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
net = torch.nn.DataParallel(net)
net.to(device)
loss_fn = torch.nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
# Train ResNet50 model
net.train()
for epoch in range(1, 1 + 1): # only one epoch for debugging
train_loss = 0.0
for x, y in trainloader:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
loss = loss_fn(net(x), y)
loss.backward()
optimizer.step()
train_loss += loss.item()