Split Single GPU

ptrblck · May 28, 2018, 9:57am

The error seems to be related to some issues with multiprocessing and CUDA.
Have a look at the doc on Sharing CUDA tensors.

You have to use the “spawn” or “forkserver” start method.

Also, in your first script your time measurement is a bit wrong, because you have to call torch.cuda.synchronize() before getting the end time.
CUDA calls are asynchronous, so that the end time might be stored before the CUDA operation is done.

Here is a small script for your second use case, which might be a starter (I’m not sure, if you need modelX.share_memory()):

import torch
import torch.nn as nn
import torch.optim as optim

import torch.multiprocessing as _mp
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


# Globals
mp = _mp.get_context('spawn')
use_cuda = True


class Flatten(nn.Module):
    def __init__(self):
        super(Flatten, self).__init__()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return x


def get_model():
    model = nn.Sequential(
            nn.Conv2d(3, 6, 3 ,1, 1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(6, 16, 3, 1, 1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 1, 3, 1, 1),
            nn.MaxPool2d(2),
            Flatten(),
            nn.Linear(28*28, 10),
            nn.LogSoftmax(dim=1)
    )
    
    return model


def train(model, data_loader, optimizer, criterion):
    for data, labels in data_loader:
        labels = labels.long()
        if use_cuda:
            data, labels = data.to('cuda'), labels.to('cuda')
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()


if __name__=='__main__':
    num_processes = 2
    model1 = get_model()
    model2 = get_model()
    if use_cuda:
        model1 = model1.to('cuda')
        model2 = model2.to('cuda')
    
    dataset = datasets.FakeData(transform=transforms.ToTensor())
    data_loader = DataLoader(dataset, batch_size=2,
                             num_workers=0,
                             pin_memory=False)
    
    criterion = nn.NLLLoss()
    optimizer1 = optim.SGD(model1.parameters(), lr=1e-3)
    optimizer2 = optim.SGD(model2.parameters(), lr=1e-3)
    
    #model1.share_memory()
    #model2.share_memory()
    processes = []
    p1 = mp.Process(target=train, args=(model1, data_loader, optimizer1, criterion))
    p1.start()
    processes.append(p1)
    p2 = mp.Process(target=train, args=(model2, data_loader, optimizer2, criterion))
    p2.start()
    processes.append(p2)

    for p in processes:
        p.join()
    
    print('Done')

However, I’m still not sure, if you’ll see any performance advantage.
It would be nice, if you could time your script and report the results using the sequential and multiprocessing way.