DataParallel only uses 1 GPU

I’ve created a model that has a few convolutional layers followed by linear fully connected layers. It works just fine when training with 1 GPU. But when I try using 2 GPUs, from nvidia-smi I can see that only 1 has any memory usage.

How I run my program:

CUDA_VISIBLE_DEVICES=0,2 python main.py

(torch.cuda.device_count() shows 2)

My model:

import torch, sys, random
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import constants

class WebcamLocation(nn.Module):
    def __init__(self, input_shape=(3, 32, 128, 128)):
        super(WebcamLocation, self).__init__()

        self.kernel_sizes = [(1, 5, 5),
                             (4, 2, 2), # Look at 4 frames at once.
                             (1, 2, 2),
                             (1, 2, 2)] # each element corresponds to a layer, tuple is (height, width)
        self.output_channels = [16, 32, 48, 16] # each element corresponds to a layer
        self.paddings = [(0, 2, 2), (0, 1, 1), (0, 1, 1), (0, 0, 0)] 
        self.strides = [(1, 2, 2), (2, 1, 1), (1, 1, 1), (1, 1, 1)]
        #self.poolings = [(1, 2, 2), (1, 2, 2), None, (1, 2, 2)]
        #self.conv_relus = [True, True, False, True]       

        self.conv1 = nn.Conv3d(constants.NUM_CHANNELS, self.output_channels[0],
                               kernel_size=self.kernel_sizes[0],
                               stride=self.strides[0],
                               padding=self.paddings[0])
        self.conv2 = nn.Conv3d(self.output_channels[0], self.output_channels[1],
                               kernel_size=self.kernel_sizes[1],
                               stride=self.strides[1], # Skip every other frame.
                               padding=self.paddings[1])
        self.conv3 = nn.Conv3d(self.output_channels[1], self.output_channels[2],
                               kernel_size=self.kernel_sizes[2],
                               stride=self.strides[2],
                               padding=self.paddings[2])
        self.conv4 = nn.Conv3d(self.output_channels[2], self.output_channels[3],
                               kernel_size=self.kernel_sizes[3],
                               stride=self.strides[3],
                               padding=self.paddings[3])

        # Compute output size of convolutions to get input to fc layers.
        self.first_fc_layer_size = self.get_conv_output(input_shape)

        linear_size = [2000, 200]
        self.fc1 = nn.Linear(self.first_fc_layer_size, linear_sizes[0])
        self.fc2 = nn.Linear(linear_sizes[0], linear_sizes[1])
        self.fc3 = nn.Linear(linear_sizes[1], 1)
        


        self.network = torch.nn.DataParallel(nn.Sequential(self.conv1, self.conv2, self.conv3, self.conv4, self.fc1, self.fc2, self.fc3))
        
        

    # Used to get output size of convolutions.
    def get_conv_output(self, shape):
        batch_size = 1 # Not important.
        input = Variable(torch.rand(batch_size, *shape), requires_grad=False)

        output_feat = self.forward_features(input)
        flattened_size = self.num_flat_features(output_feat)

        return flattened_size

    def forward_features(self, x):
        x = F.max_pool3d(F.relu(self.conv1(x)), (1, 2, 2))
        x = F.max_pool3d(F.relu(self.conv2(x)), (1, 2, 2))
        x = F.relu(self.conv3(x))
        x = F.max_pool3d(F.relu(self.conv4(x)), (1, 2, 2))

        return x

    def forward(self, x):
        # Convolutional layers.
        x = self.forward_features(x)

        # Flatten into vector.
        x = x.view(-1, self.first_fc_layer_size)

        # Fully connected layers.        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        

        return x


    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In my main, I set

model = WebcamLocation()
model.cuda()

My training loop (called in a loop that goes through some # of epochs):

def train_epoch(epoch, model, data_loader, optimizer):
    model.train()

    for batch_idx, (data, target) in enumerate(data_loader):
        data, target = Variable(data), Variable(target)

        target = target.float()

        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()

        optimizer.zero_grad()
        output = model(data)
        loss = train_loss_fn(output, target)
        loss.backward()
        optimizer.step()

And I’ve also tried moving the DataParallel to wrap around model (rather than in the WebcamLocation class). I’ve tried setting device_ids=[0, 1] argument in DataParallel… but nothing works!
Why doesn’t the 2nd GPU get used?