DataPrallel uses only one device

Hello pytorches!

I am new to pytorch so this issue could be just stupid mistake.
Recently I am trying to build language model on pytorch (plese see code below). I am using DataParallel but by some reason it uses only one device but practically I see that then I call model it actually splits the input. nvidia-smi shows that only one device is actually used. Why is that so?

import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from pandas import DataFrame as df
import numpy
import re

DIR = './'

S       = '_абвгдеёжзийклмнопрстуфхцчшщьыъэюя '
char2id = dict([(c, i) for i, c in enumerate(S)])
id2char = dict([(i, c) for i, c in enumerate(S)])

# Hyper Parameters
NUMDEVICE     = 3

hidden_size   = 512
batch_size    = NUMDEVICE * 32
linesize      = 128
num_epochs    = 5
num_samples   = 1000  # number of words to be sampled
learning_rate = 0.002

# RNN Based Language Model
class RNNLM(nn.Module):
    def __init__(self, charnum, hidden_size):
        super(RNNLM, self).__init__()
        self.embed  = nn.Embedding(charnum, charnum)
        self.lstm1  = nn.LSTM(charnum, hidden_size, 1, batch_first=True)
        self.linear = nn.Linear(hidden_size, charnum)
        self.init_weights()

    def init_weights(self):
        self.linear.bias.data.fill_(0)
        self.linear.weight.data.uniform_(-0.1, 0.1)

    def forward(self, x, h, t):
        x      = self.embed(x)
        out, h = self.lstm1(x, h)
        out = out.contiguous().view(out.size(0) * out.size(1), out.size(2))
        out = self.linear(out)
        return out, h, t

class LangDataset(Dataset):
    def __init__(self, csv_file, char2id):
        data = df.from_csv(csv_file, sep='\t')['FullTranscriptionText'].values
        self.text = []
        for i in range(len(data)):
            line = ''
            for word in re.findall('[' + S[1:-1] + ']+', data[i].lower()):
                if len(line) + 1 + len(word) >= linesize:
                    self.text.append(line)
                    line = ''
                if len(word) + 1 > linesize:
                    continue
                line += word + ' '
            if len(line) > 0:
                self.text.append(line[:-1])

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        data = numpy.zeros(linesize)
        for k, c in enumerate(self.text[idx]):
            data[k] = char2id[c]
        return (data[:-1], data[1:])

dataset = LangDataset(DIR + 'audio_manifest.tsv', char2id)

model = RNNLM(len(S), hidden_size)
model = nn.DataParallel(model)
model = model.cuda()

# Loss and Optimizer
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Truncated Backpropagation
def detach(states):
    return [state.detach() for state in states]

# Training
for epoch in range(num_epochs):
    # Initial hidden and memory states
    states = (Variable(torch.zeros(1, int(batch_size / NUMDEVICE), hidden_size)).cuda(),
              Variable(torch.zeros(1, int(batch_size / NUMDEVICE), hidden_size)).cuda())

    for i, batch in enumerate(DataLoader(dataset=dataset, batch_size=batch_size, num_workers=4, drop_last=True)):
        # Get batch inputs and targets
        inputs  = Variable(batch[0]).cuda()
        targets = Variable(batch[1]).cuda()#.contiguous()).cuda()

        # Forward + Backward + Optimize
        model.zero_grad()
        states = detach(states)

        outputs, states, targets = model(inputs.long(), states, targets)

        # Cast values
        targets = targets.view(targets.size(0) * targets.size(1))
        targets = targets.long()

        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
        optimizer.step()

        if i % 100 == 0:
            print('Epoch [%d/%d], Step[%d/%d], Loss: %.3f, Perplexity: %5.2f' %
                  (epoch + 1, num_epochs, i, len(dataset) / batch_size, loss.data[0], np.exp(loss.data[0])))

This is because you are giving a batch size of 1. DataParallel splits over the batch dimension, and hence at a batch size of 1 it cant use GPUs 2 and 3 as you asked.

In my case batch size is 3 * 32