Hello pytorches!
I am new to pytorch so this issue could be just stupid mistake.
Recently I am trying to build language model on pytorch (plese see code below). I am using DataParallel but by some reason it uses only one device but practically I see that then I call model it actually splits the input. nvidia-smi shows that only one device is actually used. Why is that so?
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from pandas import DataFrame as df
import numpy
import re
DIR = './'
S = '_абвгдеёжзийклмнопрстуфхцчшщьыъэюя '
char2id = dict([(c, i) for i, c in enumerate(S)])
id2char = dict([(i, c) for i, c in enumerate(S)])
# Hyper Parameters
NUMDEVICE = 3
hidden_size = 512
batch_size = NUMDEVICE * 32
linesize = 128
num_epochs = 5
num_samples = 1000 # number of words to be sampled
learning_rate = 0.002
# RNN Based Language Model
class RNNLM(nn.Module):
def __init__(self, charnum, hidden_size):
super(RNNLM, self).__init__()
self.embed = nn.Embedding(charnum, charnum)
self.lstm1 = nn.LSTM(charnum, hidden_size, 1, batch_first=True)
self.linear = nn.Linear(hidden_size, charnum)
self.init_weights()
def init_weights(self):
self.linear.bias.data.fill_(0)
self.linear.weight.data.uniform_(-0.1, 0.1)
def forward(self, x, h, t):
x = self.embed(x)
out, h = self.lstm1(x, h)
out = out.contiguous().view(out.size(0) * out.size(1), out.size(2))
out = self.linear(out)
return out, h, t
class LangDataset(Dataset):
def __init__(self, csv_file, char2id):
data = df.from_csv(csv_file, sep='\t')['FullTranscriptionText'].values
self.text = []
for i in range(len(data)):
line = ''
for word in re.findall('[' + S[1:-1] + ']+', data[i].lower()):
if len(line) + 1 + len(word) >= linesize:
self.text.append(line)
line = ''
if len(word) + 1 > linesize:
continue
line += word + ' '
if len(line) > 0:
self.text.append(line[:-1])
def __len__(self):
return len(self.text)
def __getitem__(self, idx):
data = numpy.zeros(linesize)
for k, c in enumerate(self.text[idx]):
data[k] = char2id[c]
return (data[:-1], data[1:])
dataset = LangDataset(DIR + 'audio_manifest.tsv', char2id)
model = RNNLM(len(S), hidden_size)
model = nn.DataParallel(model)
model = model.cuda()
# Loss and Optimizer
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Truncated Backpropagation
def detach(states):
return [state.detach() for state in states]
# Training
for epoch in range(num_epochs):
# Initial hidden and memory states
states = (Variable(torch.zeros(1, int(batch_size / NUMDEVICE), hidden_size)).cuda(),
Variable(torch.zeros(1, int(batch_size / NUMDEVICE), hidden_size)).cuda())
for i, batch in enumerate(DataLoader(dataset=dataset, batch_size=batch_size, num_workers=4, drop_last=True)):
# Get batch inputs and targets
inputs = Variable(batch[0]).cuda()
targets = Variable(batch[1]).cuda()#.contiguous()).cuda()
# Forward + Backward + Optimize
model.zero_grad()
states = detach(states)
outputs, states, targets = model(inputs.long(), states, targets)
# Cast values
targets = targets.view(targets.size(0) * targets.size(1))
targets = targets.long()
loss = criterion(outputs, targets)
loss.backward()
torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
optimizer.step()
if i % 100 == 0:
print('Epoch [%d/%d], Step[%d/%d], Loss: %.3f, Perplexity: %5.2f' %
(epoch + 1, num_epochs, i, len(dataset) / batch_size, loss.data[0], np.exp(loss.data[0])))