I wrote a simple demo for short text classification but the speed is unexpectedly slow. When I tried to find out where the bottleneck is, it turns out to be intractable.

At first, the bottleneck is this line:

running_loss += loss.data[0]

However, after commenting out the above line, it slows down at these lines in get_batch() function:

`data = data.cuda() target = target.cuda()`

Is there any problem in the code? I ran this script on GPU (Titan X) with cuda 8.0, python 2.7, ubuntu 16 and pytorch was installed by pip. The data was randomly generated.

The code is attached below:

`import numpy as np import time import torch import torch.nn as nn from torch.autograd import Variable import torch.optim as optim import torch.nn.functional as F`

`class Net(nn.Module): def __init__(self, vocab_size, emb_dim): """ :param vocab_size: an int value, representing the total number of vocabs in the pre-defined lookup table :param emb_dim: an int value, representing the dimension of each word vector """ super(Net, self).__init__() self.lookup_table = nn.Embedding(vocab_size, emb_dim) self.init_embedding() self.encoder = nn.Conv2d(in_channels=1, out_channels=200, kernel_size=(3, 300)) self.hid_1 = nn.Linear(200, 200) self.hid_2 = nn.Linear(200, 10)`

`def forward(self, x, training=False): """ :param x: input x is in size of [N, C, H, W] N: batch size C: number of channel, in text case, this is 1 H: height, in text case, this is the length of the text W: width, in text case, this is the dimension of the embedding :param training: boolean value, whether the forward is for training purpose :return: a tensor [N, L], where L is the number of classes """ x = self.lookup_table(x) x = x.unsqueeze(1) enc = F.relu(self.encoder(x)) enc = F.max_pool2d(enc, kernel_size=(48, 1)) enc = enc.squeeze() enc = F.dropout(F.relu(self.hid_1(enc)), training=training) enc = F.relu(self.hid_2(enc)) pred_prob = F.softmax(enc) return pred_prob`

`def init_embedding(self): initrange = 0.1 self.lookup_table.weight.data.uniform_(-initrange, initrange)`

`def get_batch(source, batch_size, i): data = source[0][batch_size * i: batch_size * (i + 1)] target = source[1][batch_size * i: batch_size * (i + 1)] if torch.cuda.is_available(): print "moving data..." st = time.time() data = data.cuda() target = target.cuda() dt = time.time() - st print "moving data time: {}".format(dt) return data, target`

`print "Setting seed..." seed = 1234 torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed)`

`train_batch_size = 100 test_batch_size = 100`

`rng = np.random.RandomState(seed)`

`num_train_instances = 20000 num_test_instances = 2000 max_text_len = 50 vocab_size = 62639 num_classes = 10 emb_dim = 300`

`print "Generating random data..." train_set_numpy = rng.randint(0, vocab_size, (num_train_instances, max_text_len)), rng.randint(0, num_classes, (num_train_instances,)) test_set_numpy = rng.randint(0, vocab_size, (num_test_instances, max_text_len)), rng.randint(0, num_classes, (num_test_instances,)) print "Converting numpy data into Tensor..." train_set = torch.from_numpy(train_set_numpy[0]), torch.from_numpy(train_set_numpy[1]) test_set = torch.from_numpy(test_set_numpy[0]), torch.from_numpy(test_set_numpy[1])`

`n_train_batch = train_set[0].size()[0] / train_batch_size + 1 n_test_batch = test_set[0].size()[0] / test_batch_size + 1`

`model = Net(vocab_size=vocab_size, emb_dim=emb_dim)`

`if torch.cuda.is_available(): print "move model to GPU" model.cuda() print "move done" print model`

`criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters())`

`for epoch in xrange(10): running_loss = 0. for i in xrange(n_train_batch): start_time = time.time() print "batch: %d" % i text, labels = get_batch(train_set, train_batch_size, i) text, labels = Variable(text), Variable(labels) print "zero optimizer" optimizer.zero_grad() print "compute forward" st = time.time() outputs = model(text, training=True) dt = time.time() - st print "compute forward time: {}".format(dt) print "compute loss" st = time.time() loss = criterion(outputs, labels) dt = time.time() - st print "compute loss time: {}".format(dt) print "compute backword" st = time.time() loss.backward() dt = time.time() - st print "compute backword time: {}".format(dt) print "update gradient" st = time.time() optimizer.step() dt = time.time() - st print "update gradient time: {}".format(dt) print "accumulate loss" st = time.time() running_loss += loss.data[0] dt = time.time() - st print "accumulate loss time: {}".format(dt) duration = time.time() - start_time`

`if i % 1 == 0: print "training speed: {}/sec".format(train_batch_size / duration) running_loss = 0.`

`if i % 4 == 3: start_time = time.time() correct = 0. total = 0. for j in xrange(n_test_batch): text, labels = get_batch(test_set, test_batch_size, j) text, labels = Variable(text), Variable(labels) outputs = model(text) _, predicted = torch.max(outputs.data, dim=1) total += labels.size()[0] correct += (predicted == labels.data).sum() duration = time.time() - start_time print "acc: {}".format(100 * correct / total) print "testing speed: {}/sec".format(total / duration) print "loop done"`