Hi I am trying out pytorch with a basic continuous bag of words word2vec implementation. The pytorch implementation seems to be very slow requiring multiple hours. A similar implementation in tensorflow trains within 15-20 min on the text8.zip dataset available from http://mattmahoney.net/dc
I tried to debug the implementation and found that most of the time ~100ms each is being spent in the model and the call to backward method.
I am not able to find out how to speed it up.
class CBOW(nn.Module):
def __init__(self, vocabulary_size, embedding_dimension):
super(CBOW, self).__init__()
self.vocabulary_size = vocabulary_size
self.embedding_dimension = embedding_dimension
self.embeddings = nn.Embedding(self.vocabulary_size, self.embedding_dimension, sparse=True)
self.linear = nn.Linear(embedding_dimension, vocabulary_size)
self.init_embeddings()
def init_embeddings(self):
initrange = 0.5 / self.embedding_dimension
self.embeddings.weight.data.uniform_(-initrange, initrange)
def forward(self, inputs):
# print inputs.data.shape
embedding = self.embeddings(inputs)
avg_embedding = torch.mean(embedding, dim=1)
out = self.linear(avg_embedding)
log_probs = F.log_softmax(out)
return log_probs
#return torch.max(log_probs, dim=1, keepdim=True)[1]
class Word2Vec:
def __init__(self):
logger.info('CBOW Training ....')
self.batch_size = 128
self.embedding_dimension = 128
self.skip_window = 1
self.input_data = InputData()
self.cbow = CBOW(vocabulary_size=VOCABULARY_SIZE, embedding_dimension=self.embedding_dimension)
def train(self):
loss_function = nn.NLLLoss()
optimizer = optim.SGD(self.cbow.parameters(), lr=0.01, momentum=0.5)
optimizer.zero_grad()
epochs = 100001
data_index = 0
for epoch in range(epochs):
batch_data, batch_labels, data_index = self.input_data.generate_batch_cbow(data_index, self.batch_size, self.skip_window)
x_values = autograd.Variable(batch_data)
y_labels = autograd.Variable(batch_labels[:,0])
# start_model = time.time()
predicted = self.cbow(x_values)
# end_model = time.time()
# logger.info('Elapsed Time %s' % (end_model - start_model))
loss = loss_function(predicted, y_labels)
optimizer.zero_grad()
# start_backward = time.time()
loss.backward()
# end_backward = time.time()
# logger.info('Elapsed Time %s' % (end_backward - start_backward))
optimizer.step()
if epoch % 2000 == 0:
print('[%d/%d] Loss: %.3f' % (epoch + 1, epochs, loss.data.mean()))