Word2Vec Implementation and Execution Issue

Hi I am trying out pytorch with a basic continuous bag of words word2vec implementation. The pytorch implementation seems to be very slow requiring multiple hours. A similar implementation in tensorflow trains within 15-20 min on the text8.zip dataset available from http://mattmahoney.net/dc

I tried to debug the implementation and found that most of the time ~100ms each is being spent in the model and the call to backward method.

I am not able to find out how to speed it up.

class CBOW(nn.Module):
    def __init__(self, vocabulary_size, embedding_dimension):
        super(CBOW, self).__init__()
        self.vocabulary_size = vocabulary_size
        self.embedding_dimension = embedding_dimension
        self.embeddings = nn.Embedding(self.vocabulary_size, self.embedding_dimension, sparse=True)
        self.linear = nn.Linear(embedding_dimension, vocabulary_size)
        self.init_embeddings()

    def init_embeddings(self):
        initrange = 0.5 / self.embedding_dimension
        self.embeddings.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        # print inputs.data.shape
        embedding = self.embeddings(inputs)
        avg_embedding = torch.mean(embedding, dim=1)
        out = self.linear(avg_embedding)
        log_probs = F.log_softmax(out)
        return log_probs
        #return torch.max(log_probs, dim=1, keepdim=True)[1]


class Word2Vec:
    def __init__(self):
        logger.info('CBOW Training ....')
        self.batch_size = 128
        self.embedding_dimension = 128
        self.skip_window = 1

        self.input_data = InputData()
        self.cbow = CBOW(vocabulary_size=VOCABULARY_SIZE, embedding_dimension=self.embedding_dimension)

    def train(self):
        loss_function = nn.NLLLoss()
        optimizer = optim.SGD(self.cbow.parameters(), lr=0.01, momentum=0.5)
        optimizer.zero_grad()
        epochs = 100001
        data_index = 0
        for epoch in range(epochs):
            batch_data, batch_labels, data_index = self.input_data.generate_batch_cbow(data_index, self.batch_size, self.skip_window)
            x_values = autograd.Variable(batch_data)
            y_labels = autograd.Variable(batch_labels[:,0])
            # start_model = time.time()
            predicted = self.cbow(x_values)
            # end_model = time.time()
            # logger.info('Elapsed Time %s' % (end_model - start_model))
            loss = loss_function(predicted, y_labels)
            optimizer.zero_grad()
            # start_backward = time.time()
            loss.backward()
            # end_backward = time.time()
            # logger.info('Elapsed Time %s' % (end_backward - start_backward))
            optimizer.step()
            if epoch % 2000 == 0:
                print('[%d/%d] Loss: %.3f' % (epoch + 1, epochs, loss.data.mean()))

Anyone has insights into this, it will be really helpful :slight_smile:

Matt, can you run the code with:

OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 python foo.py

Does that help?
If your machine has lots of cores (20 cores for example), OpenMP’s threading overhead doesn’t fare well for small workloads.