I’m trying to write a variation of word2vec but on pytorch for multi-gpu support but I have been extremely unsuccessful in parallelizing it across multiple GPU’s. The code works for single GPU and torch.cuda.device_count() returns with 2 GPU’s but the second GPU has 0 memory being used.
class skipgram_discriminator(nn.Module):
def __init__(self, vocabulary_size=150000, embedding_size=300, learning_rate=1e-4, batch_size=512):
"""
Initialize a skipgram discriminator.
- vocabulary size is the unique words in the corpus to use to
make the Dense Matrices
- embedding size is the dimensionality of word vectors
- learning_rate is learning rate for discriminator optimizer
"""
super(skipgram_discriminator, self).__init__()
self.embedding_size = embedding_size
self.batch_size = batch_size
self.dis_embeddings = nn.Embedding(vocabulary_size,self.embedding_size,\
sparse=False)
self.D_W2 = nn.Embedding(vocabulary_size,self.embedding_size,\
sparse=False)
self.D_b2 = nn.Embedding(vocabulary_size,1)
self.dis_embeddings.weight.data.uniform_(-0.5/self.embedding_size,\
0.5/self.embedding_size)
self.D_W2.weight.data.uniform_(-0.5/self.embedding_size,\
0.5/self.embedding_size)
self.D_b2.weight.data.zero_()
self.use_cuda = torch.cuda.is_available()
self.criterion = nn.BCEWithLogitsLoss()
if self.use_cuda:
self.disc_params = nn.ModuleList([self.dis_embeddings,self.D_W2,self.D_b2])
for i, l in enumerate(self.disc_params):
self.add_module(str(i), l)
self.disc_params = self.disc_params.cuda()
self.disc_params = torch.nn.DataParallel(self.disc_params,
device_ids=range(torch.cuda.device_count()))
self.optimizer = optim.Adam(self.disc_params.parameters(), lr=learning_rate)
def forward(self,inputs,labels):
embedded_inputs = self.dis_embeddings(inputs)
embedded_labels = self.D_W2(labels)
embedded_bias = self.D_b2(labels)
pos_score = torch.sum(embedded_inputs * embedded_labels, 1) + embedded_bias
return pos_score