If I use batch_size=1
, my Skip-gram implementation will raise error as following: (in pytorch 0.1.12 the code works well. but in 0.3.0 raises Runtime Errors)
- CPU version:
Traceback (most recent call last):
File "/home/zarzen/Dev/sgns/train.py", line 32, in <module>
train()
File "/home/zarzen/Dev/sgns/train.py", line 25, in train
loss.backward()
File "/home/zarzen/.pyenv/versions/anaconda3-4.3.0/lib/python3.6/site-packages/torch/autograd/variable.py", line 167, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
File "/home/zarzen/.pyenv/versions/anaconda3-4.3.0/lib/python3.6/site-packages/torch/autograd/__init__.py", line 99, in backward
variables, grad_variables, retain_graph)
RuntimeError: invalid argument 1: expected 3D tensor, got 4D at /opt/conda/conda-bld/pytorch_1513368888240/work/torch/lib/TH/generic/THTensorMath.c:1630
- GPU version
# same position traceback as CPU version
RuntimeError: invalid argument 6: expected 3D tensor at /Users/zarzen/Dev/pytorch/torch/lib/THC/generic/THCTensorMathBlas.cu:442
I have uploaded codes at Github: https://github.com/zarzen/sgns. To reproduce the bug, changing batch_size=1
in train.py
. To run the code, run preprocess.py
first then train.py
.
If I change batch_size
greater than 1, the implementation works fine. I know batch_size==1
is not reasonable. But if the data_length % batch_size == 1
then the last batch produced by dataloader
could be sized 1, which will cause problem. The easy fix could be checking whether such situation will happen or not and changing batch_size
correspondingly. I just hope to know whether my code logic
contains some more crucial issues.
Following is the model snippet:
(The input data of skipgram mode is pairs
[ (w_t1, w_c1),
(w_t2, w_c2),
…]
)
class EmbeddingNN(nn.Module):
""" single hidden layer embedding model"""
def __init__(self, voc_size, emb_size=300, init_with=None):
super(EmbeddingNN, self).__init__()
padding_idx = 0
self.voc_size = voc_size
self.emb_size = emb_size
self.iembeddings = nn.Embedding(self.voc_size, self.emb_size)
self.oembeddings = nn.Embedding(self.voc_size, self.emb_size)
# pylint: disable=no-member
if init_with is not None:
assert init_with.shape == (voc_size, emb_size)
self.iembeddings.weight = nn.Parameter(FloatTensor(init_with))
else:
self.iembeddings.weight = nn.Parameter(FloatTensor(voc_size, emb_size).uniform_(-1, 1))
self.oembeddings.weight = nn.Parameter(FloatTensor(voc_size, emb_size).uniform_(-1, 1))
# pylint: enable=no-member
self.iembeddings.weight.requires_grad = True
self.oembeddings.weight.requires_grad = True
def forward(self, data):
""""""
return self.forward_i(data)
def forward_i(self, data):
""" get input vectors"""
idxs = Variable(LongTensor(data))
idxs = idxs.cuda() if self.iembeddings.weight.is_cuda else idxs
return self.iembeddings(idxs)
def forward_o(self, data):
""" get output vectors"""
idxs = Variable(LongTensor(data))
idxs = idxs.cuda() if self.oembeddings.weight.is_cuda else idxs
return self.oembeddings(idxs)
def get_emb_dim(self):
return self.emb_size
class SkipGram(nn.Module):
""""""
def __init__(self, emb_nn, n_negs=64, weights=None):
super(SkipGram, self).__init__()
self.emb_model = emb_nn
self.voc_size = emb_nn.get_emb_dim()
self.n_negs = n_negs
self.neg_sample_weights = None
if weights is not None:
wf = np.power(weights, 0.75) # pylint: disable=no-member
wf = wf / wf.sum()
self.neg_sample_weights = FloatTensor(wf)
def forward(self, data):
""" data is a list of pairs"""
batch_size = len(data[0])
iwords = data[0]
owords = data[1]
if self.neg_sample_weights is not None:
# pylint: disable=no-member
nwords = t.multinomial(self.neg_sample_weights,
batch_size * self.n_negs,
replacement=True).view(batch_size, -1)
else:
nwords = FloatTensor(batch_size, self.n_negs).uniform_(0, self.voc_size - 1).long()
ivectors = self.emb_model.forward_i(iwords).unsqueeze(2)
ovectors = self.emb_model.forward_o(owords).unsqueeze(1)
nvectors = self.emb_model.forward_o(nwords).neg() # important
# pylint: disable=no-member
oloss = t.bmm(ovectors, ivectors).squeeze().sigmoid().log()
nloss = t.bmm(nvectors, ivectors).squeeze().sigmoid().log().view(-1, 1, self.n_negs).sum(2).mean(1)
return -(oloss + nloss).mean()
Environment info:
pytorch: 0.3.0
python: 3.6.0(anaconda3-4.3.0)
Thanks!