Hello. Within the forward method of my neural network, I want to pass a tensor of size (batch_size, seq_len) to an nn.Embedding layer that ought to output shape (batch_size, seq_len, embed_dim). However, my embedding layer is returning (batch_size, embed_dim) which is unexpected according to my understanding of the documentation.
I’ve used torchtext’s Field, TabularDataset, and Iterator classes to feed label encoded text data into my neural network:
TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True)
LABEL = Field(sequential=False, batch_first=True)
train_td, val_td, test_td = TabularDataset.splits(
path='saved-data/csv/', train=f'{SAMPLE_SIZE}-token-train.csv',
validation=f'{SAMPLE_SIZE}-token-val.csv', test=f'{SAMPLE_SIZE}-token-test.csv',
format='csv', skip_header=True, fields=[('text', TEXT), ('label', LABEL)])
TEXT.build_vocab(train_td, min_freq=5, vectors='glove.6B.100d')
vocab = TEXT.vocab
train_iter, val_iter, test_iter = Iterator.splits(
(train_td, val_td, test_td),
sort_key=lambda x: len(x.text),
shuffle=True,
sort_within_batch=False,
batch_sizes=(BATCH_SIZE, VAL_BATCH_SIZE, VAL_BATCH_SIZE),
device=DEVICE)
The relevant part of my training loop:
for data in train_iter:
inputs, labels = data.text.to(DEVICE), data.label.to(DEVICE)
model.zero_grad()
output, hidden = model(inputs)
loss = criterion(output, labels)
if USE_AMP:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
optimizer.step()
My model:
class GruTextClassifier(nn.Module):
def __init__(self, vocab, hidden_size, num_layers, output_size, drop_prob, is_bidir):
super(GruTextClassifier, self).__init__()
self.vocab_size = len(vocab)
self.embed_size = vocab.vectors.shape[1]
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.drop_prob = drop_prob
self.is_bidir = is_bidir
self.embed = nn.EmbeddingBag(self.vocab_size, self.embed_size, sparse=True)
self.gru = nn.GRU(self.embed_size, hidden_size, num_layers, batch_first=True,
dropout=drop_prob, bidirectional=is_bidir)
self.dropout = nn.Dropout(drop_prob)
if is_bidir:
self.fc = nn.Linear(hidden_size * 2, output_size)
else:
self.fc = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
self.init_weights(vocab)
def init_weights(self, vocab):
initrange = 0.5
self.embed.weight.data.copy_(vocab.vectors)
self.fc.weight.data.uniform_(-initrange, initrange)
self.fc.bias.data.zero_()
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
if self.is_bidir:
hidden = weight.new(self.num_layers * 2, batch_size, self.hidden_size).zero_().to(DEVICE)
else:
hidden = weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(DEVICE)
return hidden
def forward(self, x):
batch_size, seq_len = x.shape
if seq_len > 200:
x = x[:, 0:200]
seq_len = x.shape[1]
hidden = self.init_hidden(batch_size)
x_embed = self.embed(x) # This is where the unexpected behavior occurs
out, hidden = self.gru(x_embed, hidden)
out = out.transpose(0,1)
out = out[-1, :, :]
out = self.dropout(out)
out = self.fc(out)
out = self.softmax(out)
return out, hidden
I have been successful using a one-hot encoded tensor as input for the GRU layer which I’ve uploaded here: https://github.com/travis-harper/authorship-attribution/blob/master/gutenberg-10-author-gru-v4-training.ipynb
Any idea why the embeddings are not working?