nn.Embedding Overwriting seq_len Dimension with H Dimension

Hello. Within the forward method of my neural network, I want to pass a tensor of size (batch_size, seq_len) to an nn.Embedding layer that ought to output shape (batch_size, seq_len, embed_dim). However, my embedding layer is returning (batch_size, embed_dim) which is unexpected according to my understanding of the documentation.

I’ve used torchtext’s Field, TabularDataset, and Iterator classes to feed label encoded text data into my neural network:

TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True)
LABEL = Field(sequential=False, batch_first=True)

train_td, val_td, test_td = TabularDataset.splits(
    path='saved-data/csv/', train=f'{SAMPLE_SIZE}-token-train.csv',
    validation=f'{SAMPLE_SIZE}-token-val.csv', test=f'{SAMPLE_SIZE}-token-test.csv',
    format='csv', skip_header=True, fields=[('text', TEXT), ('label', LABEL)])

TEXT.build_vocab(train_td, min_freq=5, vectors='glove.6B.100d')
vocab = TEXT.vocab

train_iter, val_iter, test_iter = Iterator.splits(
    (train_td, val_td, test_td),
    sort_key=lambda x: len(x.text),
    shuffle=True,
    sort_within_batch=False,
    batch_sizes=(BATCH_SIZE, VAL_BATCH_SIZE, VAL_BATCH_SIZE),
    device=DEVICE)

The relevant part of my training loop:

for data in train_iter:
        inputs, labels = data.text.to(DEVICE), data.label.to(DEVICE)
        model.zero_grad()
        output, hidden = model(inputs)
        loss = criterion(output, labels)
        
        if USE_AMP:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        
        optimizer.step()

My model:

class GruTextClassifier(nn.Module):
    def __init__(self, vocab, hidden_size, num_layers, output_size, drop_prob, is_bidir):
        super(GruTextClassifier, self).__init__()
        
        self.vocab_size = len(vocab)
        self.embed_size = vocab.vectors.shape[1]
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.drop_prob = drop_prob
        self.is_bidir = is_bidir
        
        self.embed = nn.EmbeddingBag(self.vocab_size, self.embed_size, sparse=True)
        self.gru = nn.GRU(self.embed_size, hidden_size, num_layers, batch_first=True,
                          dropout=drop_prob, bidirectional=is_bidir)
        self.dropout = nn.Dropout(drop_prob)
        if is_bidir:
            self.fc = nn.Linear(hidden_size * 2, output_size)
        else:
            self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
        self.init_weights(vocab)

    def init_weights(self, vocab):
        initrange = 0.5
        self.embed.weight.data.copy_(vocab.vectors)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if self.is_bidir:
            hidden = weight.new(self.num_layers * 2, batch_size, self.hidden_size).zero_().to(DEVICE)
        else:
            hidden = weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(DEVICE)
        return hidden
    
    def forward(self, x):
        batch_size, seq_len = x.shape
        if seq_len > 200:
            x = x[:, 0:200]
            seq_len = x.shape[1]
        hidden = self.init_hidden(batch_size)
        
        x_embed = self.embed(x)  # This is where the unexpected behavior occurs
        out, hidden = self.gru(x_embed, hidden)
        out = out.transpose(0,1)
        out = out[-1, :, :]
        out = self.dropout(out)
        
        out = self.fc(out)
        out = self.softmax(out)
        
        return out, hidden

I have been successful using a one-hot encoded tensor as input for the GRU layer which I’ve uploaded here: https://github.com/travis-harper/authorship-attribution/blob/master/gutenberg-10-author-gru-v4-training.ipynb

Any idea why the embeddings are not working?

It seems you are using nn.EmbeddingBag, which outputs [batch_size, embedding_dim], not nn.Embedding, which works as expected:

N, seq = 2, 5
emb = nn.Embedding(num_embeddings=10, embedding_dim=100)
x = torch.randint(0, 10, (N, seq))
output = emb(x)
print(output.shape)
> torch.Size([2, 5, 100])