Error in simple text classification model

Hi,

I stuck while trying to implement very basic model: https://gist.github.com/av-maslov/2ba811543558267dfc3cf909f95d1138

The error occurs on the last line when calculating loss.

I have been trying to follow https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html but using my own artificially generated text data.

I thought an error in how I generate batches: Dimensionality of batches but maybe there is an error somewhere else.

First I generate random sentences

import torch
import torch.nn as nn
import numpy as np
from torchtext.data import Dataset, Example, Field
from torchtext.data import Iterator, BucketIterator


N = 100
words = ['world', 'hello', 'country', 'moon', 'planet', 'earth']
random_sent_lengths=np.random.randint(1, 10, N)

def generate_art_data():
    """ Generate random sentences from random words along with random target.
    """
    for i in range(N):
        rand_sent = np.random.choice(words, random_sent_lengths[i])
        rand_y = np.random.randint(0,2,3) # Num of classes = 3
        yield (" ".join(rand_sent), rand_y)

Then create Dataset

def create_datasets():
    """ Create torchtext.data.Dataset from generated random data
    """
    data = generate_art_data()
    
    TEXT = Field(sequential=True, tokenize=lambda x: x.split(), use_vocab=True, lower=True, fix_length=1000)
    
    LABEL = Field(sequential=False, use_vocab=False)

    trn_fields = [('text', TEXT), ('target', LABEL)]
    examples = list(
        map(lambda x: Example.fromlist(list(x), fields=trn_fields), 
        data))
    TEXT.build_vocab(data)
    dt_train = Dataset(examples, fields=trn_fields)
    trn, vld = dt_train.split(split_ratio=0.7)
    return (trn, vld, TEXT)

Then create Iterators

def create_iterators(batch_sizes=(3,3)):
    """ Create BucketIterator iterators from generated torchtext.data.Dataset s
    """
    trn, vld, T = create_datasets()
    
    b_size=4
    train_iter = Iterator(trn, batch_size=b_size, sort_key=lambda x: len(x.text))
    val_iter = Iterator(vld, batch_size=b_size, sort_key=lambda x: len(x.text))

    return train_iter, val_iter, T

BATCH_SIZES=None
BATCH_SIZES= 4 #(4,4)

train_iter, val_iter, T = create_iterators(BATCH_SIZES)

after that - define model

EMBED_DIM      = 4
NUM_OF_CLASSES = 3
VOCAB_SIZE     = len(T.vocab)


class SuperSimpleModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text)
        return self.fc(embedded)


model = SuperSimpleModel(VOCAB_SIZE, EMBED_DIM, NUM_OF_CLASSES)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

and trying to run it (at least 1 forward step)

for i, batch in enumerate(train_iter):
    if i < 5:
        output = model(batch.text)
        print(output.size(), batch.target.size())
        loss = criterion(output, batch.target)

And it produces error. Where am I wrong?

Can you copy/paste the error messages?

In general, if your data are saved in a list, you should be able to use torch.utils.data.DataLoader, like the sentiment analysis example above. You can follow the example here to build vocab and datasets.