Reformulating the question …
Actually this is not model connected question.
I don’t understand how batches are formed, mechanics how words are encoded into tensors.
Here is the code creating batches from artificially generated sentences.
import torch
import torch.nn as nn
import numpy as np
from torchtext.data import Dataset, Example, Field
from torchtext.data import Iterator, BucketIterator
N = 100 # Number of random sentences to generate
words = ['world', 'hello', 'country', 'moon', 'planet', 'earth']
random_sent_lengths=np.random.randint(1, 10, N)
def generate_art_data():
""" Generate random sentences from random words along with random target.
"""
for i in range(N):
rand_sent = np.random.choice(words, random_sent_lengths[i])
rand_y = np.random.randint(0,2,3) # Num of classes = 3
yield (" ".join(rand_sent), rand_y)
def create_datasets():
""" Create torchtext.data.Dataset from generated random data
"""
data = generate_art_data()
TEXT = Field(sequential=True, tokenize=lambda x: x.split(), use_vocab=True, lower=True)
LABEL = Field(sequential=False, use_vocab=False)
trn_fields = [('text', TEXT), ('category', LABEL)]
examples = list(
map(lambda x: Example.fromlist(list(x), fields=trn_fields),
data))
TEXT.build_vocab(data)
dt_train = Dataset(examples, fields=trn_fields)
trn, vld = dt_train.split(split_ratio=0.7)
return (trn, vld, TEXT)
def create_iterators(num_of_batches=4):
""" Create BucketIterator iterators from generated torchtext.data.Dataset s
"""
trn, vld, T = create_datasets()
print()
print("Generated string: ", trn[0].text)
print("Length = ", len(trn[0].text))
#train_iter, val_iter = BucketIterator.splits((trn, vld), batch_sizes=batch_sizes, sort_key=lambda x: len(x.text), sort_within_batch=False, repeat=False)
train_iter = Iterator(trn, batch_size=num_of_batches, sort_key=lambda x: len(x.text))
val_iter = Iterator(vld, batch_size=num_of_batches, sort_key=lambda x: len(x.text))
return train_iter, val_iter, T
train_iter, val_iter, T = create_iterators(5)
for i, batch in enumerate(train_iter):
if i == 0:
print()
print(batch.text.size(), batch.category.size())
#print(batch.text)
#print(batch.category)
Output it
Generated string: ['planet', 'country', 'country', 'world']
Length = 4
torch.Size([9, 5]) torch.Size([5, 3])
It means that for 4 words it generated tensor of size 9 by 5.
If I call it another time
Generated string: ['country', 'world']
Length = 2
torch.Size([4, 5]) torch.Size([5, 3])
What I do not understand - why for target variable size is as expected - first dimension is 5, as a number of batches but for text dimension 0 variates and it is even not related by number of words.
Can you please advice me - how to generate batches with the same dimensions for any random sentence? And number of sentences (tensors for sentences) should be the same as number of target tensors in the batch - 5 in this case.