RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

import sqlite3
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)

Етап 1: Підготовка даних

Зчитуємо дані з бази даних і створюємо SQLite базу даних

conn = sqlite3.connect(‘translations1.db’)
c = conn.cursor()

Створення таблиці для даних

c.execute(‘’‘CREATE TABLE IF NOT EXISTS translations
(source_text TEXT, target_text TEXT)’‘’)

Додавання прикладів даних до таблиці

examples = [(‘Hello world’, ‘Привіт, світе’),
(‘How are you?’, ‘Як справи?’),
(‘I love programming’, ‘Я люблю програмувати’),
(‘Good morning’, ‘Доброго ранку’),
(‘Thank you’, ‘Дякую’),
(‘Where is the nearest restaurant?’, ‘Де найближчий ресторан?’),
(‘What time is it?’, ‘Котра година?’),
(‘I need help’, ‘Мені потрібна допомога’),
(‘The weather is nice today’, ‘Сьогодні гарна погода’)]

c.executemany(“INSERT INTO translations (source_text, target_text) VALUES (?, ?)”, examples)

Зберігаємо зміни в базі даних

conn.commit()

Етап 2: Токенізація та чисельне представлення тексту

Токенізація тексту

nltk.download(‘punkt’)
def tokenize(text):
return word_tokenize(text.lower())

Створення словників (vocabularies) для української та англійської мов

def build_vocab(texts):
word_counts = Counter()
for text in texts:
tokens = tokenize(text)
word_counts.update(tokens)
vocab = {word: idx + 2 for idx, (word, _) in enumerate(word_counts.most_common())}
vocab[‘’] = 0
vocab[‘’] = 1
return vocab

Отримання списку всіх текстів з бази даних

c.execute(“SELECT source_text FROM translations”)
source_texts = [row[0] for row in c.fetchall()]
c.execute(“SELECT target_text FROM translations”)
target_texts = [row[0] for row in c.fetchall()]

Побудова словників для української та англійської мов

source_vocab = build_vocab(source_texts)
target_vocab = build_vocab(target_texts)

Зворотні словники для перетворення чисельних ідентифікаторів у слова

reverse_source_vocab = {idx: word for word, idx in source_vocab.items()}
reverse_target_vocab = {idx: word for word, idx in target_vocab.items()}

Етап 3: Створення DataLoader

Розділення даних на навчальний, валідаційний та тестовий набори

source_train, source_temp, target_train, target_temp = train_test_split(source_texts, target_texts, test_size=0.2, random_state=42)
source_val, source_test, target_val, target_test = train_test_split(source_temp, target_temp, test_size=0.5, random_state=42)

Клас для навчального набору даних

class TranslationDataset(data.Dataset):
def init(self, source_texts, target_texts, source_vocab, target_vocab):
self.source_texts = source_texts
self.target_texts = target_texts
self.source_vocab = source_vocab
self.target_vocab = target_vocab

def __len__(self):
    return len(self.source_texts)

def __getitem__(self, idx):
    source_text = self.source_texts[idx]
    target_text = self.target_texts[idx]
    source_tokens = [self.source_vocab.get(token, self.source_vocab['<UNK>']) for token in tokenize(source_text)]
    target_tokens = [self.target_vocab.get(token, self.target_vocab['<UNK>']) for token in tokenize(target_text)]
    return source_tokens, target_tokens

Параметри для DataLoader

batch_size = 64
pad_idx = source_vocab[‘’]
print(data)

Функція для пакування послідовностей

def collate_fn(data):
source_seqs, target_seqs = zip(*data)
source_padded = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(seq) for seq in source_seqs], padding_value=pad_idx, batch_first=True)
target_padded = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(seq) for seq in target_seqs], padding_value=pad_idx, batch_first=True)
return source_padded, target_padded

Створення DataLoader

train_dataset = TranslationDataset(source_train, target_train, source_vocab, target_vocab)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

Етап 4: Модель Seq2Seq

Клас моделі Seq2Seq

class Seq2Seq(nn.Module):
def init(self, source_vocab_size, target_vocab_size, embed_size, hidden_size):
super(Seq2Seq, self).init()
self.source_embedding = nn.Embedding(source_vocab_size, embed_size)
self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True)
self.target_embedding = nn.Embedding(target_vocab_size, embed_size)
self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True)
self.output_layer = nn.Linear(hidden_size, target_vocab_size)

def forward(self, source_seqs, target_seqs, source_lengths, target_lengths):
    source_embedded = self.source_embedding(source_seqs)
    packed_source = pack_padded_sequence(source_embedded, source_lengths, batch_first=True, enforce_sorted=False)
    encoder_outputs, (encoder_hidden, _) = self.encoder(packed_source)

    target_embedded = self.target_embedding(target_seqs)
    packed_target = pack_padded_sequence(target_embedded, target_lengths, batch_first=True, enforce_sorted=False)
    decoder_outputs, _ = self.decoder(packed_target, (encoder_hidden, _))
    unpacked_decoder_outputs, _ = pad_packed_sequence(decoder_outputs, batch_first=True)

    output_logits = self.output_layer(unpacked_decoder_outputs)
    return output_logits

Параметри моделі

embed_size = 256
hidden_size = 512

Створення моделі

model = Seq2Seq(len(source_vocab), len(target_vocab), embed_size, hidden_size)

Етап 5: Навчання моделі

Функція втрати та оптимізатор

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters())

Функція навчання

Функція навчання

Функція навчання

def train(model, iterator, optimizer, criterion, clip):
model.train()
epoch_loss = 0
for source_seqs, target_seqs in iterator:
optimizer.zero_grad()
source_lengths = torch.LongTensor([len(seq) for seq in source_seqs]).cpu()
target_lengths = torch.LongTensor([len(seq) for seq in target_seqs]).cpu()
source_seqs = source_seqs.to(device)
target_seqs = target_seqs.to(device)

    # Перемістимо створення source_embedded на пристрій CUDA
    source_seqs = source_seqs.long()
    target_seqs = target_seqs.long()
    source_embedded = model.source_embedding(source_seqs).to(device)
    
    output_logits = model(source_embedded, target_seqs, source_lengths, target_lengths)
    output_logits_dim = output_logits.shape[-1]
    output_logits = output_logits[:, 1:].reshape(-1, output_logits_dim)
    target_seqs = target_seqs[:, 1:].reshape(-1)
    loss = criterion(output_logits, target_seqs)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()

return epoch_loss / len(iterator)

# Перемістилось зайвий відступ
source_embedded = model.source_embedding(source_seqs.to(device))
target_seqs = target_seqs.to(device).long()
source_seqs = source_seqs.to(device).long()
# Перемістилось зайвий відступ

Навчання моделі

N_EPOCHS = 10
CLIP = 1

for epoch in range(N_EPOCHS):
train_loss = train(model, train_loader, optimizer, criterion, CLIP)
print(f’Epoch: {epoch+1:02}‘)
print(f’\tTrain Loss: {train_loss:.3f}')

Етап 6: Оцінка та тестування моделі

Функція для перекладу речень

def translate_sentence(sentence, model, source_vocab, target_vocab, max_len=50):
model.eval()
tokens = tokenize(sentence)
source_tokens = [source_vocab.get(token, source_vocab[‘’]) for token in tokens]
source_tensor = torch.LongTensor(source_tokens).unsqueeze(0).to(device) # Convert to torch.LongTensor and unsqueeze
target_tensor = torch.LongTensor([[target_vocab[‘’]]]).to(device) # Convert to torch.LongTensor

with torch.no_grad():
    encoder_outputs, (encoder_hidden, _) = model.encoder(source_tensor, [len(source_tokens)])

target_token = target_vocab['<SOS>']
target_tokens = []
for _ in range(max_len):
    target_tensor = torch.LongTensor([[target_token]]).to(device)
    with torch.no_grad():
        decoder_outputs, (encoder_hidden, _) = model.decoder(target_tensor, (encoder_hidden, _))
    
    output_logits = model.output_layer(decoder_outputs)
    _, top_token = output_logits.squeeze(0).topk(1)
    target_token = top_token.item()
    if target_token == target_vocab['<EOS>']:
        break
    target_tokens.append(target_token)

translation = [reverse_target_vocab[token] for token in target_tokens]
return ' '.join(translation)

Переклад речення з англійської на українську

source_sentence = “Hello world”
translated_sentence = translate_sentence(source_sentence, model, source_vocab, target_vocab)
print(f’Source: {source_sentence}‘)
print(f’Translated: {translated_sentence}’)

Етап 7: Збереження моделі та результатів

Збереження моделі

torch.save(model.state_dict(), ‘seq2seq_model.pth’)

Оцінка моделі на тестовому наборі

test_dataset = TranslationDataset(source_test, target_test, source_vocab, target_vocab)
test_loader = data.DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn)

def evaluate_bleu(model, test_loader, source_vocab, target_vocab):
model.eval()
bleu_scores =
for source_seqs, target_seqs, source_lengths, target_lengths in test_loader:
source_sentence = ’ '.join([reverse_source_vocab[token.item()] for token in source_seqs[0]])
target_sentence = ’ '.join([reverse_target_vocab[token.item()] for token in target_seqs[0]])
translated_sentence = translate_sentence(source_sentence, model, source_vocab, target_vocab)
bleu = sentence_bleu([target_sentence.split()], translated_sentence.split())
bleu_scores.append(bleu)
avg_bleu = np.mean(bleu_scores)
return avg_bleu

avg_bleu_score = evaluate_bleu(model, test_loader, source_vocab, target_vocab)
print(f’Average BLEU Score on Test Data: {avg_bleu_score:.4f}')
It throws an error at startup [nltk_data] Downloading package punkt to
[nltk_data] C:\Users\Sasha\AppData\Roaming\nltk_data…
[nltk_data] Package punkt is already up-to-date!
<module ‘torch.utils.data’ from ‘C:\Users\Sasha\Projeck\myvenv\lib\site-packages\torch\utils\data\init.py’>

RuntimeError Traceback (most recent call last)
Cell In[18], line 195
192 CLIP = 1
194 for epoch in range(N_EPOCHS):
→ 195 train_loss = train(model, train_loader, optimizer, criterion, CLIP)
196 print(f’Epoch: {epoch+1:02}‘)
197 print(f’\tTrain Loss: {train_loss:.3f}')

Cell In[18], line 169, in train(model, iterator, optimizer, criterion, clip)
166 target_seqs = target_seqs.long()
167 source_embedded = model.source_embedding(source_seqs).to(device)
→ 169 output_logits = model(source_embedded, target_seqs, source_lengths, target_lengths)
170 output_logits_dim = output_logits.shape[-1]
171 output_logits = output_logits[:, 1:].reshape(-1, output_logits_dim)

File ~\Projeck\myvenv\lib\site-packages\torch\nn\modules\module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)

File ~\Projeck\myvenv\lib\site-packages\torch\nn\modules\module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don’t have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None

Cell In[18], line 126, in Seq2Seq.forward(self, source_seqs, target_seqs, source_lengths, target_lengths)
125 def forward(self, source_seqs, target_seqs, source_lengths, target_lengths):
→ 126 source_embedded = self.source_embedding(source_seqs)
127 packed_source = pack_padded_sequence(source_embedded, source_lengths, batch_first=True, enforce_sorted=False)
128 encoder_outputs, (encoder_hidden, _) = self.encoder(packed_source)

File ~\Projeck\myvenv\lib\site-packages\torch\nn\modules\module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)

File ~\Projeck\myvenv\lib\site-packages\torch\nn\modules\module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don’t have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None

File ~\Projeck\myvenv\lib\site-packages\torch\nn\modules\sparse.py:162, in Embedding.forward(self, input)
161 def forward(self, input: Tensor) → Tensor:
→ 162 return F.embedding(
163 input, self.weight, self.padding_idx, self.max_norm,
164 self.norm_type, self.scale_grad_by_freq, self.sparse)

File ~\Projeck\myvenv\lib\site-packages\torch\nn\functional.py:2233, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2227 # Note [embedding_renorm set_grad_enabled]
2228 # XXX: equivalent to
2229 # with torch.no_grad():
2230 # torch.embedding_renorm_
2231 # remove once script supports set_grad_enabled
2232 no_grad_embedding_renorm(weight, input, max_norm, norm_type)
→ 2233 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

RuntimeError: Expected tensor for argument #1 ‘indices’ to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

Embeddings require the input tensor to contain indices as long or int values while you are passing floating point values to it. Fix it and it should work.