RuntimeError in a Simple QA model using Transformer

I am new to transformer architecture and torch. I am trying to build a model that will generate answers to questions. For simplicity, I am using three sample questions and answers in a pandas dataframe. After finishing the training, it shows an error. I have tried changing many parts of the code, but due to my limited knowledge, I have been stuck with this error for the last two days. The code and errors are provided below. Please help me solve the error.
Code::

pip install -U torchtext==0.6
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
import random
import numpy as np
import pandas as pd
data = {
    'Question': ['What is the capital of France?', 'Who wrote Hamlet?', 'What is the meaning of life?'],
    'Answer': ['Paris', 'William Shakespeare', '42']
}
df = pd.DataFrame(data)
df['Question'] = df['Question'].apply(lambda x: x.lower())
import spacy
nlp = spacy.load('en_core_web_sm')
question_field = Field(tokenize=lambda x: [tok.text for tok in nlp(x)], init_token='<sos>', eos_token='<eos>', lower=True, include_lengths=False, use_vocab=True)
answer_field = Field(tokenize=lambda x: [tok.text for tok in nlp(x)], init_token='<sos>', eos_token='<eos>', lower=True, include_lengths=False, use_vocab=True)
fields = [('Question', question_field), ('Answer', answer_field)]
examples = [torchtext.data.Example.fromlist([df['Question'][i], df['Answer'][i]], fields) for i in range(df.shape[0])]
question_field.build_vocab(df['Question'], min_freq=2)
answer_field.build_vocab(df['Answer'], min_freq=2)
dataset = torchtext.data.Dataset(examples, fields)
train_data, valid_data = dataset.split(split_ratio=0.8)
class TransformerModel(nn.Module):
    def __init__(self, input_dim, output_dim, n_layers, n_heads, hidden_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(hidden_dim, n_heads, hidden_dim, dropout), n_layers)
        self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(hidden_dim, n_heads, hidden_dim, dropout), n_layers)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, trg):
        src = self.embedding(src)
        trg = self.embedding(trg)

        src = self.dropout(src)
        trg = self.dropout(trg)

        encoder_output = self.encoder(src)
        decoder_output = self.decoder(trg, encoder_output)

        output = self.fc_out(decoder_output)

        return output
input_dim = len(question_field.vocab)
output_dim = len(answer_field.vocab)
n_layers = 6
n_heads = 4
hidden_dim = 512  
dropout = 0.1
model = TransformerModel(input_dim, output_dim, n_layers, n_heads, hidden_dim, dropout)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 64
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.Question),
    device=device  
)

def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0

    for batch in iterator:
        src = batch.Question
        trg = batch.Answer

        optimizer.zero_grad()

        output = model(src, trg)
        output_dim = output.shape[-1]

        output = output.view(-1, output_dim)

        trg = trg.transpose(0, 1)  
        trg = torch.tensor([answer_field.vocab.stoi[token.item()] for token in trg.reshape(-1)], dtype=torch.long, device=device)

        loss = criterion(output, trg)
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}')

# Generate responses
def generate_response(model, question_field, answer_field, question):
    model.eval()

    tokenized_question = question_field.tokenize(question)
    tokenized_question = [question_field.init_token] + tokenized_question + [question_field.eos_token]
    numerical_question = [question_field.vocab.stoi[token] for token in tokenized_question]

    src_tensor = torch.LongTensor(numerical_question).unsqueeze(0).to(device)
    trg_tensor = torch.zeros(100).long().unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(src_tensor, trg_tensor)

    output = output.squeeze(0)
    generated_answer = [answer_field.vocab.itos[idx] for idx in output.argmax(1).cpu().numpy()]
    generated_answer = generated_answer[1:]  # Remove <sos>

    if '<eos>' in generated_answer:
        generated_answer = generated_answer[:generated_answer.index('<eos>')]

    return ' '.join(generated_answer)

# Example usage
question = "What is the capital of France?"
response = generate_response(model, question_field, answer_field, question)
print("Response:", response) 

Error::

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-129-bb346f3fdb29> in <cell line: 26>()
     24 # Example usage
     25 question = "What is the capital of France?"
---> 26 response = generate_response(model, question_field, answer_field, question)
     27 print("Response:", response)

14 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
   5380     q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
   5381     if static_k is None:
-> 5382         k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
   5383     else:
   5384         # TODO finish disentangling control flow so we don't do in-projections when statics are passed

RuntimeError: shape '[1, 400, 128]' is invalid for input of size 4608