RuntimeError: index 28 is out of bounds for dimension 0 with size 28

here is my model.py code:

from typing import List
import torch
from torch import nn
import torch.nn.functional as F
import torchvision
from transformers import PreTrainedTokenizer, PreTrainedModel, AutoTokenizer, AutoModel, AutoConfig
from dataset import DataPoint, Data
import constants as constants
from torchcrf import CRF
from torch_geometric.nn import GCNConv

# constants for model
CLS_POS = 0
SUBTOKEN_PREFIX = '##'
IMAGE_SIZE = 224
VISUAL_LENGTH = (IMAGE_SIZE // 32) ** 2


def use_cache(module: nn.Module, data_points: List[DataPoint]):
    for parameter in module.parameters():
        if parameter.requires_grad:
            return False
    for data_point in data_points:
        if data_point.feat is None:
            return False
    return True


def resnet_encode(model, x):
    x = model.conv1(x)
    x = model.bn1(x)
    x = model.relu(x)
    x = model.maxpool(x)

    x = model.layer1(x)
    x = model.layer2(x)
    x = model.layer3(x)
    x = model.layer4(x)

    x = x.view(x.size()[0], x.size()[1], -1)
    x = x.transpose(1, 2)

    return x


class MultiModelModel(nn.Module):
    def __init__(
            self,
            device: torch.device,
            tokenizer: PreTrainedTokenizer,
            encoder_t: PreTrainedModel,
            hid_dim_t: int,
            encoder_v: nn.Module = None,
            hid_dim_v: int = None,
    ):
        super().__init__()
        self.device = device
        self.tokenizer = tokenizer
        self.encoder_t = encoder_t
        self.hid_dim_t = hid_dim_t
        self.encoder_v = encoder_v
        self.hid_dim_v = hid_dim_v
        self.token_embedding = None
        self.proj = nn.Linear(hid_dim_v, hid_dim_t)
        self.aux_head = nn.Linear(hid_dim_t, 2)
        hid_dim_rnn = 256
        num_layers = 2
        num_directions = 2
        self.gcn_layers = nn.ModuleList([
            GCNConv(self.hid_dim_t, self.hid_dim_t),
            GCNConv(self.hid_dim_t, self.hid_dim_t),
            GCNConv(self.hid_dim_t, self.hid_dim_t),
            GCNConv(self.hid_dim_t, self.hid_dim_t)
        ])
        self.rnn = nn.LSTM(self.hid_dim_t, hid_dim_rnn, num_layers, batch_first=True, bidirectional=True)
        self.head = nn.Linear(hid_dim_rnn * num_directions, constants.LABEL_SET_SIZE)
        self.crf = CRF(constants.LABEL_SET_SIZE, batch_first=True)
        self.to(device)

    @classmethod
    def from_pretrained(cls, cuda, t_encoder, v_encoder):
        device = torch.device(f'cuda:{cuda}')
        models_path = '../resources/models'

        encoder_t_path = f'{models_path}/transformers/{t_encoder}'
        tokenizer = AutoTokenizer.from_pretrained(encoder_t_path)
        encoder_t = AutoModel.from_pretrained(encoder_t_path)
        config = AutoConfig.from_pretrained(encoder_t_path)
        hid_dim_t = config.hidden_size


        encoder_v = getattr(torchvision.models, v_encoder)()
        encoder_v.load_state_dict(torch.load(f'{models_path}/cnn/{v_encoder}.pth'))
        hid_dim_v = encoder_v.fc.in_features




        return cls(
            device=device,
            tokenizer=tokenizer,
            encoder_t=encoder_t,
            hid_dim_t=hid_dim_t,
            encoder_v=encoder_v,
            hid_dim_v=hid_dim_v,
        )

    def _bert_forward_with_image(self, inputs, datas, gate_signal=None):
        images = [data.image for data,_,_ in datas]
        textual_embeds = self.encoder_t.embeddings.word_embeddings(inputs.input_ids)
        visual_embeds = torch.stack([image.data for image in images]).to(self.device)
        if not use_cache(self.encoder_v, images):
            visual_embeds = resnet_encode(self.encoder_v, visual_embeds)
        visual_embeds = self.proj(visual_embeds)
        if gate_signal is not None:
            visual_embeds *= gate_signal
        inputs_embeds = torch.concat((textual_embeds, visual_embeds), dim=1)

        batch_size = visual_embeds.size()[0]
        visual_length = visual_embeds.size()[1]

        attention_mask = inputs.attention_mask
        visual_mask = torch.ones((batch_size, visual_length), dtype=attention_mask.dtype, device=self.device)
        attention_mask = torch.cat((attention_mask, visual_mask), dim=1)

        token_type_ids = inputs.token_type_ids
        visual_type_ids = torch.ones((batch_size, visual_length), dtype=token_type_ids.dtype, device=self.device)
        token_type_ids = torch.cat((token_type_ids, visual_type_ids), dim=1)

        return self.encoder_t(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )

    def ner_encode(self, datas: List[Data], gate_signal=None):
        sentence_batch = [data.sentence for data,_,_ in datas]
        tokens_batch = [[token.text for token in sentence] for sentence in sentence_batch]
        inputs = self.tokenizer(tokens_batch, is_split_into_words=True, padding=True, return_tensors='pt',
                                return_special_tokens_mask=True, return_offsets_mapping=True).to(self.device)

        outputs = self._bert_forward_with_image(inputs, datas, gate_signal)
        feat_batch = outputs.last_hidden_state[:, :-VISUAL_LENGTH]


        ids_batch = inputs.input_ids
        offset_batch = inputs.offset_mapping
        mask_batch = inputs.special_tokens_mask.bool().bitwise_not()
        for sentence, ids, offset, mask, feat in zip(sentence_batch, ids_batch, offset_batch, mask_batch, feat_batch):
            ids = ids[mask]
            offset = offset[mask]
            feat = feat[mask]
            subtokens = self.tokenizer.convert_ids_to_tokens(ids)
            length = len(subtokens)

            token_list = []
            feat_list = []
            i = 0
            while i < length:
                j = i + 1

                while j < length and (offset[j][0] != 0 or subtokens[j].startswith(SUBTOKEN_PREFIX)):
                    j += 1
                token_list.append(''.join(subtokens[i:j]))
                feat_list.append(torch.mean(feat[i:j], dim=0))
                i = j
            assert len(sentence) == len(token_list)
            for token, token_feat in zip(sentence, feat_list):
                token.feat = token_feat


    def ner_forward(self, datas: List[Data]):
        tokens_batch = [[token.text for token in data.sentence] for data, _, _ in datas]
        inputs = self.tokenizer(tokens_batch, is_split_into_words=True, padding=True, return_tensors='pt')
        inputs = inputs.to(self.device)
        outputs = self._bert_forward_with_image(inputs,  datas)
        feats = outputs.last_hidden_state[:, CLS_POS]
        logits = self.aux_head(feats)
        gate_signal = F.softmax(logits, dim=1)[:, 1].view(len(datas), 1, 1)
        _, edges, labels = datas[0]
        edge_index = torch.cat(edges, dim=1)
        self.ner_encode(datas, gate_signal)

        sentences = [data.sentence for data,_,_ in datas]
        batch_size = len(sentences)
        lengths = [len(sentence) for sentence in sentences]
        max_length = max(lengths)

        feat_list = []
        zero_tensor = torch.zeros(max_length * self.hid_dim_t, device=self.device)
        for sentence in sentences:
            feat_list += [token.feat for token in sentence]
            num_padding = max_length - len(sentence)
            if num_padding > 0:
                padding = zero_tensor[:self.hid_dim_t * num_padding]
                feat_list.append(padding)

        feats = torch.cat(feat_list).view(batch_size, max_length, self.hid_dim_t)
        print(feats.shape)

        for gcn_layer in self.gcn_layers:
            feats = gcn_layer(feats, edge_index)
        feats = nn.utils.rnn.pack_padded_sequence(feats, lengths, batch_first=True, enforce_sorted=False)
        feats, _ = self.rnn(feats)
        feats, _ = nn.utils.rnn.pad_packed_sequence(feats, batch_first=True)

        logits_batch = self.head(feats)

        labels_batch = torch.zeros(batch_size, max_length, dtype=torch.long, device=self.device)
        for i, sentence in enumerate(sentences):
            labels = torch.tensor([token.label for token in sentence], dtype=torch.long, device=self.device)
            labels_batch[i, :lengths[i]] = labels

        mask = torch.zeros(batch_size, max_length, dtype=torch.bool, device=self.device)
        for i in range(batch_size):
            mask[i, :lengths[i]] = 1
        loss = -self.crf(logits_batch, labels_batch, mask)
        pred_ids = self.crf.decode(logits_batch, mask)
        pred = [[constants.ID_TO_LABEL[i] for i in ids] for ids in pred_ids]

        return loss, pred

dataset.py

import torch
from torch.utils.data import Dataset
from typing import List, Optional
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
import spacy


class DataPoint:
    def __init__(self):
        self.feat: Optional[torch.Tensor] = None
        self.label: Optional[int] = None


class Token(DataPoint):
    def __init__(self, text, label):
        super().__init__()
        self.text: str = text
        self.label = label


class Sentence(DataPoint):
    def __init__(self, tokens: List[Token] = None, text: str = None):
        super().__init__()
        self.tokens: List[Token] = tokens
        self.text = text

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, index: int):
        return self.tokens[index]

    def __iter__(self):
        return iter(self.tokens)

    def __str__(self):
        return self.text if self.text else ' '.join([token.text for token in self.tokens])


class ImageData(DataPoint):
    def __init__(self, file_name: str):
        super().__init__()
        #print(file_name)
        self.file_name: str = file_name
        self.data: ImageData = None


class Data(DataPoint):
    def __init__(self, sentence, image, label=-1):
        super().__init__()
        self.sentence: Sentence = sentence
        self.image: ImageData = image
        self.label = label


class SpacyParser:

    def __init__(self, data: List[Data], transform):
        self.datas = data
        self._invalid_words = [' ']
        self.parser = spacy.load("en_core_web_sm")

    def execute(self):
        edges_b = []
        labels_b = []
        for data in tqdm(self.datas):
            parsed = self.parser(str(data.sentence))
            edges = []
            i = 0
            items_dict = dict()
            for item in parsed:
                if item.orth_ in self._invalid_words:
                    continue
                items_dict[item.idx] = i
                i += 1

            for item in parsed:
                if item.orth_ in self._invalid_words:
                    continue
                index = items_dict[item.idx]
                for child_index in [items_dict[l.idx] for l in item.children
                                    if not l.orth_ in self._invalid_words]:
                    edges.append((index, child_index))
            edges = torch.tensor(edges, dtype=torch.long)
            edges = edges.t().contiguous().long()
            labels = [token.label for token in data.sentence]
            edges_b.append(edges)
            labels_b.append(torch.tensor(labels, dtype=torch.long))
        return edges_b, labels_b


class CustomDataset(Dataset):
    def __init__(self, datas: List[Data], path_to_images: str, load_image: bool = True):
        self.datas: List[Data] = datas
        self.path_to_images = path_to_images
        self.load_image = load_image
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        self.edges, self.labels = SpacyParser(self.datas, self.transform).execute()

    def __len__(self):
        return len(self.datas)

    def __getitem__(self, index: int):

        data = self.datas[index]

        if self.load_image:
            image = data.image

            if image.data is not None or image.feat is not None:
                return data
            # print(image.file_name)
            path_to_image = self.path_to_images + "/" + image.file_name
            image.data = Image.open(path_to_image).convert('RGB')
            image.data = self.transform(image.data)
        return data, self.edges, self.labels


class Corpus:
    def __init__(self, train=None, dev=None, test=None):
        self.train: CustomDataset = train
        self.dev: CustomDataset = dev
        self.test: CustomDataset = test

when running it with train.py:

import os
import pickle
import torch
from torch.utils.data import DataLoader
import loader
from model import MultiModelModel
from utils import seed_worker, seed_everything, train, evaluate
if __name__ == '__main__':
    num_workers = 8
    encoder_t = 'bert-base-uncased'
    encoder_v = 'resnet152'
    dataset = 'twitter2015'
    lr = 1e-5
    num_epochs = 1
    optim = 'Adam'
    bs = 16

    seed_everything(0)
    generator = torch.Generator()
    generator.manual_seed(0)

    if num_workers > 0:
        torch.multiprocessing.set_sharing_strategy('file_system')
        os.environ['TOKENIZERS_PARALLELISM'] = 'true'
    ner_corpus = loader.load_ner_corpus(f'resources/datasets/{dataset}', load_image=(encoder_v != ''))
    ner_train_loader = DataLoader(ner_corpus.train, batch_size=bs, collate_fn=list, num_workers=num_workers,
                                  shuffle=True, worker_init_fn=seed_worker, generator=generator)
    ner_dev_loader = DataLoader(ner_corpus.dev, batch_size=bs, collate_fn=list, num_workers=num_workers)
    ner_test_loader = DataLoader(ner_corpus.test, batch_size=bs, collate_fn=list, num_workers=num_workers)
    model = MultiModelModel.from_pretrained(0, encoder_t, encoder_v) #CUDA编号,Transformer Encoder, Vision Encoder
    params = [
        {'params': model.encoder_t.parameters(), 'lr': lr},
        {'params': model.head.parameters(), 'lr': lr * 100},
        {'params': model.encoder_v.parameters(), 'lr': lr},
        {'params': model.proj.parameters(), 'lr': lr * 100},
        {'params': model.rnn.parameters(), 'lr': lr * 100},
        {'params': model.crf.parameters(), 'lr': lr * 100},
        {'params': model.aux_head.parameters(), 'lr': lr * 100}
    ]

    optimizer = getattr(torch.optim, optim)(params)

    dev_f1s, test_f1s = [], []
    ner_losses, itr_losses = [], []
    best_dev_f1, best_test_report = 0, None
    #训练
    for epoch in range(1, num_epochs + 1):

        ner_loss = train(ner_train_loader, model, optimizer, task='ner')
        ner_losses.append(ner_loss)

        dev_f1, dev_report = evaluate(model, ner_dev_loader)
        dev_f1s.append(dev_f1)
        test_f1, test_report = evaluate(model, ner_test_loader)
        test_f1s.append(test_f1)
        if dev_f1 > best_dev_f1:
            best_dev_f1 = dev_f1
            best_test_report = test_report
    print()

    print(best_test_report)

    file_name = f'trained/{encoder_t}-BiLSTM-{encoder_v}.pkl'
    pickle.dump(model, open(file_name, 'wb'))

it gives me the error:

Traceback (most recent call last):
  File "D:\projects\GNNNER\BiLSTM-Resnet152-Bert NER\train.py", line 59, in <module>
    ner_loss = train(ner_train_loader, model, optimizer, task='ner')
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\projects\GNNNER\BiLSTM-Resnet152-Bert NER\utils.py", line 32, in train
    loss, _ = getattr(model, f'{task}_forward')(batch)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\projects\GNNNER\BiLSTM-Resnet152-Bert NER\model.py", line 203, in ner_forward
    feats = gcn_layer(feats, edge_index)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\projects\GNNNER\venv\Lib\site-packages\torch\nn\modules\module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\projects\GNNNER\venv\Lib\site-packages\torch\nn\modules\module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\projects\GNNNER\venv\Lib\site-packages\torch_geometric\nn\conv\gcn_conv.py", line 241, in forward
    edge_index, edge_weight = gcn_norm(  # yapf: disable
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\projects\GNNNER\venv\Lib\site-packages\torch_geometric\nn\conv\gcn_conv.py", line 108, in gcn_norm
    deg = scatter(edge_weight, idx, dim=0, dim_size=num_nodes, reduce='sum')
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\projects\GNNNER\venv\Lib\site-packages\torch_geometric\utils\_scatter.py", line 75, in scatter
    return src.new_zeros(size).scatter_add_(dim, index, src)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: index 28 is out of bounds for dimension 0 with size 28