Error when training on windows ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed

this is my code:

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(GCN, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels, cached=True))
        for _ in range(num_layers - 2):
            self.convs.append(
                GCNConv(hidden_channels, hidden_channels, cached=True))
        self.convs.append(GCNConv(hidden_channels, out_channels, cached=True))

        self.crf = CRF(constants.LABEL_SET_SIZE, batch_first=True)
        self.dropout = dropout
        self.encoder_v = getattr(torchvision.models, "resnet152")()
        self.encoder_v.load_state_dict(torch.load(f'./resources/models/cnn/resnet152.pth'))
        self.hid_dim_v = self.encoder_v.fc.in_features
        self.tokenizer = AutoTokenizer.from_pretrained("./resources/models/transformers/bert-base-uncased")
        self.encoder_t = AutoModel.from_pretrained("./resources/models/transformers/bert-base-uncased")
        config = AutoConfig.from_pretrained("./resources/models/transformers/bert-base-uncased")
        self.hid_dim_t = config.hidden_size
        self.device = torch.device(f'cuda:0')

        self.aux_head = nn.Linear(self.hid_dim_t, 2)
        hid_dim_rnn = 256
        num_layers = 2
        num_directions = 2
        self.head = nn.Linear(hid_dim_rnn * num_directions, constants.LABEL_SET_SIZE)

        self.proj = nn.Linear(self.hid_dim_v, self.hid_dim_t)
        self.bilstm = nn.LSTM(self.hid_dim_t, hid_dim_rnn, num_layers, batch_first=True, bidirectional=True)
        self.to(self.device)
    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()
    def ner_encode(self, datas: List[Data], gate_signal=None):
        sentence_batch = [data.sentence for data in datas]
        tokens_batch = [[token.text for token in sentence] for sentence in sentence_batch]
        inputs = self.tokenizer(tokens_batch, is_split_into_words=True, padding=True, return_tensors='pt',
                                return_special_tokens_mask=True, return_offsets_mapping=True).to(self.device)

        outputs = self.gnn_forward(inputs, datas, gate_signal)
        feat_batch = outputs[:, :-VISUAL_LENGTH]


        ids_batch = inputs.input_ids
        offset_batch = inputs.offset_mapping
        mask_batch = inputs.special_tokens_mask.bool().bitwise_not()
        for sentence, ids, offset, mask, feat in zip(sentence_batch, ids_batch, offset_batch, mask_batch, feat_batch):
            ids = ids[mask]
            offset = offset[mask]
            feat = feat[mask]
            subtokens = self.tokenizer.convert_ids_to_tokens(ids)
            length = len(subtokens)

            token_list = []
            feat_list = []
            i = 0
            while i < length:
                j = i + 1

                while j < length and (offset[j][0] != 0 or subtokens[j].startswith(SUBTOKEN_PREFIX)):
                    j += 1
                token_list.append(''.join(subtokens[i:j]))
                feat_list.append(torch.mean(feat[i:j], dim=0))
                i = j
            assert len(sentence) == len(token_list)

            for token, token_feat in zip(sentence, feat_list):
                token.feat = token_feat

    def gnn_forward(self, inputs, datas: List[Data], gate_signal=None):
        images = [data.image for data in datas]
        images_data = torch.stack([image.data for image in images]).to(self.device)
        # Assuming datas is your List[Data]
        tokens = [[token for token in data.sentence] for data in datas]
        lengths = [len(token) for token in tokens]
        max_len = max(len(token) for token in tokens)  # Find the length of the longest sequence
        edge_index = []
        for token in tokens:
            for i in range(max_len - 1):
                if i < len(token):
                    edge_index.append([i, i + 1])  # Add an edge for this index pair
                else:
                    edge_index.append([0, 0])  # Add a dummy edge for padding

        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous().to(self.device)
        print(edge_index.shape)
        print((edge_index >= 48).any())
        word_embeddings = self.encoder_t.embeddings.word_embeddings(inputs['input_ids']).to(self.device)
        # Create PyTorch tensors
        x = word_embeddings
        print(x.shape)
        visual_embeds = resnet_encode(self.encoder_v, images_data)
        visual_embeds = self.proj(visual_embeds)
        if gate_signal is not None:
            visual_embeds *= gate_signal
        x = torch.concat((x, visual_embeds), dim=1)
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, edge_index)

        return x

    def ner_encode(self, datas: List[Data], gate_signal=None):
        sentence_batch = [data.sentence for data in datas]
        tokens_batch = [[token.text for token in sentence] for sentence in sentence_batch]
        inputs = self.tokenizer(tokens_batch, is_split_into_words=True, padding=True, return_tensors='pt',
                                return_special_tokens_mask=True, return_offsets_mapping=True).to(self.device)
        outputs = self.gnn_forward(inputs, datas,gate_signal)
        feat_batch = outputs[:, :-VISUAL_LENGTH]
        ids_batch = inputs.input_ids
        offset_batch = inputs.offset_mapping
        mask_batch = inputs.special_tokens_mask.bool().bitwise_not()
        for sentence, ids, offset, mask, feat in zip(sentence_batch, ids_batch, offset_batch, mask_batch, feat_batch):
            ids = ids[mask]
            offset = offset[mask]
            feat = feat[mask]
            subtokens = self.tokenizer.convert_ids_to_tokens(ids)
            length = len(subtokens)

            token_list = []
            feat_list = []
            i = 0
            while i < length:
                j = i + 1

                while j < length and (offset[j][0] != 0 or subtokens[j].startswith(SUBTOKEN_PREFIX)):
                    j += 1
                token_list.append(''.join(subtokens[i:j]))
                feat_list.append(torch.mean(feat[i:j], dim=0))
                i = j
            assert len(sentence) == len(token_list)

            for token, token_feat in zip(sentence, feat_list):
                token.feat = token_feat
    def forward(self, datas: List[Data]):
        sentence_batch = [data.sentence for data in datas]
        tokens_batch = [[token.text for token in sentence] for sentence in sentence_batch]
        inputs = self.tokenizer(tokens_batch, is_split_into_words=True, padding=True, return_tensors='pt',
                                return_special_tokens_mask=True, return_offsets_mapping=True).to(self.device)
        inputs = inputs.to(self.device)
        outputs = self.gnn_forward(inputs, datas)
        feats = outputs[:, CLS_POS]
        logits = self.aux_head(feats)
        gate_signal = F.softmax(logits, dim=1)[:, 1].view(len(datas), 1, 1)

        self.ner_encode(datas, gate_signal)

        sentences = [data.sentence for data in datas]
        batch_size = len(sentences)
        lengths = [len(sentence) for sentence in sentences]
        max_length = max(lengths)

        feat_list = []
        zero_tensor = torch.zeros(max_length * self.hid_dim_t, device=self.device)
        for sentence in sentences:
            feat_list += [token.feat for token in sentence]
            num_padding = max_length - len(sentence)
            if num_padding > 0:
                padding = zero_tensor[:self.hid_dim_t * num_padding]
                feat_list.append(padding)
        feats = torch.cat(feat_list).view(batch_size, max_length, self.hid_dim_t)

        feats = nn.utils.rnn.pack_padded_sequence(feats, lengths, batch_first=True, enforce_sorted=False)
        feats, _ = self.bilstm(feats)
        feats, _ = nn.utils.rnn.pad_packed_sequence(feats, batch_first=True)

        logits_batch = self.head(feats)

        labels_batch = torch.zeros(batch_size, max_length, dtype=torch.long, device=self.device)
        for i, sentence in enumerate(sentences):
            labels = torch.tensor([token.label for token in sentence], dtype=torch.long, device=self.device)
            labels_batch[i, :lengths[i]] = labels

        mask = torch.zeros(batch_size, max_length, dtype=torch.bool, device=self.device)
        for i in range(batch_size):
            mask[i, :lengths[i]] = 1
        loss = -self.crf(logits_batch, labels_batch, mask)
        pred_ids = self.crf.decode(logits_batch, mask)
        pred = [[constants.ID_TO_LABEL[i] for i in ids] for ids in pred_ids]

        return loss, pred

it outputs the sizes when trained with the params: GCN(768, 256, 768,3,0.5):
torch.Size([2,432])
tensor(False, device=‘cuda:0’)
torch.Size([16,48,768])
and the error is:
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [0,0,0] Assertion srcIndex < srcSelectDimSize failed.
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [1,0,0] Assertion srcIndex < srcSelectDimSize failed.
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [2,0,0] Assertion srcIndex < srcSelectDimSize failed.
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [3,0,0] Assertion srcIndex < srcSelectDimSize failed.
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [4,0,0] Assertion srcIndex < srcSelectDimSize failed.
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [5,0,0] Assertion srcIndex < srcSelectDimSize failed.
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [6,0,0] Assertion srcIndex < srcSelectDimSize failed.
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [7,0,0] Assertion srcIndex < srcSelectDimSize failed.
C:\actions-runner_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\cuda\Indexing.cu:1290: block: [54,0,0], thread: [8,0,0] Assertion srcIndex < srcSelectDimSize failed.

Traceback (most recent call last):
File “Z:\CWI\venv\lib\site-packages\torch_geometric\nn\aggr\base.py”, line 128, in call
return super().call(x, index=index, ptr=ptr, dim_size=dim_size,
File “Z:\CWI\venv\lib\site-packages\torch\nn\modules\module.py”, line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “Z:\CWI\venv\lib\site-packages\torch\nn\modules\module.py”, line 1520, in call_impl
return forward_call(*args, **kwargs)
File “Z:\CWI\venv\lib\site-packages\torch_geometric\nn\aggr\basic.py”, line 22, in forward
return self.reduce(x, index, ptr, dim_size, dim, reduce=‘sum’)
File “Z:\CWI\venv\lib\site-packages\torch_geometric\nn\aggr\base.py”, line 182, in reduce
return scatter(x, index, dim, dim_size, reduce)
File “Z:\CWI\venv\lib\site-packages\torch_geometric\utils_scatter.py”, line 75, in scatter
return src.new_zeros(size).scatter_add
(dim, index, src)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File “Z:\CWI\GCN NER\train.py”, line 63, in
ner_loss = train(ner_train_loader, model, optimizer)
File “Z:\CWI\GCN NER\utils.py”, line 32, in train
loss, _ = getattr(model, f’forward’)(batch)
File “Z:\CWI\GCN NER\model.py”, line 192, in forward
outputs = self.gnn_forward(inputs, datas)
File “Z:\CWI\GCN NER\model.py”, line 147, in gnn_forward
x = conv(x, edge_index)
File “Z:\CWI\venv\lib\site-packages\torch\nn\modules\module.py”, line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “Z:\CWI\venv\lib\site-packages\torch\nn\modules\module.py”, line 1520, in _call_impl
return forward_call(*args, **kwargs)
File “Z:\CWI\venv\lib\site-packages\torch_geometric\nn\conv\gcn_conv.py”, line 263, in forward
out = self.propagate(edge_index, x=x, edge_weight=edge_weight)
File “C:\Users\Lenovo.cache\pyg\message_passing\torch_geometric.nn.conv.gcn_conv_GCNConv_propagate.py”, line 230, in propagate
out = self.aggregate(
File “Z:\CWI\venv\lib\site-packages\torch_geometric\nn\conv\message_passing.py”, line 612, in aggregate
return self.aggr_module(inputs, index, ptr=ptr, dim_size=dim_size,
File “Z:\CWI\venv\lib\site-packages\torch_geometric\experimental.py”, line 117, in wrapper
return func(*args, **kwargs)
File “Z:\CWI\venv\lib\site-packages\torch_geometric\nn\aggr\base.py”, line 132, in call
if index.numel() > 0 and dim_size <= int(index.max()):
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

The scatter operation fails with invalid indices as shows in the stacktrace. Check the input indices and make sure they are in the valid range.

what input indecies? the inputs tokenized array inputted into the gnn?

the edge_index are in the valid range. it is in the range of 48.