How to solve ValueError: expected sequence of length 15 at dim 1 (got 18) error in python

I am training a simple custom NER model using Hugging face model. My inputs are of different lengths which I solve by truncation and padding.

I am training this on 2 GPU's.

I get the below errors as outputs are of different lengths:

ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/", line 61, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/tmp/ipykernel_1511906/", line 253, in forward
    return [, torch.tensor(prediction).to(device)]
ValueError: expected sequence of length 15 at dim 1 (got 18)

Here is the complete code:

import os
import warnings
import compress_json
from collections import Counter
import tqdm
import random
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"]= "true"
from torchcrf import CRF
from transformers import BertTokenizerFast as BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW, BertModel, BertConfig
import torch.nn as nn
import torch.nn.functional as F
log_soft = F.log_softmax
from transformers import (Trainer,TrainingArguments)
from import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

{'text': "My name is Jon. I live in Germany.",
'spans': [{'start': 12, 'end': 14, 'label': 'name', 'ngram': 'Jon'},
          {'start': 27, 'end': 33, 'label': 'country', 'ngram': 'Germany'}

{'text': "My name is Jony. I live in Russia. I am good and back from school.",
'spans': [{'start': 12, 'end': 15, 'label': 'name', 'ngram': 'Jony'},
          {'start': 28, 'end': 33, 'label': 'country', 'ngram': 'Russia'}
{'text': "My name is Tony. I live in Poland.",
'spans': [{'start': 12, 'end': 15, 'label': 'name', 'ngram': 'Tony'},
          {'start': 28, 'end': 33, 'label': 'country', 'ngram': 'Poland'}
{'text': "My name is Yun. I live in Holland. I am not.",
'spans': [{'start': 12, 'end': 14, 'label': 'name', 'ngram': 'Yun'},
          {'start': 27, 'end': 33, 'label': 'country', 'ngram': 'Holland'}

model_checkpoint = "spanbert-base"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)

def isin(a, b):
    return a[1] > b[0] and a[0] < b[1]

def tokenize_and_align_labels(examples, label2id, max_length=512):
    return tokenized_inputs

train_set = [
        [{'start': y["start"], 'end': y["end"], 'tag': y["label"], 'text': y["ngram"]} for y in x['spans']]
    ] for x in train_data

## get label list
ori_label_list = []
for line in train_set:
    ori_label_list += [entity['tag'] for entity in line[1]]

ori_label_list = sorted(list(set(ori_label_list)))

label_list = []
for prefix in 'BI':
    label_list += [prefix + '-' + x for x in ori_label_list]
label_list += ['O']
label_list = sorted(list(set(label_list)))
label2id = {n:i for i,n in enumerate(label_list)}
id2label= {i:n for i,n in enumerate(label_list)}

train_examples ={'texts':[x[0] for x in train_set],'tag_names':[x[1] for x in train_set]}
train_data = tokenize_and_align_labels(train_examples,label2id)


class MyDataset(
    def __init__(self, examples):
        self.encodings = examples
        # print(self.encodings)
        # print()
        self.labels = examples['labels']

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        # print(item)
        # item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):

        return len(self.labels)


bert_model = BertForTokenClassification.from_pretrained(

class BERT_CRF(nn.Module):

    def __init__(self, bert_model, num_labels):
        super(BERT_CRF, self).__init__()
        self.bert = bert_model
        self.config = self.bert.config
        self.dropout = nn.Dropout(0.25)
        self.classifier = nn.Linear(768, num_labels)

        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
        print("the types in forward",type(input_ids), type(attention_mask), type(labels),type(token_type_ids))
        outputs = self.bert(input_ids, attention_mask=attention_mask) #output_hidden_states=True,return_dict=False 

        sequence_output = torch.stack((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4])).mean(dim=0)

        sequence_output = self.dropout(sequence_output)

        emission = self.classifier(sequence_output)  # [32,256,17]
        labels = labels.reshape(attention_mask.size()[0], attention_mask.size()[1])

        if labels is not None:
            loss = -self.crf(log_soft(emission, 2), labels, mask=attention_mask.type(torch.uint8), reduction='mean')

            prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))

            return [, torch.tensor(prediction2).to(device)]

            prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
            return prediction

model = BERT_CRF(bert_model, num_labels=len(label2id))

args = TrainingArguments(
    # evaluation_strategy="epoch",


trainer = Trainer(


The error is most likely raised in the torch.tensor(prediciton) call, which might use a list of numpy arrays with a different shape as seen here:

prediction = [np.random.randn(15), np.random.randn(18)]
# ValueError: expected sequence of length 15 at dim 1 (got 18)

Check if that’s the case and make sure each array has the same length if you want to create a single tensor from them.

Hi @ptrblck thank you for your response. Yes, I noticed that the torch.tensor(prediciton) is of different lengths. The outputs = self.bert(input_ids, attention_mask=attention_mask) gives me output of different lengths, though I have used padding and truncation in tokenizer

Please suggest me how to make consistent length output.?

I don’t know if your code removes the padding internally somewhere but you could try to add it back assuming you need to create a single prediction tensor.
If not, you might be able to process each prediction separately.