BERT with torchtext | TypeError: '<' not supported between instances of 'Example' and 'Example'

I’m trying to train bert with torchtext, but I got the following error:`

Traceback (most recent call last):
  File "C:/Users/BHAAK/Desktop/ML_PATH/dirty-hands/DL file/bert pro.py", line 215, in <module>
    big_training_loop(555, optimizer, model, criterion, train_iterator, valid_iterator, accuracy_score=accuracy_score)
  File "C:/Users/BHAAK/Desktop/ML_PATH/dirty-hands/DL file/bert pro.py", line 121, in big_training_loop
    for (text, labels), (val_text, val_labels) in zip(train_loader, val_loader):
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\iterator.py", line 141, in __iter__
    self.init_epoch()
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\iterator.py", line 117, in init_epoch
    self.create_batches()
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\iterator.py", line 245, in create_batches
    self.batches = batch(self.data(), self.batch_size,
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\iterator.py", line 102, in data
    xs = sorted(self.dataset, key=self.sort_key)
TypeError: '<' not supported between instances of 'Example' and 'Example'

Process finished with exit code 1

It’s probably from this section of my code:

`TEXT = data.Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, # include_lengths=True,
                  preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx,
                  pad_token=pad_token_idx, unk_token=unk_token_idx)

LABEL = data.LabelField(dtype=torch.float)

fields = [('text', TEXT), ('label', LABEL)]
train_data, test_data = data.TabularDataset('IMDB Dataset.csv', format='csv', skip_header=True, fields=fields).split()
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

BATCH_SIZE = 255

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# I think its  from BucketIterator ?
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

any idea about the solution ?

here is the full script:

import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score
from transformers import BertTokenizer, BertModel
import random, torch
import numpy as np
from torchtext import data, datasets

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']


def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length - 2]
    return tokens


TEXT = data.Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, # include_lengths=True,
                  preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx,
                  pad_token=pad_token_idx, unk_token=unk_token_idx)

LABEL = data.LabelField(dtype=torch.float)

fields = [('text', TEXT), ('label', LABEL)]
train_data, test_data = data.TabularDataset('IMDB Dataset.csv', format='csv', skip_header=True, fields=fields).split()
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

BATCH_SIZE = 255

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)
', sort_key= lambda x: len(x.text), sort_within_batch=False'
bert = BertModel.from_pretrained('bert-base-uncased')

class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional,
                          batch_first=True, dropout=0 if n_layers < 2 else dropout)

        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        with torch.no_grad():
            embedded = self.bert(text)[0]
        # embedded = nn.utils.rnn.pack_padded_sequence(embedded)
        _, hidden = self.rnn(embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        output = self.out(hidden)
        return output


HIDDEN_DIM = 26
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = BERTGRUSentiment(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)



for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False


optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model
criterion = criterion.to(device)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.1)

model.to(device)

def big_training_loop(epochs, optim, clf, lossf, train_loader, val_loader, accuracy_score):
    # variables for early stopping
    n_epochs_stop = 10
    epochs_no_improve = 0
    min_val_loss = float('inf')

    # running epoch
    for epoch in range(1, epochs + 1):

        # variables for performance monitoring
        loss_train_list = []
        loss_val_list = []
        correct_train_list_fscroe = []
        correct_train_list = []
        correct_val_list_fscroe = []
        correct_val_list = []

        # training on batches
        for (text, labels), (val_text, val_labels) in zip(train_loader, val_loader):
            # preparing data
            labels = labels

            # - training section - #
            clf.train()
            output = clf(text).squeeze()
            print(output.shape, labels.shape)
            # compute loss function
            loss = lossf(output, labels)
            # append loss output to loss_train_list for mnitoring
            loss_train_list.append(loss.item())

            # convert tensors to numpy array
            labels = labels.detach().numpy()
            output = torch.round(output).detach().numpy()
            # calculate accuracy score
            correct_train_list_fscroe.append(f1_score(output, labels, average='macro'))
            correct_train_list.append(accuracy_score(output, labels))
            # compute back propagation
            loss.backward()
            # update weights
            optim.step()
            # clean gradients to not accumulate
            optim.zero_grad()

            # - evaluation section - #
            clf.eval()
            with torch.no_grad():
                # preparing data
                #val_text, val_length = val_text
                val_labels = val_labels
                # - training section - #

                output = clf(val_text).squeeze()

                # compute loss function
                val_loss = lossf(output, val_labels).to(device)
                # append loss output to loss_val_list for mnitoring
                loss_val_list.append(val_loss.item())

                # convert tensors to numpy array
                val_labels = val_labels.detach().numpy()
                val_output = torch.round(output).detach().numpy()
                # calculate accuracy score
                correct_val_list.append(accuracy_score(val_output, val_labels))
                correct_val_list_fscroe.append(f1_score(val_output, val_labels, average='macro'))

        # change lr if the loss didn't decrease

        loss = torch.mean(torch.FloatTensor(loss_train_list))
        val_loss = torch.mean(torch.FloatTensor(loss_val_list))
        acc = torch.mean(torch.FloatTensor(correct_train_list)) * 100
        fscore = torch.mean(torch.FloatTensor(correct_train_list_fscroe)) * 100
        val_acc = torch.mean(torch.FloatTensor(correct_val_list)) * 100
        val_fscore = torch.mean(torch.FloatTensor(correct_val_list_fscroe)) * 100

        scheduler.step(val_loss)

        # save best model if the loss is the best
        if val_loss < min_val_loss:
          # variables for best performance
          best_epoch = epoch
          best_loss = loss
          best_val_loss = val_loss
          best_acc = acc
          best_fscore = fscore
          best_val_acc = val_acc
          best_val_fscore = val_fscore
          # save best model
          epochs_no_improve = 0
          min_val_loss = best_val_loss
          # print the current epoch as the best epoch

          print(
              f'BEST EPOCH: Epoch({epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')

        else:
            # print the current epoch as normal epoch
            print(
                f'Epoch({epoch}) -> Train: (Accuracy: {acc:.1f}, f-score: {fscore:.1f}, Loss: {loss:.4f}) | Val: (Accuracy: {val_acc:.1f}, f-score: {val_fscore:.1f},Loss: {val_loss:.4f})')
            # if epochs_no_improve reached n_epochs_stop the training will stop
            epochs_no_improve += 1

        # early stop the training
        if epoch > 5 and epochs_no_improve == n_epochs_stop:
            torch.save(clf, f'clf_val_loss_{best_val_loss:.4f}_f-score_{best_val_fscore:.1f}_val_acc_{best_val_acc:.1f}.pt')
            print('Early stopping!')
            print()
            print(
                f'BEST EPOCH: Epoch({best_epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')
            break

print('training:')
big_training_loop(555, optimizer, model, criterion, train_iterator, valid_iterator, accuracy_score=accuracy_score)

'''import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score
from transformers import BertTokenizer, BertModel
import random, torch
import numpy as np
from torchtext import data, datasets

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']


def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length - 2]
    return tokens


TEXT = data.Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, # include_lengths=True,
                  preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx,
                  pad_token=pad_token_idx, unk_token=unk_token_idx)

LABEL = data.LabelField(dtype=torch.float)

fields = [('text', TEXT), ('label', LABEL)]
train_data, test_data = data.TabularDataset('IMDB Dataset.csv', format='csv', skip_header=True, fields=fields).split()
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

BATCH_SIZE = 28

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    device=device, sort_key= lambda x: len(x.text), sort_within_batch=False)

bert = BertModel.from_pretrained('bert-base-uncased')

class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional,
                          batch_first=True, dropout=0 if n_layers < 2 else dropout)

        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        with torch.no_grad():
            embedded = self.bert(text)[0]
        # embedded = nn.utils.rnn.pack_padded_sequence(embedded)
        _, hidden = self.rnn(embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        output = self.out(hidden)
        return output


HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = BERTGRUSentiment(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)



for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False


optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.1)


def big_training_loop(epochs, optim, clf, lossf, train_loader, val_loader, accuracy_score):
    # variables for early stopping
    n_epochs_stop = 10
    epochs_no_improve = 0
    min_val_loss = float('inf')

    # running epoch
    for epoch in range(1, epochs + 1):

        # variables for performance monitoring
        loss_train_list = []
        loss_val_list = []
        correct_train_list_fscroe = []
        correct_train_list = []
        correct_val_list_fscroe = []
        correct_val_list = []

        # training on batches
        for (text, labels), (val_text, val_labels) in zip(train_loader, val_loader):
            # preparing data
            labels = labels

            # - training section - #
            clf.train()
            output = clf(text).squeeze()
            # compute loss function
            loss = lossf(output, labels)
            # append loss output to loss_train_list for mnitoring
            loss_train_list.append(loss.item())

            # convert tensors to numpy array
            labels = labels.detach().numpy()
            output = torch.round(output).detach().numpy()
            # calculate accuracy score
            correct_train_list_fscroe.append(f1_score(output, labels, average='macro'))
            correct_train_list.append(accuracy_score(output, labels))
            # compute back propagation
            loss.backward()
            # update weights
            optim.step()
            # clean gradients to not accumulate
            optim.zero_grad()

            # - evaluation section - #
            clf.eval()
            with torch.no_grad():
                # preparing data
                #val_text, val_length = val_text
                val_labels = val_labels
                # - training section - #
                clf.to(device)

                output = clf(val_text).squeeze()

                # compute loss function
                val_loss = lossf(output, val_labels)
                # append loss output to loss_val_list for mnitoring
                loss_val_list.append(val_loss.item())

                # convert tensors to numpy array
                val_labels = val_labels.detach().numpy()
                val_output = torch.round(output).detach().numpy()
                # calculate accuracy score
                correct_val_list.append(accuracy_score(val_output, val_labels))
                correct_val_list_fscroe.append(f1_score(val_output, val_labels, average='macro'))

        # change lr if the loss didn't decrease

            loss = torch.mean(torch.FloatTensor(loss_train_list))
            val_loss = torch.mean(torch.FloatTensor(loss_val_list))
            acc = torch.mean(torch.FloatTensor(correct_train_list)) * 100
            fscore = torch.mean(torch.FloatTensor(correct_train_list_fscroe)) * 100
            val_acc = torch.mean(torch.FloatTensor(correct_val_list)) * 100
            val_fscore = torch.mean(torch.FloatTensor(correct_val_list_fscroe)) * 100

            scheduler.step(val_loss)

            # save best model if the loss is the best
            if torch.mean(torch.FloatTensor(loss_val_list)) < min_val_loss:

                # variables for best performance
                best_epoch = epoch
                best_loss = loss
                best_val_loss = val_loss
                best_acc = acc
                best_fscore = fscore
                best_val_acc = val_acc
                best_val_fscore = val_fscore

                # save best model
                epochs_no_improve = 0
                min_val_loss = best_val_loss
                # print the current epoch as the best epoch

                print(
                    f'BEST EPOCH: Epoch({epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')

            else:
                # print the current epoch as normal epoch
                print(
                    f'Epoch({epoch}) -> Train: (Accuracy: {acc:.1f}, f-score: {fscore:.1f}, Loss: {loss:.4f}) | Val: (Accuracy: {val_acc:.1f}, f-score: {val_fscore:.1f},Loss: {val_loss:.4f})')
                # if epochs_no_improve reached n_epochs_stop the training will stop
                epochs_no_improve += 1

            # early stop the training
            if epoch > 5 and epochs_no_improve == n_epochs_stop:
                torch.save(clf, f'clf_val_loss_{best_val_loss:.4f}_f-score_{best_val_fscore:.1f}_val_acc_{best_val_acc:.1f}.pt')

                print('Early stopping!')
                print()
                print(
                    f'BEST EPOCH: Epoch({best_epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')
                break

print('training:')
big_training_loop(1000, optimizer, model, criterion, train_iterator, valid_iterator, accuracy_score=accuracy_score)
'''

This could be a problem. Instead, try this:

        for batch in train_loader:
            feature, target = batch.text, batch.label
1 Like

It works thanks, but I’m wondering why did it work?
I mean why the loop didn’t work while this work. Can you explain ?

And if I want the loop to fit on train_set and test on val_set can you tell me how?

because your solution remove the val_set.