TypeError: forward() got an unexpected keyword argument 'return_dict' BERT CLASSIFICATION HUGGINFACE with ray tuning

I’m stacked with this model, every day errors came to my code! Anyway I’m trying to implement a Bert Classifier to discriminate between 2 sequences classes (BINARY CLASSIFICATION), with AX hyperparameters tuning.
This is all my code implemented anticipated by a sample of my datasets ( I have 3 csv, train-test-val). Thank you very much !

df_train=pd.read_csv('CLASSIFIER_train',sep=',',header=None)
df_train
                                               0	    1
	M A T T D R P T P D G T D A I D L T T R V R R...	1
	M K K L F Q T E P L L E L F N C N E L R I I G...	0
	M L V A A A V C P H P P L L I P E L A A G A A...	1
	M I V A W G N S G S G L L I L I L S L A V S A...	0
	M V E E G R R L A A L H P N I V V K L P T T E...	1
	M G S K V S K N A L V F N V L Q A L R E G L T...	1
	M P S K E T S P A E R M A R D E Y Y M R L A M...	1
	M V K E Y A L E W I D G Y R E R L V K V S D A...	1
	M G T A A S Q D R A A M A E A A Q R V G D S F...	0
class SequenceDataset(Dataset):

  def __init__(self, sequences, targets, tokenizer, max_len):
    self.sequences = sequences
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.sequences)
  
  def __getitem__(self, item):
    sequences = str(self.sequences[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      sequences,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'sequences_text': sequences,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }
    
class SequenceDataset(Dataset):

  def __init__(self, sequences, targets, tokenizer, max_len):
    self.sequences = sequences
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.sequences)
  
  def __getitem__(self, item):
    sequences = str(self.sequences[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      sequences,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'sequences_text': sequences,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = SequenceDataset(
    sequences=df[0].to_numpy(),
    targets=df[1].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2,
    shuffle=True
  )

def net_train(net, train_data_loader, parameters, dtype, device):
  net.to(dtype=dtype, device=device)

  # Define loss and optimizer
  #criterion = nn.CrossEntropyLoss()
  criterion = nn.NLLLoss()
  optimizer = optim.SGD(net.parameters(), # or any optimizer you prefer 
                        lr=parameters.get("lr", 0.001), # 0.001 is used if no lr is specified
                        momentum=parameters.get("momentum", 0.9)
  )

  scheduler = optim.lr_scheduler.StepLR(
      optimizer,
      step_size=int(parameters.get("step_size", 30)),
      gamma=parameters.get("gamma", 1.0),  # default is no learning rate decay
  )

  num_epochs = parameters.get("num_epochs", 3) # Play around with epoch number
  # Train Network

# Train Network
  for _ in range(num_epochs):
      # Your dataloader returns a dictionary
      # so access it as such
      for batch in train_data_loader:
          # move data to proper dtype and device
          labels = batch['targets'].to(device=device)
          attention_mask = batch['attention_mask'].to(device=device)
          input_ids = batch['input_ids'].to(device=device)
          #labels = labels.long()
          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs,x= net(input_ids, attention_mask,return_dict=True)
          #outputs,x= net(input_ids,atten_mask)


          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()
          scheduler.step()
  return net
  
  
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      
      self.fc1 = nn.Linear(1024,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,1)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, input_ids, attention_mask ):

      #pass the inputs to the model  
      _, cls_hs = self.bert(input_ids, attention_mask,return_dict=False)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

from transformers import AutoModel
# import BERT-base pretrained model
bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

from transformers.models.bert.modeling_bert import BertForSequenceClassification
def init_net(parameterization):

    model = BERT_Arch(bert) #pretrained ResNet50

    # push the model to GPU
    model = model.to(device)

    # The depth of unfreezing is also a hyperparameter
    for param in model.parameters():
        param.requires_grad = False # Freeze feature extractor
        

    return model # return untrained model

def train_evaluate(parameterization):

    # constructing a new training data loader allows us to tune the batch size


    train_data_loader=create_data_loader(df_train, tokenizer, MAX_LEN, batch_size=parameterization.get("batchsize", 32))
    
    
    # Get neural net
    untrained_net = init_net(parameterization) 
    
    # train
    trained_net = net_train(net=untrained_net, train_data_loader=train_data_loader, 
                            parameters=parameterization, dtype=dtype, device=device)
    
    # return the accuracy of the model as it was trained in this run
    return evaluate(
        net=trained_net,
        data_loader=test_data_loader,
        dtype=dtype,
        device=device,
    )

dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "lr", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True},
        {"name": "batchsize", "type": "range", "bounds": [16, 128]},
        {"name": "momentum", "type": "range", "bounds": [0.0, 1.0]},
        #{"name": "max_epoch", "type": "range", "bounds": [1, 30]},
        #{"name": "stepsize", "type": "range", "bounds": [20, 40]},        
    ],
  
    evaluation_function=train_evaluate,
    objective_name='accuracy',
)

print(best_parameters)
means, covariances = values
print(means)
print(covariances)

File "<ipython-input-61-aa60b2f44317>", line 35, in net_train
    outputs,x= net(input_ids, attention_mask,return_dict=True)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
TypeError: forward() got an unexpected keyword argument 'return_dict'

I’m not sure how the error message fits the posted code, but based on the failure it seems that net doesn’t recognize the return_dict argument, while the BertModel should accept it.
Could you verify that net is indeed self.bert or generally a BertModel object?

I checked and decide to modify the way the model is implemented.

this is the updated code:

def net_train(net, train_data_loader, parameters, dtype, device):
  net.to(dtype=dtype, device=device)

  # Define loss and optimizer
  #criterion = nn.CrossEntropyLoss()
  criterion = nn.NLLLoss()
  optimizer = optim.SGD(net.parameters(), # or any optimizer you prefer 
                        lr=parameters.get("lr", 0.001), # 0.001 is used if no lr is specified
                        momentum=parameters.get("momentum", 0.9)
  )

  scheduler = optim.lr_scheduler.StepLR(
      optimizer,
      step_size=int(parameters.get("step_size", 30)),
      gamma=parameters.get("gamma", 1.0),  # default is no learning rate decay
  )

  num_epochs = parameters.get("num_epochs", 3) # Play around with epoch number
  # Train Network

# Train Network
  for _ in range(num_epochs):
      # Your dataloader returns a dictionary
      # so access it as such
      for batch in train_data_loader:
          # move data to proper dtype and device
          labels = batch['targets'].to(device=device)
          attention_mask = batch['attention_mask'].to(device=device)
          input_ids = batch['input_ids'].to(device=device)
          #labels = labels.long()
          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs,x= net(input_ids, attention_mask,return_dict=False)
          #outputs,x= net(input_ids,atten_mask)


          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()
          scheduler.step()
  return net
  
#from transformers.models.bert.modeling_bert import BertForSequenceClassification,AutoModel
def init_net(parameterization):

    model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=True) #pretrained ResNet50

    # The depth of unfreezing is also a hyperparameter
    for param in model.parameters():
        param.requires_grad = False # Freeze feature extractor
        
    Hs = 512 # Hidden layer size; you can optimize this as well
                                  
    model.fc = nn.Sequential(nn.Linear(1024, 512), # attach trainable classifier
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(Hs, 1),
                                 nn.LogSoftmax(dim=1))
    return model # return untrained model

def train_evaluate(parameterization):

    # constructing a new training data loader allows us to tune the batch size


    train_data_loader=create_data_loader(df_train, tokenizer, MAX_LEN, batch_size=parameterization.get("batchsize", 32))
    
    
    # Get neural net
    untrained_net = init_net(parameterization) 
    
    # train
    trained_net = net_train(net=untrained_net, train_data_loader=train_data_loader, 
                            parameters=parameterization, dtype=dtype, device=device)
    
    # return the accuracy of the model as it was trained in this run
    return evaluate(
        net=trained_net,
        data_loader=test_data_loader,
        dtype=dtype,
        device=device,
    )

dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "lr", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True},
        {"name": "batchsize", "type": "range", "bounds": [16, 128]},
        {"name": "momentum", "type": "range", "bounds": [0.0, 1.0]},
        #{"name": "max_epoch", "type": "range", "bounds": [1, 30]},
        #{"name": "stepsize", "type": "range", "bounds": [20, 40]},        
    ], 
  
    evaluation_function=train_evaluate,
    objective_name='accuracy',
)

print(best_parameters)
means, covariances = values
print(means)
print(covariances)

And now the error is the following one:

    return torch._C._nn.nll_loss_nd(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Cholesky errors typically occur when the same or very similar arms are suggested repeatedly. This can mean the model has already converged and you should avoid running further trials. It will also help to convert integer or categorical parameters to float ranges where reasonable.
Original error: : RuntimeError: Expected target size [115, 1024], got [115]

Thank you !

nn.NLLLoss expects a model output in the shape [batch_size, nb_classes] containing log probabiltiies and a target in the shape [batch_size] as a LongTensor containing class indices in the range [0, nb_classes-1].
Based on the error message, I guess your target shape is wrong. If you are using a one-hot encoded target, use target = target.argmax(dim=1) to create the class indices.

1 Like

Thanks but unfortunately I’m here again stacked with those errors. I can understand that maybe the problem is the 0-1 class label, but I tried almost everything.
This is my code updated:

class CategoricalCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(CategoricalCrossEntropyLoss, self).__init__()

    def forward(self, y_hat, y):
        return nn.NLLLoss()(torch.log(y_hat), torch.argmax(y, dim=1))

def net_train(net, train_data_loader, parameters, dtype, device):
  net.to(dtype=dtype, device=device)

  # Define loss and optimizer
  #criterion = nn.CrossEntropyLoss()
  #criterion = nn.BCEWithLogitsLoss()
  #criterion = nn.NLLLoss()
  criterion = CategoricalCrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), # or any optimizer you prefer 
                        lr=parameters.get("lr", 0.001), # 0.001 is used if no lr is specified
                        momentum=parameters.get("momentum", 0.9)
  )

  scheduler = optim.lr_scheduler.StepLR(
      optimizer,
      step_size=int(parameters.get("step_size", 30)),
      gamma=parameters.get("gamma", 1.0),  # default is no learning rate decay
  )

  num_epochs = parameters.get("num_epochs", 3) # Play around with epoch number
  # Train Network
# Train Network
  for _ in range(num_epochs):
      # Your dataloader returns a dictionary
      # so access it as such
      for batch in train_data_loader:
        #print (batch.size)
          #batch = tuple(t.to(device) for t in batch)
          # move data to proper dtype and device
        input_ids = batch['input_ids']
        attention_mask =batch['attention_mask'] #.type(torch.LongTensor)
        labels = batch['targets'] #.type(torch.LongTensor)
        labels = labels.to(device=device)
          #labels = labels.argmax(dim=-1)
        #labels = labels.view(-1,32)
          
        attention_mask = attention_mask.to(device=device)
        #input_ids=torch.tensor(input_ids) #,dtype=float)
        input_ids = input_ids.to(device=device)
          #labels = labels.type(torch.FloatTensor)
                  #.reshape((labels.shape[0], 1))

          
          #labels = labels.long()
          # zero the parameter gradients
        optimizer.zero_grad()

          # forward + backward + optimize
        outputs,x= net(input_ids, attention_mask,return_dict=False)
          #outputs,x= net(input_ids,atten_mask)
        print(outputs)
        print(outputs.shape) #torch.Size([32, 450, 1024])

          
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
  return net

#from transformers.models.bert.modeling_bert import BertForSequenceClassification,AutoModel
def init_net(parameterization):

    model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=True) #pretrained ResNet50

    # The depth of unfreezing is also a hyperparameter
    for param in model.parameters():
        param.requires_grad = False # Freeze feature extractor
        
    Hs = 512 # Hidden layer size; you can optimize this as well
                                  
    model.fc = nn.Sequential(nn.Linear(1024, 512), # attach trainable classifier
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(512, 1),
                                 nn.LogSoftmax(dim=1))
                                 #nn.Sigmoid())
                                 #nn.Sigmoid()
                                
    return model 
 
def train_evaluate(parameterization):

    # constructing a new training data loader allows us to tune the batch size


    train_data_loader=create_data_loader(df_train, tokenizer, MAX_LEN, batch_size=parameterization.get("batchsize", 32))
    
    
    # Get neural net
    untrained_net = init_net(parameterization) 
    
    # train
    trained_net = net_train(net=untrained_net, train_data_loader=train_data_loader, 
                            parameters=parameterization, dtype=dtype, device=device)
    
    # return the accuracy of the model as it was trained in this run
    return evaluate(
        net=trained_net,
        data_loader=test_data_loader,
        dtype=dtype,
        device=device,
    )

dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "lr", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True},
        #{"name": "batchsize", "type": "range", "bounds": [16, 128]},
        {"name": "momentum", "type": "range", "bounds": [0.0, 1.0]},
        #{"name": "max_epoch", "type": "range", "bounds": [1, 30]},
        #{"name": "stepsize", "type": "range", "bounds": [20, 40]},        
    ], 
  
    evaluation_function=train_evaluate,
    objective_name='accuracy',
)

print(best_parameters)
means, covariances = values
print(means)
print(covariances)
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

I try using dim=-1, but the error in that case is that the target batch size is 0 and must be the same of the input batch size (32)

Can you help me? thanks !!

What are the shapes of the model output and target before passing them to CategoricalCrossEntropyLoss?

Thanks for your time.
If I run this code those are my input example (no float editing etc):

for batch in train_data_loader:
  input_ids = batch['input_ids'].to(device=device) 

  labels = batch['targets'].to(device=device)

  attention_mask = batch['attention_mask'].to(device=device)

  print(labels.shape)
  print(labels)
  print(input_ids)
  print(input_ids.shape)
  

torch.Size([16])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       device='cuda:0', dtype=torch.float64)
tensor([[ 2, 21,  8,  ...,  0,  0,  0],
        [ 2, 21, 21,  ..., 20, 11,  3],
        [ 2, 21,  8,  ...,  0,  0,  0],
        ...,
        [ 2, 21, 10,  ...,  0,  0,  0],
        [ 2, 21, 21,  ...,  0,  0,  0],
        [ 2, 21, 17,  ..., 20,  5,  3]], device='cuda:0')
torch.Size([16, 450])

I tried to put both float and squeze the labels but without success, I think i did it not properly.
Thanks

Your labels tensor is already in the right shape so you don’t need to use argmax on it again.
This post describes the expected shapes.

Unrelated to your issue, but you are using nn.LogSoftmax and are then again applying torch.log on the model output:

return nn.NLLLoss()(torch.log(y_hat), torch.argmax(y, dim=1))

which is wrong as it would create NaNs.

1 Like

Thank you for the explanation. I made a mess trying to solve it. Anyway I had corrected my code, but now I have this error again:
RuntimeError: Expected target size [2, 1024], got [2]


#!pip install -U adapter-transformers
#!pip install ax-platform

########################################
########## WORKING ENV #################
########################################

#import tensorflow as tf
#print(tf.test.gpu_device_name())


#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

#gpu_info = !nvidia-smi
#gpu_info = '\n'.join(gpu_info)
#if gpu_info.find('failed') >= 0:
  #print('Not connected to a GPU')
#else:
  #print(gpu_info)
  
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')
  
import torch
print(torch.torch.cuda.is_available())


#########################################
########## IMPORT LIBRARIES #############
#########################################

#@title Setup & Config
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

#%matplotlib inline
#%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 21091996
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

########################################
########### DATA #######################
########################################


df_test=pd.read_csv('CLASSIFIER_test',sep=',',header=None)
print(df_test)

df_train=pd.read_csv('CLASSIFIER_train',sep=',',header=None)
print(df_train)

df_val=pd.read_csv('CLASSIFIER_val',sep=',',header=None)
print(df_val)

#######################################
########## MODEL #######################
########################################

PRE_TRAINED_MODEL_NAME = "Rostlab/prot_bert_bfd"

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=False)

MAX_LEN = 512

class SequenceDataset(Dataset):

  def __init__(self, df,sequences,targets, tokenizer, max_len):
    self.df=df
    self.sequences = sequences
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    #return len(self.sequences)
    return len(self.df)
    
  
  def __getitem__(self, item):
    sequences = str(self.sequences[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      sequences,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True,
    )

    return {
      'sequences_text': sequences,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=float)
      
    }
    

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = SequenceDataset(df,
    sequences=df[0].to_numpy(),
    targets=df[1].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=1,
    shuffle=True
  )

BATCH_SIZE = 8

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)


#############################################
########### TRAINING ########################
##############################################

def net_train(net, train_data_loader, parameters, dtype, device):
  net.to(dtype=dtype, device=device)

  # Define loss and optimizer
  #criterion = nn.CrossEntropyLoss()
  #criterion = nn.BCEWithLogitsLoss()
  criterion = nn.NLLLoss()
  #criterion=nn.BCELoss()
  #criterion = CategoricalCrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), # or any optimizer you prefer 
                        lr=parameters.get("lr",  3e-5), # 0.001 is used if no lr is specified
                        momentum=parameters.get("momentum", 0.9)
  )

  scheduler = optim.lr_scheduler.StepLR(
      optimizer,
      step_size=int(parameters.get("step_size", 20)),
      gamma=parameters.get("gamma", 1.0),  # default is no learning rate decay
  )

  num_epochs = parameters.get("num_epochs", 3) # Play around with epoch number
  # Train Network
# Train Network
  for _ in range(num_epochs):
      # Your dataloader returns a dictionary
      # so access it as such
      for batch in train_data_loader:
          # move data to proper dtype and device
        input_ids = batch['input_ids']

        attention_mask =batch['attention_mask'] #.type(torch.LongTensor)
        labels = batch['targets'] #.type(torch.LongTensor)
        labels = labels.to(device=device)

          
        attention_mask = attention_mask.to(device=device)
        input_ids = input_ids.to(device)
        print(input_ids)
          

          # zero the parameter gradients
        optimizer.zero_grad()

          # forward + backward + optimize
        outputs,x= net(input_ids, attention_mask,return_dict=False)
 
        print(outputs)
        print(outputs.shape)
        print(labels)
        print(labels.shape)
          
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
  return net
  

#from transformers.models.bert.modeling_bert import BertForSequenceClassification,AutoModel
def init_net(parameterization):

    model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=True) #pretrained ResNet50

    # The depth of unfreezing is also a hyperparameter
    for param in model.parameters():
        param.requires_grad = True # Freeze feature extractor
        
    Hs = 512 # Hidden layer size; you can optimize this as well
                                  
    model.fc = nn.Sequential(nn.Linear(1024, 2), # attach trainable classifier
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 #nn.Linear(512, 2),
                                 nn.LogSoftmax(dim=1))
                                 #nn.Sigmoid())
                                 #nn.Sigmoid()
                                
    return model # return untrained model
    
    
def train_evaluate(parameterization):

    # constructing a new training data loader allows us to tune the batch size
    train_data_loader=create_data_loader(df_train, tokenizer, MAX_LEN, batch_size=parameterization.get("batchsize", 2))
    
    
    # Get neural net
    untrained_net = init_net(parameterization) 
    
    # train
    trained_net = net_train(net=untrained_net, train_data_loader=train_data_loader, 
                            parameters=parameterization, dtype=dtype, device=device)
    
    # return the accuracy of the model as it was trained in this run
    return evaluate(
        net=trained_net,
        data_loader=test_data_loader,
        dtype=dtype,
        device=device,
    )

###################################################################
############# BAYESIAN OPTM #######################################
###################################################################

import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render
from ax.utils.tutorials.cnn_utils import train, evaluate

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

dtype = torch.float

best_parameters, values, experiment, model = optimize(
    parameters=[
        #{"name": "lr", "type": "range", "bounds": [1e-6, 1e-5], "log_scale": True},
        {"name": "batchsize", "type": "range", "bounds": [2, 8]},
        #{"name": "momentum", "type": "range", "bounds": [0.0, 1.0]},
        {"name": "max_epoch", "type": "range", "bounds": [1, 6]},
        #{"name": "stepsize", "type": "range", "bounds": [20, 40]},        
    ], 
  
    evaluation_function=train_evaluate,
    objective_name='accuracy',
)

print(best_parameters)
means, covariances = values
print(means)
print(covariances)

Those are my print in my training loop:

print(input_ids)
print(inputs_ids.shape)
print(outputs)
print(outputs.shape)
print(labels)
print(labels.shape)


tensor([[ 2, 21, 17,  ...,  9,  5,  3],
        [ 2, 21, 17,  ...,  0,  0,  0]], device='cuda:0')
torch.Size([2, 512])

tensor([[[ 0.0949, -0.0691,  0.0722,  ..., -0.0580, -0.0939, -0.0966],
         [-0.1368,  0.1321, -0.0213,  ...,  0.0096, -0.0003, -0.0079],
         [ 0.0838, -0.1333,  0.0386,  ...,  0.0169, -0.0764,  0.0893],
         ...,
         [-0.1042, -0.0693, -0.0452,  ...,  0.0023,  0.0337, -0.0197],
         [-0.1451, -0.0231,  0.0353,  ..., -0.1217,  0.1428, -0.0060],
         [ 0.1038, -0.0717,  0.0692,  ..., -0.0499, -0.0918, -0.0983]],

        [[ 0.1139,  0.0058,  0.0930,  ...,  0.0737, -0.0484, -0.0482],
         [-0.1256,  0.1795, -0.0092,  ...,  0.0905,  0.0385, -0.0034],
         [-0.0516, -0.0208, -0.0808,  ..., -0.0421, -0.0105, -0.0259],
         ...,
         [-0.0868,  0.0125,  0.0060,  ...,  0.0013,  0.0366, -0.0371],
         [-0.0730, -0.0596,  0.0412,  ..., -0.1052,  0.0139,  0.0302],
         [-0.0007, -0.0228,  0.1051,  ...,  0.0565, -0.0587, -0.0262]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
torch.Size([2, 512, 1024])

tensor([0., 0.], device='cuda:0', dtype=torch.float64)
torch.Size([2])

Thank you very much !

I think you are hitting this issue again.

Based on your last statement in the linked topic, I guess your output has the shape [batch size=2, seq_len=512, nb_classes=1024] while the target only contains the class indices for [batch_size=2].
This doesn’t work, since:

  • the target should contain values for all samples in the batch dimension as well as all samples in the temporal dimension (seq_len),
  • Once this is fixed, you would have to .permute(0, 2, 1) the model output as the class dimension is supposed to be in dim1.

If you don’t have a target value for each temporal step in the seq_len dimension, you would have to reduce this dimension in your model somehow.

1 Like