Batch normalization with model.eval() is behaving like mode.train() and produces an error while infering on single input

Here is the model I use:

class MixedDropout(nn.Module):

def __init__(self, p):
    super().__init__()
    self.dense_dropout = nn.Dropout(p)

def forward(self, input):
    return self.dense_dropout(input)

class MixedLinear(nn.Module):

def __init__(self, in_features, out_features, bias=False):
    super().__init__()
    self.in_features = in_features
    self.out_features = out_features
    self.weight = nn.Parameter(torch.Tensor(in_features, out_features))
    if bias:
        self.bias = nn.Parameter(torch.Tensor(out_features))
    else:
        self.register_parameter('bias', None)
    self.reset_parameters()

def reset_parameters(self):
    nn.init.kaiming_uniform_(self.weight, mode = 'fan_out', a = math.sqrt(5))
    if self.bias is not None:
        _, fan_out = nn.init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_out)
        nn.init.uniform_(self.bias, -bound, bound)

def forward(self, input):
    if self.bias is not None:
        res = torch.addmm(self.bias, input, self.weight)
    else:
        res = input.matmul(self.weight)
    return res

class Dynamic_Neural_Network(nn.Module):

def __init__(self, hyper_parameters_dict):
    super().__init__()
    global device
    self.input_size = hyper_parameters_dict['input_size']
    self.output_size = hyper_parameters_dict['output_size']
    self.list_hidden_layers_sizes = hyper_parameters_dict['list_hidden_layers_sizes']
    self.dropout_proba = hyper_parameters_dict['dropout_proba']
    self.last_layer_activation_func = hyper_parameters_dict['last_layer_activation_func']
    self.batch_norm_flg = hyper_parameters_dict['batch_norm_flg']
    
    if self.batch_norm_flg == True:
        bn_ls = []
        bn_ls.append(torch.nn.BatchNorm1d(self.input_size))
        for hidden_lyr_sz in self.list_hidden_layers_sizes:
            bn_ls.append(torch.nn.BatchNorm1d(hidden_lyr_sz))
        bn_ls.append(torch.nn.BatchNorm1d(self.output_size))
        self.batch_norm_layers = bn_ls
    
    fcs = [MixedLinear(self.input_size, self.list_hidden_layers_sizes[0])]
    for hidden_layer_idx in range(len(self.list_hidden_layers_sizes) - 1):
        fcs.append(MixedLinear(self.list_hidden_layers_sizes[hidden_layer_idx], 
                               self.list_hidden_layers_sizes[hidden_layer_idx + 1]))
    fcs.append(MixedLinear(self.list_hidden_layers_sizes[-1], self.output_size))
    
    self.fcs = nn.ModuleList(fcs)
    self.drop = MixedDropout(self.dropout_proba)
    
def forward(self, X):
    if self.batch_norm_flg == False:
        embedding = self.fcs[0](X)
        for layer_id, fc in enumerate(self.fcs[1:]):
            embedding = fc(self.drop(F.leaky_relu(embedding, negative_slope = 0.1, inplace = False)))
    else:
        bn = self.batch_norm_layers[0].to(device)
        embedding = self.fcs[0](bn(X))
        for layer_id, fc in enumerate(self.fcs[1:]):
            bn = self.batch_norm_layers[layer_id + 1].to(device)
            embedding = fc(self.drop(F.leaky_relu(bn(embedding), negative_slope = 0.1, inplace = False)))
        
    if self.last_layer_activation_func == 'softmax':
        embedding = F.softmax(embedding, dim=1)
    elif self.last_layer_activation_func == 'log_softmax':
        embedding = F.log_softmax(embedding, dim=1)
    elif self.last_layer_activation_func == 'relu':
        embedding = F.relu(embedding)
    elif self.last_layer_activation_func == 'leaky_relu':
        embedding = F.leaky_relu(embedding, negative_slope = 0.1, inplace = False)
    elif self.last_layer_activation_func == 'tanh':
        embedding = F.tanh(embedding)
    elif self.last_layer_activation_func == 'sigmoid':
        embedding = F.sigmoid(embedding)
    else:
        print('The activation function type you inserted is not correct,\
              please insert either softmax, log_softmax, relu, leaky_relu, tanh, or sigmoid activation function')
        
    return embedding

class DIET_Intent_Classifier(nn.Module):

def __init__(self, nn_hyperparameters_dict):
    super().__init__()
    self.feature_extractor_net = Dynamic_Neural_Network(nn_hyperparameters_dict).to(device)
    
def forward(self, batch_queries, intents_desc_embed):
    # pass the queries of the current batch to the feature extractor neural net to get h(query)
    h_queries = self.feature_extractor_net(batch_queries)
    
    # pass all the intents description to the feature extractor neural net to get h(intent)
    h_intents = self.feature_extractor_net(intents_desc_embed)
    
    # calculate the dot product between each h(query) and each h(intent)
    queries_intents_similarities = torch.matmul(h_queries, h_intents.T)
    
    return queries_intents_similarities

class NLU_Intent_Classifier_Training():

def __init__(self, config_obj, queries_train, intents_train, queries_test, intents_test, intents_desc_embed):

    global device
    device = config_obj.device

    # NLU intent classifier model training hyper-parameters
    self.features_extracted_size = config_obj.features_extracted_size
    self.lr = config_obj.lr
    self.weight_decay = config_obj.weight_decay
    self.max_epochs = config_obj.max_epochs
    self.batch_size = config_obj.batch_size
    self.eval_step = config_obj.eval_step
    self.early_stop = config_obj.early_stop
    self.patience = config_obj.patience
    self.batch_mult_val = config_obj.batch_mult_val
    self.list_hidden_layers_sizes = config_obj.list_hidden_layers_sizes
    self.dropout_proba = config_obj.dropout_proba
    self.last_layer_activation_func = config_obj.last_layer_activation_func
    self.loss_func = config_obj.loss_func
    self.batch_norm_flg = config_obj.batch_norm_flg
    
    # define our tensors for training and validation
    self.queries_train = queries_train 
    self.intents_train = intents_train
    self.queries_test = queries_test 
    self.intents_test = intents_test
    self.intents_desc_embed = intents_desc_embed

def train_intent_classifier_model(self):
    global device
    
    def calculate_similarity_loss(batch_queries_intents_similarities, yb):
        # get the similarity of the queries with the positive intents (ground-truth label)
        sim_pos = batch_queries_intents_similarities[torch.arange(batch_queries_intents_similarities.shape[0]).squeeze(), yb.squeeze()]

        # apply masking (-inf) before the exponent to set the similarities of positive intents as 0 
        sim_neg = batch_queries_intents_similarities
        sim_neg[torch.arange(sim_neg.shape[0]).squeeze(), yb.squeeze()] = -float("Inf")

        # loss = - avg_batch( S+ve - log(exp[S+ve] + sum_over_negative_intents( exp[S-ve] ) ) )
        inner_term = torch.exp(sim_pos) + torch.sum(torch.exp(sim_neg), dim = 1)
        batch_loss_tensor = sim_pos - torch.log(inner_term)
        loss = - torch.mean(batch_loss_tensor)

        return loss

    def run_batch(model, xbs, yb, train, optimizer = None, loss = 'cross_entropy'):
        global device
        if train:
            model.train()
        else:
            model.eval()

        with torch.set_grad_enabled(train):
            batch_queries_intents_similarities = model(*xbs)  ## shape = batch_sz * num_of_intents
            batch_predicted_intents = torch.argmax(batch_queries_intents_similarities, dim = 1)  ## shape = batch_sz * 1
            ncorrect = torch.sum(batch_predicted_intents == yb.squeeze())

            if loss == 'similarity_loss':
                batch_loss = calculate_similarity_loss(batch_queries_intents_similarities, yb)
            else:
                batch_proba = torch.softmax(batch_queries_intents_similarities, dim = 1)
                loss_func = nn.CrossEntropyLoss().to(device)
                if batch_proba.shape[0] > 1:
                    batch_loss = loss_func(batch_proba, yb.squeeze())
                else:
                    batch_loss = loss_func(batch_proba.squeeze(), yb.squeeze())

            if train:
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step() 

            return batch_loss, ncorrect, batch_predicted_intents
        
        
    ## train function
    def train(model, train_set_obj, val_set_obj, queries_test, lr, weight_decay, max_epochs, 
              batch_size, eval_step, early_stop, patience, batch_mult_val, loss_func):

        """ Instantiating dataloader objects for the training and validation objects """
        train_loader = torch.utils.data.DataLoader(dataset=train_set_obj, 
                                               sampler=torch.utils.data.BatchSampler(
                                               torch.utils.data.SequentialSampler(train_set_obj),
                                               batch_size=batch_size, drop_last=False
                                            ),
                                            batch_size=None, shuffle=False)

        """ Initializing the variables for comparing the batch loss with the best loss in each evaluation step """
        best_loss = np.inf
        best_test_acc = 0


        loss_hist = {'train': [], 'val': []}
        acc_hist = {'train': [], 'val': []}

        """ Defining our optimizer """
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

        for epoch in range(max_epochs):
            step = 0
            for xbs, yb in train_loader:
                loss_batch, ncorr_batch, top1 = run_batch(model, xbs, yb, True, optimizer, loss_func) 

                step += 1
                """We will not perform evaluation unless we passed a number of batches specified in the eval_step parameter"""
                if step % eval_step == 0:
                    val_indices = torch.arange(queries_test.shape[0])
                    val_xbs, val_yb = val_set_obj.__getitem__(val_indices)
                    val_loss, val_ncorr, val_top1 = run_batch(model, val_xbs, val_yb, False, optimizer)

                    """ Calculating the current training batch loss and batch accuracy """
                    train_loss = loss_batch
                    train_acc = ncorr_batch / yb.shape[0]

                    val_acc = val_ncorr/len(val_indices)

                    loss_hist['train'].append(train_loss)
                    acc_hist['train'].append(train_acc)

                    loss_hist['val'].append(val_loss)
                    acc_hist['val'].append(val_acc)

                    if val_acc > best_test_acc:
                        best_test_acc = val_acc
                        best_epoch = epoch
                        best_state = {
                                        key: value.cpu() for key, value
                                        in model.state_dict().items()
                                     }
                    if early_stop and epoch >= (best_epoch + patience):
                        model.load_state_dict(best_state)
                        return model, epoch + 1, loss_hist, acc_hist

                    print(f"Epoch {epoch}, step {step}: train_loss: {train_loss:.7f}, train_acc: {train_acc:.7f}, val_loss: {val_loss:.7f}, val_acc: {val_acc:.7f}")
        model.load_state_dict(best_state)
        return model, epoch + 1, loss_hist, acc_hist    
    
    # set your training hyper-parameters
    input_dimension_size = self.queries_train.shape[1]
    nn_hyperparameters_dict = {'input_size' : input_dimension_size,
                               'output_size' : self.features_extracted_size,
                               'list_hidden_layers_sizes' : self.list_hidden_layers_sizes,
                               'dropout_proba' : self.dropout_proba,
                               'last_layer_activation_func' : self.last_layer_activation_func,
                               'batch_norm_flg' : self.batch_norm_flg}
    
    train_set_obj = Intent_Capture_Dataset(self.queries_train, self.intents_train, self.intents_desc_embed)
    val_set_obj = Intent_Capture_Dataset(self.queries_test, self.intents_test, self.intents_desc_embed)
    intent_classifier_obj = DIET_Intent_Classifier(nn_hyperparameters_dict).to(device)

    model, epochs, loss_hist, acc_hist = train(intent_classifier_obj, train_set_obj, val_set_obj, self.queries_test, self.lr, self.weight_decay,
                                               self.max_epochs, self.batch_size, self.eval_step, self.early_stop, self.patience, 
                                               self.batch_mult_val, self.loss_func)
    
    return model, epochs, loss_hist, acc_hist

I have trained the model using this configuration:

class Config():

def __init__(self):
    # set the endpoints names here
    self.embedding_model_endpoint_name = 'hf-textembedding-all-minilm-l6-v2-2024-05-17-05-07-36-657'
    
    # define here some environment variables (not dependent on each run)
    self.aws_region = boto3.Session().region_name
    self.sagemaker = boto3.client('sagemaker')
    self.runtime = boto3.client('sagemaker-runtime', region_name = self.aws_region)
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # NLU intent classifier model hyper-parameters
    self.features_extracted_size = 32
    self.lr = 0.001
    self.weight_decay = 1e-07
    self.max_epochs = 350
    self.batch_size = 61
    self.eval_step = 1
    self.early_stop = True # True or False
    self.patience = 100
    self.batch_mult_val = 1
    self.list_hidden_layers_sizes = [512, 256, 128, 64]
    self.dropout_proba = 0.25
    self.last_layer_activation_func = 'tanh' # tanh, sigmoid, softmax, relu, leaky_relu
    self.loss_func =  'similarity_loss'  # similarity_loss or cross_entropy
    self.batch_norm_flg = True
    
    # define our data paths
    self.annotations_path = './examples.txt'
    self.intents_description_path = './intents_description.txt'

from intent_classifier_training import NLU_Intent_Classifier_Training
classifier_obj = NLU_Intent_Classifier_Training(config_obj, queries_train, intents_train, queries_test, intents_test, intents_desc_embed)
model, epochs, loss_hist, acc_hist = classifier_obj.train_intent_classifier_model()

and saved it in the evaluation mode:

model.eval()
torch.save(model, ‘./intent_classifier.pth’)

however, while inferring on a single example as follows:

model = torch.load(‘./intent_classifier.pth’, map_location=config_obj.device)
model.eval()
out = model(queries_test[0].unsqueeze(0), intents_desc_embed)

It produces this error:

ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 384])

Note that: the same line of code works properly when the input is more than 1 example:

I don’t understand why this issue occurs?? I have set the model to the evaluation mode and saved it in the evaluation mode; Is there something wrong in my implementation?
I’m using torch 2.0.0 on aws sagemaker instance CPU optimized with python 3.10

Could you format your code and post a minimal and executable code snippet reproducing the error, please?

I’ve edited my post and formatted the code in a readable format, could you please check?
Here is the shape of the data that i used:
image