Problems calculate precision and recall

I need to calculate precision and recall to evaluate my model performance,so I am using this code that perform inference,annotate the images with the resulted class
and calculates the precision and recall
this is the script I am using


import torch
import numpy as np
import cv2
import os
import torch.nn.functional as F
import torchvision.transforms as transforms
import glob
import argparse
import pathlib

from model import build_model
from class_names import class_names as CLASS_NAMES

import pandas as pd
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix

# Argument parser
parser = argparse.ArgumentParser()
parser.add_argument('-w', '--weights', default='../action_recognition/outputs/best_model.pth', 
                    help='path to the model weights')
parser.add_argument('-c', '--csv_file', default='/action_recognition/input/testing.csv', 
                    help='path to the CSV file containing ground truth labels')
args = parser.parse_args()

# Constants and configurations
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
IMAGE_RESIZE = 224

# Define transforms
def get_test_transform(image_size):
    return transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

# Function to denormalize and annotate images
def annotate_image(image, output_class):
    image = image.squeeze(0).permute((1, 2, 0)).cpu().numpy()  # Permute dimensions for cv2
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Convert to BGR for cv2
    class_name = CLASS_NAMES[int(output_class)]
    cv2.putText(image, class_name, (5, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, lineType=cv2.LINE_AA)
    return image

# Function to perform inference
def inference(model, image, device):
    model.eval()
    with torch.no_grad():
        image = image.to(device)
        outputs = model(image)
        predictions = F.softmax(outputs, dim=1).cpu().numpy()
        output_class = np.argmax(predictions)
    return output_class

# Function to load ground truth labels
def load_ground_truth(csv_path):
    df = pd.read_csv(csv_path)
    df['label'] = df['label'].map(lambda x: CLASS_NAMES.index(x))
    return df['label'].values

# Function to calculate precision and recall
def calculate_metrics(true_labels, pred_labels):
    precision = precision_score(true_labels, pred_labels, average='weighted')
    recall = recall_score(true_labels, pred_labels, average='weighted')
    return precision, recall

if __name__ == '__main__':
    # Load model
   
    weights_path = pathlib.Path(args.weights)
    checkpoint = torch.load(weights_path, map_location=DEVICE)
    model = build_model(fine_tune=False, num_classes=len(CLASS_NAMES)).to(DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])

    # Load images and ground truth labels
    all_image_paths = glob.glob('action_recognition/input/test/*')   
    ground_truth_labels = load_ground_truth(args.csv_file)
    y_true = ground_truth_labels

    y_pred = []

    infer_result_path = os.path.join(
        '../action_recognition', 'outputs', 'inference_results', 'image_outputs'
    )
    os.makedirs(infer_result_path, exist_ok=True)

    # Perform inference and save annotated images
    transform = get_test_transform(IMAGE_RESIZE)
    for i, image_path in enumerate(all_image_paths):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image_tensor = transform(image)
        image_tensor = torch.unsqueeze(image_tensor, 0)
        
        # Inference
        predicted_class = inference(model, image_tensor, DEVICE)
        y_pred.append(predicted_class)

        # Annotate and save result image
        annotated_image = annotate_image(image_tensor, predicted_class)
        image_name = os.path.basename(image_path)
        cv2.imwrite(os.path.join(infer_result_path, image_name), annotated_image)

    if len(y_true) != len(y_pred):
        raise ValueError(f'Number of samples in y_true ({len(y_true)}) and y_pred ({len(y_pred)}) do not match.')
    print("Ground Truth Labels:", y_true)
    print("******************************")
    print("Predicted Labels:", y_pred)
    # Calculate precision and recall
    y_pred = np.array(y_pred)
    precision, recall = calculate_metrics(y_true, y_pred)
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}')
    print("Classification Report:\n", classification_report(y_true, y_pred, target_names=CLASS_NAMES))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


I have two directories traning_set and testing_set
and two csv files train.csv and test csv
both files are structured as follow
filename,label
0_0_image.jpg,CallCellphone
0_101_image.jpg,CallCellphone
0_104_image.jpg,CallCellphone
0_110_image.jpg,CallCellphone
0_117_image.jpg,CallCellphone
0_125_image.jpg,CallCellphone

the results of the precision and recall were very poor Precision: 0.0637, Recall: 0.0638
but I think it does’t make sense cause the training results were very good reaching 99% for the training accuracy and 96% for the validation
I think I made a mistake somewhere in the script But I can’t figure it out
can someone help me ?

Did you check the class frequency and if the dataset is imbalanced as a high accuracy could indicate your model is predicting the majority class only and is this not learning anything?

yes and there is no problem of imablanced data but I think the problem is in the way Iam using the groud truth and predicted class