Video Classification CNN - F1 score gets stuck while training the model

I’m trying to replicate the Large-scale Video Classification with Convolutional Neural Networks.
I’ve already downloaded the frames divided homogenuisly in bag of shots. The train and test DataFrames that maintain all the infromations have the following structure:

index 0 1 2 3 4 etc
0 train/-b5vDKaAd9o/bag_of_shots0 0 0 0 0 1
1 train/-b5vDKaAd9o/bag_of_shots1 0 0 0 0 1
2 train/-b5vDKaAd9o/bag_of_shots2 0 0 0 0 1
3 train/5smf0wPlLk4/bag_of_shots0 1 0 1 0 1
etc - - - - - -

With as index the counter, the path of the bag of shots location and the one hot encoding of all labels.

Now my Custom dataset class is:

class VideoDataset(Dataset):
  def __init__(self, df, transform = None, t = 'single'):
    self.df = df
    self.transform = transform
    self.t = t
  
  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    if self.transform is None: self.transform = transforms.ToTensor()
    images_path = self.df.iloc[index, 0]
    
    if self.t == 'single':
      images = self.transform(Image.open(f'{dataset_path}/{images_path}/shot{shots//2}.png'))
    if self.t == 'early':
      images = np.array([self.transform(Image.open(f'{dataset_path}/{images_path}/shot{idx}.png')) for idx in range(shots//3,shots//3*2)])
    if self.t == 'late':
      images = np.array([self.transform(Image.open(f'{dataset_path}/{images_path}/shot0.png')), self.transform(Image.open(f'{dataset_path}/{images_path}/shot{shots-1}.png'))])
    if self.t == 'slow':
      if shots%10 == 0:
        images = np.array([self.transform(Image.open(f'{dataset_path}/{images_path}/shot{idx}.png')) for idx in range((shots/2) - 5, (shots/2) + 5)])
      else:
        images = np.array([self.transform(Image.open(f'{dataset_path}/{images_path}/shot{idx}.png')) for idx in range((shots%10) - (shots%10)//2, shots-(shots%10)//2)])

    y_labels = torch.from_numpy(self.df.iloc[0, 1:].to_numpy().astype(float))

    if self.t != 'single': images = torch.from_numpy(images)

    return images, y_labels, images_path

Then regarding train test split I have:

def spit_train(train_data, perc_val_size):
  train_size = len(train_data)
  val_size = int((train_size * perc_val_size) // 100)
  train_size -= val_size

  return random_split(train_data, [int(train_size), int(val_size)]) #train_data, val_data 

train_data_single, val_data_single = spit_train(VideoDataset(df=train_df, transform=train_transform, t='single'), 20)
test_data_single = VideoDataset(df=test_df, transform=test_transform, t='single')

The DataLoader:

BATCH_SIZE = 8
NUM_WORKERS = os.cpu_count()
def generate_dataloaders(train_data, val_data, test_data, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS):

  train_dl = DataLoader(dataset = train_data, 
                                batch_size = BATCH_SIZE,
                                num_workers = NUM_WORKERS,
                                shuffle = True)

  val_dl = DataLoader(dataset = val_data, 
                                batch_size = BATCH_SIZE,
                                num_workers = NUM_WORKERS,
                                shuffle = True)

  test_dl = DataLoader(dataset = test_data, 
                                batch_size = BATCH_SIZE, 
                                num_workers = NUM_WORKERS, 
                                shuffle = False) # don't need to shuffle testing data when we are considering time series dataset

  return train_dl, val_dl, test_dl

The custom AlexNet CNN:

class AlexNet(nn.Module):
    def __init__(self, in_channels, stream_type=None, t_frames=[1,1,1]):
        super().__init__()

        self.stream_type = stream_type
        self.fovea = transforms.Compose([transforms.CenterCrop((89, 89))])
        self.context = transforms.Compose([transforms.Resize((89, 89))])
        self.transform = transforms.Compose([transforms.Resize((170, 170))])
        self.t_frames = t_frames

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels*self.t_frames[0], 96, kernel_size=11, stride=3, padding=2), 
            nn.ReLU(inplace=False),
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.5, k=2),
            nn.MaxPool2d(kernel_size=3, stride=2)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(96*self.t_frames[1], 256, kernel_size=5, stride=1, padding=2),
            nn.ReLU(inplace=False),
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.5, k=2),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(256*self.t_frames[2], 384, kernel_size=3, stride=1, padding=1), 
            nn.ReLU(inplace=False),
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),            
            nn.ReLU(inplace=False),
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=False)
        )

        self.MaxPool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.init_bias()  # initialize bias

    def init_bias(self):
        for block in [self.conv1, self.conv2, self.conv3]:
            for layer in block:
              if isinstance(layer, nn.Conv2d):
                  nn.init.normal_(layer.weight, mean=0, std=0.01)
                  nn.init.constant_(layer.bias, 0)
        # original paper = 1 for Conv2d layers 2nd, 4th, and 5th conv layers
        nn.init.constant_(self.conv2[0].bias, 1)
        nn.init.constant_(self.conv3[2].bias, 1)
        nn.init.constant_(self.conv3[4].bias, 1)


    def forward(self, x):
        if self.stream_type != None:
          x = self.fovea(x) if self.stream_type == 'fovea' else self.context(x)
          return self.conv3(self.conv2(self.conv1(x)))
        else: 
          x = self.transform(x)
          return self.MaxPool(self.conv3(self.conv2(self.conv1(x))))

The initial Model is the one that take as input the middle frame of every bag of shots

class NoMultiresCNN(nn.Module):
    def __init__(self, CNN, num_classes):
        super(NoMultiresCNN, self).__init__()
        self.CNN = CNN

        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5, inplace=False),
            nn.Linear(in_features=(256 * 7 * 7), out_features=4096),
            nn.ReLU(inplace=False),
            nn.Dropout(p=0.5, inplace=False),
            nn.Linear(in_features=4096, out_features=4096),
            nn.ReLU(inplace=False),
            nn.Linear(in_features=4096, out_features=num_classes),
        )

        
    def forward(self, x):
        x = self.CNN(x)
        x = x.reshape(x.shape[0], -1)
        return self.classifier(x)

The architecture of the Train and Evaluate steps:

class CNN_Architecture():

  def __init__(self, model: torch.nn.Module, train_dataloader: torch.utils.data.DataLoader, 
    val_dataloader: torch.utils.data.DataLoader, optimizer: torch.optim.Optimizer,
    loss_fn: torch.nn.Module, accuracy_fn, scheduler: torch.optim.Optimizer, device: torch.device, save_check = False):

    self.model = model.to(device)
    self.optimizer = optimizer
    self.train_dataloader = train_dataloader
    self.loss_fn = loss_fn
    self.val_dataloader = val_dataloader
    self.accuracy_fn = accuracy_fn
    self.scheduler = scheduler
    self.device = device
    self.save_check = save_check

  def __save_checkpoint(self, train_loss, train_f1, epoch):
    data_path = Path('data/')
    filename = f'{self.model.typ}_checkpoint.pth.tar'
    print('=> Saving Checkpoint')
    checkpoint = {'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'train_loss': train_loss, 'train_f1': train_f1, 'epoch': epoch}
    torch.save(checkpoint, filename)
    print(' DONE\n')

  def __load_checkpoint(self,checkpoint):
    self.model.load_state_dict(checkpoint['state_dict'])
    self.optimizer.load_state_dict(checkpoint['optimizer'])
  

  def evaluate(self, val_dataloader: torch.utils.data.DataLoader, epoch = 1, epochs = 1):
    val_loss, val_f1 = 0, 0
    self.model.eval()

    pbar = tqdm(enumerate(val_dataloader), total = len(val_dataloader), leave=False) #, desc='EVALUATION'

    with torch.inference_mode():
      for batch_idx, (images, labels, _) in pbar: # there is a _ to ignore the paths
        images, labels = images.to(self.device), labels.to(self.device)

        outputs = self.model(images)

        #loss = self.loss_fn(outputs, labels).mean()
        loss = self.loss_fn(outputs, labels)
        f1 = self.accuracy_fn(outputs, labels)

        #val_loss +=  loss.data.mean()
        val_loss += loss
        val_f1 += f1
        
        pbar.set_description(f'{self.model.__class__.__name__} EVALUATION Epoch [{epoch + 1} / {epochs}]')
        pbar.set_postfix(loss = loss.item(), f1 = f1.item())
        
      val_loss /= len(val_dataloader) # already calculate the mean of all loss
      val_f1 /= len(val_dataloader) # already calculate the mean of all f1

    model_name = self.model.__class__.__name__
    if self.model.__class__.__name__ == 'NoMultiresCNN': model_name = f'{model_name} - Stream Type: {self.model.CNN.stream_type}'

    return { 'model_name': model_name, # only works when model was created with a class
             'model_loss': val_loss.item(),
             'model_f1': val_f1.item() }



  def fit(self, epochs: int):
    results = { 'train_loss': [], 'train_f1': [], 'val_loss': [], 'val_f1': [] }
    best_train_loss, best_train_f1 = float('inf'), float('-inf')

    for epoch in range(epochs):
      train_loss, train_f1 = 0, 0

      # Training phase
      self.model.train()

      pbar = tqdm(enumerate(self.train_dataloader), total = len(self.train_dataloader), leave=False) #, desc='TRAIN'
      
      for batch_idx, (images, labels, _) in pbar: # there is a _ to ignore the paths
        # zero_grad -> backword -> step

        self.optimizer.zero_grad()
        images, labels = images.to(self.device), labels.to(self.device)
        
        outputs = self.model(images)

        #loss = self.loss_fn(outputs, labels).mean()
        loss = self.loss_fn(outputs, labels)
        loss.backward()
        self.optimizer.step()

        train_loss += loss.item()
        f1 = self.accuracy_fn(outputs, labels).item()

        train_f1 += f1

        model_name = self.model.__class__.__name__
        if self.model.__class__.__name__ == 'NoMultiresCNN': model_name = f'{model_name} - Stream Type: {self.model.CNN.stream_type}'

        pbar.set_description(f'{model_name} TRAIN Epoch [{epoch + 1} / {epochs}]')
        pbar.set_postfix(loss = loss.item(), f1 = f1)


      train_loss /= len(self.train_dataloader)
      train_f1 /= len(self.train_dataloader)


      self.scheduler.step(train_loss)

      if(self.save_check):
        if(train_loss < best_train_loss and train_f1 > best_train_f1):
          self.__save_checkpoint(train_loss, train_f1, epoch + 1)
          best_train_loss, best_train_f1 = train_loss, train_f1

      # Validation phase
      model_name, val_loss, val_f1 = (self.evaluate(self.val_dataloader, epoch, epochs)).values()


      results['train_loss'].append(train_loss)
      results['train_f1'].append(train_f1)
      results['val_loss'].append(val_loss)
      results['val_f1'].append(val_f1)

      print('Epoch [{}], train_loss: {:.4f}, train_f1: {:.4f}, val_loss: {:.4f}, val_f1: {:.4f} \n'.format(
            epoch + 1, train_loss, train_f1, val_loss, val_f1))

    return {'model_name': model_name, 'results': results}

The accuracy measure that I have chosen:

def accuracy(outputs, labels):
    metric = MultilabelF1Score(num_labels=len(LABELS)).to(device)
    return metric(outputs, labels)

Function to start the all computation:

def train_evaluate(model, epochs=NUM_EPOCHS):
  # Train model
  start_time = timer()
  history = model.fit(NUM_EPOCHS)
  end_time = timer()

  print(f'Total training time: {end_time-start_time:.3f} seconds')

  # Compare the results between train and validation set
  plot_loss_curves(history)

  # Evaluate model
  # Test the model in the Test Dataloader
  start_time = timer()
  result = model.evaluate(test_dl_single)
  end_time = timer()

  print(f'Total evaluation time: {end_time-start_time:.3f} seconds\n')
  print(f"TEST Results for {result['model_name']} -> loss: {result['model_loss']} f1-accuracy: {result['model_f1']}")

And finally

NUM_EPOCHS = 20
n_class = len(LABELS)
singleframe_model = NoMultiresCNN(AlexNet(in_channels=3), num_classes=n_class)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=singleframe_model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=3, verbose=True)
CNN = CNN_Architecture(model = singleframe_model, 
             train_dataloader = train_dl_single,
             val_dataloader = val_dl_single,
             optimizer = optimizer,
             loss_fn = loss_fn,
             accuracy_fn = accuracy,
             scheduler = scheduler,
             device = device)

train_evaluate(CNN)

Before stopping the training process I’ve got these result and as you can see the performance are not improving

Epoch [1], train_loss: 0.0097, train_f1: 0.0021, val_loss: 0.0000, val_f1: 0.0021 

Epoch [2], train_loss: 0.0000, train_f1: 0.0021, val_loss: 0.0000, val_f1: 0.0021 

Epoch [3], train_loss: 0.0000, train_f1: 0.0021, val_loss: 0.0000, val_f1: 0.0021 

Epoch [4], train_loss: 0.0000, train_f1: 0.0021, val_loss: 0.0000, val_f1: 0.0021 

NoMultiresCNN - Stream Type: None TRAIN Epoch [5 / 20]: 16%
179/1085 [00:17<01:19, 11.43it/s, f1=0.00205, loss=1.29e-9]

Moreover I’ve used the nn.BCEWithLogitsLoss() loss function and the MultilabelF1Score as accuracy score.
Even if I change the model the f1 score remain stucks while the loss continue to decrease.

So it seems like that there is some sort of bug in the trainingprcess even if it looks like correct in my opinion, this is why I’m here to ask some help.

For the complete project implementation you can check the following Google Colab notebook