CTC Loss Is Negative

R_M · April 12, 2024, 7:13pm

I’m making a speech recognition model and can’t figure out why my ctc loss is negative for each processed batch.

I saw some other posts and I made sure that the CTC loss epsilon or blank is not in my dictionary of characters. Would really appreciate help on this.

main.py

from train import * 
from transcription import * 
import torchaudio
from torch.utils.data import DataLoader

def main():
    hparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 5, 
        "hidden_size": 512, 
        "n_class": 29, #including epsilon (NOTE we allow the model to predict epsilon. This is necessary for CTC)
        #we pass in prev LSTM output so very possible to adjust! 
        "n_feats": 128, 
        "stride": 2,
        "dropout": 0.1, 
        "learning_rate": 5e-4,
        "batch_size": 20, 
        "epochs": 10
    } 
    # print('here HELLLLLOOOOOOOO')

    train_dataset = torchaudio.datasets.LIBRISPEECH('./', url='train-clean-100', download=True)
    test_dataset = torchaudio.datasets.LIBRISPEECH('./', url="test-clean", download=True)
    trainloader = DataLoader(dataset=train_dataset, batch_size=hparams["batch_size"], shuffle=True,
                             collate_fn=lambda x: data_preprocessing(x, data_type="train"))
    testloader = DataLoader(dataset=test_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=False,
                                collate_fn=lambda x: data_preprocessing(x, 'valid'))
    
    # print('here HELLLLLOOOOOOOO')
    train(hparams=hparams, trainloader=trainloader, testloader=testloader)

# print(__name__)

if (__name__ == '__main__'):
    
    main()

train.py

import torch
from torch import nn  
from transcription import * 
from torch.nn import functional as F 
from scorer import *


class IterMeter(object):
    def __init__(self) -> None:
        self.val = 0 
    def increment(self):
        self.val += 1 
    def get(self):
        return self.val 

def load_model(hparams):
    model =  SpeechRecognitionModel(hparams['lstm_input_dim'], hidden_size=hparams['hidden_size'], 
                                   n_cnn_layers=hparams['n_cnn_layers'], n_rnn_layers=hparams['n_rnn_layers'],
                                   n_class=hparams['n_class'])
    model.load_state_dict(torch.load('weights/weights.pth'))
    return model 

def test_final(hparams, trainloader, testloader):
    model = load_model(hparams=hparams)
    loss_crit = nn.CTCLoss(blank=28)
    test_cer = [] 
    test_wer = [] 
    for batch_ix, batch in enumerate(testloader):
        spectograms, labels, input_lengths, label_lengths = batch
        output = model(spectograms) #N here = length of the spectogram 
        output = F.softmax(output, dim=2) #softmax of N, T, n_channels 
        output = output.transpose(0,1)
        loss = loss_crit(output, labels, input_lengths, label_lengths)
        agg_loss = agg_loss + loss.item() / len(testloader)
        decoded_output, target_output = greedyDecoder(output.transpose(0,1), labels, label_lengths)
        for j in range(len(decoded_output)):
            test_cer.append(cer(target_output[j], decoded_output[j]))
            test_wer.append(wer(target_output[j], decoded_output[j]))
    
    avg_cer = sum(test_cer)/len(test_cer)
    avg_wer = sum(test_wer)/len(test_wer)
    print(f'word error rate: {avg_wer} vs. average character error rate: {avg_cer}\n')
        

def test_incremental(model, testloader, epoch):
    with torch.no_grad():
        loss_crit = nn.CTCLoss(blank=28)
        test_cer = [] 
        test_wer = [] 
        for batch_ix, batch in enumerate(testloader):
            spectograms, labels, input_lengths, label_lengths = batch
            output = model(spectograms) #N here = length of the spectogram 
            output = F.softmax(output, dim=2) #softmax of N, T, n_channels 
            output = output.transpose(0,1)
            loss = loss_crit(output, labels, input_lengths, label_lengths)
            agg_loss = agg_loss + loss.item() / len(testloader)
            decoded_output, target_output = greedyDecoder(output.transpose(0,1), labels, label_lengths)
            for j in range(len(decoded_output)):
                test_cer.append(cer(target_output[j], decoded_output[j]))
                test_wer.append(wer(target_output[j], decoded_output[j]))
        
        avg_cer = sum(test_cer)/len(test_cer)
        avg_wer = sum(test_wer)/len(test_wer)
        print(f'FOR EPOCH: {epoch}, word error rate: {avg_wer} vs. average character error rate: {avg_cer}\n')


    # output model
    # greedyDecoder()
    return 

def train(hparams, trainloader, testloader):
    print('here at train')
    model = SpeechRecognitionModel(hparams['hidden_size'], hidden_size=hparams['hidden_size'], 
                                   n_cnn_layers=hparams['n_cnn_layers'], n_rnn_layers=hparams['n_rnn_layers'],
                                   n_class=hparams['n_class'])
    optimizer = torch.optim.AdamW(model.parameters(), hparams['learning_rate']) #to ensure that adam has a uniform like effect even with regularisation. 
    #i/e decouples the regularisation term from the gradient updates only subtracting at the end. 
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'],
                                                    steps_per_epoch= len(trainloader), epochs = hparams['epochs'],
                                                    anneal_strategy='linear')
    criterion = nn.CTCLoss(blank=28) #ie we have 28 characters in our transcription set A-Z, ' ' and . therefore indexed 0->27. We want another one.
    # print('here at for loop being')
    for epoch in range(hparams['epochs']): 
        epoch_loss = 0
        for batch_idx, batch in enumerate(trainloader):
            # print('btach', batch)
            spectograms, labels, input_lengths, label_lengths = batch
            y_hat = model(spectograms)
            output = F.softmax(y_hat, dim=2) #this is batch, time, n_classes output 
            output = output.transpose(0,1) #Wants it in T,N,C
            loss = criterion(output, labels, input_lengths, label_lengths ) #target exp N,S (where S = max(timestep))
            #we need to know the sequence length of each input in the batch for masking (i.e for both label and spetogram).
            # The mask tells us which parts of the sequence were padded and which aren't i.e what our loss function should ignore. 
            #and therefore the ones we actually want to learn on. Padding on both sides for bidrectional context i.e future and past. 
            #padding important for parralelisation whre I think architecture needs to be the same? i.e same # of lstm cells? 
    #also, this is based on the softmax output from the LSTM layer, and therefore, we have the label length as this is what we need to predict, i.e even if blank / same 
    #but the label length is standard. Hence why input is // 2 because of prior convolution in the beginning, not the N residual conv nets.. 
            print(f"Batch: {batch_idx}, epoch: {epoch}", loss.item(), '\n')
            epoch_loss = epoch_loss + loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            if batch_idx % 100 == 0 or batch_idx == len(trainloader.dataset):
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(spectograms), len(trainloader.dataset),
                    100. * batch_idx / len(trainloader), loss.item()))
    
        with torch.no_grad():
            agg_loss = loss / len(spectograms)
            print(f"TRAIN loss for epoch {epoch}, batch: {batch_idx} : {agg_loss} \n")
            test_incremental(model, testloader, epoch)


    with torch.no_grad():
         torch.save(model.state_dict(), 'weights/weights.pth')

transcription.py

#Can convert from raw audio to mel spectogram (i.e frequency vs time and color coded)

import torchaudio 
from characters import *
from torch import nn
from torch.nn import functional as F
import torch

train_dataset = torchaudio.datasets.LIBRISPEECH('./', url='train-clean-100', download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH('./', url="test-clean", download=True)
#waveform, sample_rate, and transcript + additional metadata speaker, chapter, utteranceid 

#Data augmentation -> changing pitch, speed, reverb / vibrato?, noise 
#As an alternative, can also cut out consecutive time and frequency chunks (found to result in a model having stronger 
#generalisation capabolities i.e generalise to test data -> SPEC AUGMENT) //particularly for speech recognition. 


class TextTransform:
    def __init__(self) -> None:
        self.charToIndex = {} 
        self.indexToChar = {} 
        self.characters = char_map_str()
        lines = self.characters.char_map_str.strip().split('\n')
        for line in lines: 
            # print(line.split())
            ch, index = line.split() 
            self.charToIndex[ch] = int(index)
            self.indexToChar[int(index)] = ch
        self.indexToChar[1] = " "
    
    def text_to_int(self, text):
        int_sequence = [] 
        for char in text: 
            if char == ' ':
                char = self.charToIndex['<SPACE>']
            else: 
                char = self.charToIndex[char]
            int_sequence.append(char)
        return int_sequence 
    
    def int_to_text(self, int_sequence):
        char = ''
        for val in int_sequence:
            if (self.indexToChar[val] == '<SPACE>'):
                char += ' '
            else:
                char += self.indexToChar[val]
        return char


train_audio_transforms = nn.Sequential( #like a pipe operator. 
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128), #frequencies (y -axis) for a given time window (x-axis)
    # amplitude is the color in decibles. Mel is a math transofrmation to the frquency such that equidistant frequencies
    # also sound equidistant to the human ear. This is the n_mels i.e how many mel filterbanks to use (filterbanks = a range of freuqencies).
    #Not equidistantly spaced frequency buckets like normal spectorgrams but rather more logarithmic like.  
    #Mel spectogram is used to provide sound information similar to what humans would perceive. 
    # Sample rate is used to compute max frequency using nyquist theorem (i.e sample rate / 2). Then seaparate to the bins/ filters. 
    #Often times convolution + Mel Spectogram seen as a better alternative to audio signal + RNN 
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    #f consecutive mel frequency channels are masked [f0, f0 + f] 
    # f is first chosen from a uniform distribution from 0 to the frequency mask parameter F, and f0 is chosen from 0, ν − f
    #masked horizontally. kinda like the width. 
    torchaudio.transforms.TimeMasking(time_mask_param=100)
    # https://chat.openai.com/c/e0574e99-e988-42f9-be07-c94a1cb0072f (size is n_mels, time)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram() 
#default params all g --> can pass torchaudio.load('.wav') where waveform output will be tensor array that when normalised can be passed
text_transform = TextTransform()


def data_preprocessing(data, data_type="train"): #within the batch, we take all the data within it, and then we process it as follows. 
    #i.e we create arrays so they're easy to unpack
    spectograms = []
    labels = []
    input_lengths= [] 
    label_lengths = [] 

    for (waveform, _, transcript, _, _, _) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0,1)
        else:
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0,1) #switch to time, n_mels (we do this for pad sequence later which
            #ensures that all sepctograms are of the same time dimension (i.e the longest time dimension input in the data)
        spectograms.append(spec)
        label = torch.tensor(text_transform.text_to_int(transcript.lower()))
        labels.append(label) #list of torches of int attays 
        input_lengths.append(spec.shape[0]//2) #time / 2 WHY not just shape[0]???
        label_lengths.append(len(label))


#spectogram helps to quantise!!! i.e not continuos frequencies data. aggregate amplitude on the equidistant frequency buckets or filters here. 
        #also aligned with human representation? 
    spectograms = nn.utils.rnn.pad_sequence(spectograms, batch_first=True).unsqueeze(1).transpose(2, 3) 
    #Shape: Batch size, 1, n_mels, time   ((it's 2d) for each frequency index and time we have amplitude values, 1 for channel makes it easier for pytorch to interpret)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True) #Batch size (total count), Max length, 1 ??/ may not have the 1. 
    # print(labels.shape)
    return spectograms, labels, input_lengths, label_lengths


class CNNLayerNorm(nn.Module):
    def __init__(self, n_feats) -> None:
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats) #n_feats refers to the dimension of features. It will search for last dim andnormalise over that dimension, i.e features, where it's expected that the last dimension will have size of n_feat. 

    def forward(self, x):
        #x is (batch, channels, features, time)
        x = x.transpose(2,3).contiguous() #contiguous reformates memory storage and converting to have features at the last dim 
        x = self.layer_norm(x) 
        x = x.transpose(2,3).contiguous() 
        return x 
        #makes sense, i.e care moreso relative to each other in a timestemp vs raw amplitude. 

#maybe convolutions are like learning speech chunk representations and harmonies. The greater reference window can 
        #result in more accurate transcriptions because it has some context on future. Also just learns what 
        #sounds correspond to which spellings 

class ResidualCNN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel, stride, n_feats, dropout) -> None:
        super(ResidualCNN, self).__init__()
        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, padding=(kernel // 2), stride=stride) #will always be same size input and output
        self.cnn2 = nn.Conv2d(in_channels, out_channels, kernel, padding=(kernel // 2), stride=stride)
        self.layernorm1 = CNNLayerNorm(n_feats)
        self.layernorm2 = CNNLayerNorm(n_feats)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x):
        residual = x #(batch, channel, feature, time) Actually, feature here very much so just like frequency
        # print('x is none? 1?', x.shape)
        x = self.layernorm1(x) #we use a cnn first prior to reisdual - for each timestep normalise over frequency amplitude vals.
        # print('x is none? 1?', x.shape)
        x = F.relu(x)
        x = self.dropout1(x)
        x = self.cnn1(x) 
        x = self.layernorm2(x) #we use a cnn first prior to reisdual 
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.cnn2(x) 
        return x + residual

class BidrectionalLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, batchFirst) -> None:
        super(BidrectionalLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                             num_layers=num_layers, bidirectional=True, dropout=dropout, batch_first=batchFirst)
        self.layer_norm = nn.LayerNorm(input_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.layer_norm(x)
        x = F.relu(x)
        #not sure why no dropout here 
        x, _ =  self.lstm(x) #x is output from last of stacked LSTM num_layers where output is hidden state (not softmax yet)
        x = self.dropout(x) 
        return x 

class SpeechRecognitionModel(nn.Module):
    def __init__(self, lstm_input_dim, hidden_size, n_cnn_layers, n_rnn_layers,  n_class, resChannels=32, kernel=3, stride=2, n_feats=128, num_layers=1, dropout=0.1) -> None:
        super().__init__()
        n_feats = n_feats // 2 #I think this is because due to padding of 1, stride of 2, and f = 3, then (n - 3 + 2) / 2 ~ n/2  
        #cnn first detects edges i.e harmonies, then can even learn things like rhythm and structure of audio. 
        self.cnn = nn.Conv2d(1, 32, 3, stride, padding = 3//2) 
        self.residualConvs = nn.ModuleList()
        for i in range(n_cnn_layers):
            self.residualConvs.append(ResidualCNN(resChannels, resChannels, kernel, 1, n_feats, dropout))
        self.linear = nn.Linear(32*n_feats, lstm_input_dim) #this represents leftover frequencies and the 32 channels, that's the data per timestep 
        self.lstms = nn.ModuleList() 
        for i in range(n_rnn_layers):
            self.lstms.append(BidrectionalLSTM(input_size=lstm_input_dim if i == 0 else hidden_size * 2, 
                                               hidden_size=hidden_size, num_layers=1, dropout=dropout, batchFirst= True)) #why i==0 here 
        self.classifier = nn.Sequential(
            nn.Linear(2*hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, n_class)
        )
    
    def forward(self, x): 
        x = self.cnn(x)
        #maybe unlikely given sampling rate for orignal cnn reduce time dimension to make a difference? Size becomes around n/2 
#and adds context, but it really doesn't matter much because sampling rate, i.e samples per second is 16000??
        #yup I think this is correct, see https://kouohhashi.medium.com/dissecting-deepspeech-pytorch-part-2-c029042b30b0
        #note conv is not casual but forward looking here. 
        for resConv in self.residualConvs:
            x = resConv(x)
        #need to prepare to flatten i.e linear will affect the last layer only but need to make it of the form 
        #(N, time, channels*n_freq)
        # x right now is (Batch, channels, frequency, time)
        # x = x.transpose(2,3)
        # x = x.view(x.shape[0], x.shape[2], x.shape[1]*x.shape[3])
        x = x.view(x.shape[0], x.shape[1]*x.shape[2], x.shape[3]) #like a flatten
        x = x.transpose(1,2) #i.e now we have channels * frquency as last one for linear layer to process
        x = self.linear(x)
        for lstm in self.lstms:
            x = lstm(x)
        return self.classifier(x)
        

#how come we don't add input length here?
def greedyDecoder(output, labels, label_lengths, blank_labels=28, collapse_repeated=True): #i.e max char prob from softmax for each timestep (doesn't account for marginalisation). 
#The concept of text generation with LSTMS and greedy not being accurate cause next output conditional on previous 
#doesn't apply here I feel because the input audio is predetermined. I.e not generative here! 
    arg_maxes = torch.argmax(output, dim=2) #batch, time, n_classes  
    decodes = [] #now argmaxes is batch, time 
    targets = [] 
    for i, prob_matrix in enumerate(arg_maxes): #i.e we are iterating through the batch which is really just one sample transcipt
        target = labels[i][:label_lengths[i]].tolist() #latter slicing because of the padding 0s to end due to max label_length 
        targets.append(text_transform.indexToChar(target))
        decode = [] 
        for j, timestep_output in enumerate(prob_matrix):
            if (timestep_output != blank_labels): #i.e if epsilon then we don't want to append BUT in subsequent will ensure we don't collapse 
                if (collapse_repeated and j > 0 and timestep_output == prob_matrix[j-1]):
                    continue 
                decode.append(timestep_output.item())
        decodes.append(text_transform.indexToChar(decode))
    return decodes, targets
        

    



#SGD takes minibatches the same as ADAM. The difference is that ADAM adjusts learning rates for parameters separately while SGD does them together.
# That allows ADAM to converge fast since one learning rate is unlikely to be best for all parameters in a model;
    
# The One Cycle Learning Rate Scheduler was first introduced in the paper Super-Convergence:
# Very Fast Training of Neural Networks Using Large Learning Rates. 
# This paper shows that you can train neural networks an order of magnitude faster, 
# while keeping their generalizable abilities, using a simple trick. You start with a low learning rate, 
# which warms up to a large maximum learning rate, then decays linearly to the same point of where you originally started.
    #Perhaps also regularisation benefits because learning rate at max is >> than min learning rate so severely penalises params? ```

R_M · April 12, 2024, 8:27pm

So I was able to fix this to positive loss which is decreasing BUT had to do F.log_softmax vs F.softmax.

Any idea as to why this is the case? Is there anything I did incorrectly?