Language Identification for Audios

Nestaty · January 10, 2022, 11:37am

Hello, I am a beginner in the field of artificial intelligence
For my first project I wanted to create an AI to recognize the spoken language in an audio.
I used librosa to extract mfcc from audios and sklearn to train and test with train_test_split with my two lists.
I have an accuracy around 30% after ~30 epochs but i want to increase by any way from 30 to 70% . I got around 500 audio files and i want to keep this number.

if someone can help me

def feature_extractor ( audio_file_dir ):
    # load the audio files
    x , freq = librosa.load( audio_file_dir,sr= 16000)
    # trim the first 5 seconds ( Sequence Truncation )
    x_5sec =x[: 5*16000 ]
    # extract 20 MFCCs
    mfccs_5sec = librosa.feature.mfcc ( x_5sec , sr=freq , n_mfcc =20 )
    # return mfcc of the first 5 sec as the audio file feature
    return mfccs_5sec
    
       
# create a dictionary for labels
#lang_dic ={'EN':[0,0,0,1] ,'FR':[0,0,1,0] ,'AR':[0,1,0,0] ,'JP':[1,0,0,0]}
lang_dic ={'EN':0 ,'FR':1 ,'AR':2 ,'JP':3 ,}
# create a list of extracted feature ( MFCC ) for files
x_data =[]
# set data_dir to the directory of your data files
data_dir = "LID_data/"


# Read file info file to get the list of audio files and their labels
file_list =[]
label_list =[]


with open ( data_dir +"Info.txt", 'r') as file :
    reader = csv.reader(file)
    for row in reader :
        # The first column contains the file name
        file_list.append(row[0])
        # The last column contains the lable ( language )
        label_list.append(row[-1])
# create a dictionary for labels
# create a list of extracted feature ( MFCC ) for files
x_data =[]


for audio_file in file_list :
    file_feature = feature_extractor(data_dir + audio_file)
    # add extracted feature to dataset
    x_data.append (file_feature)
    # create a list of labels for files
y_data =[]
for lang_label in label_list :
    # convert the label to a value in {0,1,2,3} as the class label
    y_data.append(lang_dic[lang_label])
# shuffle two lists
temp_list = list ( zip( x_data , y_data ))
random.shuffle ( temp_list )
x_data , y_data = zip (* temp_list )

X_train, X_test, Y_train, Y_test = train_test_split(x_data, y_data, test_size=0.33, random_state=42)

device ="cpu"
# transform to torch tensor
tensor_x_data = torch.Tensor( X_test)
tensor_y_data = torch.Tensor( Y_test)
tensor_x_data_train = torch.Tensor( X_train)
tensor_y_data_train = torch.Tensor( Y_train)
# create your datset
dataset = TensorDataset ( tensor_x_data , tensor_y_data )
dataset_train = TensorDataset ( tensor_x_data_train , tensor_y_data_train )
# the batch size can be changed to a lrger value when you have more data
batch_size = 1
# create your dataloader
dataloader = DataLoader ( dataset , batch_size = 1 )
dataloader_train = DataLoader (dataset_train, batch_size = batch_size)
# Get cpu or gpu device for training .


print ("Using {} device".format ( device ))
# Define model
class NeuralNetwork ( nn.Module ):
    def __init__ ( self ):
        super ( NeuralNetwork , self ).__init__ ()
        self.flatten = nn.Flatten ()
        self.linear_relu_stack = nn.Sequential (
            # the size of input should be the number of features ( 20 MFCC ) times
            # length of sequence ( 157 )
            nn.Linear ( 20*157 , 512 ) ,
            nn.ReLU () ,
            nn.Dropout(0.5),
            nn.Linear ( 512 , 512 ) ,
            nn.ReLU () ,
            nn.Dropout(0.5),
            nn.Linear ( 512 , 256 ) ,
            nn.ReLU () ,
            nn.Dropout(0.5),
            nn.Linear ( 256 , 4),
            
        )
    def forward ( self , x):
        x = self.flatten (x)
        logits = self.linear_relu_stack (x)
        return logits
model = NeuralNetwork().to( device )
# to train a model , we need a loss function and an optimizer .
loss_fn = nn.CrossEntropyLoss ()
#optimizer = torch.optim.SGD( model.parameters() , lr=1e-2)
#optimizer = torch.optim.SGD( model.parameters() , lr=0.01)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

def train ( dataloader , model , loss_fn , optimizer ):
    size = len ( dataloader.dataset )
    model.train ()
    for batch , (X , y) in enumerate ( dataloader ):
        X , y = X.to( device ) , y.to( device )
        # Compute prediction error
        pred = model ( X)
        loss = loss_fn ( pred , y.type (torch.LongTensor))
        # B a c k p r o p a g a t i o n
        optimizer.zero_grad ()
        loss.backward ()
        optimizer.step()
        if batch % 100 == 0:
            loss , current = loss.item() , batch * len (X)
            print (f" loss : { loss :>7f} [{ current :>5d}/{ size : >5d}]")
            
def test ( dataloader , model , loss_fn ):
    size = len ( dataloader.dataset )
    num_batches = len ( dataloader )
    model.eval ()
    test_loss , correct = 0 , 0
    with torch.no_grad ():
        for X , y in dataloader :
            X , y = X.to( device ) , y.to( device )
            pred = model (X)
            test_loss += loss_fn ( pred , y.type( torch.LongTensor )).item () #
            correct += ( pred.argmax (1) == y).sum().item ()
            #correct += ( pred.argmax (1) == y).type(torch.float).sum().item ()
            #print((pred.argmax (1) == y.argmax(1)).sum().item ())
    test_loss /= num_batches
    correct /= size
    print (f" Test Error : \n Accuracy : {( 100 * correct ):>0.1f}%, Avg loss : {test_loss:>8f} \n")
    
    
number_of_epochs = 30
for t in range ( number_of_epochs ):
    print (f"Epoch {t+1}\n- ----- ---- ----- ----- ---- ----- --")
    train ( dataloader_train , model , loss_fn , optimizer )
    test ( dataloader , model , loss_fn )
    print (" Done !")