Sure! The line is not running because of the error…
Here is the train.py code:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import dataset
import model
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from dataset import MaqamDataset
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
def pad_to_max_length(self, max_length):
for i in range(len(self)):
padded_data = F.pad(self.data[i][0], (0, max_length - len(self.data[i][0])), 'constant', 0)
padded_data = padded_data.unsqueeze(0) if len(padded_data.shape) == 1 else padded_data
padded_data = padded_data.unsqueeze(1)
padded_data = padded_data.repeat(1, 32, 1, 1)
self.data[i] = (padded_data, self.data[i][1])
def MFCC_plot(mfcc):
plt.figure(figsize=(10, 4))
mfcc = mfcc.detach().numpy()
mfcc = mfcc.mean(axis=2).T
librosa.display.specshow(mfcc, x_axis='time')
plt.colorbar()
plt.title('MFCC')
plt.tight_layout()
plt.show()
#clean GPU torch cache
torch.cuda.empty_cache()
# Define hyperparameters
batch_size = 2 # should be 64 according to page 7
learning_rate = 0.0001 #page 7 0.0001
num_epochs = 1 #should be 35 according to page 7
# Load the dataset
train_dataset = dataset.MaqamDataset(mode='train')
# Find the maximum length of the input tensors
max_length = 0
for i in range(len(train_dataset)):
inputs, labels, mfcc = train_dataset[i]
if inputs.shape[0] > max_length:
max_length = inputs.shape[0]
# Pad all input tensors to the maximum length
# MaqamDataset.pad_to_max_length(1440000)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Define the model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.MaqamCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print(torch.cuda.is_available())
# Train the model
print("Starting training!")
for epoch in range(num_epochs):
# print("in epoch number ", epoch)
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
# print("in process number ", i)
inputs, labels, mfcc = data
# MFCC_plot(mfcc)
labels = labels.to(device)
# print("inputs.shape = ", inputs.shape)
inputs = inputs.unsqueeze(1).unsqueeze(3).cuda()
optimizer.zero_grad()
outputs = model(inputs)
# print("Outputs shape = ", outputs.shape)
batch_size1 = outputs.size(0)
padding_size = max_length - outputs.size(1)
padding = torch.zeros(batch_size1, padding_size).to(device)
padded_outputs = torch.cat((outputs, padding), dim=1)
loss = criterion(padded_outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print('Epoch %d, loss: %.3f' % (epoch + 1, running_loss / len(train_loader)))
# Save the model
torch.save(model.state_dict(), 'maqam_cnn2.pth')
And here is the model.py code:
import torch.nn as nn
import torch
import numpy as np
class MaqamCNN(nn.Module):
def __init__(self):
super(MaqamCNN, self).__init__()
self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
self.relu1 = nn.ReLU()
self.pool1 = nn.MaxPool1d(kernel_size=3)
self.dropout1 = nn.Dropout(p=0.1)
self.conv2 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3, padding=0)
self.relu2 = nn.ReLU()
self.pool2 = nn.MaxPool1d(kernel_size=3)
self.dropout2 = nn.Dropout(p=0.2)
self.fc1 = nn.Linear(5119968, 512)
self.dropout3 = nn.Dropout(p=0.2)
self.fc2 = nn.Linear(265, 64)
self.dropout4 = nn.Dropout(p=0.2)
self.fc3 = nn.Linear(64, 100)
self.dropout5 = nn.Dropout(p=0.2)
def forward(self, x):
x = torch.squeeze(x, 3)
# x = np.transpose(x, (0, 2, 1))
# print("0 - x.shape = ", x.shape)
x = self.conv1(x)
x = self.relu1(x)
# print("1 - x.shape = ", x.shape)
x = self.pool1(x)
# print("2 - x.shape = ", x.shape)
x = self.dropout1(x)
# print("3 - x.shape = ", x.shape)
x = self.conv2(x)
# print("4 - x.shape = ", x.shape)
x = self.relu2(x)
x = self.pool2(x)
x = self.dropout2(x)
# x = x.view(-1, 30*192*192)
x = x.view(x.size(0), -1)
# print("5 - x.shape = ", x.shape)
x = self.fc1(x)
x = self.dropout3(x)
x = self.fc2(x)
x = self.dropout4(x)
x = self.fc3(x)
x = self.dropout5(x)
return x
And here is the dataset.py code:
import os
import torchaudio
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import librosa
import numpy as np
class MaqamDataset(Dataset):
def __init__(self, mode='train', transform=None):
self.mode = mode
self.transform = transform
self.data_dir = r"C:\Users\USER\Documents\GitHub\dataset_cutten30"
self.maqams = ['Ajam', 'Bayat', 'Hijaz', 'Kurd', 'Nahawand', 'Rast', 'Saba', 'Seka']
self.audio_list = self._load_audio_list()
self.data = [self.__getitem__(i) for i in range(len(self))]
self.pad_to_max_length(1440000)
def _load_audio_list(self):
audio_list = []
for i, maqam in enumerate(self.maqams):
label_dir = os.path.join(self.data_dir, maqam)
audio_list += [(os.path.join(label_dir, audio_name), i) for audio_name in os.listdir(label_dir) if audio_name.endswith('.wav')]
return audio_list
def __len__(self):
return len(self.audio_list)
def __getitem__(self, idx):
audio_path, label_idx = self.audio_list[idx]
waveform, sample_rate = torchaudio.load(audio_path)
waveform = waveform[0] # only keep the first channel
if self.transform:
waveform = self.transform(waveform)
mfcc = self.compute_mfcc(waveform)
return waveform, label_idx, mfcc
def pad_to_max_length(self, max_length):
for i in range(len(self)):
padded_data = F.pad(self.data[i][0], (0, max_length - len(self.data[i][0])), 'constant', 0)
self.data[i] = (padded_data, self.data[i][1])
def compute_mfcc(self, waveform):
# Compute the MFCC of the waveform
n_fft = 2048
hop_length = 512
n_mels = 128
sr = 48000
waveform = waveform.numpy() # Convert PyTorch tensor to NumPy array
mfcc = librosa.feature.mfcc(y=waveform, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, n_mfcc=20)
mfcc = np.transpose(mfcc)
mfcc = mfcc.astype(np.float32) # Ensure data type is compatible with np.issubdtype()
return mfcc