Hello,
My custom dataset get item is as follows:
def __getitem__(self, idx):
"""
INPUT:
idx: The file path to the audio
RETURNS:
- waveform (Tensor): The waveform that the torchaudio returns.
- sample rate (int): The sample rate that the torchaudio library returns.
"""
# Load audio
print("Get Data Item")
full_path = self.root_dir + self.data.iloc[idx, 0]
#The sampling rate that torchaudio uses for these audios are: 22050
sample_rate = 8000
waveform, sample_rate = librosa.load(full_path, sr=sample_rate)
#Randomly inject noise
#inject = choice([0, 1])
#if inject:
# waveform = inject_noise(waveform)
# Load text from pandas
utterance = self.data.iloc[idx, 2]
# Get the spectrogram if f_type = 'spec'
if (self.f_type == 'spec'):
signal = librosa.feature.melspectrogram(y=waveform, sr=sample_rate)
else:
signal = librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=13)
# Return the spectrogram and the label
#Scale the spectrograms:
#scale(spectrogram)
print("Return Data Item")
return (
signal,
utterance
)
My collate function (for transformation and padding):
# The following code will be for collat_fn for the pytorch dataloader function
def data_processing(audio_data):
spectrograms = []
labels = []
input_lengths = []
label_lengths = []
print("data processing")
for (spec,label) in audio_data:
#The spectrogram is in (128, 407) and (128, 355) for example but later on for padding the function expects (407, 128) and (355, 128). So we need to transpose the matrices.
spectrograms.append(torch.Tensor(spec.transpose()))
t = textprocessor.TextProcessor()
label = torch.Tensor(t.text2int(text=label))
labels.append(label)
input_lengths.append(spec.shape[0]//2)
label_lengths.append(len(label))
print("Start padding")
spec_pad = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2,3) #(batch, channel=1, features, time )
label_pad = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)
print("Finish padding")
return spec_pad, label_pad, input_lengths, label_lengths
My dataloaders:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,shuffle=True,drop_last=True,collate_fn=lambda x: data_processing(x), **kwargs )
valid_loader = DataLoader(dataset=val_dataset, batch_size=batch_size,shuffle=False,drop_last=True,collate_fn=lambda x: data_processing(x), **kwargs)
test_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,shuffle=False,drop_last=True,collate_fn=lambda x: data_processing(x), **kwargs)
I am not sure if there is any flaw in my code?? Do I need to do the packed sequence??? I just think that the dataloading is taking too long.
My training set is of length 8812 audios and I calculated that with batch size of 32 with number of workers=4 on GPU, it takes about 20seconds to load the data + pass the model and get the loss. Is that a reasonable time?
Please, I really need help, I am feeling helpless, not sure what I am doing wrong…
Thank you so much.