My custom Dataloader

Mohamed_Nabih · January 28, 2020, 7:18pm

import os
import warnings
import torchaudio
from torch.utils.data import Dataset
from torchaudio.datasets.utils import (download_url, extract_archive, unicode_csv_reader, walk_files)
URL = “https://www.kaggle.com/mfekadu/darpa-timit-acousticphonetic-continuous-speech”
FOLDER_IN_ARCHIVE = “timitcorpus”
def load_timit_item(fileid, path, ext_audio):

Read lables

labels = [int© for c in fileid.split("_")]

Read wav

file_audio = os.path.join(path, fileid + ext_audio)
waveform, sample_rate, lables
class timit(Dataset):
_ext_audio = “.wav”
def init(self, root, url=URL, folder_in_archive = FOLDER_IN_ARCHIVE,
downlaod = False, transform = None,
target_transform = None):
if transform is not None or target_transform is not None:
warning.warn(“Show warning”, DeprecationWarning)
self.transform = transform
self.target_transform = target_transform
archive = os.path.basename(url)
archive = os.path.join(root, archive)
self.path._path = os.path.join (root, folder_in_archive)
if not os.path.isdir(self.path):
raise RuntimeError (“Dataset not found. Please use download=True, to download it.”)
walker = walk_files(self._path, suffix=self._ext_audio, prefix=False,
remove_suffix=True)
self._walker = list(walker)
def getitem(self, n):
fileid = self.walker[n]
item = load_timit_item(fileid, self._path, self._ext_audio)
waveform, sample_rate, labels = item
if self.transform is not None:
waveform = self.transform(waveform)
if self.target_transform is not None:
labels = self.target_transform(labels)
return waveform, sample_rate, labels
def len(self):
return len (self._walker)

When I run this command
timit_data = torchaudio.datasets.timit(’.’, download=True)
it gives me tourchaudio.datasets has no attribute timit I am following Yeson dataloader

ptrblck · January 29, 2020, 1:51am

torchaudio.datasets doesn’t seem to have the “timit” dataset as seen here.
Your code is currently a bit hard to read and you can format it by wrapping it into three backticks ```.

Are you defining this dataset somehow?
E.g. load_timit_item seems to be undefined as well.

Amin_Jun · November 13, 2020, 5:27am

@Mohamed_Nabih I suggest you use something like this:

import os

import torchaudio
from torch.utils.data import Dataset
from torchaudio.datasets.utils import walk_files


def main():
    timit = Timit('../TIMIT/')
    x = timit[0]


def load_timit(file: str):
    data_path = os.path.splitext(file)[0]
    with open(data_path + '.TXT', 'r') as txt_file:
        _, __, transcript = next(iter(txt_file)).strip().split(" ", 2)
    with open(data_path + '.WRD', 'r') as word_file:
        words = [l.strip().split(' ') for l in word_file]
        words = [(int(hd), int(tl), w) for (hd, tl, w) in words]
    with open(data_path + '.PHN', 'r') as phn_file:
        phonemes = [l.strip().split(' ') for l in phn_file]
        phonemes = [(int(hd), int(tl), w) for (hd, tl, w) in phonemes]
    wav, sr = torchaudio.load(data_path + '.WAV')
    return data_path, wav, transcript, words, phonemes


class Timit(Dataset):
    def __init__(self, root: str):
        self.root = root
        self.walker = list(walk_files(root, suffix='.WAV', prefix=True))

    def __getitem__(self, item):
        return load_timit(self.walker[item])

    def __len__(self):
        return len(self.walker)


if __name__ == '__main__':
    main()

It simply loads all files (regardless of train test). You could do the train test split by simply changing root=’…/TIMIT/data/TEST’ or similar thing for train.