Audio frame wise training

I’m trying to build MIR 1K dataset and train with DNN frame wise based.

Given the spectrogram with feature data (40, 150) as below and label (150). eg. frame index 0 has feature (40, ) with MIDI label 68.

The above Mel spec and label is obtained as below.

class MIR1K(Dataset):
  """
  offical document
  frame size of 40ms -> 640 sample points
  hop size of 20 ms -> 320 sample points
  each 10ms hop size has one pitch vector

  Args:
      Dataset (_type_): return mel, pitch
  """

  def __init__(self, transform=None, segment_len=150):
  # def __init__(self, val_ratio:float):
    self.audio_dir = DBPATH + MIRAUD + '/Wavfile/'
    self.label_dir = DBPATH + MIRAUD + '/PitchLabel'
    self.total_song = os.listdir(self.audio_dir)
    self._transform = transform
    self.o_sr = 0
    self.segment_len = segment_len
    # self.val_ratio = val_ratio

  def __len__(self):
    'return the fold song number'
    return len(self.total_song)

  def __getitem__(self, index):
    songname = self._get_song(self.audio_dir, index)
    print(songname)
    audio_path = self._get_audio(songname)
    pitch = self._get_label(songname)

    audio, self.o_sr = librosa.load(audio_path, sr=None, mono=True)
    if self._transform is None:
      return torch.FloatTensor(audio),  torch.FloatTensor(pitch)
    else:
      audio = self._transform(audio, self.o_sr)

    # rescale rescale audio and pitch
    shape_diff = audio.shape[1] - pitch.shape[0]
    if (shape_diff) == 1:
      audio = audio[:,1:]
    elif (shape_diff) == 2:
      audio = audio[:,1:-1]
    else:
      assert(shape_diff > 2)


    # Segmemt mel-spectrogram into "segment_len" frames
    if (audio.shape[1] > self.segment_len):
      start = random.randint(0, audio.shape[1] - self.segment_len)
      mel = torch.FloatTensor(audio[:,start:start+self.segment_len])
      pitch = torch.FloatTensor(pitch[start:start+self.segment_len])
    else:
      mel = torch.FloatTensor(audio)
      pitch = torch.FloatTensor(pitch)

    return mel, pitch
    # return audio, pitch

  def _get_song(self, audio_dir, index):
    return os.listdir(audio_dir)[index].split('.')[0]

  def _get_audio(self, song):
    if ".wav" in song:
      return os.path.join(self.audio_dir, song)
    return os.path.join(self.audio_dir, song + '.wav')

  def _get_label(self, song):
    pitch_label = np.loadtxt(os.path.join(self.label_dir, song + '.pv'))
    # convert midi to frequency
    pitch = 2**( (pitch_label - 69)/12) * 440
    return pitch

Is there any tutorial to use the data and train frame wise ?