Cant quite get how to shape data to LSTM models

I am trying to implement some music generation LSTM, but cant figure out how to properly shape my data. In current configuration, when i try to train my model, it just crashes my colab notebook, instantly, and it doesnt look like it is because of ram shortage, colab doesnt tell me that

Some code:

Wav File dataset: takes a data file and samples a seq_length samples from it, chopping one sample off as a label, returning everything else as a time sequence of one feature

class WavFileDataset(Dataset):
  def __init__(self, file_path, seq_length):
    sequence = pydub.AudioSegment.from_file(file_path) #Load wav
    sequence = sequence.set_channels(1) #Stereo to mono
    sequence = np.asarray(sequence.get_array_of_samples()) #Get numpy ndarray
    self.length = len(sequence) - seq_length #Get how many sequences we can take from wav file
    self.seq_length = seq_length #Save length of sequence that we will sample later
    self.file_path = file_path #Save file path for later
  def __len__(self):
    return self.length #Return how many sequences we can take, not much here
  def __getitem__(self,idx):
    #Again, open and convert wav to ndarray
    sequence = pydub.AudioSegment.from_file(self.file_path)
    sequence = sequence.set_channels(1)
    sequence = np.asarray(sequence.get_array_of_samples())
    seq = sequence[idx:idx+self.seq_length] #Get a sequence of length = seq_length starting and idx
    seq = (seq / (1<<15)) #Normalize for int16 audio to -1...1 value range
    feature = seq[0:-1].astype('float32').reshape(1,-1) #get seq_length-1 samples as time sequence of feature
    label = seq[-1:None].astype('float32').reshape(1,-1) #get last item from sequence as a label
    #return everything
    return feature,label

LSTM Model class:

class LSTM(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=100, output_size=1, batch_size_ = 128):
        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first = False)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(1,batch_size_,self.hidden_layer_size),

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq, self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions[-1]

Initialization code:

data_dir = r"./datasets/rammwav" #Dataset dir: a folder full of wav files
files = [os.path.join(data_dir,f) for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))] #Get all files
files = [files[0]] #temporary sampling from only one file, for speeding up dataset creation
datasets = []
files_count = len(files)
seq_length = 100
batch_size = 2
i = 1
print("Processing datasets...")
for file in files: #Create WavFileDataset list to feed into ConcatDataset
  print_inline("Processed {}/{}".format(i,files_count)) #print_inline - a helper function: prints on the same line using sys.stdout.write("\r" + input_string)
  i += 1
print("\n Datasets processed")
#Create model-related stuff
model = LSTM(input_size = 1, hidden_layer_size = 100, output_size = 1, batch_size_ = batch_size).cuda()
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#Create dataset and dataloader
dataset =
data_loader =,
#Some shape checking for testing purposes
test_batch = next(iter(data_loader))

Training loop:

epochs = 10
batches = len(data_loader)
losses = []
print("Starting training...")
for epoch in range(epochs): #iterate over epochs
  batch = 1
  for seq, labels in data_loader: #iterate over batches in data loader
    starttime = time.time()
    seq = seq.permute(2,0,1).cuda() #permute feature sequence batch to proper shape
    labels = labels.permute(2,0,1).cuda() #permute label batch to proper shape
    model.hidden_cell = (torch.zeros(1, batch_size, model.hidden_layer_size),
                    torch.zeros(1, batch_size, model.hidden_layer_size)) #reset hidden state
    y_pred = model(seq)#make prediction
    #Backprop and stuff:
    single_loss = loss_function(y_pred, labels)
    print_inline("Batch {}/{} Time/batch: {:.2f}, Loss: ".format(batch,batches,time.time()-starttime, single_loss.item()))
    batch += 1

  if i%5 == 1:
        print("epoch: {}/{} loss:{}; Loss:{:.4f}".format(epoch,epochs,single_loss.item()))

I have no idea what is wrong, and why everything crashes, please help

Could you try to isolate the error by running smaller code snippets?
That would make debugging easier. :slight_smile:

I’ve isolated the error to training loop, i will run some tests, and also just rewrite everything, since my code right now is a Frankenstein of different code snippets from different tutorials