I’m a college student trying to implement Wavenet using PyTorch, this is my first time writing custom modules for a model in PyTorch and I’m having a problem with my model in that it won’t train. Essentially the project is this: I have a set of wav files that I am reading in, processing and quantizing as in the Wavenet paper, and am arranging into series of 1024 data points (the model takes a series of 1024 amplitudes from the wav as input and should output a tensor of 256 values describing the probability that the next item in the series is one of those 256 values).
I’m currently trying to train the model on a single music file, hoping to get it to overfit so that I can be sure that it actually learns from the data. Here lies the problem: the loss won’t decrease as I train. I’ve tried making the model smaller, changing the loss function, changing what kind of layer the output layer is, and making the learning rate larger but nothing seems to work.
I suspect that the problem is somewhere in the model itself, the way that it is constructed may be wrong. It’s possible that I could have linked the custom modules together with the model in a way that interferes with back propagation, at least that is my best guess. My code for the model and my training code is below. I would really appreciate some help!
#model https://github.com/Dankrushen/Wavenet-PyTorch/blob/master/wavenet/models.py
#https://github.com/ryujaehun/wavenet/blob/master/wavenet/networks.py
#https://medium.com/@satyam.kumar.iiitv/understanding-wavenet-architecture-361cc4c2d623
#https://discuss.pytorch.org/t/causal-convolution/3456/4
import torch
import torch.optim as optim
from torch import nn
from functools import reduce
#causal convolution (citation above)
class CausalConv1d(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, dilation=1, **kwargs):
super(CausalConv1d, self).__init__()
self.pad = (kernel_size - 1) * dilation
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.dilation = dilation
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=self.pad, dilation=dilation, **kwargs)
def forward(self, x):
return self.conv(x)
class ResidualBlock(nn.Module):
def __init__(self, input_channels, output_channels, kernel_size, skip_size, skip_channels, dilation=1):
super(ResidualBlock, self).__init__()
self.dilation = dilation
self.skip_size = skip_size
self.conv_s = CausalConv1d(input_channels, output_channels, kernel_size, dilation)#dim
self.sig = nn.Sigmoid()
self.conv_t = CausalConv1d(input_channels, output_channels, kernel_size, dilation)#dim
self.tanh = nn.Tanh()
self.conv_1 = nn.Conv1d(output_channels, output_channels, 1)#dim -> k = 1
self.skip_conv = nn.Conv1d(output_channels, skip_channels, 1)
def forward(self, x):
o = self.sig(self.conv_s(x)) * self.tanh(self.conv_t(x))
skip = self.skip_conv(o)
skip = skip[:,:,-self.skip_size:] #dim control for adding skips
residual = self.conv_1(o)
return residual, skip
class WaveNet(nn.Module):#SET SKIP SIZE default
def __init__(self, skip_size=256, num_blocks=2, num_layers=10, num_hidden=128, kernel_size=2):
super(WaveNet, self).__init__()
self.layer1 = CausalConv1d(1, num_hidden, kernel_size)#dim
self.res_stack = nn.ModuleList()
for b in range(num_blocks):
for i in range(num_layers):
self.res_stack.append(ResidualBlock(num_hidden, num_hidden, kernel_size, skip_size=skip_size, skip_channels=1, dilation=2**i))#dim
#self.hidden = nn.ModuleList(self.hidden)
self.relu1 = nn.ReLU()
self.conv1 = nn.Conv1d(1,1,1)#dim
self.relu2 = nn.ReLU()
self.conv2 = nn.Conv1d(1,1,1)#dim
self.output = nn.Softmax()
def forward(self, x):
skip_vals = []
#initial causal conv
o = self.layer1(x)
#run res blocks
for i, layer in enumerate(self.res_stack):
o, s = layer(o)
skip_vals.append(s)
#sum skip values and pass to last portion of network
o = reduce((lambda a,b: a+b), skip_vals)
o = self.relu1(o)
o = self.conv1(o)
o = self.relu2(o)
o = self.conv2(o)
return self.output(o)
#overfit model to test if it will train
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
net = WaveNet(num_layers=1)
#send to gpu
net.to(device)
criterion = nn.CrossEntropyLoss() #preproc data, stream, train, remember to reformat y label as onehot vector for classification
optimizer = optim.Adam(net.parameters(),lr=0.001)
num_epochs = 20
losses = []
_, inp = wav_to_data(data_path+'/'+wav_files[0])
data = encode(inp)
batch_size = 32
i = 0
for epoch in range(num_epochs):
i += 1
for s in range(0, len(data) - 1024, batch_size):
(x, y) = create_singular_input_stream(data, s, batch_size)
optimizer.zero_grad()
output = net(torch.reshape(torch.FloatTensor(x).to(device), (batch_size,1,1024)))
print(output.shape)
print(torch.Tensor([y]).shape)
#find loss between distributions of amplitudes
#https://discuss.pytorch.org/t/indexerror-target-1-is-out-of-bounds-nlloss/68656
loss = criterion(torch.squeeze(output), torch.squeeze(torch.Tensor(y)).type(torch.LongTensor).to(device))
loss.backward()
optimizer.step()
losses.append(loss.item())
print('Epoch {}/{}, Loss: {:.6f}'.format(i, num_epochs, loss.item()))
A helpful note:
create_singular_input_stream returns x (1,1024 tensor with series) and y (next series value [0,255])
Again, any help is appreciated - I’d love to know what I’m doing wrong.