Currently, we are a group doing a project about implementing WaveNet in a Tacotron2 → WaveNet → ASR (Given by firm) for midterm project. We are all novices to PyTorch, but recommended to try this library for constructing our WaveNet. We have a problem with the padding and the F.cross_entropy problem for a given .wav-file.
The main issue is when we compute the loss function. Our output (from WaveNet) is a tensor of shape:
output = tensor([1, 256, 225332]) # [batch_size, sample_size, audio_length] input = tensor([1, 256, 225360])
There is a problem here, and from what I can see and talk to my supervisor about it is padding the input of the WaveNet. (Cross_entropy wants (N, C) as input and (N) as target, from what I gather, and the dimensions are wrong)
He said “use ‘same’ padding”, but that is currently only operable in TF/Keras as far as I know. We’ve tried to read across multiple posts, but since we’re novices, we can’t seem to figure it out. Any help is appreciated.
This is our WaveNet, which probably has some issues (particularly padding and perhaps causal convolution seems iffy?).
""" Wavenet model """ from torch import nn import torch #TODO: Add local and global conditioning def initialize(m): """ Initialize CNN with Xavier_uniform weight and 0 bias. """ if isinstance(m, torch.nn.Conv1d): nn.init.xavier_uniform_(m.weight) nn.init.constant_(m.bias, 0.0) class CausalConv1d(torch.nn.Module): """ Causal Convolution for WaveNet - Jakob """ def __init__(self, in_channels, out_channels, kernel_size, dilation = 1, bias = True): super(CausalConv1d, self).__init__() # padding=1 for same size(length) between input and output for causal convolution self.dilation = dilation self.kernel_size = kernel_size self.in_channels = in_channels self.out_channels = out_channels self.padding = padding = (kernel_size-1) * dilation # kernelsize = 2, -1 * dilation = 1, = 1. - Jakob. self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, dilation=dilation, bias=bias) # Fixed for WaveNet but not sure def forward(self, x): output = self.conv(x) if self.padding != 0: output = output[:, :, :-self.padding] return output class Wavenet(nn.Module): def __init__(self, layers=3, blocks=2, dilation_channels=32, residual_block_channels=512, skip_connection_channels=512, output_channels=256, output_size=32, kernel_size=3 ): super(Wavenet, self).__init__() self.layers = layers self.blocks = blocks self.dilation_channels = dilation_channels self.residual_block_channels = residual_block_channels self.skip_connection_channels = skip_connection_channels self.output_channels = output_channels self.kernel_size = kernel_size self.output_size = output_size # initialize dilation variables receptive_field = 1 init_dilation = 1 # List of layers and connections self.dilations =  self.residual_convs = nn.ModuleList() self.filter_conv_layers = nn.ModuleList() self.gate_conv_layers = nn.ModuleList() self.skip_convs = nn.ModuleList() # First convolutional layer self.first_conv = CausalConv1d(in_channels=self.output_channels, out_channels=residual_block_channels, kernel_size = 2) # Building the Modulelists for the residual blocks for b in range(blocks): additional_scope = kernel_size - 1 new_dilation = 1 for i in range(layers): # dilations of this layer self.dilations.append((new_dilation, init_dilation)) # dilated convolutions self.filter_conv_layers.append(nn.Conv1d(in_channels=residual_block_channels, out_channels=dilation_channels, kernel_size=kernel_size, dilation=new_dilation)) self.gate_conv_layers.append(nn.Conv1d(in_channels=residual_block_channels, out_channels=dilation_channels, kernel_size=kernel_size, dilation=new_dilation)) # 1x1 convolution for residual connection self.residual_convs.append(nn.Conv1d(in_channels=dilation_channels, out_channels=residual_block_channels, kernel_size=1)) # 1x1 convolution for skip connection self.skip_convs.append(nn.Conv1d(in_channels=dilation_channels, out_channels=skip_connection_channels, kernel_size=1)) # Update receptive field and dilation receptive_field += additional_scope additional_scope *= 2 init_dilation = new_dilation new_dilation *= 2 # Last two convolutional layers self.last_conv_1 = nn.Conv1d(in_channels=skip_connection_channels, out_channels=skip_connection_channels, kernel_size=1) self.last_conv_2 = nn.Conv1d(in_channels=skip_connection_channels, out_channels=output_channels, kernel_size=1) #Calculate model receptive field and the required input size for the given output size self.receptive_field = receptive_field self.input_size = receptive_field + output_size - 1 def forward(self, input): # Feed first convolutional layer with input x = self.first_conv(input) # Initialize skip connection skip = 0 # Residual block for i in range(self.blocks * self.layers): (dilation, init_dilation) = self.dilations[i] # Residual connection bypassing dilated convolution block residual = x # input to dilated convolution block filter = self.filter_conv_layers[i](x) filter = torch.tanh(filter) gate = self.gate_conv_layers[i](x) gate = torch.sigmoid(gate) x = filter * gate # Feed into 1x1 convolution for skip connection s = self.skip_convs[i](x) #Adding skip & Match size with decreasing dimensionality of x if skip is not 0: skip = skip[:, :, -s.size(2):] skip = s + skip # Sum all skip connections # Feed into 1x1 convolution for residual connection x = self.residual_convs[i](x) #Adding Residual & Match size with decreasing dimensionality of x x = x + residual[:, :, dilation * (self.kernel_size - 1):] # print(x.shape) x = torch.relu(skip) #Last conv layers x = torch.relu(self.last_conv_1(x)) x = self.last_conv_2(x) soft = torch.nn.Softmax(dim=1) x = soft(x) return x
The training file:
model = Wavenet(layers=3,blocks=2,output_size=32).to(device) model.apply(initialize) # xavier_uniform_ : Does this work? model.train() optimizer = optim.Adam(model.parameters(), lr=0.0003) for i, batch in tqdm(enumerate(train_loader)): mu_enc_my_x = encode_mu_law(x=batch, mu=256) input_tensor = one_hot_encoding(mu_enc_my_x) input_tensor = input_tensor.to(device) output = model(input_tensor) # TODO: Inspect input/output formats, maybe something wrong.... loss = F.cross_entropy(output.T.reshape(-1, 256), input_tensor[:,:,model.input_size - model.output_size:].long().to(device)) # subtract receptive field instead of pad it, workaround for quick debugging of loss-issue. print("\nLoss:", loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() if i % 1000 == 0: print("\nSaving model") torch.save(model.state_dict(), "wavenet.pt")