I’m currently trying to reconstruct speech signals that are 3,000 samples long using an autoencoder. I currently have 90,000 examples of these speech signals to train on. Here is a summary of the autoencoder architecture:
Input shape: (8, 1, 3000)
Normalized shape: (8, 1, 3000)
Normalized mean: 6.45319602199379e-08
Normalized variance: 0.9997082948684692
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv1d-1 [-1, 16, 2998] 64
PTanh-2 [-1, 16, 2998] 0
Conv1d-3 [-1, 16, 1499] 528
BatchNorm1d-4 [-1, 16, 1499] 32
Conv1d-5 [-1, 32, 1495] 2,592
PTanh-6 [-1, 32, 1495] 0
Conv1d-7 [-1, 32, 747] 2,080
BatchNorm1d-8 [-1, 32, 747] 64
Conv1d-9 [-1, 64, 741] 14,400
PTanh-10 [-1, 64, 741] 0
Conv1d-11 [-1, 64, 370] 8,256
BatchNorm1d-12 [-1, 64, 370] 128
Upsample-13 [-1, 64, 740] 0
Conv1d-14 [-1, 32, 734] 14,368
PTanh-15 [-1, 32, 734] 0
Upsample-16 [-1, 32, 1468] 0
Conv1d-17 [-1, 16, 1464] 2,576
PTanh-18 [-1, 16, 1464] 0
Upsample-19 [-1, 16, 3006] 0
PTanh-20 [-1, 16, 3006] 0
Conv1d-21 [-1, 1, 3000] 113
================================================================
Total params: 45,201
Trainable params: 45,201
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 5.47
Params size (MB): 0.17
Estimated Total Size (MB): 5.65
----------------------------------------------------------------
And here is the autoencoder architecture itself:
class PTanh(torch.nn.Module):
# PTanh(x) = a * tanh(b * x)
def __init__(self,in_channels):
super().__init__()
# need this for __repr__ method
self.in_channels = in_channels
# initialize a
a = torch.full((in_channels,1), 1.7159)
self.a = torch.nn.Parameter(a,requires_grad = True)
# initialize b
b = torch.full((in_channels,1), 2/3)
self.b = torch.nn.Parameter(b,requires_grad = True)
def __repr__(self):
return 'PTanh(' + str(self.in_channels) + ')'
def forward(self,x):
x = torch.multiply(x,self.a)
x = torch.multiply(torch.nn.Tanh()(x),self.b)
return x
class Autoencoder(torch.nn.Module):
def __init__(self,batch_norm = False):
super().__init__()
# encoder --------------------------------------------------------
encoder = []
# first layer
layer = [torch.nn.Conv1d(in_channels = 1,
out_channels = 16,
kernel_size = 3,
stride = 1,
bias = True),
PTanh(16),
torch.nn.Conv1d(in_channels = 16,
out_channels = 16,
kernel_size = 2,
stride = 2,
bias = True)]
encoder.extend(layer)
if batch_norm:
encoder.append(torch.nn.BatchNorm1d(num_features = 16,
eps = 1e-08,
momentum = 0.1,
affine = True,
track_running_stats = True))
# second layer
layer = [torch.nn.Conv1d(in_channels = 16,
out_channels = 32,
kernel_size = 5,
stride = 1,
bias = True),
PTanh(32),
torch.nn.Conv1d(in_channels = 32,
out_channels = 32,
kernel_size = 2,
stride = 2,
bias = True)]
encoder.extend(layer)
if batch_norm:
encoder.append(torch.nn.BatchNorm1d(num_features = 32,
eps = 1e-08,
momentum = 0.1,
affine = True,
track_running_stats = True))
# third layer
layer = [torch.nn.Conv1d(in_channels = 32,
out_channels = 64,
kernel_size = 7,
stride = 1,
bias = True),
PTanh(64),
torch.nn.Conv1d(in_channels = 64,
out_channels = 64,
kernel_size = 2,
stride = 2,
bias = True)]
encoder.extend(layer)
if batch_norm:
encoder.append(torch.nn.BatchNorm1d(num_features = 64,
eps = 1e-08,
momentum = 0.1,
affine = True,
track_running_stats = True))
self.encoder = torch.nn.Sequential(*encoder)
# decoder --------------------------------------------------------
decoder = []
# fourth layer
layer = [torch.nn.Upsample(scale_factor = 2,
mode = 'nearest'),
torch.nn.Conv1d(in_channels = 64,
out_channels = 32,
kernel_size = 7,
stride = 1,
bias = True),
PTanh(32)]
decoder.extend(layer)
if batch_norm:
encoder.append(torch.nn.BatchNorm1d(num_features = 32,
eps = 1e-08,
momentum = 0.1,
affine = True,
track_running_stats = True))
# fifth layer
layer = [torch.nn.Upsample(scale_factor = 2,
mode = 'nearest'),
torch.nn.Conv1d(in_channels = 32,
out_channels = 16,
kernel_size = 5,
stride = 1,
bias = True),
PTanh(16)]
decoder.extend(layer)
if batch_norm:
encoder.append(torch.nn.BatchNorm1d(num_features = 16,
eps = 1e-08,
momentum = 0.1,
affine = True,
track_running_stats = True))
# sixth layer
layer = [torch.nn.Upsample(size = 3006,
mode = 'nearest'),
PTanh(16),
torch.nn.Conv1d(in_channels = 16,
out_channels = 1,
kernel_size = 7,
stride = 1,
bias = True)]
decoder.extend(layer)
self.decoder = torch.nn.Sequential(*decoder)
def forward(self,x):
x = self.encoder(x)
x = self.decoder(x)
return x
I have trained this autoencoder for 30 epochs using a batch size of 8, the SmoothL1Loss
loss function, and the Adam
optimizer as follows:
loss_func = torch.nn.SmoothL1Loss(reduction = 'mean',
beta = 1.0)
optimizer = torch.optim.Adam(params = net.parameters(),
lr = 0.0003)
However, when I test the autoencoder on new examples, it simply tries to reconstruct the mean. In other words, a low-pass filtered version of the input. For example:
The original speech signal is at the top and the reconstructed signal is at the bottom. Any suggestions on how I can solve this problem?