How can I ensure that an autoencoder does not learn the mean?

Mahmoud_Abdelkhalek · January 29, 2021, 10:31am

I’m currently trying to reconstruct speech signals that are 3,000 samples long using an autoencoder. I currently have 90,000 examples of these speech signals to train on. Here is a summary of the autoencoder architecture:

Input shape: (8, 1, 3000)
Normalized shape: (8, 1, 3000)
Normalized mean: 6.45319602199379e-08
Normalized variance: 0.9997082948684692
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv1d-1             [-1, 16, 2998]              64
             PTanh-2             [-1, 16, 2998]               0
            Conv1d-3             [-1, 16, 1499]             528
       BatchNorm1d-4             [-1, 16, 1499]              32
            Conv1d-5             [-1, 32, 1495]           2,592
             PTanh-6             [-1, 32, 1495]               0
            Conv1d-7              [-1, 32, 747]           2,080
       BatchNorm1d-8              [-1, 32, 747]              64
            Conv1d-9              [-1, 64, 741]          14,400
            PTanh-10              [-1, 64, 741]               0
           Conv1d-11              [-1, 64, 370]           8,256
      BatchNorm1d-12              [-1, 64, 370]             128
         Upsample-13              [-1, 64, 740]               0
           Conv1d-14              [-1, 32, 734]          14,368
            PTanh-15              [-1, 32, 734]               0
         Upsample-16             [-1, 32, 1468]               0
           Conv1d-17             [-1, 16, 1464]           2,576
            PTanh-18             [-1, 16, 1464]               0
         Upsample-19             [-1, 16, 3006]               0
            PTanh-20             [-1, 16, 3006]               0
           Conv1d-21              [-1, 1, 3000]             113
================================================================
Total params: 45,201
Trainable params: 45,201
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 5.47
Params size (MB): 0.17
Estimated Total Size (MB): 5.65
----------------------------------------------------------------

And here is the autoencoder architecture itself:

class PTanh(torch.nn.Module):
    
    # PTanh(x) = a * tanh(b * x)
    
    def __init__(self,in_channels):
        
        super().__init__()
        
        # need this for __repr__ method
        
        self.in_channels = in_channels
        
        # initialize a
        
        a = torch.full((in_channels,1), 1.7159)
        self.a = torch.nn.Parameter(a,requires_grad = True)
        
        # initialize b
        
        b = torch.full((in_channels,1), 2/3)
        self.b = torch.nn.Parameter(b,requires_grad = True)
    
    def __repr__(self):
        return 'PTanh(' + str(self.in_channels) + ')'
    
    def forward(self,x):
        x = torch.multiply(x,self.a)
        x = torch.multiply(torch.nn.Tanh()(x),self.b)
        return x

class Autoencoder(torch.nn.Module):
    
    def __init__(self,batch_norm = False):
        
        super().__init__()
        
        # encoder --------------------------------------------------------
        
        encoder = []
        
        # first layer
        
        layer = [torch.nn.Conv1d(in_channels = 1,
                                 out_channels = 16,
                                 kernel_size = 3,
                                 stride = 1,
                                 bias = True),
                 PTanh(16),
                 torch.nn.Conv1d(in_channels = 16,
                                 out_channels = 16,
                                 kernel_size = 2,
                                 stride = 2,
                                 bias = True)]
        
        encoder.extend(layer)
        
        if batch_norm:
            encoder.append(torch.nn.BatchNorm1d(num_features = 16,
                                                eps = 1e-08,
                                                momentum = 0.1,
                                                affine = True,
                                                track_running_stats = True))
        
        # second layer
        
        layer = [torch.nn.Conv1d(in_channels = 16,
                                 out_channels = 32,
                                 kernel_size = 5,
                                 stride = 1,
                                 bias = True),
                 PTanh(32),
                 torch.nn.Conv1d(in_channels = 32,
                                 out_channels = 32,
                                 kernel_size = 2,
                                 stride = 2,
                                 bias = True)]
        
        encoder.extend(layer)
        
        if batch_norm:
            encoder.append(torch.nn.BatchNorm1d(num_features = 32,
                                                eps = 1e-08,
                                                momentum = 0.1,
                                                affine = True,
                                                track_running_stats = True))
        
        # third layer
        
        layer = [torch.nn.Conv1d(in_channels = 32,
                                 out_channels = 64,
                                 kernel_size = 7,
                                 stride = 1,
                                 bias = True),
                 PTanh(64),
                 torch.nn.Conv1d(in_channels = 64,
                                 out_channels = 64,
                                 kernel_size = 2,
                                 stride = 2,
                                 bias = True)]
        
        encoder.extend(layer)
        
        if batch_norm:
            encoder.append(torch.nn.BatchNorm1d(num_features = 64,
                                                eps = 1e-08,
                                                momentum = 0.1,
                                                affine = True,
                                                track_running_stats = True))
        
        self.encoder = torch.nn.Sequential(*encoder)
        
        # decoder --------------------------------------------------------
        
        decoder = []
        
        # fourth layer
        
        layer = [torch.nn.Upsample(scale_factor = 2,
                                   mode = 'nearest'),
                 torch.nn.Conv1d(in_channels = 64,
                                 out_channels = 32,
                                 kernel_size = 7,
                                 stride = 1,
                                 bias = True),
                 PTanh(32)]
        
        decoder.extend(layer)
        
        if batch_norm:
            encoder.append(torch.nn.BatchNorm1d(num_features = 32,
                                                eps = 1e-08,
                                                momentum = 0.1,
                                                affine = True,
                                                track_running_stats = True))
        
        # fifth layer
        
        layer = [torch.nn.Upsample(scale_factor = 2,
                                   mode = 'nearest'),
                 torch.nn.Conv1d(in_channels = 32,
                                 out_channels = 16,
                                 kernel_size = 5,
                                 stride = 1,
                                 bias = True),
                 PTanh(16)]
        
        decoder.extend(layer)
        
        if batch_norm:
            encoder.append(torch.nn.BatchNorm1d(num_features = 16,
                                                eps = 1e-08,
                                                momentum = 0.1,
                                                affine = True,
                                                track_running_stats = True))
        
        # sixth layer
        
        layer = [torch.nn.Upsample(size = 3006,
                                   mode = 'nearest'),
                 PTanh(16),
                 torch.nn.Conv1d(in_channels = 16,
                                 out_channels = 1,
                                 kernel_size = 7,
                                 stride = 1,
                                 bias = True)]
        
        decoder.extend(layer)
        
        self.decoder = torch.nn.Sequential(*decoder)
        
    def forward(self,x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

I have trained this autoencoder for 30 epochs using a batch size of 8, the SmoothL1Loss loss function, and the Adam optimizer as follows:

loss_func = torch.nn.SmoothL1Loss(reduction = 'mean',
                                  beta = 1.0)
optimizer = torch.optim.Adam(params = net.parameters(),
                             lr = 0.0003)

However, when I test the autoencoder on new examples, it simply tries to reconstruct the mean. In other words, a low-pass filtered version of the input. For example:

The original speech signal is at the top and the reconstructed signal is at the bottom. Any suggestions on how I can solve this problem?