Autoencoder + classifier - chance level learning

I am implementing a neural net model that has the following architecture:

        # Encoder
        self.encoder = nn.Sequential(encoder_config)

        # Decoder
        self.decoder = nn.Sequential(decoder_config)

        # Classifier
        self.classifier = nn.Sequential(classifier_config)

The idea is to first train the autoencoder part of the model then train the classifier on the encoded representations. I have tried many different layer configurations and it turns out the autoencoder portion never learns anything. Before, training the autoencoder, I freeze the classifier as follows:

    def freeze(self, network):

        for p in network.parameters():
            p.requires_grad = False

This is done by calling model.freeze(model.classifier).

Here is an example of the latest layer configuration I used:

#Neural net layer config
encoder_config = OrderedDict({'lin1_encoder':nn.Linear(256, 216), 'norm1_encoder':nn.BatchNorm1d(216),
                              'sig1_encoder':nn.Sigmoid(), 'lin2_encoder':nn.Linear(216, 196),
                              'norm2_encoder':nn.BatchNorm1d(196), 'sig2_encoder':nn.Sigmoid()})
decoder_config = OrderedDict({'lin1_decoder':nn.Linear(196, 216),'norm1_decoder':nn.BatchNorm1d(216),
                              'sig2_decoder':nn.Sigmoid(), 'lin2_decoder':nn.Linear(216, 256),
                              'norm2_decoder':nn.BatchNorm1d(256), 'sig2_decoder':nn.Sigmoid()})
classifier_config = OrderedDict({'lin1_classifier':nn.Linear(196,128), 'norm1_classifier':nn.BatchNorm1d(128),
                                 'sig1_classifier':nn.Sigmoid(), 'lin2_classifier':nn.Linear(128, 32),
                                 'norm2_classifier':nn.BatchNorm1d(32), 'sig2_classifier':nn.Sigmoid(),
                                 'lin3_classifier':nn.Linear(32,2), 'norm3_classifier':nn.BatchNorm1d(2), 
                                 'sig3_classifier':nn.Sigmoid()})

I then train the autoencoder as follows (using data that has added Gaussian noise but even with 0 noise the model does not learn):

    def train_autoencoder(self, num_epochs, stimuli, batch_size, noise_factor, optimizer, criterion, scheduler, conv=False, verbose=False):

        running_loss = []
        test_loss = []
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Load test and training datasets
        testloader = load_dataset_AE(stimuli=stimuli, batch_size=batch_size, noise_factor=0.0, conv=conv)
        trainloader = load_dataset_AE(stimuli=stimuli, batch_size=batch_size, noise_factor=noise_factor, conv=conv)

        for epoch in range(num_epochs):
            cur_running_loss = 0.0
            for i, (stimuli, target) in enumerate(trainloader):
                stimuli = stimuli.to(device)
                target = target.to(device)
                # Train mode
                self.train()
                # Forward pass
                x = self.forward(stimuli, decode=True)
                loss = criterion(x, target)
                # Backward pass
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()
                # Add loss of current pass
                cur_running_loss += loss.item()

            cur_running_loss /= len(trainloader)
            running_loss.append(cur_running_loss)

            # Eval loss
            eval_loss = self.evaluate_AE(testloader, criterion)
            test_loss.append(eval_loss)

            if verbose:
                print('Autoencoder --> Running Loss: {} \t Eval loss: {}'.format(cur_running_loss, eval_loss))

            # Scheduler
            scheduler.step(eval_loss)

        return running_loss, test_loss

However I configure the encoder portion, it learns [0.500, 0.500, 0.500, … , 0.500] and decoder seems to learn to output the approximately the zero vector. The training data are binary vectors. I use torch.MSELoss() and torch.optim.Adam() as loss and optimizer. Weirdly enough, for vectors of length 256, I get a very low loss when training the autoencoder (~0.002). Given that the encoder learns nothing, the classifier categorizes at chance. Lastly, an SVM can learn to classify this data with near perfect accuracy so I doubt this is a problem with my data. Thanks for your help!