Simple gradient-free genetic code -- advise needed

This is not a problem solving or debugging question. I am more seeking for advise from experienced ML and AI coders as a beginner. I have wrote this very simple genetic algorithm. I have no good machine to test it fully, but it seems like fitness does increase with the number of interactions, so it learns.

I wonder what would experienced coders think of it and if there are suggestions for improvements. I am also in a search for good entry level textbook on GAs. I found one that I can read but it is so old, I am afraid a lot of approaches are very much out of date.

Here is my code:

import torch
from pylab import *
from torchvision import datasets
from torchvision.transforms import ToTensor
import torch.nn as nn
from torch.utils.data import DataLoader

from torch.autograd import Variable
import numpy as np
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data = datasets.MNIST(
    root = 'data',
    train = True,                         
    transform = ToTensor(), 
    download = True,            
)
test_data = datasets.MNIST(
    root = 'data', 
    train = False, 
    transform = ToTensor()
)

batch_size = 32

loaders = {
    'train' : torch.utils.data.DataLoader(train_data, 
                                          batch_size=batch_size, 
                                          shuffle=True, 
                                          num_workers=1),
    
    'test'  : torch.utils.data.DataLoader(test_data, 
                                          batch_size=batch_size, 
                                          shuffle=True, 
                                          num_workers=1),
}
loaders

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(
                in_channels=1,              
                out_channels=16,            
                kernel_size=5,              
                stride=1,                   
                padding=2) 
                                                                          
        self.MaxPool2d = nn.MaxPool2d(2,2)
        self.BatchNorm2d = nn.BatchNorm2d(16)
        self.Flatten = nn.Flatten()  

        self.Linear = nn.Linear(16 * 14 * 14, batch_size * 8) 
        self.ReLU = nn.ReLU()                    
        self.Flatten = nn.Flatten()               
        self.out = nn.Sequential(nn.Linear(batch_size*8, 10), nn.Softmax(dim=1))

        self.weights_initialization()

    def forward(self, x):
        x = self.conv1(x)
        x = self.MaxPool2d(x)
        x = self.BatchNorm2d(x)
        x = self.Flatten(x)
        x = self.Linear(x)
        x = self.ReLU(x)
        x = self.Flatten(x)
        output = self.out(x)
        return output

    def weights_initialization(self):
      for m in self.modules():
          if isinstance(m, nn.Linear):
              nn.init.xavier_normal_(m.weight)
              nn.init.constant_(m.bias, 0)

cnn = CNN()

activation = {}
def getActivation(name):
  # the hook signature
  def hook(model, input, output):
    activation[name] = output.detach()
  return hook
  
h = cnn.out.register_forward_hook(getActivation('out'))

images, labels = next(iter(loaders['train']))
num_epochs = 1

chromosome = []
fitness = 0

def single_run(images, labels, num_epochs, chromosome, fitness):

  weights_conv1 = torch.reshape(torch.from_numpy(chromosome[0:400]), (16,1,5,5))
  weights_BatchNorm = torch.from_numpy(chromosome[400:416])
  weights_Linear = torch.reshape(torch.from_numpy(chromosome[416:803232]), (256, 3136))

  with torch.no_grad():
    for layer in cnn.state_dict():
        # cnn.state_dict()[layer].data.fill_(2)
        if layer == 'conv1.weight': cnn.state_dict()[layer].data.copy_(weights_conv1)#fill_(random.random())
        if layer == 'BatchNorm.weight': cnn.state_dict()[layer].data.copy_(weights_BatchNorm)
        if layer == 'Linear.weight': cnn.state_dict()[layer].data.copy_(weights_Linear)

  def train(num_epochs, cnn, loaders):
    cnn.train()

  for i, (images, labels) in enumerate(loaders['train']): 

      b_x = Variable(images)   # batch x
      b_y = Variable(labels)   # batch y

      output = cnn(b_x)[0] 

      out_ten = activation['out']

      conv1_weights = cnn.conv1.weight.detach().numpy().flatten() #[16,1,5,5]
      BatchNorm_weights = cnn.BatchNorm2d.weight.detach().numpy().flatten() #[16]
      Linear = cnn.Linear.weight.detach().numpy().flatten() #[256,3136]

      chromosome = np.append(conv1_weights,BatchNorm_weights)
      chromosome = np.append(chromosome, Linear)

      fitness
      for j in range(len(out_ten)): 
        if out_ten[j].argmax().item() == labels[j].item(): 
          fitness += 1

  return fitness, chromosome

chromosome_init = np.random.rand(803232)

fitness1, chromosome1 = single_run(images, labels, num_epochs, chromosome_init, fitness)

print(fitness1)
print('______')
print(chromosome1, len(chromosome1))

population_c = []
population_f = []

size=10

def populate(size, population_c,population_f):

  chromosome_init = np.random.rand(803232)

  for i in range(size):

    curr_fitness, curr_chromosome = single_run(images, labels,num_epochs, chromosome_init, fitness)

    population_f = np.append(population_f, curr_fitness)
    if i==0: population_c = np.append(population_c, curr_chromosome)
    else: 
      population_c = np.vstack((population_c, curr_chromosome))

  return population_c, population_f

population_c1, population_f1 = populate(size, population_c, population_f)

import math

def selection(population_c, population_f):

  indices = np.arange(10)

  # elite = int(ceil(len(population_f) * 0.2))
  random_ind = random.choices(indices, weights=population_f1, k=2)

  # elite_ind = np.argpartition(population_f, -elite)[-elite:]
  elite_c = np.take(population_c, random_ind, 0)
  elite_f = np.take(population_f, random_ind, 0)

  return elite_c, elite_f

elite_c1, elite_f1 = selection(population_c1, population_f1)

def crossover(parent_1, parent_2):

  index = random.randint(0, len(parent_1))

  parent_1[0:index] = parent_2[0:index]

  return parent_1

child = crossover(elite_c1[0], elite_c1[1])

def new_populate(size, population_c,population_f):

  child = crossover(elite_c1[0], elite_c1[1])

  for i in range(size):

    curr_fitness, curr_chromosome = single_run(images, labels,num_epochs, chromosome_init, fitness)

    population_f = np.append(population_f, curr_fitness)
    if i==0: population_c = np.append(population_c, curr_chromosome)
    else: 
      population_c = np.vstack((population_c, curr_chromosome))

  return population_c, population_f

fitness2, chromosome2 = single_run(images, labels, num_epochs, child, fitness)

print(fitness2)

for i in range(25):

  over_fitness = []

  population_c2, population_f2 = new_populate(size, population_c, population_f)
  elite_c2, elite_f2 = selection(population_c2, population_f2)

  child = crossover(elite_c1[0], elite_c1[1])

  fitness2, chromosome2 = single_run(images, labels, num_epochs, child, fitness)

  over_fitness = append(over_fitness, fitness2)

  print(fitness2)

P.S.: I am not using backdrop on purpose. I need a gradient-free algorithm.