Quick question on constant loss

I have the follwoing code written. The loss reduces initially reduces however, the loss settles at a value 2.30258… and when i print out the accuracy right from the beginning the accuracy does not improve beyond 10 percent. So basically even if the loss is decreasing there is not learning happening

import torch
import torch.nn
import numpy as np
torch.backends.cudnn.deterministic=True
#from utils import plot_images
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision
import torch.nn.functional as F
from torch.distributions import Categorical

device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
sigm=torch.nn.Sigmoid()

def get_train_valid_loader(data_dir,
                           batch_size,
                           augment,
                           random_seed,
                           valid_size=0.1,
                           shuffle=True,
                           show_sample=False,
                           num_workers=4,
                           pin_memory=False):
    """
    Utility function for loading and returning train and valid
    multi-process iterators over the CIFAR-10 dataset. A sample
    9x9 grid of the images can be optionally displayed.
    If using CUDA, num_workers should be set to 1 and pin_memory to True.
    Params
    ------
    - data_dir: path directory to the dataset.
    - batch_size: how many samples per batch to load.
    - augment: whether to apply the data augmentation scheme
      mentioned in the paper. Only applied on the train split.
    - random_seed: fix seed for reproducibility.
    - valid_size: percentage split of the training set used for
      the validation set. Should be a float in the range [0, 1].
    - shuffle: whether to shuffle the train/validation indices.
    - show_sample: plot 9x9 sample grid of the dataset.
    - num_workers: number of subprocesses to use when loading the dataset.
    - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
      True if using GPU.
    Returns
    -------
    - train_loader: training set iterator.
    - valid_loader: validation set iterator.
    """
    error_msg = "[!] valid_size should be in the range [0, 1]."
    assert ((valid_size >= 0) and (valid_size <= 1)), error_msg

    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    valid_transform = transforms.Compose([
            transforms.ToTensor(),
            normalize,
    ])
    if augment:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        train_transform = transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])

    # load the dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=train_transform,
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=valid_transform,
    )

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler,
        num_workers=num_workers, pin_memory=pin_memory,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler,
        num_workers=num_workers, pin_memory=pin_memory,
    )

    # visualize some images
    if show_sample:
        sample_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=9, shuffle=shuffle,
            num_workers=num_workers, pin_memory=pin_memory,
        )
        data_iter = iter(sample_loader)
        images, labels = data_iter.next()
        X = images.numpy().transpose([0, 2, 3, 1])
        plot_images(X, labels)

    return (train_loader, valid_loader)

train_loader,valid_loader=get_train_valid_loader(data_dir='C://Users//AEON-LAB PC//.spyder-py3//CIFAR_10',
                           batch_size=256,
                           augment=True,
                           random_seed=999,
                           valid_size=0.2,
                           shuffle=True,
                           show_sample=False,
                           num_workers=1,
                           pin_memory=True)

class Repcnn(torch.nn.Module):
  def __init__(self,wfp):
    super(Repcnn,self).__init__()
    self.a,self.b=self.initialize(wfp)
    
  def initialize(self,wfp):
    wtilde=wfp/torch.std(wfp)
    sigma_a=0.95-((0.95-0.05)*torch.abs(wtilde))
    sigma_b=0.5*(1+(wfp/(1-sigma_a)))
    sigma_a=torch.clamp(sigma_a,0.05,0.95)
    sigma_b=torch.clamp(sigma_b,0.05,0.95)
    a=torch.log(sigma_a/(1-sigma_a)).requires_grad_().cuda()
    b=torch.log(sigma_b/(1-sigma_b)).requires_grad_().cuda()
    return torch.nn.Parameter(a),torch.nn.Parameter(b)
  
  def forward(self,x):
    
    weight_m= (2*sigm(self.b)-(2*sigm(self.a)*sigm(self.b))-1+sigm(self.a))
    weight_v=(1-sigm(self.a))-weight_m**2
    assert torch.all(weight_v>=0)
    om=F.conv2d(x,weight_m,padding=1)
    ov=F.conv2d(x**2,weight_v,padding=1)
    assert torch.all(ov>=0)
    e=torch.randn_like(ov).cuda()
    z=om+(ov*e)
    return z
  

class Repfc(torch.nn.Module):
  def __init__(self,wfp):
    super(Repfc,self).__init__()
    self.a1,self.b1=self.initialize(wfp)
    
  def initialize(self,wfp):
    
    wtilde=wfp/torch.std(wfp)
    sigma_a=0.95-((0.95-0.05)*torch.abs(wtilde))
    sigma_b=0.5*(1+(wfp/(1-sigma_a)))
    sigma_a=torch.clamp(sigma_a,0.05,0.95)
    sigma_b=torch.clamp(sigma_b,0.05,0.95)
    a=torch.log(sigma_a/(1-sigma_a))
    b=torch.log(sigma_b/(1-sigma_b))
    return torch.nn.Parameter(a),torch.nn.Parameter(b) 
  
  
  def forward(self,x):
    
    weight_m=(2*sigm(self.b1)-(2*sigm(self.a1)*sigm(self.b1))-1+sigm(self.a1))
    weight_v=(1-sigm(self.a1))-weight_m**2
    om=torch.matmul(weight_m,x)
    ov=torch.matmul(weight_v,x**2)
    e=torch.randn(ov.shape).cuda()
    z=om+(ov*e)
    
    return z
 

model=torch.load('/content/cifar_fullprecison_vgg19_valid_shayer_change.pth',map_location='cpu')
wfp=[]
wfp.append(model['layer1.0.weight'])
wfp.append(model['layer1.3.weight'])
wfp.append(model['layer2.0.weight'])
wfp.append(model['layer2.3.weight'])
wfp.append(model['layer3.0.weight'])
wfp.append(model['layer3.3.weight'])
wfp.append(model['layer4.0.weight'])
wfp.append(model['layer4.3.weight'])

for i in range(len(wfp)):
  wfp[i]=torch.Tensor(wfp[i])
  
  
class Conv_Net(torch.nn.Module):
  def __init__(self,wfp):
    super(Conv_Net,self).__init__()
    self.hidden=torch.nn.ModuleList([])
    self.batchnorm=torch.nn.ModuleList([])
    for i in range(6):
      cnn=Repcnn(wfp[i])
      self.hidden.append(cnn)
    for j in range(2):
      fc=Repfc(wfp[i+1])
      i+=1
      self.hidden.append(fc)
    batch_dim=[128,256,512]
    for i in batch_dim:
      self.batchnorm.append(torch.nn.BatchNorm2d(i))
    self.mp=torch.nn.MaxPool2d(kernel_size=2,stride=2)
  def forward(self,x):
    op=x
    j=0
    while(j<6):
      obj=self.hidden[j]
      obj_next=self.hidden[j+1]
      b=self.batchnorm[j//2]
      j+=2
      op=self.mp(b(F.relu(obj_next(b(F.relu(obj(op)))))))
    op=op.view(op.size(0),-1)
    op=torch.t(op)
    obj=self.hidden[j]
    op=F.dropout(F.relu(obj(op)))
    j+=1
    obj=self.hidden[j]
    yout=obj(op)
    yout=torch.t(yout)
    return yout

del model
import gc
gc.collect()
net=Conv_Net(wfp).to(device)

def l2_reg():
  sum=0
  for p in net.parameters():
    sum+=p.norm(2)
  return sum

l_rate=0.05
beta_param=1e-11
weight_decay=1e-4
lr_decay=30
optimizer=torch.optim.Adam(net.parameters(),lr=l_rate,weight_decay=weight_decay)
criterion=torch.nn.CrossEntropyLoss()
net.train()
num_epochs=290
for epoch in range(num_epochs):
  for i,(images,labels) in enumerate(train_loader):
    images=images.to(device)
    labels=labels.to(device)
    optimizer.zero_grad()
    yout=net(images)
    for param_group in optimizer.param_groups:
      param_group['lr']=l_rate*0.5**(epoch//lr_decay)
    loss_batch=criterion(yout,labels)+(beta_param*l2_reg())
    loss_batch.backward()
    optimizer.step()
    
  print('epoch {}'.format(epoch),'loss {}'.format(loss_batch.item()))

The loss in each epoch is:
epoch 0 loss 17519124.0
epoch 1 loss 2177401.25
epoch 2 loss 1200699.875
epoch 3 loss 510016.46875
epoch 4 loss 245595.390625
epoch 5 loss 169389.859375
epoch 6 loss 153158.8125
epoch 7 loss 64970.9140625
epoch 8 loss 48519.734375
epoch 9 loss 31702.677734375
epoch 10 loss 24409.251953125
epoch 11 loss 6872.13232421875
epoch 12 loss 11279.4140625
epoch 13 loss 12181.693359375
epoch 14 loss 18632.748046875
epoch 15 loss 9640.330078125
epoch 16 loss 6570.16357421875
epoch 17 loss 26879.6015625


epoch 67 loss 155.22543334960938
epoch 68 loss 2.3025851249694824
epoch 69 loss 2.2666075229644775
epoch 70 loss 2.3025851249694824
epoch 71 loss 2.3025851249694824
epoch 72 loss 2.3025851249694824

The result of yout in the train loop is all zeros for all the batches. Any inputs on why this is happening?
It would be very helpful

I have tried a few changes like the batchsize, learning rates and even exchanging the batch norm and relu activation layers. The output always converges to zero. Is there any obvious reason that I am missing here. I am unsure how to debug this stuff out. Any leads on this would be damn helpful.