Q. Even after changing the weights, the maximum speed is maintained

Hey guys, despite making changes to the model’s weights, the overall speed of the model remains constant and does not show any improvement or degradation.
(If my English is incorrect, I apologize.)
that’s my all code.

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import datasets, transforms
import pdb
from google.colab import drive
drive.mount('/content/drive')
def LoadMNIST(batch_size=32, validation=True, num_workers=1):
    root = "data"
    transform = transforms.Compose([transforms.ToTensor()])
    test_set = torchvision.datasets.MNIST(
        root=root, train=False, transform=transform, download=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_set, batch_size=int(batch_size), shuffle=False, num_workers=num_workers
    )
    return test_loader
class Net(nn.Module):
    def __init__(self, q_bit_weight, q_bit_act, p_ratio):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 4, 5, 1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(4, 16, 5, 1)
        self.relu2 = nn.ReLU()
        self.fc1 = nn.Linear(4*4*16, 32)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(32, 10)
        self.q_bit_weight = q_bit_weight
        self.q_bit_act = q_bit_act
        self.p_ratio = p_ratio

    def forward(self, x):
        x1 = self.relu1(self.conv1(x))
        x1 = F.max_pool2d(x1, 2, 2)
        x2 = self.relu2(self.conv2(x1))
        x2 = F.max_pool2d(x2, 2, 2)
        x2 = x2.view(-1, 4*4*16)
        x3 = self.relu3(self.fc1(x2))
        x4 = self.fc2(x3)

        return x4
def test(model, test_loader, epoch):
    correct = 0

    for batch_idx, (data, target) in enumerate(test_loader):
        output = model(data)
        pred = output.data.max(1)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    acc = 100. * float(correct) / float(len(test_loader.dataset))
    text = f"epoch: {str(epoch).zfill(2)} Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.2f}%)"

    return acc
def quantize_weight(layer, q_bit): #signed
    q_weight = quantize_layer(layer.weight, q_bit)
    layer.weight = nn.Parameter(q_weight)

def quantize_act(input, q_bit):  #unsigned
    q_act = quantize_layer(input, q_bit)
    return q_act

def pruning_weight(input, ratio):
    p_weight = prune_layer(input.weight, ratio)
    input.weight = nn.Parameter(p_weight)

def quantize_layer(layer, q_bit):
    scale_factor = 2 ** q_bit - 1  
    quantized_layer = (layer * scale_factor).round() / scale_factor  
    return quantized_layer

def prune_layer(layer, ratio):
    num_zeros = int(layer.numel() * ratio)  
    sorted_weights = layer.view(-1).abs().sort()[0]  
    threshold = sorted_weights[num_zeros]  
    pruned_layer = torch.where(torch.abs(layer) <= threshold, 0, layer)
    return pruned_layer


quantize_weight(model.conv1, 8)
quantize_weight(model.conv2, 16)
quantize_weight(model.fc1, 16)
quantize_weight(model.fc2, 32)

pruning_weight(model.conv1, 0.5)
pruning_weight(model.conv2, 0.5)
pruning_weight(model.fc1, 0.5)
pruning_weight(model.fc2, 0.5)

test_loader = LoadMNIST(batch_size=256, validation=True)
    ckpt_load_path = '/content/drive/MyDrive/Colab Notebooks/model.pth'

    q_bit_weight = [8, 16, 16, 32]  #layer별 quantization bit
    q_bit_act = 8  #activation quantization bit
    p_ratio = [0.2, 0.2, 0.2, 0.2]  #layer별 pruning ratio

    model = Net(q_bit_weight=q_bit_weight, q_bit_act=q_bit_act, p_ratio=p_ratio)
    model.load_state_dict(torch.load(ckpt_load_path, map_location=torch.device('cpu')))
    model.eval()
    test_acc = test(model, test_loader, 0)

    print(f"Maximum Test Accuarcy = {test_acc:.4f}")
````Preformatted text`


I want to prune the weights of a specific layer by setting 20% of the lowest weights to zero and assess the layer-wise pruning robustness.

If my English is incorrect, I apologize.