Validation loss and training loss is same in every epoch

validation loss and training loss is not increasing. I try to imply l2 regularization in form of weight decay as 0.05 but I have removed and tried maybe that could be the reason.

class CustomDataset(Dataset):

        def __init__(self, root_folder_path):

            self.root_folder_path = root_folder_path
            self.image_files = []
            self.labels = []


            # Collect image paths and corresponding labels

            folders = sorted([f for f in os.listdir(root_folder_path) if os.path.isdir(os.path.join(root_folder_path, f))])
            self.label_dict = {folder: i for i, folder in enumerate(folders)}


            for folder in folders:

                folder_path = os.path.join(root_folder_path, folder)
                image_files = sorted([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.jpg')])
                self.image_files.extend([os.path.join(folder_path, img) for img in image_files])
                self.labels.extend([self.label_dict[folder]] * len(image_files))


            self.transform = transforms.Compose([
                transforms.ToPILImage(),
                transforms.Resize((900, 300)),
                transforms.Grayscale(),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5], std=[0.5])
            ])

            

        def __len__(self):

            return len(self.image_files)


        def __getitem__(self, idx):

            image_path = self.image_files[idx]
            label = self.labels[idx]
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            image = self.transform(image)
            #print("Image shape:", image.shape)  # Print the shape of the image
            one_hot_label = torch.zeros(len(self.label_dict))
            one_hot_label[label] = 1

            return image, one_hot_label

main script

from custom_dataset import CustomDataset

if __name__ == '__main__':
    
    
     # Instantiate your custom dataset and dataloaders
    root_folder_path = r'W:\MASTER_BAGCHI_SCHALDACH\THESIS\code and dataset\image_dataset_300_900_10_classes'
    dataset = CustomDataset(root_folder_path)

    print("Labels:", sorted(dataset.label_dict.keys()))
    print("Total number of labels:", len(dataset.label_dict))


    # Display some images from each folder
    n_images_to_display = 4
    n_folders = len(dataset.label_dict)
    fig, ax = plt.subplots(n_images_to_display, n_folders, figsize=(n_folders * 4, n_images_to_display * 4))

    for i, (folder, label) in enumerate(dataset.label_dict.items()):
        folder_images = [dataset[i][0] for i, lbl in enumerate(dataset.labels) if lbl == label]
        indices_to_display = random.sample(range(len(folder_images)), min(n_images_to_display, len(folder_images)))
        for j, ind in enumerate(indices_to_display):
            ax[j, i].imshow(folder_images[ind].squeeze(), cmap='gray')  # Squeeze to remove the channel dimension for grayscale images
            ax[j, i].axis('off')
        ax[0, i].set_title(folder, fontsize=30)

    plt.show()
    fig.tight_layout(pad=0, w_pad=0, h_pad=0)

  

    from torch.utils.data import DataLoader, Subset
    from sklearn.model_selection import train_test_split

    TEST_SIZE = 0.2
    BATCH_SIZE = 32
    SEED = 42

    # Get the labels from the dataset
    labels = np.array([label for _, label in dataset])
    

    # generate indices: instead of the actual data we pass in integers instead
    train_indices, test_indices, _, _ = train_test_split(
        range(len(dataset)),
        labels,
        stratify=labels,
        test_size=TEST_SIZE,
        random_state=SEED
    )

    # generate subset based on indices
    train_split = Subset(dataset, train_indices)
    test_split = Subset(dataset, test_indices)
    print('Length of train_batch:',len(train_split))
    print('Length of test_batch:',len(test_split))

   
    # create batches
    train_loader = DataLoader(train_split, batch_size=BATCH_SIZE, num_workers=6,shuffle=True,pin_memory=True)
    test_loader = DataLoader(test_split, batch_size=BATCH_SIZE,num_workers=6,pin_memory=True)

    for batch in train_loader:
        images, labels = batch
        #print('Train batch size:', images.size())
        #print('Shape of labels array:',labels.size())

    for batch in test_loader:
        images, labels = batch
        #print('Test batch size:', images.size())
        #print('Shape of labels array:',labels.size())
    
    class ImageClassificationBase(nn.Module):
        
        def training_step(self, batch):
            images, labels = batch 
            out = self(images)                  # Generate predictions
            loss = F.cross_entropy(out, labels) # Calculate loss
            return loss
        
        def accuracy(self,outputs, labels):
            #_, preds = torch.max(outputs, dim=1)
            preds = torch.argmax(outputs, dim=1)
            #preds_one_hot = F.one_hot(preds, num_classes=labels.shape[1])  # Convert predictions to one-hot encoding
            #print("Shape of preds:", preds.shape)  # Check the shape of preds
            #correct=(preds_one_hot == labels).float().sum() # Count the number of correct predictions
            correct = (preds == torch.argmax(labels, dim=1)).float().sum()  # Count the number of correct predictions
            total = len(labels)  # Total number of samples
            acc = correct / total  # Calculate accuracy
            return acc
            #return torch.sum(preds_one_hot == labels).float().mean()          
                   
           
        def validation_step(self, batch):
            images, labels = batch 
            out = self(images)                    # Generate predictions
            loss = F.cross_entropy(out, labels)   # Calculate loss
            acc = self.accuracy(out, labels)           # Calculate accuracy
            #batch_size = labels.shape[0]
            #acc = self.accuracy(out, labels, batch_size)
            return {'val_loss': loss.detach(), 'val_acc': acc}
            
        def validation_epoch_end(self, outputs):
            batch_losses = [x['val_loss'] for x in outputs]
            epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
            batch_accs = [x['val_acc'] for x in outputs]
            epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
            return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
        
        def epoch_end(self, epoch, result):
            print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
                epoch, result['train_loss'], result['val_loss'], result['val_acc']))
            

    import torch.nn.init as init
    class ImageClassification(ImageClassificationBase):
        def __init__(self):
            super().__init__()
            self.network = nn.Sequential(
                #image size is [1,900,300] as [channel, height,width]
                nn.Conv2d(1, 32, kernel_size = 3, padding = 1),
                nn.LeakyReLU(0.01),
                nn.BatchNorm2d(32),
                nn.AvgPool2d(kernel_size=2, stride=2),

                nn.Conv2d(32,32, kernel_size = 3,  padding = 1),
                nn.LeakyReLU(0.01),
                nn.BatchNorm2d(32),
                nn.AvgPool2d(kernel_size=2, stride=2),
            
                nn.Conv2d(32, 64, kernel_size = 3, padding = 1),
                nn.LeakyReLU(0.01),
                nn.BatchNorm2d(64),
                nn.AvgPool2d(kernel_size=2, stride=2),
            
                nn.Conv2d(64 ,64, kernel_size = 3, padding = 1),
                nn.LeakyReLU(0.01),
                nn.BatchNorm2d(64),
                nn.AvgPool2d(kernel_size=2, stride=2),
                                    
                nn.Flatten(),                
                nn.Dropout(0.3),

                nn.Linear(64 * 56 * 18, 64),  # Assuming input size after convolutional layers is 64 * 56 * 18
                nn.LeakyReLU(0.01),
                nn.BatchNorm1d(64),
                nn.Dropout(0.2),
            
                nn.Linear(64, 64),
                nn.LeakyReLU(0.01),
                nn.BatchNorm1d(64),
                nn.Dropout(0.2),
            
                nn.Linear(64, 10)  # Output layer
            )
            # Initialize the weights of convolutional layers
            self._initialize_weights()

        def _initialize_weights(self):
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
        
        def forward(self, xb):
            return self.network(xb)

    def get_default_device():
        #Set Device to GPU or CPU
        if torch.cuda.is_available():
            return torch.device('cuda')
        else:
            return torch.device('cpu')
        

    def to_device(data, device):
        "Move data to the device"
        if isinstance(data,(list,tuple)):
            return [to_device(x,device) for x in data]
        return data.to(device,non_blocking = True)

    class DeviceDataLoader():
        #Wrap a dataloader to move data to a device
        
        def __init__(self, dl, device):
            self.dl = dl
            self.device = device
        
        def __iter__(self):
            #Yield a batch of data after moving it to device
            for b in self.dl:
                yield to_device(b,self.device)
                
        def __len__(self):
            #Number of batches
            return len(self.dl)

    device = get_default_device()
    device

    torch.cuda.empty_cache()
    model = ImageClassification()

    random_seed = 42
    torch.manual_seed(random_seed)

    train_loader = DeviceDataLoader(train_loader, device)
    test_loader = DeviceDataLoader(test_loader, device)

    to_device(model, device)

    @torch.no_grad()
    def evaluate(model, test_loader):
        model.eval()
        outputs = [model.validation_step(batch) for batch in test_loader]
        return model.validation_epoch_end(outputs)
        
    
    # Define the RMSprop optimizer
    optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.99, eps=1e-08, momentum=0.9)
    from torch.optim.lr_scheduler import LambdaLR

    # Define the custom scheduler function
    def lr_schedule(epoch, lr):
        if epoch < 10:
            return lr
        else:
            return lr * torch.exp(torch.tensor(-0.1))

    # Create a LambdaLR scheduler using the custom function
    scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: lr_schedule(epoch, lr=0.001))

    
    def fit(epochs, model, train_loader, test_loader, optimizer):
        history = []
        for epoch in range(epochs):
            # Training Phase 
            model.train()
            train_losses = []

            correct_train = 0
            total_train = 0

            for batch in train_loader:
                #images,labels = batch
                #out = model(images)
                #loss = F.cross_entropy(out,labels)
                loss = model.training_step(batch)
                #train_losses.append(loss.item())
                train_losses.append(loss)

                # Calculate training accuracy
                #preds = torch.argmax(out, dim=1)
                #correct_train += (preds == torch.argmax(labels, dim=1)).sum().item()
                #total_train += labels.size(0)

                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

            scheduler.step()
            # Validation phase
            result = evaluate(model, test_loader)
            result['train_loss'] = torch.stack(train_losses).mean().item()
            #result['train_loss'] = torch.tensor(train_losses).mean().item()
            #result['train_acc'] = correct_train / total_train
            model.epoch_end(epoch, result)
            history.append(result)
        return history

    model=to_device(ImageClassification(),device)

    #initial evaluation of the model
    #evaluate(model,test_loader)
    # Initial evaluation of the model
    initial_result = evaluate(model, test_loader)
    accuracy_percentage = initial_result['val_acc'] * 100
    print('Initial Test Loss: {:.4f}, Initial Test Accuracy: {:.4f}%'.format(initial_result['val_loss'], accuracy_percentage))

    #set the no. of epochs, optimizer funtion and learning rate
    num_epochs = 10
    
    #fitting the model on training data and record the result after each epoch
    history = fit(num_epochs, model, train_loader, test_loader, optimizer)

output

Labels: ['120', '144', '168', '192', '216', '24', '240', '48', '72', '96']
Total number of labels: 10
Length of train_batch: 1835
Length of test_batch: 459
Initial Test Loss: 2.3067, Initial Test Accuracy: 9.5833%
Epoch [0], train_loss: 2.4796, val_loss: 2.3525, val_acc: 0.1229
Epoch [1], train_loss: 2.4644, val_loss: 2.3175, val_acc: 0.1188
Epoch [2], train_loss: 2.4614, val_loss: 2.3247, val_acc: 0.1083
Epoch [3], train_loss: 2.4695, val_loss: 2.3192, val_acc: 0.1167
Epoch [4], train_loss: 2.4771, val_loss: 2.3155, val_acc: 0.1292
Epoch [5], train_loss: 2.4994, val_loss: 2.3175, val_acc: 0.1292
Epoch [6], train_loss: 2.4528, val_loss: 2.3189, val_acc: 0.1125
Epoch [7], train_loss: 2.4887, val_loss: 2.3146, val_acc: 0.1331
Epoch [8], train_loss: 2.4908, val_loss: 2.3149, val_acc: 0.1208
Epoch [9], train_loss: 2.4809, val_loss: 2.3195, val_acc: 0.1208

I have tried to print the gradients whether it is changing or not
Indeed it is changing

Gradient - network.4.weight: 1972.0595703125
Gradient - network.4.bias: 65.66154479980469
Gradient - network.6.weight: 39.97140121459961
Gradient - network.6.bias: 16.263952255249023
Gradient - network.8.weight: 1124.482177734375
Gradient - network.8.bias: 29.2763729095459
Gradient - network.10.weight: 26.414426803588867
Gradient - network.10.bias: 12.346122741699219
Gradient - network.12.weight: 1166.0787353515625
Gradient - network.12.bias: 21.25786590576172
Gradient - network.14.weight: 21.803403854370117
Gradient - network.14.bias: 6.328159332275391
Gradient - network.18.weight: 8477.4990234375
Gradient - network.18.bias: 8.84211254119873
Gradient - network.20.weight: 20.904268264770508
Gradient - network.20.bias: 4.960602283477783
Gradient - network.22.weight: 295.7684020996094
Gradient - network.22.bias: 7.291604995727539
Gradient - network.24.weight: 32.42948913574219
Gradient - network.24.bias: 7.2995924949646
Gradient - network.26.weight: 315.2677307128906
Gradient - network.26.bias: 13.069473266601562
Gradient - network.0.weight: 879.0718994140625
Gradient - network.0.bias: 633.0789184570312
Gradient - network.2.weight: 85.5080795288086
Gradient - network.2.bias: 43.94591522216797
Gradient - network.4.weight: 1975.424072265625
Gradient - network.4.bias: 65.96759033203125
Gradient - network.6.weight: 40.16535186767578
Gradient - network.6.bias: 16.365644454956055
Gradient - network.8.weight: 1127.8577880859375
Gradient - network.8.bias: 29.406267166137695
Gradient - network.10.weight: 26.385618209838867
Gradient - network.10.bias: 12.453120231628418
Gradient - network.12.weight: 1168.868896484375
Gradient - network.12.bias: 21.41246223449707
Gradient - network.14.weight: 21.852977752685547
Gradient - network.14.bias: 6.353606224060059
Gradient - network.18.weight: 8493.0283203125
Gradient - network.18.bias: 8.883261680603027
Gradient - network.20.weight: 20.931377410888672
Gradient - network.20.bias: 4.95845365524292
Gradient - network.22.weight: 296.2810974121094
Gradient - network.22.bias: 7.28194522857666
Gradient - network.24.weight: 32.507911682128906
Gradient - network.24.bias: 7.35975456237793
Gradient - network.26.weight: 315.92791748046875
Gradient - network.26.bias: 13.163238525390625
Gradient - network.0.weight: 880.11669921875
Gradient - network.0.bias: 634.5462036132812
Gradient - network.2.weight: 85.60417175292969
Gradient - network.2.bias: 44.08628463745117
Gradient - network.4.weight: 1978.5992431640625
Gradient - network.4.bias: 66.08138275146484
Gradient - network.6.weight: 40.269412994384766
Gradient - network.6.bias: 16.405197143554688
Gradient - network.8.weight: 1131.3485107421875
Gradient - network.8.bias: 29.45978546142578
Gradient - network.10.weight: 26.44884490966797
Gradient - network.10.bias: 12.506694793701172
Gradient - network.12.weight: 1171.60302734375
Gradient - network.12.bias: 21.473613739013672
Gradient - network.14.weight: 21.900503158569336
Gradient - network.14.bias: 6.34891939163208
Gradient - network.18.weight: 8515.4375
Gradient - network.18.bias: 8.869505882263184
Gradient - network.20.weight: 20.98282241821289
Gradient - network.20.bias: 4.969213485717773
Gradient - network.22.weight: 296.9770812988281
Gradient - network.22.bias: 7.2847700119018555
Gradient - network.24.weight: 32.582645416259766
Gradient - network.24.bias: 7.3756022453308105
Gradient - network.26.weight: 316.69842529296875
Gradient - network.26.bias: 13.197066307067871
Gradient - network.0.weight: 879.7801513671875
Gradient - network.0.bias: 634.4246215820312
Gradient - network.2.weight: 85.79474639892578
Gradient - network.2.bias: 44.21272277832031
Gradient - network.4.weight: 1982.5150146484375
Gradient - network.4.bias: 66.30018615722656
Gradient - network.6.weight: 40.24628448486328
Gradient - network.6.bias: 16.319913864135742
Gradient - network.8.weight: 1133.7718505859375
Gradient - network.8.bias: 29.49907684326172
Gradient - network.10.weight: 26.49559783935547
Gradient - network.10.bias: 12.514779090881348
Gradient - network.12.weight: 1174.254638671875
Gradient - network.12.bias: 21.49401092529297
Gradient - network.14.weight: 21.917810440063477
Gradient - network.14.bias: 6.327130317687988
Gradient - network.18.weight: 8531.330078125
Gradient - network.18.bias: 8.888445854187012
Gradient - network.20.weight: 21.00731658935547
Gradient - network.20.bias: 4.976906776428223
Gradient - network.22.weight: 297.3583679199219
Gradient - network.22.bias: 7.307310104370117
Gradient - network.24.weight: 32.64148712158203
Gradient - network.24.bias: 7.367309093475342
Gradient - network.26.weight: 317.3054504394531
Gradient - network.26.bias: 13.19448184967041
Gradient - network.0.weight: 879.8980712890625
Gradient - network.0.bias: 634.2294311523438
Gradient - network.2.weight: 86.13259887695312
Gradient - network.2.bias: 44.219303131103516
Gradient - network.4.weight: 1985.419921875
Gradient - network.4.bias: 66.2845687866211
Gradient - network.6.weight: 40.186622619628906
Gradient - network.6.bias: 16.335500717163086
Gradient - network.8.weight: 1136.12548828125
Gradient - network.8.bias: 29.476919174194336
Gradient - network.10.weight: 26.633195877075195
Gradient - network.10.bias: 12.565845489501953
Gradient - network.12.weight: 1177.6787109375
Gradient - network.12.bias: 21.632165908813477
Gradient - network.14.weight: 21.89198875427246
Gradient - network.14.bias: 6.34059476852417
Gradient - network.18.weight: 8556.083984375
Gradient - network.18.bias: 8.892379760742188
Gradient - network.20.weight: 21.034530639648438
Gradient - network.20.bias: 5.000185966491699
Gradient - network.22.weight: 297.9396667480469
Gradient - network.22.bias: 7.318844318389893
Gradient - network.24.weight: 32.725765228271484
Gradient - network.24.bias: 7.413058280944824
Gradient - network.26.weight: 318.0461120605469
Gradient - network.26.bias: 13.253472328186035
Gradient - network.0.weight: 881.0809326171875
Gradient - network.0.bias: 635.8485717773438
Gradient - network.2.weight: 86.10462951660156
Gradient - network.2.bias: 44.66469192504883
Gradient - network.4.weight: 1986.0169677734375
Gradient - network.4.bias: 66.80414581298828
Gradient - network.6.weight: 40.27083969116211
Gradient - network.6.bias: 16.412464141845703
Gradient - network.8.weight: 1139.51904296875
Gradient - network.8.bias: 29.524568557739258
Gradient - network.10.weight: 26.66860580444336
Gradient - network.10.bias: 12.62289047241211
Gradient - network.12.weight: 1180.2764892578125
Gradient - network.12.bias: 21.73480796813965
Gradient - network.14.weight: 21.971662521362305
Gradient - network.14.bias: 6.366931915283203
Gradient - network.18.weight: 8580.0400390625
Gradient - network.18.bias: 8.940530776977539
Gradient - network.20.weight: 21.07856559753418
Gradient - network.20.bias: 5.031106948852539
Gradient - network.22.weight: 298.6278991699219
Gradient - network.22.bias: 7.346281051635742
Gradient - network.24.weight: 32.80044174194336
Gradient - network.24.bias: 7.4140496253967285
Gradient - network.26.weight: 318.723876953125
Gradient - network.26.bias: 13.262210845947266
Gradient - network.0.weight: 879.3939819335938
Gradient - network.0.bias: 634.1470947265625
Gradient - network.2.weight: 86.17279815673828
Gradient - network.2.bias: 44.4538459777832
Gradient - network.4.weight: 1985.5703125
Gradient - network.4.bias: 66.63707733154297
Gradient - network.6.weight: 40.31544876098633
Gradient - network.6.bias: 16.456451416015625
Gradient - network.8.weight: 1141.0543212890625
Gradient - network.8.bias: 29.57697296142578
Gradient - network.10.weight: 26.659425735473633
Gradient - network.10.bias: 12.567343711853027
Gradient - network.12.weight: 1180.8160400390625
Gradient - network.12.bias: 21.668344497680664
Gradient - network.14.weight: 22.020626068115234
Gradient - network.14.bias: 6.353189945220947
Gradient - network.18.weight: 8588.345703125
Gradient - network.18.bias: 8.943714141845703
Gradient - network.20.weight: 21.09166717529297
Gradient - network.20.bias: 5.038583278656006
Gradient - network.22.weight: 298.9999084472656
Gradient - network.22.bias: 7.367218494415283
Gradient - network.24.weight: 32.82997512817383
Gradient - network.24.bias: 7.389002323150635
Gradient - network.26.weight: 318.98876953125
Gradient - network.26.bias: 13.243236541748047
Gradient - network.0.weight: 877.5641479492188
Gradient - network.0.bias: 632.996826171875
Gradient - network.2.weight: 86.11853790283203
Gradient - network.2.bias: 44.579681396484375
Gradient - network.4.weight: 1985.0216064453125
Gradient - network.4.bias: 66.81136322021484
Gradient - network.6.weight: 40.34208297729492
Gradient - network.6.bias: 16.47683334350586
Gradient - network.8.weight: 1143.3651123046875
Gradient - network.8.bias: 29.57136344909668
Gradient - network.10.weight: 26.713293075561523
Gradient - network.10.bias: 12.635432243347168
Gradient - network.12.weight: 1183.5819091796875
Gradient - network.12.bias: 21.756553649902344
Gradient - network.14.weight: 22.053306579589844
Gradient - network.14.bias: 6.3881611824035645
Gradient - network.18.weight: 8603.1357421875
Gradient - network.18.bias: 8.96336841583252
Gradient - network.20.weight: 21.1342716217041
Gradient - network.20.bias: 5.0362091064453125
Gradient - network.22.weight: 299.5596008300781
Gradient - network.22.bias: 7.381054401397705
Gradient - network.24.weight: 32.8720703125
Gradient - network.24.bias: 7.46377420425415
Gradient - network.26.weight: 319.5681457519531
Gradient - network.26.bias: 13.388437271118164
Gradient - network.0.weight: 877.2111206054688
Gradient - network.0.bias: 632.5233154296875
Gradient - network.2.weight: 86.13668823242188
Gradient - network.2.bias: 44.751502990722656
Gradient - network.4.weight: 1984.501220703125
Gradient - network.4.bias: 66.9331283569336
Gradient - network.6.weight: 40.42633056640625
Gradient - network.6.bias: 16.478057861328125
Gradient - network.8.weight: 1144.1136474609375
Gradient - network.8.bias: 29.618366241455078
Gradient - network.10.weight: 26.684040069580078
Gradient - network.10.bias: 12.708597183227539
Gradient - network.12.weight: 1183.7313232421875
Gradient - network.12.bias: 21.847732543945312
Gradient - network.14.weight: 22.05240249633789
Gradient - network.14.bias: 6.420278072357178
Gradient - network.18.weight: 8611.87890625
Gradient - network.18.bias: 8.972417831420898
Gradient - network.20.weight: 21.194501876831055
Gradient - network.20.bias: 5.053436756134033
Gradient - network.22.weight: 300.089599609375
Gradient - network.22.bias: 7.381433010101318
Gradient - network.24.weight: 32.95003890991211
Gradient - network.24.bias: 7.524375915527344
Gradient - network.26.weight: 320.1886901855469
Gradient - network.26.bias: 13.481547355651855
Gradient - network.0.weight: 883.7786865234375
Gradient - network.0.bias: 636.7481079101562
Gradient - network.2.weight: 86.76094818115234
Gradient - network.2.bias: 44.764747619628906
Gradient - network.4.weight: 1998.7744140625
Gradient - network.4.bias: 66.89154815673828
Gradient - network.6.weight: 40.57377243041992
Gradient - network.6.bias: 16.533796310424805
Gradient - network.8.weight: 1146.3558349609375
Gradient - network.8.bias: 29.672372817993164
Gradient - network.10.weight: 26.756752014160156
Gradient - network.10.bias: 12.728416442871094
Gradient - network.12.weight: 1187.3931884765625
Gradient - network.12.bias: 21.872133255004883
Gradient - network.14.weight: 22.104177474975586
Gradient - network.14.bias: 6.4766716957092285
Gradient - network.18.weight: 8623.474609375
Gradient - network.18.bias: 8.982100486755371
Gradient - network.20.weight: 21.254610061645508
Gradient - network.20.bias: 5.0786943435668945
Gradient - network.22.weight: 300.53887939453125
Gradient - network.22.bias: 7.424628257751465
Gradient - network.24.weight: 33.012672424316406
Gradient - network.24.bias: 7.542198181152344
Gradient - network.26.weight: 320.61279296875
Gradient - network.26.bias: 13.536066055297852
Gradient - network.0.weight: 887.2842407226562
Gradient - network.0.bias: 639.4404907226562
Gradient - network.2.weight: 86.62738800048828
Gradient - network.2.bias: 44.65797424316406
Gradient - network.4.weight: 1999.5264892578125
Gradient - network.4.bias: 66.84803771972656
Gradient - network.6.weight: 40.641902923583984
Gradient - network.6.bias: 16.545888900756836
Gradient - network.8.weight: 1148.614501953125
Gradient - network.8.bias: 29.66803550720215
Gradient - network.10.weight: 26.7753963470459
Gradient - network.10.bias: 12.73292064666748
Gradient - network.12.weight: 1189.1826171875
Gradient - network.12.bias: 21.93785285949707
Gradient - network.14.weight: 22.13835906982422
Gradient - network.14.bias: 6.473572254180908
Gradient - network.18.weight: 8638.9208984375
Gradient - network.18.bias: 8.975906372070312
Gradient - network.20.weight: 21.28856086730957
Gradient - network.20.bias: 5.067121505737305
Gradient - network.22.weight: 301.0235900878906
Gradient - network.22.bias: 7.417306423187256
Gradient - network.24.weight: 33.057098388671875
Gradient - network.24.bias: 7.558659076690674
Gradient - network.26.weight: 321.19915771484375
Gradient - network.26.bias: 13.55431079864502
Gradient - network.0.weight: 891.125244140625
Gradient - network.0.bias: 642.4556274414062
Gradient - network.2.weight: 86.79536437988281
Gradient - network.2.bias: 44.94325256347656
Gradient - network.4.weight: 2003.970703125
Gradient - network.4.bias: 67.32352447509766
Gradient - network.6.weight: 40.832035064697266
Gradient - network.6.bias: 16.655580520629883
Gradient - network.8.weight: 1154.07421875
Gradient - network.8.bias: 29.737234115600586
Gradient - network.10.weight: 26.81071662902832
Gradient - network.10.bias: 12.78685188293457
Gradient - network.12.weight: 1192.1068115234375
Gradient - network.12.bias: 22.021142959594727
Gradient - network.14.weight: 22.187774658203125
Gradient - network.14.bias: 6.4893388748168945
Gradient - network.18.weight: 8650.58203125
Gradient - network.18.bias: 8.961124420166016
Gradient - network.20.weight: 21.35179901123047
Gradient - network.20.bias: 5.105689525604248
Gradient - network.22.weight: 301.6043701171875
Gradient - network.22.bias: 7.450457572937012
Gradient - network.24.weight: 33.128787994384766
Gradient - network.24.bias: 7.580132007598877
Gradient - network.26.weight: 321.80267333984375
Gradient - network.26.bias: 13.596098899841309
Gradient - network.0.weight: 892.2864379882812
Gradient - network.0.bias: 643.8526611328125
Gradient - network.2.weight: 86.71598815917969
Gradient - network.2.bias: 44.659828186035156
Gradient - network.4.weight: 2004.498291015625
Gradient - network.4.bias: 66.9974136352539
Gradient - network.6.weight: 41.023223876953125
Gradient - network.6.bias: 16.733489990234375
Gradient - network.8.weight: 1156.192138671875
Gradient - network.8.bias: 29.837438583374023
Gradient - network.10.weight: 26.829349517822266
Gradient - network.10.bias: 12.768521308898926
Gradient - network.12.weight: 1192.6715087890625
Gradient - network.12.bias: 22.03158187866211
Gradient - network.14.weight: 22.21875
Gradient - network.14.bias: 6.503129482269287
Gradient - network.18.weight: 8664.353515625
Gradient - network.18.bias: 8.971582412719727
Gradient - network.20.weight: 21.394588470458984
Gradient - network.20.bias: 5.086203575134277
Gradient - network.22.weight: 302.173583984375
Gradient - network.22.bias: 7.458706855773926
Gradient - network.24.weight: 33.19932556152344
Gradient - network.24.bias: 7.592126369476318
Gradient - network.26.weight: 322.46307373046875
Gradient - network.26.bias: 13.622519493103027
Gradient - network.0.weight: 892.9862670898438
Gradient - network.0.bias: 644.5220336914062
Gradient - network.2.weight: 86.74250030517578
Gradient - network.2.bias: 44.87397003173828
Gradient - network.4.weight: 2004.817626953125
Gradient - network.4.bias: 67.1401138305664
Gradient - network.6.weight: 41.00642013549805
Gradient - network.6.bias: 16.755428314208984
Gradient - network.8.weight: 1156.2100830078125
Gradient - network.8.bias: 29.85357093811035
Gradient - network.10.weight: 26.898408889770508
Gradient - network.10.bias: 12.812833786010742
Gradient - network.12.weight: 1193.90625
Gradient - network.12.bias: 22.077566146850586
Gradient - network.14.weight: 22.258325576782227
Gradient - network.14.bias: 6.495406150817871
Gradient - network.18.weight: 8681.146484375
Gradient - network.18.bias: 8.972992897033691
Gradient - network.20.weight: 21.431018829345703
Gradient - network.20.bias: 5.099081516265869
Gradient - network.22.weight: 302.689697265625
Gradient - network.22.bias: 7.468793869018555
Gradient - network.24.weight: 33.262115478515625
Gradient - network.24.bias: 7.6273722648620605
Gradient - network.26.weight: 323.1198425292969
Gradient - network.26.bias: 13.675524711608887
Gradient - network.0.weight: 889.4791870117188
Gradient - network.0.bias: 642.1243286132812
Gradient - network.2.weight: 86.6361083984375
Gradient - network.2.bias: 44.542972564697266
Gradient - network.4.weight: 2002.07470703125
Gradient - network.4.bias: 66.74109649658203
Gradient - network.6.weight: 40.948028564453125
Gradient - network.6.bias: 16.77215576171875
Gradient - network.8.weight: 1154.799072265625
Gradient - network.8.bias: 29.893823623657227
Gradient - network.10.weight: 26.842357635498047
Gradient - network.10.bias: 12.823034286499023
Gradient - network.12.weight: 1194.6961669921875
Gradient - network.12.bias: 22.08135986328125
Gradient - network.14.weight: 22.241701126098633
Gradient - network.14.bias: 6.507607460021973
Gradient - network.18.weight: 8692.8095703125
Gradient - network.18.bias: 8.98963451385498
Gradient - network.20.weight: 21.49245262145996
Gradient - network.20.bias: 5.071356773376465
Gradient - network.22.weight: 303.2649230957031
Gradient - network.22.bias: 7.465071678161621
Gradient - network.24.weight: 33.28678512573242
Gradient - network.24.bias: 7.676116943359375
Gradient - network.26.weight: 323.65716552734375
Gradient - network.26.bias: 13.70574951171875
Gradient - network.0.weight: 888.65625
Gradient - network.0.bias: 641.3937377929688
Gradient - network.2.weight: 86.89801025390625
Gradient - network.2.bias: 44.760589599609375
Gradient - network.4.weight: 2007.996826171875
Gradient - network.4.bias: 66.98281860351562
Gradient - network.6.weight: 41.05509567260742
Gradient - network.6.bias: 16.842411041259766
Gradient - network.8.weight: 1156.12744140625
Gradient - network.8.bias: 29.934673309326172
Gradient - network.10.weight: 26.800188064575195
Gradient - network.10.bias: 12.860856056213379
Gradient - network.12.weight: 1195.3272705078125
Gradient - network.12.bias: 22.127914428710938
Gradient - network.14.weight: 22.26172637939453
Gradient - network.14.bias: 6.499686241149902
Gradient - network.18.weight: 8702.3271484375
Gradient - network.18.bias: 8.964279174804688
Gradient - network.20.weight: 21.520503997802734
Gradient - network.20.bias: 5.08458137512207
Gradient - network.22.weight: 303.66766357421875
Gradient - network.22.bias: 7.489974498748779
Gradient - network.24.weight: 33.37401580810547
Gradient - network.24.bias: 7.668869495391846
Gradient - network.26.weight: 324.3313293457031
Gradient - network.26.bias: 13.679880142211914
Gradient - network.0.weight: 890.3974609375
Gradient - network.0.bias: 642.9691162109375
Gradient - network.2.weight: 86.96422576904297
Gradient - network.2.bias: 44.65692138671875
Gradient - network.4.weight: 2010.599853515625
Gradient - network.4.bias: 66.89025115966797
Gradient - network.6.weight: 41.119293212890625
Gradient - network.6.bias: 16.88640022277832
Gradient - network.8.weight: 1158.7886962890625
Gradient - network.8.bias: 30.00696563720703
Gradient - network.10.weight: 26.892629623413086
Gradient - network.10.bias: 12.884419441223145
Gradient - network.12.weight: 1197.207763671875
Gradient - network.12.bias: 22.121807098388672
Gradient - network.14.weight: 22.31559944152832
Gradient - network.14.bias: 6.499016761779785
Gradient - network.18.weight: 8718.044921875
Gradient - network.18.bias: 8.9684476852417
Gradient - network.20.weight: 21.550609588623047
Gradient - network.20.bias: 5.10231876373291
Gradient - network.22.weight: 304.14263916015625
Gradient - network.22.bias: 7.50643253326416
Gradient - network.24.weight: 33.43026351928711
Gradient - network.24.bias: 7.648624897003174
Gradient - network.26.weight: 324.78997802734375
Gradient - network.26.bias: 13.654890060424805
Gradient - network.0.weight: 895.0760498046875
Gradient - network.0.bias: 646.4779052734375
Gradient - network.2.weight: 87.31189727783203
Gradient - network.2.bias: 44.7834358215332
Gradient - network.4.weight: 2019.102783203125
Gradient - network.4.bias: 67.17203521728516
Gradient - network.6.weight: 41.24296951293945
Gradient - network.6.bias: 16.957056045532227
Gradient - network.8.weight: 1161.6141357421875
Gradient - network.8.bias: 30.078218460083008
Gradient - network.10.weight: 26.943647384643555
Gradient - network.10.bias: 12.87646484375
Gradient - network.12.weight: 1199.8587646484375
Gradient - network.12.bias: 22.134782791137695
Gradient - network.14.weight: 22.319555282592773
Gradient - network.14.bias: 6.474643230438232
Gradient - network.18.weight: 8729.1171875
Gradient - network.18.bias: 8.94941520690918
Gradient - network.20.weight: 21.547975540161133
Gradient - network.20.bias: 5.089081764221191
Gradient - network.22.weight: 304.73486328125
Gradient - network.22.bias: 7.521025657653809
Gradient - network.24.weight: 33.504295349121094
Gradient - network.24.bias: 7.6666364669799805
Gradient - network.26.weight: 325.4660339355469
Gradient - network.26.bias: 13.731916427612305
Gradient - network.0.weight: 896.95751953125
Gradient - network.0.bias: 647.49462890625
Gradient - network.2.weight: 87.42718505859375
Gradient - network.2.bias: 44.96442413330078
Gradient - network.4.weight: 2023.546630859375
Gradient - network.4.bias: 67.33155822753906
Gradient - network.6.weight: 41.1812744140625
Gradient - network.6.bias: 16.93539810180664
Gradient - network.8.weight: 1163.3677978515625
Gradient - network.8.bias: 30.113325119018555
Gradient - network.10.weight: 27.0476131439209
Gradient - network.10.bias: 12.90893268585205
Gradient - network.12.weight: 1201.0880126953125
Gradient - network.12.bias: 22.146635055541992
Gradient - network.14.weight: 22.318470001220703
Gradient - network.14.bias: 6.536839485168457
Gradient - network.18.weight: 8740.421875
Gradient - network.18.bias: 8.970993041992188
Gradient - network.20.weight: 21.585071563720703
Gradient - network.20.bias: 5.088612079620361
Gradient - network.22.weight: 305.0697021484375
Gradient - network.22.bias: 7.539649963378906
Gradient - network.24.weight: 33.554786682128906
Gradient - network.24.bias: 7.589280605316162
Gradient - network.26.weight: 325.8426513671875
Gradient - network.26.bias: 13.58186149597168
Gradient - network.0.weight: 899.632080078125
Gradient - network.0.bias: 648.8675537109375
Gradient - network.2.weight: 87.92989349365234
Gradient - network.2.bias: 45.084293365478516
Gradient - network.4.weight: 2033.312744140625
Gradient - network.4.bias: 67.55367279052734
Gradient - network.6.weight: 41.306907653808594
Gradient - network.6.bias: 16.977205276489258
Gradient - network.8.weight: 1166.0074462890625
Gradient - network.8.bias: 30.134624481201172
Gradient - network.10.weight: 27.12920570373535
Gradient - network.10.bias: 12.912632942199707
Gradient - network.12.weight: 1204.6234130859375
Gradient - network.12.bias: 22.19436264038086
Gradient - network.14.weight: 22.314727783203125
Gradient - network.14.bias: 6.543850421905518
Gradient - network.18.weight: 8754.8369140625
Gradient - network.18.bias: 8.979570388793945
Gradient - network.20.weight: 21.6317195892334
Gradient - network.20.bias: 5.116827964782715
Gradient - network.22.weight: 305.66217041015625
Gradient - network.22.bias: 7.569220066070557
Gradient - network.24.weight: 33.619911193847656
Gradient - network.24.bias: 7.631232738494873
Gradient - network.26.weight: 326.54876708984375
Gradient - network.26.bias: 13.587586402893066
Gradient - network.0.weight: 901.572021484375
Gradient - network.0.bias: 649.9549560546875
Gradient - network.2.weight: 87.95177459716797
Gradient - network.2.bias: 44.89067459106445
Gradient - network.4.weight: 2034.5638427734375
Gradient - network.4.bias: 67.41141510009766
Gradient - network.6.weight: 41.28365707397461
Gradient - network.6.bias: 16.946340560913086
Gradient - network.8.weight: 1165.4605712890625
Gradient - network.8.bias: 30.097436904907227
Gradient - network.10.weight: 27.08199691772461
Gradient - network.10.bias: 12.912230491638184
Gradient - network.12.weight: 1203.5059814453125
Gradient - network.12.bias: 22.167888641357422
Gradient - network.14.weight: 22.350629806518555
Gradient - network.14.bias: 6.531960964202881
Gradient - network.18.weight: 8761.8212890625
Gradient - network.18.bias: 8.97881031036377
Gradient - network.20.weight: 21.650041580200195
Gradient - network.20.bias: 5.122071743011475
Gradient - network.22.weight: 306.1198425292969
Gradient - network.22.bias: 7.546377658843994
Gradient - network.24.weight: 33.67656326293945
Gradient - network.24.bias: 7.699538707733154
Gradient - network.26.weight: 327.157958984375
Gradient - network.26.bias: 13.64001178741455
Gradient - network.0.weight: 900.7600708007812
Gradient - network.0.bias: 649.739990234375
Gradient - network.2.weight: 87.87818145751953
Gradient - network.2.bias: 44.900390625
Gradient - network.4.weight: 2033.869140625
Gradient - network.4.bias: 67.4268569946289
Gradient - network.6.weight: 41.44637680053711
Gradient - network.6.bias: 16.994789123535156
Gradient - network.8.weight: 1168.6217041015625
Gradient - network.8.bias: 30.163768768310547
Gradient - network.10.weight: 27.181312561035156
Gradient - network.10.bias: 12.932775497436523
Gradient - network.12.weight: 1207.64208984375
Gradient - network.12.bias: 22.235519409179688
Gradient - network.14.weight: 22.372713088989258
Gradient - network.14.bias: 6.568336009979248
Gradient - network.18.weight: 8781.1845703125
Gradient - network.18.bias: 8.990159034729004
Gradient - network.20.weight: 21.69947052001953
Gradient - network.20.bias: 5.153902530670166
Gradient - network.22.weight: 306.6746520996094
Gradient - network.22.bias: 7.607180118560791
Gradient - network.24.weight: 33.72838592529297
Gradient - network.24.bias: 7.726334095001221
Gradient - network.26.weight: 327.83453369140625
Gradient - network.26.bias: 13.708324432373047
Gradient - network.0.weight: 902.1609497070312
Gradient - network.0.bias: 651.1881103515625
Gradient - network.2.weight: 88.35334014892578
Gradient - network.2.bias: 45.360050201416016
Gradient - network.4.weight: 2040.7132568359375
Gradient - network.4.bias: 68.05839538574219
Gradient - network.6.weight: 41.69377136230469
Gradient - network.6.bias: 17.071767807006836
Gradient - network.8.weight: 1172.311767578125
Gradient - network.8.bias: 30.30951499938965
Gradient - network.10.weight: 27.30013084411621
Gradient - network.10.bias: 12.963415145874023
Gradient - network.12.weight: 1209.921875
Gradient - network.12.bias: 22.227863311767578
Gradient - network.14.weight: 22.433956146240234
Gradient - network.14.bias: 6.71971321105957
Gradient - network.18.weight: 8795.84375
Gradient - network.18.bias: 9.113448143005371
Gradient - network.20.weight: 21.736454010009766
Gradient - network.20.bias: 5.172119617462158
Gradient - network.22.weight: 307.2093200683594
Gradient - network.22.bias: 7.613105297088623
Gradient - network.24.weight: 33.80817413330078
Gradient - network.24.bias: 7.718411445617676
Gradient - network.26.weight: 328.44000244140625
Gradient - network.26.bias: 13.668107986450195
Gradient - network.0.weight: 906.4451293945312
Gradient - network.0.bias: 653.2893676757812
Gradient - network.2.weight: 88.9601821899414
Gradient - network.2.bias: 45.43707275390625
Gradient - network.4.weight: 2053.031005859375
Gradient - network.4.bias: 68.34053039550781
Gradient - network.6.weight: 41.76192092895508
Gradient - network.6.bias: 17.12295913696289
Gradient - network.8.weight: 1174.7449951171875
Gradient - network.8.bias: 30.351566314697266
Gradient - network.10.weight: 27.35013198852539
Gradient - network.10.bias: 12.96728229522705
Gradient - network.12.weight: 1211.5029296875
Gradient - network.12.bias: 22.27062225341797
Gradient - network.14.weight: 22.42460060119629
Gradient - network.14.bias: 6.7438154220581055
Gradient - network.18.weight: 8805.7724609375
Gradient - network.18.bias: 9.121944427490234
Gradient - network.20.weight: 21.788862228393555
Gradient - network.20.bias: 5.192955017089844
Gradient - network.22.weight: 307.75555419921875
Gradient - network.22.bias: 7.629467010498047
Gradient - network.24.weight: 33.863731384277344
Gradient - network.24.bias: 7.703819274902344
Gradient - network.26.weight: 329.0401611328125
Gradient - network.26.bias: 13.62995719909668
Gradient - network.0.weight: 908.295654296875
Gradient - network.0.bias: 655.1250610351562
Gradient - network.2.weight: 90.37646484375
Gradient - network.2.bias: 45.84529113769531
Gradient - network.4.weight: 2072.69677734375
Gradient - network.4.bias: 69.06295776367188
Gradient - network.6.weight: 41.98386764526367
Gradient - network.6.bias: 16.997722625732422
Gradient - network.8.weight: 1177.68701171875
Gradient - network.8.bias: 30.44122314453125
Gradient - network.10.weight: 27.598312377929688
Gradient - network.10.bias: 12.992627143859863
Gradient - network.12.weight: 1215.014404296875
Gradient - network.12.bias: 22.353275299072266
Gradient - network.14.weight: 22.513397216796875
Gradient - network.14.bias: 7.212557315826416
Gradient - network.18.weight: 8825.7529296875
Gradient - network.18.bias: 9.533659934997559
Gradient - network.20.weight: 21.800037384033203
Gradient - network.20.bias: 5.197999954223633
Gradient - network.22.weight: 308.0997314453125
Gradient - network.22.bias: 7.637275218963623
Gradient - network.24.weight: 33.882328033447266
Gradient - network.24.bias: 7.725020408630371
Gradient - network.26.weight: 329.4073486328125
Gradient - network.26.bias: 13.677855491638184




Epoch [9], train_loss: 2.5131, val_loss: 2.3465, val_acc: 0.1142