I am using Vgg16 to classify Cifar10 but the accuracy is not improving

I have trained cifar10 to classify using this Vgg16 model, but the accuracy is not improving. However, loss_val is dropping well. How can I improve the accuracy? I have seen other papers that go 96%, etc.

class VGG16(nn.Module):
    def __init__(self): # , num_classes):
        super(VGG16, self).__init__()
        num_classes=10

        self.block1_output = nn.Sequential (
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.block2_output = nn.Sequential (
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.block3_output = nn.Sequential (
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.block4_output = nn.Sequential (
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.block5_output = nn.Sequential (
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.classifier = nn.Sequential(
            nn.Linear(512, 512),  #512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(512, 32 ),  #4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(32, num_classes),  #4096
        )

    def forward(self, x):
        x = self.block1_output(x)
        x = self.block2_output(x)
        x = self.block3_output(x)
        x = self.block4_output(x)
        x = self.block5_output(x)
        #print(x.size())
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x
def train(model,train_loader,device):
# 交差エントロピー
    criterion = nn.CrossEntropyLoss()
    # 確率的勾配降下法
    model =model.train()
    optimizer = optim.AdamW(model.parameters(), lr=0.004)
    loss_values = []
    for epoch in range(100):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            #print(i)
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs.to(device))
            loss = criterion(outputs, labels.to(device))
            # 誤差逆伝播
            loss.backward()
            optimizer.step()
            train_loss = loss.item()
            running_loss += loss.item()
            if i % 500 == 499:
                #print('[%d, %5d] loss: %.20f' % (epoch + 1, i + 1, running_loss / 500))
                loss_values.append(running_loss/500)
                running_loss = 0.0
    plt.plot(loss_values)
    plt.show()
    #print('Finished Training')
    return model
def val(model,val_dataloader,device):
    model = model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in val_dataloader:
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            #print(outputs.data)
            #print((predicted == labels).sum())
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on images: %d %%' % (100 * correct / total))
if __name__ == '__main__':
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = (v.VGG16()).to(device)
    # ToTensor:画像のグレースケール化(RGBの0~255を0~1の範囲に正規化)、Normalize:Z値化(RGBの平均と標準偏差を0.5で決め打ちして正規化)
    transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# トレーニングデータをダウンロード
    trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(trainset, batch_size=100, shuffle=True, num_workers=0)
# テストデータをダウンロード
    testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
    val_dataloader = torch.utils.data.DataLoader(testset, batch_size=10, shuffle=True, num_workers=0)
    #train_loader = torch.load("D:\Imagetrain.pt")
    #train_dataloader = data.DataLoader(train_loader, batch_size = 100, shuffle = True)
    #val_loader = torch.load("D:\Imagevalid.pt")
    #val_dataloader = data.DataLoader(val_loader, batch_size = 100, shuffle = True)
    print("VGG16")
    model = train(model,train_loader,device)
    
    val(model,val_dataloader,device)

This image shows the loss_val transition.
Vgg16CI
accuracy:
Accuracy of the network on images: 76 %