How upgrade learning process?

Hi ! I am working with torch.audio (SPEECHCOMMANDS). I learned example which is in documentation and I started work with idea of using mel-spectogram. How can I upgrade effectivity of learn process ? During learning I have accuracy: 13% - 12% - 13% -13% - 12% … , but sometimes when I launch scripts I get 14% - 17% - 21%. Maybe I have any mistake in model ?
It is my code:
Can someone see my code and write me where I made mistake ? Should I use another function ? (optimizer)

# Train set and Test set
train_set = Data_Loader("training")
test_set = Data_Loader("testing")


waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]


l = list(set(datapoint[2] for datapoint in train_set))
labels = sorted(l)

new_sample_rate = 8000
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
transformed = transform(waveform)

ipd.Audio(transformed.numpy(), rate=new_sample_rate)

def label_to_index(word):
    return torch.tensor(labels.index(word))

def index_to_label(index):
    return labels[index]

def pad_sequence(batch):
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)

mel_spectogram = torchaudio.transforms.MelSpectrogram(
    sample_rate = 8000,
    n_fft=1024,
    hop_length=512,
    n_mels=64  
)

def collate_fn(batch):

    tensors, targets = [], []

    for waveform, _, label, *_ in batch:
        tensors += [waveform]
        targets += [label_to_index(label)]

    tensors = pad_sequence(tensors)
    tensors = mel_process(tensors)
    targets = torch.stack(targets)
    
    return tensors, targets

def mel_process(tensors):
    tensor = []
    for ten in tensors:
        ten = transform(ten)
        data_mel = mel_spectogram(ten)
        tensor.append(data_mel)
    tensor = torch.stack(tensor)
    return tensor

batch_size = 256

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False
    

train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)

test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)


class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=16, kernel_size=3, stride=1, padding=1)
        self.RL1 = nn.ReLU() 
        self.pool1 = nn.MaxPool2d(2)
        
        self.conv2 = nn.Conv2d(in_channels=16,out_channels=32, kernel_size=3, stride=1, padding=1)
        self.RL2 = nn.ReLU() 
        self.pool2 = nn.MaxPool2d(2)
        
        self.conv3 = nn.Conv2d(in_channels=32,out_channels=64, kernel_size=3, stride=1, padding=1)
        self.RL3 = nn.ReLU() 
        self.pool3 = nn.MaxPool2d(2)
        
        self.conv4 = nn.Conv2d(in_channels=64,out_channels=128, kernel_size=3, stride=1, padding=1)
        self.RL4 = nn.ReLU() 
        self.pool4 = nn.MaxPool2d(2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(128 * 4 * 1 , 8)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.RL1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.RL2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.RL3(x)
        x = self.pool3(x)
        x = self.conv4(x)
        x = self.RL4(x)
        x = self.pool4(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = F.log_softmax(x, dim=1)
        x = x.unsqueeze(1)
        return x

model = CNN()
model.to(device)
summary(model, (1, 64, 16))
optimizer = optim.Adam(model.parameters(), lr=0.01)

def train(model, epoch, log_interval):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)
        output = model(data)

        loss = F.nll_loss(output.squeeze(), target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % log_interval == 0:
            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")


def number_of_correct(pred, target):
    return pred.squeeze().eq(target).sum().item()


def get_likely_index(tensor):
    return tensor.argmax(dim=-1)

def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:

        data = data.to(device)
        print(data.shape)
        target = target.to(device)
 
        output = model(data)

        pred = get_likely_index(output)
        correct += number_of_correct(pred, target)


    print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")
    return 100. * correct / len(test_loader.dataset)
    
    
log_interval = 20
n_epoch = 500

losses = []

transform = transform.to(device)
accuracy = []

for epoch in range(1, n_epoch + 1):
    train(model, epoch, log_interval)
    acc = test(model, epoch)
    accuracy.append(acc)

x = [i for i in range(len(accuracy))]     
plt.plot(x, accuracy)
plt.show()