Why sometimes model don't training?

Hi guys, I want to find best dropout value for my model and I use kfold for it, and sometimes model don’t training if I retrain on the same fold and the same data and the same everything … the problem dissapier
and the same problem happens if I change dropout to batchnorm
here my training loop

def train_model(dataloader_train,drpt):

    model = TorchModel(drpt)
    model = model.to(Config.device)
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1.0e-2, weight_decay=1e-3)

    epochs = tqdm(range(70),leave=True)
    epochs.set_description("Epoch")
    epochs_loss = []
    #scheduler = StepLR(optimizer, step_size=40, gamma=0.1)
    scheduler = MultiStepLR(optimizer, milestones=[25], gamma=0.1)
    for epoch in epochs:
        running_loss = 0.0
        for data in dataloader_train:
            X = data["X"].to(Config.device)
            y = data["y"].to(Config.device)
            optimizer.zero_grad()
            preds = model(X)
            loss_value = loss(preds, y)
            loss_value.backward()
            optimizer.step()
            running_loss += loss_value.item() / y.shape[0]
        scheduler.step()
        epochs.set_postfix(
            epoch=epoch, loss=running_loss, lr=scheduler.get_last_lr()[0]
        )
        epochs_loss.append(loss_value.item())

    return model, epochs_loss

and my kfold

X_train,y_train=get_df(path_train)
columns=X_train.columns
kfolds = KFold(n_splits=5)
splits = kfolds.split(X_train,y_train)
result=[]
for train_index, valid_index in splits:    
    X_tr, X_val = X_train[columns].iloc[train_index], X_train[columns].iloc[valid_index]
    y_tr, y_val = y_train[train_index], y_train[valid_index]       

    dataset_train=CustomDataset(X_tr,y_tr,'Train')
    dataset_val=CustomDataset(X_val,y_val,'Val')

    dataloader_train = DataLoader(dataset_train, batch_size=128, shuffle=True, num_workers=0, pin_memory=True)    
    dataloader_val = DataLoader(dataset_val, batch_size=128, shuffle=False, num_workers=0, pin_memory=True)

    model,epochs_loss=train_model(dataloader_train,drpt);
    recall,y_pred_valid = get_recall(model, dataloader_val, y_val)
    cm=confusion_matrix(y_true=y_val,y_pred=y_pred_valid)
    result.append({
        'recall':recall,
        'cm':cm,
        'epochs_loss':epochs_loss,
        #'model':model,
    })   

and sometimes I got this result

Epoch: 100%|██████████| 70/70 [00:57<00:00,  1.22it/s, epoch=69, loss=0.0194, lr=0.001]
tensor(0.9462)
Epoch: 100%|██████████| 70/70 [01:02<00:00,  1.11it/s, epoch=69, loss=0.0204, lr=0.001]
tensor(0.9627)
Epoch: 100%|██████████| 70/70 [00:58<00:00,  1.20it/s, epoch=69, loss=0.537, lr=0.001]
tensor(0.1491)
Epoch: 100%|██████████| 70/70 [00:55<00:00,  1.27it/s, epoch=69, loss=0.0262, lr=0.001]
tensor(0.9555)
Epoch: 100%|██████████| 70/70 [1:02:23<00:00, 53.48s/it, epoch=69, loss=0.0214, lr=0.001]    
tensor(0.9493)

How I can fix it problem? maybe I need Gradient Clipping or something similar to this?

my model is simple it is consist from
conv1d; ReLu,Dropout1d … and Linear

upd: I tried to use

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2,error_if_nonfinite=True)

it does not help((

maybe problem was in big lr, but I am not sure yet