High loss when resuming the training

Hi all,

I am getting a high train loss when i try to resume it from the saved .pth file

Please let me know what could be the reason for it

You could have forgotten to load the state_dict of the optimizer after restoring the training, which could diverge the training. If that’s not the case, you might have changed the preprocessing of the data etc. so that the model sees “new” samples.
Without knowing more details these would be my guesses.

Hi ptrblck,

I am loading the state_dict of optimizer also. There are some randomness in data preparation.

Could that be the reason??

If the randomness is introduced only after resuming the training, it could be the reason.
However, if the input tensors use the same data augmentation / transformation it shouldn’t increase the loss unexpectedly.
I would recommend to check the model with a static input tensor, e.g. torch.ones before saving and after loading the model to make sure the output is the same (call model.eval() to disable e.g. dropout layers). If these outputs don’t match (up to floating point precision), the model loading itself seems to fail.

The randomness for data preparation was already present. There are no torch.ones used in the model. When I call model.eval() the val results are reproducible but the loss is high when I call model.train()

When continuing training, try lowering the starting learning rate. What optimizer are you using?

@J_Johnson I am using SDG
→ optimizer = optim.SGD(centerface.parameters(), lr=1e-2, momentum=0.9, weight_decay=0.0005)
→ exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones= [30, 90, 140], gamma=0.1)

Hi, I am having the same problem. I save the model and the optimizer state dicts. Then I load them both and if I call model.eval() and compute the test loss its the same as previously, but when I try to resume training after model.train(), the train loss is back to 1 and it doesn’t improve with more epochs. I’ve tried changing the learning rate but the loss is still the same.

I am using the same data as I did when first training the model.

For saving:


    if attribute == 'arousal':
        idx = 0
    elif attribute == 'dominance':
        idx = 1
    else:
        idx = 2 # Valence
    
    print("Attribute: ", attribute , "Idx: ", idx, "\n")
    
    # Move the model to the GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print("Running in device: ",device)

    # Training loop
    best_val_loss = float('inf')
    best_model = None

    patience = 30

    for epoch in range(num_epochs):
        
        model.train()
        train_loss = 0.0

        print("\nEpoch: ",epoch +1 )

        for i, (inputs, labels) in enumerate(train_loader): 
            labels = labels[idx] # Gets the labels for arousal, dominance of valence only
            inputs, labels = inputs.to(device), labels.to(device)

            if i%100 == 0: print("Training: ",i)

            # Zero the parameter gradients
            optimizer.zero_grad()
            outputs = model(inputs) # forward 
            loss = ccc_loss(labels, outputs) 
            # print(f"Loss {i}",loss.item())
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        print(f"Training: {epoch+1} finished")

        # Evaluation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for i, (inputs, labels) in enumerate(develop_loader):
                labels = labels[idx]
                if i%100 == 0: print("Validating: ", i)
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = ccc_loss(labels, outputs)
                val_loss += loss.item()

            val_loss /= len(develop_loader)
            print(f"Validating: {epoch+1} finished")

        # Print training and evaluation loss
        print(f"Epoch {epoch+1}/{200}, train loss: {train_loss:.4f}, val loss: {val_loss:.4f}")
        
        # Check for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochsNotImproving = 0
            best_model = model.state_dict()
            best_optimizer = optimizer.state_dict()
            torch.save({'model_state_dict': best_model, 'optimizer_state_dict' : best_optimizer}, savePath)
            print("Model saved because it is the best until now!")
        else:
            epochsNotImproving +=1 
            if epochsNotImproving >= patience:
                print(f"Early stopping triggered. No improvement in the last {patience} epochs.")
                break
        
        print("Epochs not improving: ",epochsNotImproving)

    # Evaluate on the test set
    checkpoint = torch.load(savePath ,map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])

    model.eval()

    test_loss = 0.0
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(test_loader):
            labels = labels[idx]
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = ccc_loss(labels, outputs)
            test_loss += loss.item()
        test_loss /= len(test_loader)
    print(f"\nTest loss: {test_loss:.4f}")

# Loss function
def ccc_loss(x, y):

    std_x = torch.std(x)
    std_y = torch.std(y)
    mean_x = torch.mean(x)
    mean_y = torch.mean(y)

    cov = torch.mean((x - x.mean()) * (y.squeeze() - y.squeeze().mean()))
    ro = cov / (std_x * std_y)

    loss = 1 - 2 * ro * std_x * std_y / (std_x**2 + std_y**2 + (mean_x - mean_y)**2)
    return loss

For loading:

# Attribute choice
    if attribute == 'arousal':
        idx = 0
    elif attribute == 'dominance':
        idx = 1
    else:
        idx = 2  # Valence

    patience=15
   model = NeuralNetwork(dropout_rate).double() 

    # Move the model to the GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print("Running on device:", device)
checkpoint = torch.load(modelPath,map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer = optim.SGD(model.parameters(), learning_rate, momentum=0.9)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

model.eval()

    test_loss = 0.0
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(test_loader):
            labels = labels[idx]
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = ccc_loss(labels, outputs)
            print(f"Loss {i}",loss.item())
            test_loss += loss.item()
        test_loss /= len(test_loader)
    print(f"Test loss: {test_loss:.4f}")

    # Training loop
    best_dev_loss = float('inf')
    epochs_not_improving = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        print("Epoch:", epoch + 1)
        # print(f"\nWeight layer 1 before epoch {epoch +1 }",model.layer1.weight)
        # print(f"Weight output layer before epoch {epoch +1 }", model.output_layer.weight, "\n")

        for i, (inputs, labels) in enumerate(adaptation_loader):
            labels = labels[idx]
            inputs, labels = inputs.to(device), labels.to(device) 

            if i % 100 == 0: print("Training:", i)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = ccc_loss(outputs, labels)
            print(loss.item())
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(adaptation_loader)
        print(f"Training: {epoch + 1} finished")

        # Evaluation
        model.eval()
        dev_loss = 0.0
        with torch.no_grad():
            for i, (inputs, labels) in enumerate(development_loader):
                labels = labels[idx]
                if i % 100 == 0: print("Validating:", i)
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = ccc_loss(outputs, labels)
                dev_loss += loss.item()

            dev_loss /= len(development_loader)
            print(f"Validating: {epoch + 1} finished")

        # Print training and evaluation loss
        print(f"Epoch {epoch + 1}/{num_epochs}, train loss: {train_loss:.4f}, val loss: {dev_loss:.4f}")

        # Early stopping
        if dev_loss < best_dev_loss:
            best_dev_loss = dev_loss
            epochs_not_improving = 0
            best_model = model.state_dict()
            torch.save(best_model, savePath)
            print("Model saved because it is the best until now!")
        else:
            epochs_not_improving += 1

        if epochs_not_improving >= patience:
            print(f"Early stopping triggered. No improvement in the last {patience} epochs.")
            break

        print("Epochs not improving:", epochs_not_improving)

    # Evaluate on the test set
    model.load_state_dict(torch.load(savePath ,map_location=device))
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(test_loader):
            labels = labels[idx]
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = ccc_loss(labels, outputs)
            test_loss += loss.item()
        test_loss /= len(test_loader)
    print(f"Test loss: {test_loss:.4f}")

The train and validation loss are 1 all the time.