Hi all,
I am getting a high train loss when i try to resume it from the saved .pth file
Please let me know what could be the reason for it
Hi all,
I am getting a high train loss when i try to resume it from the saved .pth file
Please let me know what could be the reason for it
You could have forgotten to load the state_dict
of the optimizer after restoring the training, which could diverge the training. If that’s not the case, you might have changed the preprocessing of the data etc. so that the model sees “new” samples.
Without knowing more details these would be my guesses.
Hi ptrblck,
I am loading the state_dict of optimizer also. There are some randomness in data preparation.
Could that be the reason??
If the randomness is introduced only after resuming the training, it could be the reason.
However, if the input tensors use the same data augmentation / transformation it shouldn’t increase the loss unexpectedly.
I would recommend to check the model with a static input tensor, e.g. torch.ones
before saving and after loading the model to make sure the output is the same (call model.eval()
to disable e.g. dropout layers). If these outputs don’t match (up to floating point precision), the model loading itself seems to fail.
The randomness for data preparation was already present. There are no torch.ones used in the model. When I call model.eval() the val results are reproducible but the loss is high when I call model.train()
When continuing training, try lowering the starting learning rate. What optimizer are you using?
@J_Johnson I am using SDG
→ optimizer = optim.SGD(centerface.parameters(), lr=1e-2, momentum=0.9, weight_decay=0.0005)
→ exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer, milestones= [30, 90, 140], gamma=0.1)
Hi, I am having the same problem. I save the model and the optimizer state dicts. Then I load them both and if I call model.eval() and compute the test loss its the same as previously, but when I try to resume training after model.train(), the train loss is back to 1 and it doesn’t improve with more epochs. I’ve tried changing the learning rate but the loss is still the same.
I am using the same data as I did when first training the model.
For saving:
if attribute == 'arousal':
idx = 0
elif attribute == 'dominance':
idx = 1
else:
idx = 2 # Valence
print("Attribute: ", attribute , "Idx: ", idx, "\n")
# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Running in device: ",device)
# Training loop
best_val_loss = float('inf')
best_model = None
patience = 30
for epoch in range(num_epochs):
model.train()
train_loss = 0.0
print("\nEpoch: ",epoch +1 )
for i, (inputs, labels) in enumerate(train_loader):
labels = labels[idx] # Gets the labels for arousal, dominance of valence only
inputs, labels = inputs.to(device), labels.to(device)
if i%100 == 0: print("Training: ",i)
# Zero the parameter gradients
optimizer.zero_grad()
outputs = model(inputs) # forward
loss = ccc_loss(labels, outputs)
# print(f"Loss {i}",loss.item())
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= len(train_loader)
print(f"Training: {epoch+1} finished")
# Evaluation
model.eval()
val_loss = 0.0
with torch.no_grad():
for i, (inputs, labels) in enumerate(develop_loader):
labels = labels[idx]
if i%100 == 0: print("Validating: ", i)
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = ccc_loss(labels, outputs)
val_loss += loss.item()
val_loss /= len(develop_loader)
print(f"Validating: {epoch+1} finished")
# Print training and evaluation loss
print(f"Epoch {epoch+1}/{200}, train loss: {train_loss:.4f}, val loss: {val_loss:.4f}")
# Check for early stopping
if val_loss < best_val_loss:
best_val_loss = val_loss
epochsNotImproving = 0
best_model = model.state_dict()
best_optimizer = optimizer.state_dict()
torch.save({'model_state_dict': best_model, 'optimizer_state_dict' : best_optimizer}, savePath)
print("Model saved because it is the best until now!")
else:
epochsNotImproving +=1
if epochsNotImproving >= patience:
print(f"Early stopping triggered. No improvement in the last {patience} epochs.")
break
print("Epochs not improving: ",epochsNotImproving)
# Evaluate on the test set
checkpoint = torch.load(savePath ,map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
test_loss = 0.0
with torch.no_grad():
for i, (inputs, labels) in enumerate(test_loader):
labels = labels[idx]
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = ccc_loss(labels, outputs)
test_loss += loss.item()
test_loss /= len(test_loader)
print(f"\nTest loss: {test_loss:.4f}")
# Loss function
def ccc_loss(x, y):
std_x = torch.std(x)
std_y = torch.std(y)
mean_x = torch.mean(x)
mean_y = torch.mean(y)
cov = torch.mean((x - x.mean()) * (y.squeeze() - y.squeeze().mean()))
ro = cov / (std_x * std_y)
loss = 1 - 2 * ro * std_x * std_y / (std_x**2 + std_y**2 + (mean_x - mean_y)**2)
return loss
For loading:
# Attribute choice
if attribute == 'arousal':
idx = 0
elif attribute == 'dominance':
idx = 1
else:
idx = 2 # Valence
patience=15
model = NeuralNetwork(dropout_rate).double()
# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Running on device:", device)
checkpoint = torch.load(modelPath,map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer = optim.SGD(model.parameters(), learning_rate, momentum=0.9)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
model.eval()
test_loss = 0.0
with torch.no_grad():
for i, (inputs, labels) in enumerate(test_loader):
labels = labels[idx]
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = ccc_loss(labels, outputs)
print(f"Loss {i}",loss.item())
test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test loss: {test_loss:.4f}")
# Training loop
best_dev_loss = float('inf')
epochs_not_improving = 0
for epoch in range(num_epochs):
model.train()
train_loss = 0.0
print("Epoch:", epoch + 1)
# print(f"\nWeight layer 1 before epoch {epoch +1 }",model.layer1.weight)
# print(f"Weight output layer before epoch {epoch +1 }", model.output_layer.weight, "\n")
for i, (inputs, labels) in enumerate(adaptation_loader):
labels = labels[idx]
inputs, labels = inputs.to(device), labels.to(device)
if i % 100 == 0: print("Training:", i)
optimizer.zero_grad()
outputs = model(inputs)
loss = ccc_loss(outputs, labels)
print(loss.item())
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= len(adaptation_loader)
print(f"Training: {epoch + 1} finished")
# Evaluation
model.eval()
dev_loss = 0.0
with torch.no_grad():
for i, (inputs, labels) in enumerate(development_loader):
labels = labels[idx]
if i % 100 == 0: print("Validating:", i)
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = ccc_loss(outputs, labels)
dev_loss += loss.item()
dev_loss /= len(development_loader)
print(f"Validating: {epoch + 1} finished")
# Print training and evaluation loss
print(f"Epoch {epoch + 1}/{num_epochs}, train loss: {train_loss:.4f}, val loss: {dev_loss:.4f}")
# Early stopping
if dev_loss < best_dev_loss:
best_dev_loss = dev_loss
epochs_not_improving = 0
best_model = model.state_dict()
torch.save(best_model, savePath)
print("Model saved because it is the best until now!")
else:
epochs_not_improving += 1
if epochs_not_improving >= patience:
print(f"Early stopping triggered. No improvement in the last {patience} epochs.")
break
print("Epochs not improving:", epochs_not_improving)
# Evaluate on the test set
model.load_state_dict(torch.load(savePath ,map_location=device))
model.eval()
test_loss = 0.0
with torch.no_grad():
for i, (inputs, labels) in enumerate(test_loader):
labels = labels[idx]
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = ccc_loss(labels, outputs)
test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test loss: {test_loss:.4f}")
The train and validation loss are 1 all the time.