Hi everyone,
I’m new to PyTorch and currently working on training a PyTorch model for line detection using the NKL dataset and have encountered an issue where my training loss remains almost constant across epochs. Below are the details of my training process and some of the logs:
Any guidance on how to resolve this would be greatly appreciated. Thank you!
Command to Run Training:
python NKL_train.py --epochs 20 --data-dir './data' --train-file 'train.txt' --val-file 'val.txt' --npy-path './data/training/NKL_resize_100_100' --model-save-path './models' --batch-size 13 --learning-rate 0.0001 --accumulation-steps 2
**Model output**:
2024-05-27 14:38:10,454 - INFO - Epoch [1/20], Training Loss: 0.0007, Validation Loss: 0.0003
2024-05-27 14:45:49,872 - INFO - Epoch [2/20], Training Loss: 0.0004, Validation Loss: 0.0003
2024-05-27 14:53:33,134 - INFO - Epoch [3/20], Training Loss: 0.0003, Validation Loss: 0.0003
2024-05-27 15:01:16,966 - INFO - Epoch [4/20], Training Loss: 0.0003, Validation Loss: 0.0003
2024-05-27 15:08:55,049 - INFO - Epoch [5/20], Training Loss: 0.0003, Validation Loss: 0.0003
**Here is the relevant part of my training loop**:
for epoch in range(epochs):
model.train()
running_loss = 0.0
optimizer.zero_grad()
for i, (images, labels) in enumerate(train_loader):
images, labels = images.to(device), labels.to(device).float()
# Clear the CUDA cache
torch.cuda.empty_cache()
with autocast():
outputs = model(images)
outputs = F.interpolate(outputs, size=(100, 100), mode='bilinear', align_corners=False) # Ensure output is 100x100
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
# Accumulate gradients
if (i + 1) % accumulation_steps == 0:
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
running_loss += loss.item()
if i % 10 == 0:
logging.info(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
# Validation
model.eval()
val_loss = 0.0
with torch.no_grad():
for images, labels in val_loader:
images, labels = images.to(device), labels.to(device).float() # Ensure labels are float
# Clear the CUDA cache
torch.cuda.empty_cache()
outputs = model(images)
outputs = F.interpolate(outputs, size=(100, 100), mode='bilinear', align_corners=False) # Ensure output is 100x100
loss = criterion(outputs, labels)
val_loss += loss.item()
logging.info(f"Epoch [{epoch+1}/{epochs}], Training Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}")
# Save the model checkpoint
torch.save(model.state_dict(), os.path.join(model_save_path, f"model_epoch_{epoch+1}.pth"))
logging.info(f"Model saved to {model_save_path}/model_epoch_{epoch+1}.pth")