Hi,
Do I need to validate the model after each epoch while using DDP only on rank = 0, or should it be on all processes? With the current code I get validation loss on all processes. Which is the correct way to validate with DDP? This is my code: (I am still prototyping so validation and training loop have the same dataloader).
def validate(model, train_loader):
# Validate the model.
model.eval()
validation_loss = 0.0
with torch.no_grad():
for i, (images, labels, wsi_id) in enumerate(train_loader):
# Pass data and label to the GPU.
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
images = torch.squeeze(images).contiguous()
# Forward pass with autocasting/mixed-precision,
# GradScaler is not needed for inference.
with torch.autocast(device_type='cuda', dtype=torch.float16):
logits, Y_prob, Y_hat, _ = model(images)
vloss = loss_fn(logits, labels)
validation_loss += vloss
print(f'Validation loss {validation_loss / (i + 1):.4f}')
if name == ‘main’:
parser = argparse.ArgumentParser(description="Don't worry be happy!")
parser.add_argument('--df_path', type=str, help='', default='')
parser.add_argument('--shard_path', type=str, help='', default='')
parser.add_argument('--model_output_path', type=str, help='Model checkpoint output path', default='')
parser.add_argument('--batch_size', type=int, help='Batch size per GPU. Default batch_size is in bag-level (1 WSI)', default=1)
parser.add_argument('--epochs', type=int, help='Number of training epochs', default=1000000)
parser.add_argument('--lr', type=float, help='learning rate (default: 1.0)', default=0.0001)
parser.add_argument('--num_workers', type=int, help='Number of dataloader processes for each GPU', default=8)
parser.add_argument('--seed', type=int, help='random seed (default: 1)', default=1)
args = parser.parse_args()
df_path = args.df_path
shard_path = args.shard_path
model_output_path = args.model_output_path
epochs = args.epochs
batch_size = args.batch_size
num_workers = args.num_workers
lr = args.lr
shard_df = pd.read_csv(df_path)
seed_torch(args.seed)
# Env variables
local_rank = int(os.environ['LOCAL_RANK'])
global_rank = int(os.environ["RANK"])
world_size = int(os.environ['WORLD_SIZE'])
# Initialize PyTorch DistributedDataParallel.
init_distributed(local_rank, global_rank, world_size)
# Initialize train/valid data loaders.
train_loader = get_dataloader(shard_path, shard_df)
# Load the model.
model = Network().cuda()
model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=rank)
# Define loss function (criterion) and optimizer.
loss_fn = nn.CrossEntropyLoss().cuda()
optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=0.0005)
# Create a GradScaler for training with mixed precision.
scaler = GradScaler()
# Start training.
start_train = time.time()
for epoch in range(epochs):
start_time = time.time()
train_loader.sampler.set_epoch(epoch)
# Make sure gradient tracking is on, and do a pass over the data
model.train(True)
train(model, train_loader)
dist.barrier()
print(f"Epoch took: {(time.time() - start_time) / 60:.2f} mins")
model.train(False)
validate(model, train_loader)
epoch_time = (time.time() - start_train) / 60
print(f"Training took: {epoch_time:.2f} mins")
cleanup()
if is_main_process:
# Save GradScaler values for continued training of the model with
# amp mixed precision.
checkpoint = {"model": model.state_dict(),
"optimizer": optimizer.state_dict(),
"scaler": scaler.state_dict()}
# All processes should see same parameters as they all start from same
# random parameters and gradients are synchronized in backward passes.
# Therefore, saving it in the main process is sufficient.
save_on_master(checkpoint, model_output_path + "/model_prototype.pt")