Hi,
I have been trying to use pytorch to train a model on a single GPU for past few days, however, the data is huge, and I am only able to train a small model with a batch size of 2 in the GPU memory. The GPU memory is around 45 GB in size (both data and model are 3D).
Recently I included another GPU of 45GB, and since then I am trying to use DDP for training my model on both GPUs. But now, it turns out that the same model, with the same parameters and hyperparameters and a batch size of 1, can’t even fit in the combined 90 GB of GPU memory. Whenever I try to run it with python3, it gives an out of memory error, and when I tried to run it with torchrun, it crashed my system.
After spending 2 days on the internet, I still could not get answer to this, and rather found other people’s post facing the same issue (for eg: Memory issue when training in DDP mode · Lightning-AI/pytorch-lightning · Discussion #18525 · GitHub). As per my understanding, there is no error in my code, as I have seen people using the same commands in different articles and tutorials for implementing DDP.
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# Initialize the distributed process group
dist.init_process_group(
backend='nccl', # nccl is preferred for GPUs
# init_method='env://', # Initialization method (can be modified for multi-node)
world_size=world_size,
rank=rank
)
torch.cuda.set_device(rank) # Set the device for this process (GPU)
def cleanup():
dist.destroy_process_group()
def train(rank, world_size):
setup(rank, world_size)
binary_model = binary_model.to(rank)
dummy_input = torch.randn(inp_shape).to(rank)
_ = binary_model(dummy_input)
binary_model = DDP(binary_model, device_ids=[rank])
# Dataloader for our training and validation set
train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=False, drop_last=False)
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=num_workers, drop_last=False, sampler=train_sampler)
val_sampler = DistributedSampler(val_dataset, num_replicas=world_size, rank=rank, shuffle=False, drop_last=False)
val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=num_workers, drop_last=False, sampler=val_sampler)
# gc.collect()
# torch.cuda.empty_cache()
loss1 = nn.CrossEntropyLoss()
optimizer = optim.Adam(binary_model.parameters(), lr=0.0002, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=5, verbose=True)
# Training loop
best_val_loss = np.inf # Initializing the validation loss value with infinity
best_model = None # Variable for collecting the best model (with the lowest validation loss)
num_epochs = 80 # Number of epochs
save_epochs = 10 # Saving the model after every 'save_epochs' number of epochs
for epoch in range(num_epochs):
print("-" * 10)
print("Epoch:", epoch+1)
train_sampler.set_epoch(epoch)
binary_model.train() # Go to training mode
# Training
for img, lab in tqdm.tqdm(train_dl):
img, lab = img.float().to(rank, non_blocking=True), lab.to(rank, non_blocking=True)
optimizer.zero_grad() # Make the gradients 0
output = binary_model(img) # Predict
output_loss1 = loss1(output, lab) # Calculate crossentropy loss
output_loss1.backward() # Calculate the derivative with respect to each parameter
optimizer.step() # Calculate the updated weights
binary_model.eval() # Go to evaluation mode
val_sampler.set_epoch(epoch)
with torch.no_grad(): # Disable any gradient calculation, accumulation, etc
for img, lab in tqdm.tqdm(val_dl):
img, lab = img.float().to(rank, non_blocking=True), lab.to(rank, non_blocking=True)
output = binary_model(img) # Make prediction
output_loss1 = loss1(output, lab) # Calculate crossentropy loss
val_epoch_loss += output_loss1.item() # Calculate per epoch loss
pred = torch.argmax(output, dim=1) # Get the prediction result
scheduler.step(val_epoch_loss) # Update scheduler, used for Reduce LR on plateau
cleanup()
if __name__ == "__main__":
world_size = 2 # Number of GPUs
mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)
Can someone please suggest me what to do?