CUDA out of memory at the second epoch

Hi,

Sorry because I am new to PyTorch so maybe I am not clear about this framework.

I am facing a weird problem while training the model, it raises the bug out of memory in the second epoch even in the first epoch it runs normally. I did some research on the forum, the reason usually comes from some variable in code still reference with the computing graph which makes the memory accumulation, but I can not find anything like that in my code, please help me to find out why.

Pytorch version: 1.7.0
CUDA 11.0

This is my train and validation function:

def train_epoch(model, loader, optimizer, criterion, device):

    model.train()
    train_loss = []
    bar = tqdm(loader)
    running_loss = 0
    optimizer.zero_grad()

    for i, (image, target) in enumerate(bar):
    
        image, target = image.to(device), target.to(device)
        b_size = image.size()[0]
        output = model(image)    
        loss = criterion(output, target)        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_np = loss.detach().cpu().item()
        train_loss.append(loss_np)
        bar.set_description('loss: %.5f' % (loss_np))

    train_loss = np.mean(train_loss)
    return train_loss

def val_epoch(model, loader, criterion, device):

    model.eval()
    RMSE = 0
    num_sample = 0
    bar = tqdm(loader)
    with torch.no_grad():
        for (image,target) in bar:

            image, target = image.to(device), target.to(device)
            num_sample += image.size()[0]
            output = model(image).detach().cpu().numpy()
            target = target.detach().cpu().numpy()
            RMSE += np.sum((output - target)**2)
        RMSE/=num_sample
        print('RMSE: %.5f' % (RMSE))
    return RMSE

And my main.py:

class Hparameter(object):
    def __init__(self):
        self.batch_size = 512
        self.lr = 1e-2
        self.num_workers = 8
        self.num_epochs = 50
        # self.image_size = 368
        self.image_size = 224
        self.save_path = './weights/'

if __name__ == "__main__":
    args = Hparameter()
    device = torch.device('cuda')

    df = pd.read_csv('./data/training_set_labels.csv')
    # target = np.array(df.wind_speed).astype(np.float32)

    image_id = df.image_id.to_list()
    target = df.wind_speed.to_list()

    train, val, y_train, y_val = train_test_split(image_id, target, test_size = 0.2, random_state = 42, shuffle = True)

    transforms_train, transforms_val = get_transforms(args.image_size, gray = True)

    dataset_train = WindDataset(
        image_list = train, 
        target = y_train,
        test = False, 
        transform=transforms_train,
        gray = True
        )
    dataset_valid = WindDataset(
        image_list = val,
        target = y_val,
        test = False, 
        transform=transforms_val,
        gray = True
        )
    train_loader = torch.utils.data.DataLoader(
        dataset_train, 
        batch_size=args.batch_size, 
        shuffle = True,
        num_workers=args.num_workers
        )
    valid_loader = torch.utils.data.DataLoader(
        dataset_valid,
        batch_size=args.batch_size*2, 
        num_workers=args.num_workers,
        shuffle=False
        )

    model = SimpleModel()
    model.to(device)

    optimizer = SGD(model.parameters(), lr = args.lr , momentum=0.9, nesterov= True)

    criterion = nn.MSELoss()
    best_rmse = 12.

    rmse = []
    train_loss_overall = []

    for epoch in range(args.num_epochs):
        model.train()
        torch.cuda.synchronize()
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        model.eval()
        torch.cuda.synchronize()
        RMSE = val_epoch(model, valid_loader, criterion, device)
        rmse.append(RMSE)
        train_loss_overall.append(train_loss)
        pick = {'train': train_loss_overall, 'val':rmse}
        with open('./plot.pkl', 'wb') as f:
            pickle.dump(pick, f)
        if RMSE < best_rmse:
            name = args.save_path + 'epoch_%d_%.5f.pth'%(epoch, RMSE)
            print('Saving model...')
            torch.save(model.state_dict(), name)
    torch.save(model.state_dict(), args.save_path + 'last_epoch_%.5f.pth'%(RMSE))

First epoch after finish validation, the GPU memory reach 21.2/24GB, then it raises CUDA out of memory.
Then I reduce the batch size to 256 to see what happen, it stands on 11GB at the first epoch and raises to 18GB and stay there until the end of the training.

Keep reducing the batch size or the spatial resolution of your data if you are working with images

@moreshud as you can see in the last line if I reduce the batch size to 256, then my GPU can handle it, but my problem here is it reaches out of memory in the second epoch. If there is not enough Memory for training it should raise the bug from the first epoch, am I right?

try this order:

optimizer.step()
optimizer.zero_grad(set_to_none=True)

and drop_last=True argument to DataLoaders (to avoid variable tensor shapes and ensuing memory fragmentation)

2 Likes

Thank you, I try your solution but it is not work with the code above.

However, I found what is the problem, I set the validation batch size is twice the training batch size, when I set them equal, the GPU memory is very stable.

worked like a charm. Thank you.