Hi,
Sorry because I am new to PyTorch so maybe I am not clear about this framework.
I am facing a weird problem while training the model, it raises the bug out of memory in the second epoch even in the first epoch it runs normally. I did some research on the forum, the reason usually comes from some variable in code still reference with the computing graph which makes the memory accumulation, but I can not find anything like that in my code, please help me to find out why.
Pytorch version: 1.7.0
CUDA 11.0
This is my train and validation function:
def train_epoch(model, loader, optimizer, criterion, device):
model.train()
train_loss = []
bar = tqdm(loader)
running_loss = 0
optimizer.zero_grad()
for i, (image, target) in enumerate(bar):
image, target = image.to(device), target.to(device)
b_size = image.size()[0]
output = model(image)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_np = loss.detach().cpu().item()
train_loss.append(loss_np)
bar.set_description('loss: %.5f' % (loss_np))
train_loss = np.mean(train_loss)
return train_loss
def val_epoch(model, loader, criterion, device):
model.eval()
RMSE = 0
num_sample = 0
bar = tqdm(loader)
with torch.no_grad():
for (image,target) in bar:
image, target = image.to(device), target.to(device)
num_sample += image.size()[0]
output = model(image).detach().cpu().numpy()
target = target.detach().cpu().numpy()
RMSE += np.sum((output - target)**2)
RMSE/=num_sample
print('RMSE: %.5f' % (RMSE))
return RMSE
And my main.py
:
class Hparameter(object):
def __init__(self):
self.batch_size = 512
self.lr = 1e-2
self.num_workers = 8
self.num_epochs = 50
# self.image_size = 368
self.image_size = 224
self.save_path = './weights/'
if __name__ == "__main__":
args = Hparameter()
device = torch.device('cuda')
df = pd.read_csv('./data/training_set_labels.csv')
# target = np.array(df.wind_speed).astype(np.float32)
image_id = df.image_id.to_list()
target = df.wind_speed.to_list()
train, val, y_train, y_val = train_test_split(image_id, target, test_size = 0.2, random_state = 42, shuffle = True)
transforms_train, transforms_val = get_transforms(args.image_size, gray = True)
dataset_train = WindDataset(
image_list = train,
target = y_train,
test = False,
transform=transforms_train,
gray = True
)
dataset_valid = WindDataset(
image_list = val,
target = y_val,
test = False,
transform=transforms_val,
gray = True
)
train_loader = torch.utils.data.DataLoader(
dataset_train,
batch_size=args.batch_size,
shuffle = True,
num_workers=args.num_workers
)
valid_loader = torch.utils.data.DataLoader(
dataset_valid,
batch_size=args.batch_size*2,
num_workers=args.num_workers,
shuffle=False
)
model = SimpleModel()
model.to(device)
optimizer = SGD(model.parameters(), lr = args.lr , momentum=0.9, nesterov= True)
criterion = nn.MSELoss()
best_rmse = 12.
rmse = []
train_loss_overall = []
for epoch in range(args.num_epochs):
model.train()
torch.cuda.synchronize()
train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
model.eval()
torch.cuda.synchronize()
RMSE = val_epoch(model, valid_loader, criterion, device)
rmse.append(RMSE)
train_loss_overall.append(train_loss)
pick = {'train': train_loss_overall, 'val':rmse}
with open('./plot.pkl', 'wb') as f:
pickle.dump(pick, f)
if RMSE < best_rmse:
name = args.save_path + 'epoch_%d_%.5f.pth'%(epoch, RMSE)
print('Saving model...')
torch.save(model.state_dict(), name)
torch.save(model.state_dict(), args.save_path + 'last_epoch_%.5f.pth'%(RMSE))
First epoch after finish validation, the GPU memory reach 21.2/24GB
, then it raises CUDA out of memory.
Then I reduce the batch size to 256 to see what happen, it stands on 11GB
at the first epoch and raises to 18GB
and stay there until the end of the training.