Hello everyone. I am training CNN classification model as follows:
best_loss = 9999
best_epoch = 0
for epoch in range(num_epochs):
# Each epoch has a training and validation phase
for phase in ["train", "val"]:
if phase == "train":
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0.0
for data in tqdm(image_datasets[phase]):
inputs, labels = data
now_batch_size, c, h, w = inputs.shape
if now_batch_size < batch_size: # skip the last batch
continue
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
if phase == "val":
with torch.no_grad():
outputs = model(inputs)
else:
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs.data, 1)
preds.detach()
# backward + optimize only if in training phase
if epoch < warm_epoch and phase == "train":
warm_up = min(1.0, warm_up + 0.9 / warm_iteration)
loss *= warm_up
if phase == "train":
loss.backward()
optimizer.step()
scheduler.step()
running_loss += loss.item()
running_corrects += (
float(torch.sum(preds == labels.data)) / now_batch_size
)
del (
outputs,
inputs,
preds,
labels,
loss,
)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects / dataset_sizes[phase]
print(
"{} Loss: {:.4f} Acc: {:.4f}".format(
phase,
epoch_loss,
epoch_acc,
)
)
if epoch_loss < best_loss and phase == "val":
print("new best model, saving...")
best_loss = epoch_loss
best_epoch = epoch
last_model_wts = model.state_dict()
torch.save(
last_model_wts,
path,
)
But with every epoch RAM usage is growing. It started from 7.20 Gb, but now it’s 8 epoch and usage is already 8.02. What am I doing wrong?
I have read many threads about this problem, but still didn’t solve it. Can someone help please?