I am trying to train a vision model; the actual implementation is a lot more elaborate, but I made this bare bones version for troubleshooting. As I am training the RAM amount keeps on increasing until I get an OOM error. For my dataset, with batch size of 2 (am loading 1024x1024 images), it increases by about 12Mb per mini-batch. This is my training function:
class trainModel():
def __init__(self, model, train_loader, val_loader, optimizer, criterion, num_epochs, device, save_path, save_train_data_path, segmentation, reducelr=None, logdir = None):
self.model = model
self.train_loader = train_loader
self.val_loader = val_loader
self.optimizer = optimizer
self.criterion = criterion
self.num_epochs = num_epochs
self.device = device
self.save_path = save_path
self.save_train_data_path = save_train_data_path
self.reducelr = reducelr
self.segmentation = segmentation
self.memoryStore = []
self.log_dir = logdir
if activateTensorboard:
self.summary_writer = SummaryWriter(self.log_dir)
self.csvFieldNames = ['Epoch',
'Train loss',
'Val loss',
'Train Class Loss',
'Val Class Loss',
'Train Regr Loss',
'Val Regr Loss',
'Train IoU',
'Val IoU']
self.memUse = get_memory_usage()
## for combined loss
self.alpha = 0.5 ## weight for classification loss
self.beta = 1 - self.alpha ## weight for regression loss
def train(self):
model = self.model.to(self.device)
lastmem = 0
for epoch in range(self.num_epochs):
torch.cuda.empty_cache()
objgraph.show_growth()
for i, batch in enumerate(self.train_loader):
print(f'train batch number: {i}')
inputs = batch['img'].to(self.device)
labels = batch['annot'].to(self.device)
input = [inputs, labels] ## inputting labels for label bounding box creation
with torch.set_grad_enabled(True):
# print("Before forward pass - GPU memory allocated:", torch.cuda.memory_allocated())
before = torch.cuda.memory_allocated()
score, objClass, bboxData, regression, classification, anchors = model(input, isTraining = True)
classLoss, regressionLoss = self.criterion(classification, regression, anchors, labels)
loss = self.alpha * classLoss + self.beta * regressionLoss
loss.backward()
self.optimizer.step()
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated(self.device))
print('mem difference',torch.cuda.memory_allocated(self.device) - lastmem)
lastmem = torch.cuda.memory_allocated(self.device)
torch.cuda.memory_cached(self.device)])
snapshot = tracemalloc.take_snapshot()
topstats = snapshot.statistics('lineno')
for line in topstats[:25]:
print(line)
with torch.no_grad():
for i, batch in enumerate(self.val_loader):
print(f'val batch number: {i}')
inputs = batch['img'].to(self.device)
labels = batch['annot'].to(self.device)
input = [inputs, labels] ## inputting labels for label bounding box creation
with torch.set_grad_enabled(False):
score, objClass, bboxData, regression, classification, anchors = model(input, isTraining = True)
classLoss, regressionLoss = self.criterion(classification, regression, anchors, labels)
torch.cuda.empty_cache()
My “criterion” is a custom loss function, while the optimizer is the SGD from PyTorch (am planning on using Adam, but thought that would get simpler optimizer working first in case Adam is too computationally costly hence the issues).
I’ve tried a few different things to release variables from memory, such as gc.collect(), del [item] along with a few other things. None of these have worked as of yet, and I am running out of ideas. Any assistance would be really appreciated!!