Clearing CUDA memory after error

hi. i’m a newbie and adjusting some kernel I took from kaggle. I use the transformers library with the xla roberto pretrained model as backbone.

I train my model, but it fails when calculating loss function. some dimensions are wrong. I try an adjustment and run again. but receive this error:

RuntimeError: CUDA out of memory. Tried to allocate 42.00 MiB (GPU 0; 7.79 GiB total capacity; 6.45 GiB already allocated; 37.56 MiB free; 6.76 GiB reserved in total by PyTorch)

so I think maybe I should remove variables that might contain tensors. I use the code that follows. but it’s no use. i receive output like this

Parameter: GPU pinned 2
Parameter: GPU pinned 2 × 1536
Tensor: GPU pinned 16 × 224

so nothing is deleted. how can i clear the cuda memory for good? I want to eradicate, kill, vanquish, etc. all tensors. see code below. is this wad? seems like ill behavior to me. i did some pytorch 2 years ago and I dont remember this issue. neither on keras. perhaps this issue is related to roberto pretrained model? should I put issue to shitface transformers?

edit: i have fixed the dimension bug and apparently my gpu doesnt have enough ram to train the model beyond one batch with batch size 16… i still dont understand why im unable to clear the tensors though.

i use python 3.8 and pytorch 1.5

def pretty_size(size):
	"""Pretty prints a torch.Size object"""
	assert(isinstance(size, torch.Size))
	return " × ".join(map(str, size))

def dump_tensors(gpu_only=True):
	"""Prints a list of the Tensors being tracked by the garbage collector."""
	import gc
	total_size = 0
	for obj in gc.get_objects():
		try:
			if torch.is_tensor(obj):
				if not gpu_only or obj.is_cuda:
					print("%s:%s%s %s" % (type(obj).__name__, 
										  " GPU" if obj.is_cuda else "",
                                          " pinned" if obj.is_pinned else "",
										  pretty_size(obj.size())))
					total_size += obj.numel()
			elif hasattr(obj, "data") and torch.is_tensor(obj.data):
				if not gpu_only or obj.is_cuda:
					print("%s → %s:%s%s%s%s %s" % (type(obj).__name__, 
												   type(obj.data).__name__, 
												   " GPU" if obj.is_cuda else "",
												   " pinned" if obj.data.is_pinned else "",
												   " grad" if obj.requires_grad else "", 
												   " volatile" if obj.volatile else "",
												   pretty_size(obj.data.size())))
					total_size += obj.data.numel()
		except Exception as e:
			pass        
	print("Total size:", total_size)

del _loss_fn, model, train_loader, val_loader, backbone, test_dataset, val_dataset, train_dataset
torch.cuda.empty_cache()
dump_tensors()
class ToxicSimpleNNModel(nn.Module):
    def __init__(self, backbone):
        super(ToxicSimpleNNModel, self).__init__()
        self.backbone = backbone
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(
            in_features=self.backbone.pooler.dense.out_features*2,
            out_features=2,
        )

    def forward(self, input_ids, attention_masks):
        bs, seq_length = input_ids.shape
        seq_x, _ = self.backbone(input_ids=input_ids, attention_mask=attention_masks)
        apool = torch.mean(seq_x, 1)
        mpool, _ = torch.max(seq_x, 1)
        x = torch.cat((apool, mpool), 1)
        x = self.dropout(x)
        return self.linear(x)


backbone = AutoModel.from_pretrained(MODEL)

def train_loop_fn(data_loader, model, loss_fn, optimizer, device="cuda"):
    model.train()
    start_time = time.time()
    
    avg_loss = 0
    
    num_warmup_steps = int(WARMUP_PROP * EPOCHS * len(data_loader))
    num_training_steps = EPOCHS * len(data_loader)
    
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    
    for step, (ids, inputs, attention_masks, targets) in enumerate(data_loader):
        model.zero_grad()
        optimizer.zero_grad()
        
        inputs = inputs.to(device, dtype=torch.long)
        attention_masks = attention_masks.to(device, dtype=torch.long)
        
        targets = targets.to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(inputs, attention_masks)
            
        loss = loss_fn(outputs, targets)
        loss.backward()
        
        avg_loss += loss.item() / len(data_loader)
        if step % 50 == 0:
            print(f'Prediction Step {step}, time: {(time.time() - start_time):.5f}, avg_loss: {avg_loss}')
        
        optimizer.step()
        scheduler.step()

    model.eval()
    
def eval_loop_fn(data_loader, model, loss_fn, device="cuda"):
    result = {'id': [], 'conspiratory': []}
    avg_val_loss = 0.
    
    with torch.no_grad():
        for ids, inputs, attention_masks, targets in data_loader:
            inputs = inputs.to(device, dtype=torch.long) 
            attention_masks = attention_masks.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(inputs, attention_masks)
            
            loss = loss_fn(outputs.detach().view(-1).float(), targets.float().cuda())
            avg_val_loss += loss.item() / len(data_loader)
                            
            conspiratories = nn.functional.softmax(outputs, dim=1).data.cpu().numpy()[:,1]
                            
            result['id'].extend(ids.numpy())
            result['conspiratory_preds'].extend(conspiratories)
            result['conspiratory_truths'].extend(targets.numpy())
                            
        
    return result

model = ToxicSimpleNNModel(backbone).cuda()

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

optimizer = AdamW(model.parameters(), lr=LR)

_loss_fn = nn.BCEWithLogitsLoss().cuda()

def loss_fn(outputs, targets):
    print(outputs.size())
    print(targets.size())
    return _loss_fn(outputs.view(-1, 1), targets)

start_time = time.time()

for epoch in range(EPOCHS):
    train_loop_fn(train_loader, model, loss_fn, optimizer)
    
    result = eval_loop_fn(val_loader, model, loss_fn)
    score = roc_auc_score(result["conspiratory_truths"], result["conspiratory_preds"])
    
    dt = time.time() - start_time
    
    lr = scheduler.get_last_lr()[0]
    
    print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t val_auc={score:.4f}')