Clearing CUDA memory after error

hi. i’m a newbie and adjusting some kernel I took from kaggle. I use the transformers library with the xla roberto pretrained model as backbone.

I train my model, but it fails when calculating loss function. some dimensions are wrong. I try an adjustment and run again. but receive this error:

RuntimeError: CUDA out of memory. Tried to allocate 42.00 MiB (GPU 0; 7.79 GiB total capacity; 6.45 GiB already allocated; 37.56 MiB free; 6.76 GiB reserved in total by PyTorch)

so I think maybe I should remove variables that might contain tensors. I use the code that follows. but it’s no use. i receive output like this

Parameter: GPU pinned 2
Parameter: GPU pinned 2 × 1536
Tensor: GPU pinned 16 × 224

so nothing is deleted. how can i clear the cuda memory for good? I want to eradicate, kill, vanquish, etc. all tensors. see code below. is this wad? seems like ill behavior to me. i did some pytorch 2 years ago and I dont remember this issue. neither on keras. perhaps this issue is related to roberto pretrained model? should I put issue to shitface transformers?

edit: i have fixed the dimension bug and apparently my gpu doesnt have enough ram to train the model beyond one batch with batch size 16… i still dont understand why im unable to clear the tensors though.

i use python 3.8 and pytorch 1.5

def pretty_size(size):
	"""Pretty prints a torch.Size object"""
	assert(isinstance(size, torch.Size))
	return " × ".join(map(str, size))

def dump_tensors(gpu_only=True):
	"""Prints a list of the Tensors being tracked by the garbage collector."""
	import gc
	total_size = 0
	for obj in gc.get_objects():
			if torch.is_tensor(obj):
				if not gpu_only or obj.is_cuda:
					print("%s:%s%s %s" % (type(obj).__name__, 
										  " GPU" if obj.is_cuda else "",
                                          " pinned" if obj.is_pinned else "",
					total_size += obj.numel()
			elif hasattr(obj, "data") and torch.is_tensor(
				if not gpu_only or obj.is_cuda:
					print("%s → %s:%s%s%s%s %s" % (type(obj).__name__, 
												   " GPU" if obj.is_cuda else "",
												   " pinned" if else "",
												   " grad" if obj.requires_grad else "", 
												   " volatile" if obj.volatile else "",
					total_size +=
		except Exception as e:
	print("Total size:", total_size)

del _loss_fn, model, train_loader, val_loader, backbone, test_dataset, val_dataset, train_dataset
class ToxicSimpleNNModel(nn.Module):
    def __init__(self, backbone):
        super(ToxicSimpleNNModel, self).__init__()
        self.backbone = backbone
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(

    def forward(self, input_ids, attention_masks):
        bs, seq_length = input_ids.shape
        seq_x, _ = self.backbone(input_ids=input_ids, attention_mask=attention_masks)
        apool = torch.mean(seq_x, 1)
        mpool, _ = torch.max(seq_x, 1)
        x =, mpool), 1)
        x = self.dropout(x)
        return self.linear(x)

backbone = AutoModel.from_pretrained(MODEL)

def train_loop_fn(data_loader, model, loss_fn, optimizer, device="cuda"):
    start_time = time.time()
    avg_loss = 0
    num_warmup_steps = int(WARMUP_PROP * EPOCHS * len(data_loader))
    num_training_steps = EPOCHS * len(data_loader)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    for step, (ids, inputs, attention_masks, targets) in enumerate(data_loader):
        inputs =, dtype=torch.long)
        attention_masks =, dtype=torch.long)
        targets =, dtype=torch.long)

        outputs = model(inputs, attention_masks)
        loss = loss_fn(outputs, targets)
        avg_loss += loss.item() / len(data_loader)
        if step % 50 == 0:
            print(f'Prediction Step {step}, time: {(time.time() - start_time):.5f}, avg_loss: {avg_loss}')

def eval_loop_fn(data_loader, model, loss_fn, device="cuda"):
    result = {'id': [], 'conspiratory': []}
    avg_val_loss = 0.
    with torch.no_grad():
        for ids, inputs, attention_masks, targets in data_loader:
            inputs =, dtype=torch.long) 
            attention_masks =, dtype=torch.long)
            targets =, dtype=torch.float)

            outputs = model(inputs, attention_masks)
            loss = loss_fn(outputs.detach().view(-1).float(), targets.float().cuda())
            avg_val_loss += loss.item() / len(data_loader)
            conspiratories = nn.functional.softmax(outputs, dim=1).data.cpu().numpy()[:,1]
    return result

model = ToxicSimpleNNModel(backbone).cuda()

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

optimizer = AdamW(model.parameters(), lr=LR)

_loss_fn = nn.BCEWithLogitsLoss().cuda()

def loss_fn(outputs, targets):
    return _loss_fn(outputs.view(-1, 1), targets)

start_time = time.time()

for epoch in range(EPOCHS):
    train_loop_fn(train_loader, model, loss_fn, optimizer)
    result = eval_loop_fn(val_loader, model, loss_fn)
    score = roc_auc_score(result["conspiratory_truths"], result["conspiratory_preds"])
    dt = time.time() - start_time
    lr = scheduler.get_last_lr()[0]
    print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t val_auc={score:.4f}')