hi. i’m a newbie and adjusting some kernel I took from kaggle. I use the transformers library with the xla roberto pretrained model as backbone.
I train my model, but it fails when calculating loss function. some dimensions are wrong. I try an adjustment and run again. but receive this error:
RuntimeError: CUDA out of memory. Tried to allocate 42.00 MiB (GPU 0; 7.79 GiB total capacity; 6.45 GiB already allocated; 37.56 MiB free; 6.76 GiB reserved in total by PyTorch)
so I think maybe I should remove variables that might contain tensors. I use the code that follows. but it’s no use. i receive output like this
Parameter: GPU pinned 2
Parameter: GPU pinned 2 × 1536
Tensor: GPU pinned 16 × 224
so nothing is deleted. how can i clear the cuda memory for good? I want to eradicate, kill, vanquish, etc. all tensors. see code below. is this wad? seems like ill behavior to me. i did some pytorch 2 years ago and I dont remember this issue. neither on keras. perhaps this issue is related to roberto pretrained model? should I put issue to shitface transformers?
edit: i have fixed the dimension bug and apparently my gpu doesnt have enough ram to train the model beyond one batch with batch size 16… i still dont understand why im unable to clear the tensors though.
i use python 3.8 and pytorch 1.5
def pretty_size(size):
"""Pretty prints a torch.Size object"""
assert(isinstance(size, torch.Size))
return " × ".join(map(str, size))
def dump_tensors(gpu_only=True):
"""Prints a list of the Tensors being tracked by the garbage collector."""
import gc
total_size = 0
for obj in gc.get_objects():
try:
if torch.is_tensor(obj):
if not gpu_only or obj.is_cuda:
print("%s:%s%s %s" % (type(obj).__name__,
" GPU" if obj.is_cuda else "",
" pinned" if obj.is_pinned else "",
pretty_size(obj.size())))
total_size += obj.numel()
elif hasattr(obj, "data") and torch.is_tensor(obj.data):
if not gpu_only or obj.is_cuda:
print("%s → %s:%s%s%s%s %s" % (type(obj).__name__,
type(obj.data).__name__,
" GPU" if obj.is_cuda else "",
" pinned" if obj.data.is_pinned else "",
" grad" if obj.requires_grad else "",
" volatile" if obj.volatile else "",
pretty_size(obj.data.size())))
total_size += obj.data.numel()
except Exception as e:
pass
print("Total size:", total_size)
del _loss_fn, model, train_loader, val_loader, backbone, test_dataset, val_dataset, train_dataset
torch.cuda.empty_cache()
dump_tensors()
class ToxicSimpleNNModel(nn.Module):
def __init__(self, backbone):
super(ToxicSimpleNNModel, self).__init__()
self.backbone = backbone
self.dropout = nn.Dropout(0.3)
self.linear = nn.Linear(
in_features=self.backbone.pooler.dense.out_features*2,
out_features=2,
)
def forward(self, input_ids, attention_masks):
bs, seq_length = input_ids.shape
seq_x, _ = self.backbone(input_ids=input_ids, attention_mask=attention_masks)
apool = torch.mean(seq_x, 1)
mpool, _ = torch.max(seq_x, 1)
x = torch.cat((apool, mpool), 1)
x = self.dropout(x)
return self.linear(x)
backbone = AutoModel.from_pretrained(MODEL)
def train_loop_fn(data_loader, model, loss_fn, optimizer, device="cuda"):
model.train()
start_time = time.time()
avg_loss = 0
num_warmup_steps = int(WARMUP_PROP * EPOCHS * len(data_loader))
num_training_steps = EPOCHS * len(data_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
for step, (ids, inputs, attention_masks, targets) in enumerate(data_loader):
model.zero_grad()
optimizer.zero_grad()
inputs = inputs.to(device, dtype=torch.long)
attention_masks = attention_masks.to(device, dtype=torch.long)
targets = targets.to(device, dtype=torch.long)
optimizer.zero_grad()
outputs = model(inputs, attention_masks)
loss = loss_fn(outputs, targets)
loss.backward()
avg_loss += loss.item() / len(data_loader)
if step % 50 == 0:
print(f'Prediction Step {step}, time: {(time.time() - start_time):.5f}, avg_loss: {avg_loss}')
optimizer.step()
scheduler.step()
model.eval()
def eval_loop_fn(data_loader, model, loss_fn, device="cuda"):
result = {'id': [], 'conspiratory': []}
avg_val_loss = 0.
with torch.no_grad():
for ids, inputs, attention_masks, targets in data_loader:
inputs = inputs.to(device, dtype=torch.long)
attention_masks = attention_masks.to(device, dtype=torch.long)
targets = targets.to(device, dtype=torch.float)
outputs = model(inputs, attention_masks)
loss = loss_fn(outputs.detach().view(-1).float(), targets.float().cuda())
avg_val_loss += loss.item() / len(data_loader)
conspiratories = nn.functional.softmax(outputs, dim=1).data.cpu().numpy()[:,1]
result['id'].extend(ids.numpy())
result['conspiratory_preds'].extend(conspiratories)
result['conspiratory_truths'].extend(targets.numpy())
return result
model = ToxicSimpleNNModel(backbone).cuda()
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
optimizer = AdamW(model.parameters(), lr=LR)
_loss_fn = nn.BCEWithLogitsLoss().cuda()
def loss_fn(outputs, targets):
print(outputs.size())
print(targets.size())
return _loss_fn(outputs.view(-1, 1), targets)
start_time = time.time()
for epoch in range(EPOCHS):
train_loop_fn(train_loader, model, loss_fn, optimizer)
result = eval_loop_fn(val_loader, model, loss_fn)
score = roc_auc_score(result["conspiratory_truths"], result["conspiratory_preds"])
dt = time.time() - start_time
lr = scheduler.get_last_lr()[0]
print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t val_auc={score:.4f}')