Why am I still running out of GPU memory even after deleting everything and clearing the cache?

I’m trying to run inference on a small set of 100 prompts using the below code, but keep getting GPU out of memory exceptions after only 6 examples, despite deleting all variables and clearing the cache after every iteration:


with torch.no_grad():
  for prompt in tqdm(prompts):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to('cuda')
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=None,
        temperature=None,
        top_p=None,
        use_cache=True,
        eos_token_id=0,
        pad_token_id=0,
        return_dict_in_generate=False,
    ).to('cpu')

    
    
    text = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    prompt_length = len(
            tokenizer.decode(
                inputs[0].ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
        )
    

    new_text = text[prompt_length:]
    responses.append(new_text)

    del inputs
    del outputs
    del text
    del prompt_length
    del new_text

    torch.cuda.empty_cache()

  0%|          | 0/100 [00:00<?, ?it/s]/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
  warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
  6%|▌         | 6/100 [00:30<08:00,  5.11s/it]
---------------------------------------------------------------------------
OutOfMemoryError                          Traceback (most recent call last)
<ipython-input-9-b5760b1fea10> in <cell line: 3>()
      4   for prompt in tqdm(prompts):
      5     inputs = tokenizer(prompt, return_tensors="pt", padding=True).to('cuda')
----> 6     outputs = model.generate(
      7         **inputs,
      8         max_new_tokens=100,

15 frames
~/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-30b-instruct/2abf1163dd8c9b11f07d805c06e6ec90a1f2037e/attention.py in scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_value, softmax_scale, attn_bias, key_padding_mask, is_causal, dropout_p, training, needs_weights, multiquery)
     39         if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
     40             raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
---> 41         attn_weight = attn_weight + attn_bias
     42     min_val = torch.finfo(q.dtype).min
     43     if key_padding_mask is not None:

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.75 GiB (GPU 0; 39.56 GiB total capacity; 33.09 GiB already allocated; 1.43 GiB free; 36.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF