I’m trying to run inference on a small set of 100 prompts using the below code, but keep getting GPU out of memory exceptions after only 6 examples, despite deleting all variables and clearing the cache after every iteration:
with torch.no_grad():
for prompt in tqdm(prompts):
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to('cuda')
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=None,
temperature=None,
top_p=None,
use_cache=True,
eos_token_id=0,
pad_token_id=0,
return_dict_in_generate=False,
).to('cpu')
text = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
prompt_length = len(
tokenizer.decode(
inputs[0].ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
)
new_text = text[prompt_length:]
responses.append(new_text)
del inputs
del outputs
del text
del prompt_length
del new_text
torch.cuda.empty_cache()
0%| | 0/100 [00:00<?, ?it/s]/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
6%|▌ | 6/100 [00:30<08:00, 5.11s/it]
---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
<ipython-input-9-b5760b1fea10> in <cell line: 3>()
4 for prompt in tqdm(prompts):
5 inputs = tokenizer(prompt, return_tensors="pt", padding=True).to('cuda')
----> 6 outputs = model.generate(
7 **inputs,
8 max_new_tokens=100,
15 frames
~/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-30b-instruct/2abf1163dd8c9b11f07d805c06e6ec90a1f2037e/attention.py in scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_value, softmax_scale, attn_bias, key_padding_mask, is_causal, dropout_p, training, needs_weights, multiquery)
39 if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
40 raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
---> 41 attn_weight = attn_weight + attn_bias
42 min_val = torch.finfo(q.dtype).min
43 if key_padding_mask is not None:
OutOfMemoryError: CUDA out of memory. Tried to allocate 1.75 GiB (GPU 0; 39.56 GiB total capacity; 33.09 GiB already allocated; 1.43 GiB free; 36.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF