I am currently experimenting with modifying the KV cache of the LLaVA model in order to perform controlled interventions during generation (similar to cache-steering methods in recent research). The goal is to alter the cached key-value tensors after the prefill phase and then continue decoding from the modified cache.
However, whenever I try to resume generation using model.generate() with my modified past_key_values, I consistently encounter the following error:
Code:
def generate_with_steering(model, processor, image, prompt_text, steering_k_list, steering_v_list, coeff_k, coeff_v):
"""
Generates a caption with one-shot KV cache steering, as described in the paper.[1]
"""
prompt = f"USER: <image>\n{prompt_text}\nASSISTANT:"
inputs = processor(text=prompt, images=image, return_tensors='pt').to("cuda", torch.float16)
# 1. Prefill the KV cache by running a forward pass
with torch.no_grad():
outputs = model(**inputs, use_cache=True)
past_key_values = outputs.past_key_values
# 2. Modify the KV cache object
for i in range(len(steering_k_list)):
k, v = past_key_values[i]
num_heads = k.shape[1]
head_dim = k.shape[3]
reshaped_k = steering_k_list[i].reshape(num_heads, head_dim)
reshaped_v = steering_v_list[i].reshape(num_heads, head_dim)
# Apply the steering vector IN-PLACE to the cache of the *last* token
# This modifies the tensors *inside* the past_key_values object
k[0, :, -1, :] += coeff_k * reshaped_k
v[0, :, -1, :] += coeff_v * reshaped_v
# 3. Generate text using
output = model.generate(
input_ids=inputs['input_ids'][:, -1:],
past_key_values=past_key_values, # Pass the original, modified Cache object
max_new_tokens=100,
do_sample=False
)
full_response_list = processor.batch_decode(output, skip_special_tokens=True)
# The output from generate() when using past_key_values might not include the prompt
return full_response_list[0].strip()
Error:
--- 2. Generating WITH STEERING (k_coeff=0.1, v_coeff=2.0) ---
Traceback (most recent call last):
File "/home/gpuuser3/Pulkit/CACHE_STEERING/kv-steering-for-vlm/src/verify_steering_vector.py", line 138, in <module>
steered_caption = generate_with_steering(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/gpuuser3/Pulkit/CACHE_STEERING/kv-steering-for-vlm/src/verify_steering_vector.py", line 85, in generate_with_steering
output = model.generate(
^^^^^^^^^^^^^^^
File "/home/gpuuser3/.pyenv/versions/kv-steering-vlm/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/gpuuser3/.pyenv/versions/kv-steering-vlm/lib/python3.11/site-packages/transformers/generation/utils.py", line 2564, in generate
result = decoding_method(
^^^^^^^^^^^^^^^^
File "/home/gpuuser3/.pyenv/versions/kv-steering-vlm/lib/python3.11/site-packages/transformers/generation/utils.py", line 2781, in _sample
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/gpuuser3/.pyenv/versions/kv-steering-vlm/lib/python3.11/site-packages/transformers/models/llava/modeling_llava.py", line 466, in prepare_inputs_for_generation
model_inputs = super().prepare_inputs_for_generation(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/gpuuser3/.pyenv/versions/kv-steering-vlm/lib/python3.11/site-packages/transformers/generation/utils.py", line 574, in prepare_inputs_for_generation
inputs_embeds, input_ids = self._cache_dependant_input_preparation(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/gpuuser3/.pyenv/versions/kv-steering-vlm/lib/python3.11/site-packages/transformers/generation/utils.py", line 476, in _cache_dependant_input_preparation
or (cache_position[-1] >= input_ids.shape[1]) # Exception 3
~~~~~~~~~~~~~~^^^^
IndexError: index -1 is out of bounds for dimension 0 with size 0already be answered!