KV blending#

How to:#

Blend the KV cache using LMCache?

import lmcache_vllm
import torch
from lmcache_vllm.blend_adapter import (append_separator,
                                        combine_input_prompt_chunks)
from lmcache_vllm.vllm import LLM, SamplingParams

torch.multiprocessing.set_start_method('spawn')


def precompute_kv(text_chunk, llm):
    sampling_params_prefix = SamplingParams(temperature=0.0,
                                            top_p=0.95,
                                            max_tokens=1)
    text_chunk = append_separator(text_chunk)
    llm.generate([text_chunk], sampling_params_prefix)


context_files = ["chunk1.txt", "chunk2.txt"]
chunks = []

for context_file in context_files:
    with open(context_file, "r") as fin:
        context = fin.read()
    chunks.append(context)

sys_prompt = "Here's a document from the user: "
question = "Question: What does this document mainly talks about? Answer: "

llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
        gpu_memory_utilization=0.5,
        tensor_parallel_size=1)
sampling_params_generation = SamplingParams(temperature=0.0,
                                            top_p=0.95,
                                            max_tokens=30)

print(
    "-------------- Pre-computing KV cache for the chunks -------------------")
for chunk in chunks:
    precompute_kv(chunk, llm)

print("Running the real query here!")

user_prompt = [sys_prompt, chunks[0], chunks[1], question]
user_prompt = combine_input_prompt_chunks(user_prompt)
outputs = llm.generate(user_prompt, sampling_params_generation)
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Newly generated text: {generated_text!r}")

# Graceful exit
lmcache_vllm.close_lmcache_engine()

Save the code above to a file, e.g., kv_blend.py.

chunk_size: 256
local_device: "cpu"

# Enables KV blending
enable_blending: True

# Whether retrieve() is pipelined or not
pipelined_backend: False

Save the code above to a file, e.g., kv_blend.yaml.

You will also need the following context files, chunk1.txt and chunk2.txt: They can be found here: chunk1.txt and chunk2.txt.

Now you can run the following command to blend the KV cache using LMCache:

$ LMCACHE_CONFIG_FILE=kv_blend.yaml CUDA_VISIBLE_DEVICES=0 python kv_blend.py

Saving the decode cache

Selecting a backend