Basic usage#

LMCache can be launched in multiple ways. Here are some examples:

How to:#

  • Launch a single vLLM instance with LMCache?

import copy
import json
import os
import time

import lmcache_vllm
from lmcache_vllm.vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
context_file = os.path.join(os.pardir, 'ffmpeg.txt')
output_file = "offline_inference_outputs.jsonl"

context_text = None
with open(context_file, 'r') as f:
   context_text = f.read()
assert context_text is not None
tokenizer = AutoTokenizer.from_pretrained(model_name)

context_messages = [
   {
      "role":
      "user",
      "content":
      "I've got a document, "
      f"here's the content:```\n{context_text}\n```."
   },
   {
      "role": "assistant",
      "content": "I've got your document"
   },
]
user_inputs_batch = [
   "Give me a concise description for the format"
   " of ffmpeg command in one line.",
]


def get_context_length(tokenizer, context_messages):
   return len(tokenizer.apply_chat_template(context_messages, tokenize=False))


def gen_prompts(tokenizer, context_messages, user_inputs_of_batch):
   generated_prompts = []
   for user_input in user_inputs_of_batch:
      copyed_context_messages = copy.deepcopy(context_messages)
      copyed_context_messages.append({"role": "user", "content": user_input})
      generated_prompts.append(
            tokenizer.apply_chat_template(copyed_context_messages,
                                          tokenize=False))
   return generated_prompts


def append_outputs(output_file_name, outputs, context_length, time_taken):
   user_inputs = []
   generated_texts = []
   for output in outputs:
      prompt = output.prompt
      user_input = prompt[context_length:]
      user_inputs.append(user_input)
      generated_text = output.outputs[0].text
      generated_texts.append(f"{generated_text!r}")
   json_dict = {
      "user_inputs": user_inputs,
      "generated_texts": generated_texts,
      "time in seconds": time_taken
   }
   with open(output_file_name, "a") as f:
      f.write(json.dumps(json_dict) + '\n')


context_length = get_context_length(tokenizer, context_messages)
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)
prompts = gen_prompts(tokenizer, context_messages, user_inputs_batch)
# Create an LLM.
llm = LLM(model=model_name,
         gpu_memory_utilization=0.8,
         enable_chunked_prefill=False,
         max_model_len=32768)

# Clear output file.
with open(output_file, "w") as f:
   pass

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
t1 = time.perf_counter()
first_outputs = llm.generate(prompts, sampling_params)
t2 = time.perf_counter()
print(f"\n\nFirst request Time: {t2 - t1} seconds\n\n")
append_outputs(output_file, first_outputs, context_length, t2 - t1)
t3 = time.perf_counter()
second_outputs = llm.generate(prompts, sampling_params)
t4 = time.perf_counter()
print(f"\n\nSecond request Time: {t4 - t3} seconds\n\n")
append_outputs(output_file, second_outputs, context_length, t4 - t3)

# Graceful exit
lmcache_vllm.close_lmcache_engine()

Save the code above to a file, e.g., offline_inference.py.

chunk_size: 256
local_device: "cpu"
remote_url: "lm://localhost:65432"
remote_serde: "cachegen"

# Whether retrieve() is pipelined or not
pipelined_backend: False

Save the code above to a file, e.g., offline.yaml.

Now you can run the following command to launch a vLLM instance with LMCache:

$ lmcache_server localhost 65432
$ LMCACHE_CONFIG_FILE=offline.yaml CUDA_VISIBLE_DEVICES=0 python offline_inference.py

Note

The above will need one GPU and will use port 65432 for the LMCache server. You can change the port number if needed.

  • Launch a vLLM instance with LMCache and share the KV cache across multiple vLLM instances?

LMCache can share the KV cache across multiple vLLM instances. LMCache supports sharing KV using the lmcache.server module. First create an example LMCache config file, e.g., example.yaml:

chunk_size: 256
local_device: "cpu"
remote_url: "lm://localhost:65432"
remote_serde: "cachegen"

# Whether retrieve() is pipelined or not
pipelined_backend: False

Then, start the LMCache server and multiple vLLM instances with the LMCache config file.

Here is a quick example:

# Start lmcache server
$ lmcache_server localhost 65432

# Then, start two vLLM instances with the LMCache config file

# start the first vLLM instance
$ LMCACHE_CONFIG_FILE=example.yaml CUDA_VISIBLE_DEVICES=0 lmcache_vllm serve lmsys/longchat-7b-16k --gpu-memory-utilization 0.8 --port 8000

# start the second vLLM instance
$ LMCACHE_CONFIG_FILE=example.yaml CUDA_VISIBLE_DEVICES=1 lmcache_vllm serve lmsys/longchat-7b-16k --gpu-memory-utilization 0.8 --port 8001