lmcache.storage_backend.serde package#

Submodules#

lmcache.storage_backend.serde.cachegen_basics module#

class lmcache.storage_backend.serde.cachegen_basics.CacheGenConfig(nlayers: int, kspecs: List[lmcache.storage_backend.serde.cachegen_basics.QuantizationSpec], vspecs: List[lmcache.storage_backend.serde.cachegen_basics.QuantizationSpec])[source]#

static from_model_name(model_name: str) → CacheGenConfig[source]#

kspecs: List[QuantizationSpec]#

nlayers: int#

vspecs: List[QuantizationSpec]#

class lmcache.storage_backend.serde.cachegen_basics.CacheGenEncoderOutput(bytestream: bytes, start_indices: torch.Tensor, cdf: torch.Tensor, max_tensors_key: torch.Tensor, max_tensors_value: torch.Tensor, num_heads: int, head_size: int)[source]#

bytestream: bytes#

cdf: Tensor#

static from_bytes(bs: bytes) → CacheGenEncoderOutput[source]#

head_size: int#

max_tensors_key: Tensor#

max_tensors_value: Tensor#

num_heads: int#

start_indices: Tensor#

to_bytes() → bytes[source]#: Save the output to a file

class lmcache.storage_backend.serde.cachegen_basics.CacheGenGPUBytestream(bytestream: torch.Tensor, bytestream_lengths: torch.Tensor, ntokens: int)[source]#

bytestream: Tensor#

bytestream_lengths: Tensor#

ntokens: int#

class lmcache.storage_backend.serde.cachegen_basics.CacheGenGPUEncoderOutput(data_chunks: List[lmcache.storage_backend.serde.cachegen_basics.CacheGenGPUBytestream], cdf: torch.Tensor, max_tensors_key: torch.Tensor, max_tensors_value: torch.Tensor, num_heads: int, head_size: int)[source]#

cdf: Tensor#

data_chunks: List[CacheGenGPUBytestream]#

debug_print_device()[source]#

static from_bytes(bs: bytes) → CacheGenGPUEncoderOutput[source]#

head_size: int#

max_tensors_key: Tensor#

max_tensors_value: Tensor#

num_heads: int#

to_bytes() → bytes[source]#: Save the output to a file

class lmcache.storage_backend.serde.cachegen_basics.QuantizationSpec(start_layer: int, end_layer: int, bins: int)[source]#

bins: int#

end_layer: int#

start_layer: int#

lmcache.storage_backend.serde.cachegen_decoder module#

class lmcache.storage_backend.serde.cachegen_decoder.CacheGenDeserializer(config: LMCacheEngineConfig, metadata: LMCacheEngineMetadata, dtype)[source]#

Bases: Deserializer

from_bytes(bs: bytes) → Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:: bytes: a stream of bytes
Output:: torch.Tensor: the deserialized pytorch tensor

get_output_buffer(nlayers: int, nchannels: int, ntokens: int)[source]#

make_key_bins(config: CacheGenConfig) → Tensor[source]#

make_value_bins(config: CacheGenConfig) → Tensor[source]#

lmcache.storage_backend.serde.cachegen_decoder.decode_chunk(cdf: Tensor, data_chunk: CacheGenGPUBytestream, target_buffer: Tensor) → None[source]#: Write the decode output in target_buffer Expected shape: [nlayers (kv in total), ntokens, nchannels]

lmcache.storage_backend.serde.cachegen_decoder.decode_function_gpu(cdf: Tensor, data_chunks: List[CacheGenGPUBytestream], layers_in_key: int, chunk_size: int, output: Tensor)[source]#

Given the path to the encoded KV bytestream, decode the KV cache

Inputs:: cdf: the cdf tensor, in shape [2 * nlayers, nchannels, bins + 1] data_chunks: the data_chunks in the encoder’s output layers_in_key: number of layers in K (or V) (K/V should have the same number of layers) chunk_size: the chunk_size output: output buffer, in shape [ntokens, 2 * nlayers * nchannels]
Outputs:: key: the decoded key tensor in the shape of (layers, tokens, nchannels) value: the decoded value tensor in the shape of (layers, tokens, nchannels)

lmcache.storage_backend.serde.cachegen_decoder.do_dequantize(t: Tensor, bins: Tensor, maxtensors: Tensor)[source]#: t: [nlayers, ntokens, nchannels] bins: [nlayers] maxtensors: [nlayers, ntokens, 1]

lmcache.storage_backend.serde.cachegen_decoder.quant(bins: int, xq: Tensor, max1: float)[source]#

lmcache.storage_backend.serde.cachegen_decoder.recombine_bytes(bytes_tensor, output_lengths) → Tensor[source]#

lmcache.storage_backend.serde.cachegen_encoder module#

class lmcache.storage_backend.serde.cachegen_encoder.CacheGenEncoderImpl(**kwargs)[source]#

compute_cdf(is_key)[source]#: Compute the CDF based on the quantized tensors Field: - start_layer: the start layer to compute the CDF - end_layer: the end layer to compute the CDF

quantize()[source]#: Quantize the key and value tensors (self.fp_k and self.fp_v)

class lmcache.storage_backend.serde.cachegen_encoder.CacheGenSerializer(config: LMCacheEngineConfig, metadata: LMCacheEngineMetadata)[source]#

Bases: Serializer

make_key_bins(config: CacheGenConfig) → Tensor[source]#

make_value_bins(config: CacheGenConfig) → Tensor[source]#

to_bytes(tensor: Tensor) → bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:

t: the input pytorch tensor, can be on any device, in any shape,: with any dtype

Returns:: the serialized bytes
Return type:: bytes

lmcache.storage_backend.serde.cachegen_encoder.collect_bytes(output_buffer, output_lengths) → Tensor[source]#: Collect a byte tensor from the output_buffer + output_lengths

lmcache.storage_backend.serde.cachegen_encoder.concat_max(max1)[source]#: Given a dict of max tensors, concatenate them into a single tensor

lmcache.storage_backend.serde.cachegen_encoder.encode_function(kv: Tensor, config: CacheGenConfig, key_bins: Tensor, value_bins: Tensor, chunk_size: int) → CacheGenGPUEncoderOutput[source]#: Given the path to the original key value cache, encode the KV cache

lmcache.storage_backend.serde.cachegen_encoder.encode_ntokens(cdf_int, encode_input, output_buffer, output_lengths) → Tensor[source]#

Encode a batch of ntokens.

Parameters:

cdf_int – int16 tensor on GPU with shape [nlayers, nchannels, Lp]
encode_input – int8 tensor on GPU with shape

:param [nlayers, ntokens, nchannels] :param output_buffer: uint8 tensor on GPU with shape

[nlayers, nchannels, BUFFER_SIZE]

Parameters:: output_lengths – int32 tensor on GPU with shape [nlayers, nchannels]
Return byte_tensor:: the byte tensor

lmcache.storage_backend.serde.cachegen_encoder.torch_quant(bins: int, qA: Tensor) → Tuple[Tensor, Tensor][source]#

Quantize a float tensor to fixed number of bins

Input:: bins: number of bins qA: the input tensor

Returns:: the quantized tensor, in float32 max1: the maximum value of the tensor
Return type:: xq

lmcache.storage_backend.serde.cachegen_encoder.torch_quant_vectorized(bins: Tensor, input_groups: Tensor) → Tuple[Tensor, Tensor][source]#

Quantize each group of a tensor to fixed number of bins

Input:: bins: number of bins for different layers, with shape [nlayer] input_groups: with shape [nlayers, ntokens, nchannels]

Returns:: [nlayers, ntokens, nchannels] maxes: [nlayers, ntokens, 1]
Return type:: quantized groups

lmcache.storage_backend.serde.fast_serde module#

class lmcache.storage_backend.serde.fast_serde.FastDeserializer(dtype)[source]#

Bases: Deserializer

from_bytes(b: bytes) → Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:: bytes: a stream of bytes
Output:: torch.Tensor: the deserialized pytorch tensor

from_bytes_normal(b: bytes) → Tensor[source]#

class lmcache.storage_backend.serde.fast_serde.FastSerializer[source]#

Bases: Serializer

to_bytes(t: Tensor) → bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:

t: the input pytorch tensor, can be on any device, in any shape,: with any dtype

Returns:: the serialized bytes
Return type:: bytes

lmcache.storage_backend.serde.safe_serde module#

class lmcache.storage_backend.serde.safe_serde.SafeDeserializer(dtype)[source]#

Bases: Deserializer

from_bytes(b: bytearray | bytes) → Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:: bytes: a stream of bytes
Output:: torch.Tensor: the deserialized pytorch tensor

from_bytes_normal(b: bytearray | bytes) → Tensor[source]#

class lmcache.storage_backend.serde.safe_serde.SafeSerializer[source]#

Bases: Serializer

to_bytes(t: Tensor) → bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:

t: the input pytorch tensor, can be on any device, in any shape,: with any dtype

Returns:: the serialized bytes
Return type:: bytes

lmcache.storage_backend.serde.serde module#

class lmcache.storage_backend.serde.serde.Deserializer(dtype)[source]#

abstract from_bytes(bs: bytes) → Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:: bytes: a stream of bytes
Output:: torch.Tensor: the deserialized pytorch tensor

class lmcache.storage_backend.serde.serde.DeserializerDebugWrapper(d: Deserializer)[source]#

Bases: Deserializer

from_bytes(t: bytes) → Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:: bytes: a stream of bytes
Output:: torch.Tensor: the deserialized pytorch tensor

class lmcache.storage_backend.serde.serde.Serializer[source]#

abstract to_bytes(t: Tensor) → bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:

t: the input pytorch tensor, can be on any device, in any shape,: with any dtype

Returns:: the serialized bytes
Return type:: bytes

class lmcache.storage_backend.serde.serde.SerializerDebugWrapper(s: Serializer)[source]#

Bases: Serializer

to_bytes(t: Tensor) → bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:

t: the input pytorch tensor, can be on any device, in any shape,: with any dtype

Returns:: the serialized bytes
Return type:: bytes

lmcache.storage_backend.serde.torch_serde module#

class lmcache.storage_backend.serde.torch_serde.TorchDeserializer(dtype)[source]#

Bases: Deserializer

from_bytes(b: bytes) → Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:: bytes: a stream of bytes
Output:: torch.Tensor: the deserialized pytorch tensor

from_bytes_normal(b: bytes) → Tensor[source]#

class lmcache.storage_backend.serde.torch_serde.TorchSerializer[source]#

Bases: Serializer

to_bytes(t: Tensor) → bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:

t: the input pytorch tensor, can be on any device, in any shape,: with any dtype

Returns:: the serialized bytes
Return type:: bytes

lmcache.storage_backend.connector package

lmcache.storage_backend.evictor package