lmcache.storage_backend.serde package#

Submodules#

lmcache.storage_backend.serde.cachegen_basics module#

class lmcache.storage_backend.serde.cachegen_basics.CacheGenConfig(nlayers: int, kspecs: List[lmcache.storage_backend.serde.cachegen_basics.QuantizationSpec], vspecs: List[lmcache.storage_backend.serde.cachegen_basics.QuantizationSpec])[source]#
static from_model_name(model_name: str) CacheGenConfig[source]#
kspecs: List[QuantizationSpec]#
nlayers: int#
vspecs: List[QuantizationSpec]#
class lmcache.storage_backend.serde.cachegen_basics.CacheGenEncoderOutput(bytestream: bytes, start_indices: torch.Tensor, cdf: torch.Tensor, max_tensors_key: torch.Tensor, max_tensors_value: torch.Tensor, num_heads: int, head_size: int)[source]#
bytestream: bytes#
cdf: Tensor#
static from_bytes(bs: bytes) CacheGenEncoderOutput[source]#
head_size: int#
max_tensors_key: Tensor#
max_tensors_value: Tensor#
num_heads: int#
start_indices: Tensor#
to_bytes() bytes[source]#

Save the output to a file

class lmcache.storage_backend.serde.cachegen_basics.CacheGenGPUBytestream(bytestream: torch.Tensor, bytestream_lengths: torch.Tensor, ntokens: int)[source]#
bytestream: Tensor#
bytestream_lengths: Tensor#
ntokens: int#
class lmcache.storage_backend.serde.cachegen_basics.CacheGenGPUEncoderOutput(data_chunks: List[lmcache.storage_backend.serde.cachegen_basics.CacheGenGPUBytestream], cdf: torch.Tensor, max_tensors_key: torch.Tensor, max_tensors_value: torch.Tensor, num_heads: int, head_size: int)[source]#
cdf: Tensor#
data_chunks: List[CacheGenGPUBytestream]#
debug_print_device()[source]#
static from_bytes(bs: bytes) CacheGenGPUEncoderOutput[source]#
head_size: int#
max_tensors_key: Tensor#
max_tensors_value: Tensor#
num_heads: int#
to_bytes() bytes[source]#

Save the output to a file

class lmcache.storage_backend.serde.cachegen_basics.QuantizationSpec(start_layer: int, end_layer: int, bins: int)[source]#
bins: int#
end_layer: int#
start_layer: int#

lmcache.storage_backend.serde.cachegen_decoder module#

class lmcache.storage_backend.serde.cachegen_decoder.CacheGenDeserializer(config: LMCacheEngineConfig, metadata: LMCacheEngineMetadata, dtype)[source]#

Bases: Deserializer

from_bytes(bs: bytes) Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:

bytes: a stream of bytes

Output:

torch.Tensor: the deserialized pytorch tensor

get_output_buffer(nlayers: int, nchannels: int, ntokens: int)[source]#
make_key_bins(config: CacheGenConfig) Tensor[source]#
make_value_bins(config: CacheGenConfig) Tensor[source]#
lmcache.storage_backend.serde.cachegen_decoder.decode_chunk(cdf: Tensor, data_chunk: CacheGenGPUBytestream, target_buffer: Tensor) None[source]#

Write the decode output in target_buffer Expected shape: [nlayers (kv in total), ntokens, nchannels]

lmcache.storage_backend.serde.cachegen_decoder.decode_function_gpu(cdf: Tensor, data_chunks: List[CacheGenGPUBytestream], layers_in_key: int, chunk_size: int, output: Tensor)[source]#

Given the path to the encoded KV bytestream, decode the KV cache

Inputs:

cdf: the cdf tensor, in shape [2 * nlayers, nchannels, bins + 1] data_chunks: the data_chunks in the encoder’s output layers_in_key: number of layers in K (or V) (K/V should have the same number of layers) chunk_size: the chunk_size output: output buffer, in shape [ntokens, 2 * nlayers * nchannels]

Outputs:

key: the decoded key tensor in the shape of (layers, tokens, nchannels) value: the decoded value tensor in the shape of (layers, tokens, nchannels)

lmcache.storage_backend.serde.cachegen_decoder.do_dequantize(t: Tensor, bins: Tensor, maxtensors: Tensor)[source]#

t: [nlayers, ntokens, nchannels] bins: [nlayers] maxtensors: [nlayers, ntokens, 1]

lmcache.storage_backend.serde.cachegen_decoder.quant(bins: int, xq: Tensor, max1: float)[source]#
lmcache.storage_backend.serde.cachegen_decoder.recombine_bytes(bytes_tensor, output_lengths) Tensor[source]#

lmcache.storage_backend.serde.cachegen_encoder module#

class lmcache.storage_backend.serde.cachegen_encoder.CacheGenEncoderImpl(**kwargs)[source]#
compute_cdf(is_key)[source]#

Compute the CDF based on the quantized tensors Field: - start_layer: the start layer to compute the CDF - end_layer: the end layer to compute the CDF

quantize()[source]#

Quantize the key and value tensors (self.fp_k and self.fp_v)

class lmcache.storage_backend.serde.cachegen_encoder.CacheGenSerializer(config: LMCacheEngineConfig, metadata: LMCacheEngineMetadata)[source]#

Bases: Serializer

make_key_bins(config: CacheGenConfig) Tensor[source]#
make_value_bins(config: CacheGenConfig) Tensor[source]#
to_bytes(tensor: Tensor) bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:
t: the input pytorch tensor, can be on any device, in any shape,

with any dtype

Returns:

the serialized bytes

Return type:

bytes

lmcache.storage_backend.serde.cachegen_encoder.collect_bytes(output_buffer, output_lengths) Tensor[source]#

Collect a byte tensor from the output_buffer + output_lengths

lmcache.storage_backend.serde.cachegen_encoder.concat_max(max1)[source]#

Given a dict of max tensors, concatenate them into a single tensor

lmcache.storage_backend.serde.cachegen_encoder.encode_function(kv: Tensor, config: CacheGenConfig, key_bins: Tensor, value_bins: Tensor, chunk_size: int) CacheGenGPUEncoderOutput[source]#

Given the path to the original key value cache, encode the KV cache

lmcache.storage_backend.serde.cachegen_encoder.encode_ntokens(cdf_int, encode_input, output_buffer, output_lengths) Tensor[source]#

Encode a batch of ntokens.

Parameters:
  • cdf_int – int16 tensor on GPU with shape [nlayers, nchannels, Lp]

  • encode_input – int8 tensor on GPU with shape

:param [nlayers, ntokens, nchannels] :param output_buffer: uint8 tensor on GPU with shape

[nlayers, nchannels, BUFFER_SIZE]

Parameters:

output_lengths – int32 tensor on GPU with shape [nlayers, nchannels]

Return byte_tensor:

the byte tensor

lmcache.storage_backend.serde.cachegen_encoder.torch_quant(bins: int, qA: Tensor) Tuple[Tensor, Tensor][source]#

Quantize a float tensor to fixed number of bins

Input:

bins: number of bins qA: the input tensor

Returns:

the quantized tensor, in float32 max1: the maximum value of the tensor

Return type:

xq

lmcache.storage_backend.serde.cachegen_encoder.torch_quant_vectorized(bins: Tensor, input_groups: Tensor) Tuple[Tensor, Tensor][source]#

Quantize each group of a tensor to fixed number of bins

Input:

bins: number of bins for different layers, with shape [nlayer] input_groups: with shape [nlayers, ntokens, nchannels]

Returns:

[nlayers, ntokens, nchannels] maxes: [nlayers, ntokens, 1]

Return type:

quantized groups

lmcache.storage_backend.serde.fast_serde module#

class lmcache.storage_backend.serde.fast_serde.FastDeserializer(dtype)[source]#

Bases: Deserializer

from_bytes(b: bytes) Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:

bytes: a stream of bytes

Output:

torch.Tensor: the deserialized pytorch tensor

from_bytes_normal(b: bytes) Tensor[source]#
class lmcache.storage_backend.serde.fast_serde.FastSerializer[source]#

Bases: Serializer

to_bytes(t: Tensor) bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:
t: the input pytorch tensor, can be on any device, in any shape,

with any dtype

Returns:

the serialized bytes

Return type:

bytes

lmcache.storage_backend.serde.safe_serde module#

class lmcache.storage_backend.serde.safe_serde.SafeDeserializer(dtype)[source]#

Bases: Deserializer

from_bytes(b: bytearray | bytes) Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:

bytes: a stream of bytes

Output:

torch.Tensor: the deserialized pytorch tensor

from_bytes_normal(b: bytearray | bytes) Tensor[source]#
class lmcache.storage_backend.serde.safe_serde.SafeSerializer[source]#

Bases: Serializer

to_bytes(t: Tensor) bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:
t: the input pytorch tensor, can be on any device, in any shape,

with any dtype

Returns:

the serialized bytes

Return type:

bytes

lmcache.storage_backend.serde.serde module#

class lmcache.storage_backend.serde.serde.Deserializer(dtype)[source]#
abstract from_bytes(bs: bytes) Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:

bytes: a stream of bytes

Output:

torch.Tensor: the deserialized pytorch tensor

class lmcache.storage_backend.serde.serde.DeserializerDebugWrapper(d: Deserializer)[source]#

Bases: Deserializer

from_bytes(t: bytes) Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:

bytes: a stream of bytes

Output:

torch.Tensor: the deserialized pytorch tensor

class lmcache.storage_backend.serde.serde.Serializer[source]#
abstract to_bytes(t: Tensor) bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:
t: the input pytorch tensor, can be on any device, in any shape,

with any dtype

Returns:

the serialized bytes

Return type:

bytes

class lmcache.storage_backend.serde.serde.SerializerDebugWrapper(s: Serializer)[source]#

Bases: Serializer

to_bytes(t: Tensor) bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:
t: the input pytorch tensor, can be on any device, in any shape,

with any dtype

Returns:

the serialized bytes

Return type:

bytes

lmcache.storage_backend.serde.torch_serde module#

class lmcache.storage_backend.serde.torch_serde.TorchDeserializer(dtype)[source]#

Bases: Deserializer

from_bytes(b: bytes) Tensor[source]#

Deserialize a pytorch tensor from bytes.

Input:

bytes: a stream of bytes

Output:

torch.Tensor: the deserialized pytorch tensor

from_bytes_normal(b: bytes) Tensor[source]#
class lmcache.storage_backend.serde.torch_serde.TorchSerializer[source]#

Bases: Serializer

to_bytes(t: Tensor) bytes[source]#

Serialize a pytorch tensor to bytes. The serialized bytes should contain both the data and the metadata (shape, dtype, etc.) of the tensor.

Input:
t: the input pytorch tensor, can be on any device, in any shape,

with any dtype

Returns:

the serialized bytes

Return type:

bytes