Skip to content

vllm.v1.kv_offload.spec

logger module-attribute

logger = init_logger(__name__)

OffloadingSpec

Bases: ABC

Spec for an offloading connector

Source code in vllm/v1/kv_offload/spec.py
class OffloadingSpec(ABC):
    """Spec for an offloading connector"""

    def __init__(self, vllm_config: "VllmConfig"):
        logger.warning(
            "Initializing OffloadingSpec. This API is experimental and "
            "subject to change in the future as we iterate the design.")
        self.vllm_config = vllm_config

        kv_transfer_config = vllm_config.kv_transfer_config
        assert kv_transfer_config is not None
        self.extra_config = kv_transfer_config.kv_connector_extra_config

        self.gpu_block_size = vllm_config.cache_config.block_size
        self.offloaded_block_size = int(
            self.extra_config.get("block_size", self.gpu_block_size))

        assert self.offloaded_block_size % self.gpu_block_size == 0

    @abstractmethod
    def get_manager(self) -> OffloadingManager:
        """
        Get an OffloadingManager that will be used
        by the scheduler-side offloading connector to track
        offloaded blocks and manage evictions.
        """
        pass

    @abstractmethod
    def get_handlers(
        self, kv_caches: dict[str, torch.Tensor]
    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
                        OffloadingHandler]]:
        """
        Get offloading handlers along with their respective src and dst types.

        Args:
            kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor.

        Yields:
            Tuples of (src_type, dst_type, offloading_handler).
        """
        pass

extra_config instance-attribute

extra_config = kv_connector_extra_config

gpu_block_size instance-attribute

gpu_block_size = block_size

offloaded_block_size instance-attribute

offloaded_block_size = int(
    get("block_size", gpu_block_size)
)

vllm_config instance-attribute

vllm_config = vllm_config

__init__

__init__(vllm_config: VllmConfig)
Source code in vllm/v1/kv_offload/spec.py
def __init__(self, vllm_config: "VllmConfig"):
    logger.warning(
        "Initializing OffloadingSpec. This API is experimental and "
        "subject to change in the future as we iterate the design.")
    self.vllm_config = vllm_config

    kv_transfer_config = vllm_config.kv_transfer_config
    assert kv_transfer_config is not None
    self.extra_config = kv_transfer_config.kv_connector_extra_config

    self.gpu_block_size = vllm_config.cache_config.block_size
    self.offloaded_block_size = int(
        self.extra_config.get("block_size", self.gpu_block_size))

    assert self.offloaded_block_size % self.gpu_block_size == 0

get_handlers abstractmethod

get_handlers(
    kv_caches: dict[str, Tensor],
) -> Iterator[
    tuple[
        type[LoadStoreSpec],
        type[LoadStoreSpec],
        OffloadingHandler,
    ]
]

Get offloading handlers along with their respective src and dst types.

Parameters:

Name Type Description Default
kv_caches dict[str, Tensor]

A dictionary of layer_name -> gpu_kv_cache tensor.

required

Yields:

Type Description
tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]

Tuples of (src_type, dst_type, offloading_handler).

Source code in vllm/v1/kv_offload/spec.py
@abstractmethod
def get_handlers(
    self, kv_caches: dict[str, torch.Tensor]
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
                    OffloadingHandler]]:
    """
    Get offloading handlers along with their respective src and dst types.

    Args:
        kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor.

    Yields:
        Tuples of (src_type, dst_type, offloading_handler).
    """
    pass

get_manager abstractmethod

get_manager() -> OffloadingManager

Get an OffloadingManager that will be used by the scheduler-side offloading connector to track offloaded blocks and manage evictions.

Source code in vllm/v1/kv_offload/spec.py
@abstractmethod
def get_manager(self) -> OffloadingManager:
    """
    Get an OffloadingManager that will be used
    by the scheduler-side offloading connector to track
    offloaded blocks and manage evictions.
    """
    pass