Skip to content

vllm.v1.outputs

EMPTY_MODEL_RUNNER_OUTPUT module-attribute

EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
    req_ids=[],
    req_id_to_index={},
    sampled_token_ids=[],
    logprobs=None,
    prompt_logprobs_dict={},
    pooler_output=[],
    num_nans_in_logits=None,
)

PoolerOutput module-attribute

PoolerOutput = Union[Tensor, list[Tensor]]

AsyncModelRunnerOutput

Bases: ABC

Source code in vllm/v1/outputs.py
class AsyncModelRunnerOutput(ABC):

    @abstractmethod
    def get_output(self) -> ModelRunnerOutput:
        """Get the ModelRunnerOutput for this async output.

        This is a blocking call that waits until the results are ready, which
        might involve copying device tensors to the host.
        This method should only be called once per AsyncModelRunnerOutput.
        """
        pass

get_output abstractmethod

get_output() -> ModelRunnerOutput

Get the ModelRunnerOutput for this async output.

This is a blocking call that waits until the results are ready, which might involve copying device tensors to the host. This method should only be called once per AsyncModelRunnerOutput.

Source code in vllm/v1/outputs.py
@abstractmethod
def get_output(self) -> ModelRunnerOutput:
    """Get the ModelRunnerOutput for this async output.

    This is a blocking call that waits until the results are ready, which
    might involve copying device tensors to the host.
    This method should only be called once per AsyncModelRunnerOutput.
    """
    pass

DraftTokenIds dataclass

Source code in vllm/v1/outputs.py
@dataclass
class DraftTokenIds:

    # [num_reqs]
    req_ids: list[str]
    # num_reqs x num_draft_tokens
    draft_token_ids: list[list[int]]

draft_token_ids instance-attribute

draft_token_ids: list[list[int]]

req_ids instance-attribute

req_ids: list[str]

__init__

__init__(
    req_ids: list[str], draft_token_ids: list[list[int]]
) -> None

KVConnectorOutput dataclass

Source code in vllm/v1/outputs.py
@dataclass
class KVConnectorOutput:
    # [req_ids]
    finished_sending: Optional[set[str]] = None
    finished_recving: Optional[set[str]] = None
    kv_connector_stats: Optional["KVConnectorStats"] = None

    def is_empty(self):
        return (not self.finished_sending and not self.finished_recving
                and not self.kv_connector_stats)

finished_recving class-attribute instance-attribute

finished_recving: Optional[set[str]] = None

finished_sending class-attribute instance-attribute

finished_sending: Optional[set[str]] = None

kv_connector_stats class-attribute instance-attribute

kv_connector_stats: Optional[KVConnectorStats] = None

__init__

__init__(
    finished_sending: Optional[set[str]] = None,
    finished_recving: Optional[set[str]] = None,
    kv_connector_stats: Optional[KVConnectorStats] = None,
) -> None

is_empty

is_empty()
Source code in vllm/v1/outputs.py
def is_empty(self):
    return (not self.finished_sending and not self.finished_recving
            and not self.kv_connector_stats)

LogprobsLists

Bases: NamedTuple

Source code in vllm/v1/outputs.py
class LogprobsLists(NamedTuple):

    # [num_reqs, max_num_logprobs + 1]
    logprob_token_ids: list[list[int]]
    # [num_reqs, max_num_logprobs + 1]
    logprobs: list[list[float]]
    # [num_reqs]
    sampled_token_ranks: list[int]

    def slice(self, start: int, end: int):
        return LogprobsLists(
            self.logprob_token_ids[start:end],
            self.logprobs[start:end],
            self.sampled_token_ranks[start:end],
        )

logprob_token_ids instance-attribute

logprob_token_ids: list[list[int]]

logprobs instance-attribute

logprobs: list[list[float]]

sampled_token_ranks instance-attribute

sampled_token_ranks: list[int]

slice

slice(start: int, end: int)
Source code in vllm/v1/outputs.py
def slice(self, start: int, end: int):
    return LogprobsLists(
        self.logprob_token_ids[start:end],
        self.logprobs[start:end],
        self.sampled_token_ranks[start:end],
    )

LogprobsTensors

Bases: NamedTuple

Source code in vllm/v1/outputs.py
class LogprobsTensors(NamedTuple):

    # [num_reqs, max_num_logprobs + 1]
    logprob_token_ids: torch.Tensor
    # [num_reqs, max_num_logprobs + 1]
    logprobs: torch.Tensor
    # [num_reqs]
    selected_token_ranks: torch.Tensor

    def tolists(self):
        return LogprobsLists(
            self.logprob_token_ids.tolist(),
            self.logprobs.tolist(),
            self.selected_token_ranks.tolist(),
        )

    @staticmethod
    def empty_cpu(num_positions: int,
                  num_tokens_per_position: int) -> "LogprobsTensors":
        """Create empty LogprobsTensors on CPU."""

        logprob_token_ids = torch.empty(
            (num_positions, num_tokens_per_position),
            dtype=torch.int32,
            device="cpu")
        logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
        selected_token_ranks = torch.empty(num_positions,
                                           dtype=torch.int32,
                                           device="cpu")
        return LogprobsTensors(
            logprob_token_ids=logprob_token_ids,
            logprobs=logprobs,
            selected_token_ranks=selected_token_ranks,
        )

logprob_token_ids instance-attribute

logprob_token_ids: Tensor

logprobs instance-attribute

logprobs: Tensor

selected_token_ranks instance-attribute

selected_token_ranks: Tensor

empty_cpu staticmethod

empty_cpu(
    num_positions: int, num_tokens_per_position: int
) -> LogprobsTensors

Create empty LogprobsTensors on CPU.

Source code in vllm/v1/outputs.py
@staticmethod
def empty_cpu(num_positions: int,
              num_tokens_per_position: int) -> "LogprobsTensors":
    """Create empty LogprobsTensors on CPU."""

    logprob_token_ids = torch.empty(
        (num_positions, num_tokens_per_position),
        dtype=torch.int32,
        device="cpu")
    logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
    selected_token_ranks = torch.empty(num_positions,
                                       dtype=torch.int32,
                                       device="cpu")
    return LogprobsTensors(
        logprob_token_ids=logprob_token_ids,
        logprobs=logprobs,
        selected_token_ranks=selected_token_ranks,
    )

tolists

tolists()
Source code in vllm/v1/outputs.py
def tolists(self):
    return LogprobsLists(
        self.logprob_token_ids.tolist(),
        self.logprobs.tolist(),
        self.selected_token_ranks.tolist(),
    )

ModelRunnerOutput dataclass

Source code in vllm/v1/outputs.py
@dataclass
class ModelRunnerOutput:

    # [num_reqs]
    req_ids: list[str]
    # req_id -> index
    req_id_to_index: dict[str, int]

    # num_reqs x num_generated_tokens
    # num_generated_tokens is the number of tokens
    # generated in the current step. It can be different for
    # each request due to speculative/jump decoding.
    sampled_token_ids: list[list[int]]

    # [num_reqs, max_num_logprobs + 1]
    # [num_reqs, max_num_logprobs + 1]
    # [num_reqs]
    logprobs: Optional[LogprobsLists]

    # req_id -> (token_ids, logprobs, ranks)
    # [prompt_len, num_prompt_logprobs]
    # [prompt_len, num_prompt_logprobs]
    # [prompt_len]
    prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]

    # [num_reqs, hidden_size]
    pooler_output: list[Optional[torch.Tensor]]

    kv_connector_output: Optional[KVConnectorOutput] = None

    # req_id -> num_nans_in_logits
    num_nans_in_logits: Optional[dict[str, int]] = None

kv_connector_output class-attribute instance-attribute

kv_connector_output: Optional[KVConnectorOutput] = None

logprobs instance-attribute

num_nans_in_logits class-attribute instance-attribute

num_nans_in_logits: Optional[dict[str, int]] = None

pooler_output instance-attribute

pooler_output: list[Optional[Tensor]]

prompt_logprobs_dict instance-attribute

prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]

req_id_to_index instance-attribute

req_id_to_index: dict[str, int]

req_ids instance-attribute

req_ids: list[str]

sampled_token_ids instance-attribute

sampled_token_ids: list[list[int]]

__init__

__init__(
    req_ids: list[str],
    req_id_to_index: dict[str, int],
    sampled_token_ids: list[list[int]],
    logprobs: Optional[LogprobsLists],
    prompt_logprobs_dict: dict[
        str, Optional[LogprobsTensors]
    ],
    pooler_output: list[Optional[Tensor]],
    kv_connector_output: Optional[KVConnectorOutput] = None,
    num_nans_in_logits: Optional[dict[str, int]] = None,
) -> None

SamplerOutput dataclass

Source code in vllm/v1/outputs.py
@dataclass
class SamplerOutput:

    # [num_reqs, max_num_generated_tokens]
    # Different requests can have different number of generated tokens.
    # All requests are padded to max_num_generated_tokens.
    # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
    sampled_token_ids: torch.Tensor
    logprobs_tensors: Optional[LogprobsTensors]

logprobs_tensors instance-attribute

logprobs_tensors: Optional[LogprobsTensors]

sampled_token_ids instance-attribute

sampled_token_ids: Tensor

__init__

__init__(
    sampled_token_ids: Tensor,
    logprobs_tensors: Optional[LogprobsTensors],
) -> None