Skip to content

vllm

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Modules:

Name Description
assets
attention
beam_search
benchmarks
collect_env
compilation
config
connections
device_allocator
distributed
engine
entrypoints
env_override
envs
executor
forward_context
inputs
logger

Logging configuration for vLLM.

logging_utils
logits_process
logprobs
lora
model_executor
multimodal
outputs
platforms
plugins
pooling_params
profiler
ray
reasoning
sampling_params

Sampling parameters for text generation.

scalar_type
scripts
sequence

Sequence and its related classes.

tasks
test_utils
third_party
tracing
transformers_utils
triton_utils
usage
utils
v1
version

AsyncLLMEngine module-attribute

AsyncLLMEngine = AsyncLLM

LLMEngine module-attribute

LLMEngine = LLMEngine

MODULE_ATTRS module-attribute

MODULE_ATTRS = {
    "bc_linter_skip": "._bc_linter:bc_linter_skip",
    "bc_linter_include": "._bc_linter:bc_linter_include",
    "AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs",
    "EngineArgs": ".engine.arg_utils:EngineArgs",
    "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
    "LLMEngine": ".engine.llm_engine:LLMEngine",
    "LLM": ".entrypoints.llm:LLM",
    "initialize_ray_cluster": ".executor.ray_utils:initialize_ray_cluster",
    "PromptType": ".inputs:PromptType",
    "TextPrompt": ".inputs:TextPrompt",
    "TokensPrompt": ".inputs:TokensPrompt",
    "ModelRegistry": ".model_executor.models:ModelRegistry",
    "SamplingParams": ".sampling_params:SamplingParams",
    "PoolingParams": ".pooling_params:PoolingParams",
    "ClassificationOutput": ".outputs:ClassificationOutput",
    "ClassificationRequestOutput": ".outputs:ClassificationRequestOutput",
    "CompletionOutput": ".outputs:CompletionOutput",
    "EmbeddingOutput": ".outputs:EmbeddingOutput",
    "EmbeddingRequestOutput": ".outputs:EmbeddingRequestOutput",
    "PoolingOutput": ".outputs:PoolingOutput",
    "PoolingRequestOutput": ".outputs:PoolingRequestOutput",
    "RequestOutput": ".outputs:RequestOutput",
    "ScoringOutput": ".outputs:ScoringOutput",
    "ScoringRequestOutput": ".outputs:ScoringRequestOutput",
}

ModelRegistry module-attribute

ModelRegistry = _ModelRegistry(
    {
        model_arch: (
            _LazyRegisteredModel(
                module_name=f"vllm.model_executor.models.{mod_relname}",
                class_name=cls_name,
            )
        )
        for (model_arch, (mod_relname, cls_name)) in (
            items()
        )
    }
)

PromptType module-attribute

Set of possible schemas for an LLM input, including both decoder-only and encoder/decoder input types:

__all__ module-attribute

__all__ = [
    "__version__",
    "bc_linter_skip",
    "bc_linter_include",
    "__version_tuple__",
    "LLM",
    "ModelRegistry",
    "PromptType",
    "TextPrompt",
    "TokensPrompt",
    "SamplingParams",
    "RequestOutput",
    "CompletionOutput",
    "PoolingOutput",
    "PoolingRequestOutput",
    "EmbeddingOutput",
    "EmbeddingRequestOutput",
    "ClassificationOutput",
    "ClassificationRequestOutput",
    "ScoringOutput",
    "ScoringRequestOutput",
    "LLMEngine",
    "EngineArgs",
    "AsyncLLMEngine",
    "AsyncEngineArgs",
    "initialize_ray_cluster",
    "PoolingParams",
]

AsyncEngineArgs dataclass

Bases: EngineArgs

Arguments for asynchronous vLLM engine.

Source code in vllm/engine/arg_utils.py
@dataclass
class AsyncEngineArgs(EngineArgs):
    """Arguments for asynchronous vLLM engine."""
    enable_log_requests: bool = False

    @property
    @deprecated(
        "`disable_log_requests` is deprecated and has been replaced with "
        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
        "`enable_log_requests` instead.")
    def disable_log_requests(self) -> bool:
        return not self.enable_log_requests

    @disable_log_requests.setter
    @deprecated(
        "`disable_log_requests` is deprecated and has been replaced with "
        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
        "`enable_log_requests` instead.")
    def disable_log_requests(self, value: bool):
        self.enable_log_requests = not value

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser,
                     async_args_only: bool = False) -> FlexibleArgumentParser:
        # Initialize plugin to update the parser, for example, The plugin may
        # add a new kind of quantization method to --quantization argument or
        # a new device to --device argument.
        load_general_plugins()
        if not async_args_only:
            parser = EngineArgs.add_cli_args(parser)
        parser.add_argument('--enable-log-requests',
                            action=argparse.BooleanOptionalAction,
                            default=AsyncEngineArgs.enable_log_requests,
                            help='Enable logging requests.')
        parser.add_argument('--disable-log-requests',
                            action=argparse.BooleanOptionalAction,
                            default=not AsyncEngineArgs.enable_log_requests,
                            help='[DEPRECATED] Disable logging requests.',
                            deprecated=True)
        current_platform.pre_register_and_update(parser)
        return parser

disable_log_requests property writable

disable_log_requests: bool

enable_log_requests class-attribute instance-attribute

enable_log_requests: bool = False

__init__

__init__(
    model: str = model,
    served_model_name: Optional[
        Union[str, List[str]]
    ] = served_model_name,
    tokenizer: Optional[str] = tokenizer,
    hf_config_path: Optional[str] = hf_config_path,
    runner: RunnerOption = runner,
    convert: ConvertOption = convert,
    task: Optional[TaskOption] = task,
    skip_tokenizer_init: bool = skip_tokenizer_init,
    enable_prompt_embeds: bool = enable_prompt_embeds,
    tokenizer_mode: TokenizerMode = tokenizer_mode,
    trust_remote_code: bool = trust_remote_code,
    allowed_local_media_path: str = allowed_local_media_path,
    allowed_media_domains: Optional[
        list[str]
    ] = allowed_media_domains,
    download_dir: Optional[str] = download_dir,
    safetensors_load_strategy: str = safetensors_load_strategy,
    load_format: Union[str, LoadFormats] = load_format,
    config_format: str = config_format,
    dtype: ModelDType = dtype,
    kv_cache_dtype: CacheDType = cache_dtype,
    seed: Optional[int] = seed,
    max_model_len: Optional[int] = max_model_len,
    cuda_graph_sizes: list[int] = get_field(
        SchedulerConfig, "cuda_graph_sizes"
    ),
    distributed_executor_backend: Optional[
        Union[
            str,
            DistributedExecutorBackend,
            Type[ExecutorBase],
        ]
    ] = distributed_executor_backend,
    pipeline_parallel_size: int = pipeline_parallel_size,
    tensor_parallel_size: int = tensor_parallel_size,
    decode_context_parallel_size: int = decode_context_parallel_size,
    data_parallel_size: int = data_parallel_size,
    data_parallel_rank: Optional[int] = None,
    data_parallel_start_rank: Optional[int] = None,
    data_parallel_size_local: Optional[int] = None,
    data_parallel_address: Optional[str] = None,
    data_parallel_rpc_port: Optional[int] = None,
    data_parallel_hybrid_lb: bool = False,
    data_parallel_backend: str = data_parallel_backend,
    enable_expert_parallel: bool = enable_expert_parallel,
    enable_dbo: bool = enable_dbo,
    dbo_decode_token_threshold: int = dbo_decode_token_threshold,
    dbo_prefill_token_threshold: int = dbo_prefill_token_threshold,
    eplb_config: EPLBConfig = get_field(
        ParallelConfig, "eplb_config"
    ),
    enable_eplb: bool = enable_eplb,
    expert_placement_strategy: ExpertPlacementStrategy = expert_placement_strategy,
    _api_process_count: int = _api_process_count,
    _api_process_rank: int = _api_process_rank,
    num_redundant_experts: int = num_redundant_experts,
    eplb_window_size: int = window_size,
    eplb_step_interval: int = step_interval,
    eplb_log_balancedness: bool = log_balancedness,
    max_parallel_loading_workers: Optional[
        int
    ] = max_parallel_loading_workers,
    block_size: Optional[BlockSize] = block_size,
    enable_prefix_caching: Optional[
        bool
    ] = enable_prefix_caching,
    prefix_caching_hash_algo: PrefixCachingHashAlgo = prefix_caching_hash_algo,
    disable_sliding_window: bool = disable_sliding_window,
    disable_cascade_attn: bool = disable_cascade_attn,
    swap_space: float = swap_space,
    cpu_offload_gb: float = cpu_offload_gb,
    gpu_memory_utilization: float = gpu_memory_utilization,
    kv_cache_memory_bytes: Optional[
        int
    ] = kv_cache_memory_bytes,
    max_num_batched_tokens: Optional[
        int
    ] = max_num_batched_tokens,
    max_num_partial_prefills: int = max_num_partial_prefills,
    max_long_partial_prefills: int = max_long_partial_prefills,
    long_prefill_token_threshold: int = long_prefill_token_threshold,
    max_num_seqs: Optional[int] = max_num_seqs,
    max_logprobs: int = max_logprobs,
    logprobs_mode: LogprobsMode = logprobs_mode,
    disable_log_stats: bool = False,
    revision: Optional[str] = revision,
    code_revision: Optional[str] = code_revision,
    rope_scaling: dict[str, Any] = get_field(
        ModelConfig, "rope_scaling"
    ),
    rope_theta: Optional[float] = rope_theta,
    hf_token: Optional[Union[bool, str]] = hf_token,
    hf_overrides: HfOverrides = get_field(
        ModelConfig, "hf_overrides"
    ),
    tokenizer_revision: Optional[str] = tokenizer_revision,
    quantization: Optional[
        QuantizationMethods
    ] = quantization,
    enforce_eager: bool = enforce_eager,
    disable_custom_all_reduce: bool = disable_custom_all_reduce,
    limit_mm_per_prompt: dict[str, int] = get_field(
        MultiModalConfig, "limit_per_prompt"
    ),
    interleave_mm_strings: bool = interleave_mm_strings,
    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
        MultiModalConfig, "media_io_kwargs"
    ),
    mm_processor_kwargs: Optional[
        Dict[str, Any]
    ] = mm_processor_kwargs,
    disable_mm_preprocessor_cache: bool = False,
    mm_processor_cache_gb: float = mm_processor_cache_gb,
    mm_processor_cache_type: Optional[
        MMCacheType
    ] = mm_processor_cache_type,
    mm_shm_cache_max_object_size_mb: int = mm_shm_cache_max_object_size_mb,
    mm_encoder_tp_mode: MMEncoderTPMode = mm_encoder_tp_mode,
    io_processor_plugin: Optional[str] = None,
    skip_mm_profiling: bool = skip_mm_profiling,
    video_pruning_rate: float = video_pruning_rate,
    enable_lora: bool = False,
    enable_lora_bias: bool = bias_enabled,
    max_loras: int = max_loras,
    max_lora_rank: int = max_lora_rank,
    default_mm_loras: Optional[
        Dict[str, str]
    ] = default_mm_loras,
    fully_sharded_loras: bool = fully_sharded_loras,
    max_cpu_loras: Optional[int] = max_cpu_loras,
    lora_dtype: Optional[Union[str, dtype]] = lora_dtype,
    lora_extra_vocab_size: int = lora_extra_vocab_size,
    ray_workers_use_nsight: bool = ray_workers_use_nsight,
    num_gpu_blocks_override: Optional[
        int
    ] = num_gpu_blocks_override,
    num_lookahead_slots: int = num_lookahead_slots,
    model_loader_extra_config: dict = get_field(
        LoadConfig, "model_loader_extra_config"
    ),
    ignore_patterns: Optional[
        Union[str, List[str]]
    ] = ignore_patterns,
    enable_chunked_prefill: Optional[
        bool
    ] = enable_chunked_prefill,
    disable_chunked_mm_input: bool = disable_chunked_mm_input,
    disable_hybrid_kv_cache_manager: bool = disable_hybrid_kv_cache_manager,
    structured_outputs_config: StructuredOutputsConfig = get_field(
        VllmConfig, "structured_outputs_config"
    ),
    reasoning_parser: str = reasoning_parser,
    guided_decoding_backend: Optional[str] = None,
    guided_decoding_disable_fallback: Optional[bool] = None,
    guided_decoding_disable_any_whitespace: Optional[
        bool
    ] = None,
    guided_decoding_disable_additional_properties: Optional[
        bool
    ] = None,
    logits_processor_pattern: Optional[
        str
    ] = logits_processor_pattern,
    speculative_config: Optional[Dict[str, Any]] = None,
    show_hidden_metrics_for_version: Optional[
        str
    ] = show_hidden_metrics_for_version,
    otlp_traces_endpoint: Optional[
        str
    ] = otlp_traces_endpoint,
    collect_detailed_traces: Optional[
        list[DetailedTraceModules]
    ] = collect_detailed_traces,
    scheduling_policy: SchedulerPolicy = policy,
    scheduler_cls: Union[str, Type[object]] = scheduler_cls,
    pooler_config: Optional[PoolerConfig] = pooler_config,
    override_pooler_config: Optional[
        Union[dict, PoolerConfig]
    ] = override_pooler_config,
    compilation_config: CompilationConfig = get_field(
        VllmConfig, "compilation_config"
    ),
    worker_cls: str = worker_cls,
    worker_extension_cls: str = worker_extension_cls,
    kv_transfer_config: Optional[KVTransferConfig] = None,
    kv_events_config: Optional[KVEventsConfig] = None,
    generation_config: str = generation_config,
    enable_sleep_mode: bool = enable_sleep_mode,
    override_generation_config: dict[str, Any] = get_field(
        ModelConfig, "override_generation_config"
    ),
    model_impl: str = model_impl,
    override_attention_dtype: str = override_attention_dtype,
    calculate_kv_scales: bool = calculate_kv_scales,
    mamba_cache_dtype: MambaDType = mamba_cache_dtype,
    mamba_ssm_cache_dtype: MambaDType = mamba_ssm_cache_dtype,
    additional_config: dict[str, Any] = get_field(
        VllmConfig, "additional_config"
    ),
    use_tqdm_on_load: bool = use_tqdm_on_load,
    pt_load_map_location: str = pt_load_map_location,
    enable_multimodal_encoder_data_parallel: bool = False,
    logits_processors: Optional[
        list[Union[str, type[LogitsProcessor]]]
    ] = logits_processors,
    async_scheduling: bool = async_scheduling,
    kv_sharing_fast_prefill: bool = kv_sharing_fast_prefill,
    enable_log_requests: bool = False,
) -> None

add_cli_args staticmethod

add_cli_args(
    parser: FlexibleArgumentParser,
    async_args_only: bool = False,
) -> FlexibleArgumentParser
Source code in vllm/engine/arg_utils.py
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser,
                 async_args_only: bool = False) -> FlexibleArgumentParser:
    # Initialize plugin to update the parser, for example, The plugin may
    # add a new kind of quantization method to --quantization argument or
    # a new device to --device argument.
    load_general_plugins()
    if not async_args_only:
        parser = EngineArgs.add_cli_args(parser)
    parser.add_argument('--enable-log-requests',
                        action=argparse.BooleanOptionalAction,
                        default=AsyncEngineArgs.enable_log_requests,
                        help='Enable logging requests.')
    parser.add_argument('--disable-log-requests',
                        action=argparse.BooleanOptionalAction,
                        default=not AsyncEngineArgs.enable_log_requests,
                        help='[DEPRECATED] Disable logging requests.',
                        deprecated=True)
    current_platform.pre_register_and_update(parser)
    return parser

ClassificationOutput dataclass

The output data of one classification output of a request.

Parameters:

Name Type Description Default
probs list[float]

The probability vector, which is a list of floats. Its length depends on the number of classes.

required
Source code in vllm/outputs.py
@dataclass
class ClassificationOutput:
    """The output data of one classification output of a request.

    Args:
        probs: The probability vector, which is a list of floats.
            Its length depends on the number of classes.
    """
    probs: list[float]

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        # pooling_output shape: (num_classes)
        pooled_data = pooling_output.data
        if pooled_data.ndim != 1:
            raise ValueError("pooled_data should be a 1-D probability vector")

        return ClassificationOutput(pooled_data.tolist())

    @property
    def num_classes(self) -> int:
        return len(self.probs)

    def __repr__(self) -> str:
        return f"ClassificationOutput(num_classes={self.num_classes})"

num_classes property

num_classes: int

probs instance-attribute

probs: list[float]

__init__

__init__(probs: list[float]) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"ClassificationOutput(num_classes={self.num_classes})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    # pooling_output shape: (num_classes)
    pooled_data = pooling_output.data
    if pooled_data.ndim != 1:
        raise ValueError("pooled_data should be a 1-D probability vector")

    return ClassificationOutput(pooled_data.tolist())

ClassificationRequestOutput

Bases: PoolingRequestOutput[ClassificationOutput]

Source code in vllm/outputs.py
class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):

    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return ClassificationRequestOutput(
            request_id=request_output.request_id,
            outputs=ClassificationOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return ClassificationRequestOutput(
        request_id=request_output.request_id,
        outputs=ClassificationOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        finished=request_output.finished,
    )

CompletionOutput dataclass

The output data of one completion output of a request.

Parameters:

Name Type Description Default
index int

The index of the output in the request.

required
text str

The generated output text.

required
token_ids Sequence[int]

The token IDs of the generated output text.

required
cumulative_logprob Optional[float]

The cumulative log probability of the generated output text.

required
logprobs Optional[SampleLogprobs]

The log probabilities of the top probability words at each position if the logprobs are requested.

required
finish_reason Optional[str]

The reason why the sequence is finished.

None
stop_reason Union[int, str, None]

The stop string or token id that caused the completion to stop, None if the completion finished for some other reason including encountering the EOS token.

None
lora_request Optional[LoRARequest]

The LoRA request that was used to generate the output.

None
Source code in vllm/outputs.py
@dataclass
class CompletionOutput:
    """The output data of one completion output of a request.

    Args:
        index: The index of the output in the request.
        text: The generated output text.
        token_ids: The token IDs of the generated output text.
        cumulative_logprob: The cumulative log probability of the generated
            output text.
        logprobs: The log probabilities of the top probability words at each
            position if the logprobs are requested.
        finish_reason: The reason why the sequence is finished.
        stop_reason: The stop string or token id that caused the completion
            to stop, None if the completion finished for some other reason
            including encountering the EOS token.
        lora_request: The LoRA request that was used to generate the output.
    """

    index: int
    text: str
    token_ids: GenericSequence[int]
    cumulative_logprob: Optional[float]
    logprobs: Optional[SampleLogprobs]
    finish_reason: Optional[str] = None
    stop_reason: Union[int, str, None] = None
    lora_request: Optional[LoRARequest] = None

    def finished(self) -> bool:
        return self.finish_reason is not None

    def __repr__(self) -> str:
        return (f"CompletionOutput(index={self.index}, "
                f"text={self.text!r}, "
                f"token_ids={self.token_ids}, "
                f"cumulative_logprob={self.cumulative_logprob}, "
                f"logprobs={self.logprobs}, "
                f"finish_reason={self.finish_reason}, "
                f"stop_reason={self.stop_reason})")

cumulative_logprob instance-attribute

cumulative_logprob: Optional[float]

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

index instance-attribute

index: int

logprobs instance-attribute

lora_request class-attribute instance-attribute

lora_request: Optional[LoRARequest] = None

stop_reason class-attribute instance-attribute

stop_reason: Union[int, str, None] = None

text instance-attribute

text: str

token_ids instance-attribute

token_ids: Sequence[int]

__init__

__init__(
    index: int,
    text: str,
    token_ids: Sequence[int],
    cumulative_logprob: Optional[float],
    logprobs: Optional[SampleLogprobs],
    finish_reason: Optional[str] = None,
    stop_reason: Union[int, str, None] = None,
    lora_request: Optional[LoRARequest] = None,
) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (f"CompletionOutput(index={self.index}, "
            f"text={self.text!r}, "
            f"token_ids={self.token_ids}, "
            f"cumulative_logprob={self.cumulative_logprob}, "
            f"logprobs={self.logprobs}, "
            f"finish_reason={self.finish_reason}, "
            f"stop_reason={self.stop_reason})")

finished

finished() -> bool
Source code in vllm/outputs.py
def finished(self) -> bool:
    return self.finish_reason is not None

EmbeddingOutput dataclass

The output data of one embedding output of a request.

Parameters:

Name Type Description Default
embedding list[float]

The embedding vector, which is a list of floats. Its length depends on the hidden dimension of the model.

required
Source code in vllm/outputs.py
@dataclass
class EmbeddingOutput:
    """The output data of one embedding output of a request.

    Args:
        embedding: The embedding vector, which is a list of floats.
            Its length depends on the hidden dimension of the model.
    """
    embedding: list[float]

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        pooled_data = pooling_output.data
        if pooled_data.ndim != 1:
            raise ValueError("pooled_data should be a 1-D embedding vector")

        return EmbeddingOutput(pooled_data.tolist())

    @property
    def hidden_size(self) -> int:
        return len(self.embedding)

    def __repr__(self) -> str:
        return f"EmbeddingOutput(hidden_size={self.hidden_size})"

embedding instance-attribute

embedding: list[float]

hidden_size property

hidden_size: int

__init__

__init__(embedding: list[float]) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"EmbeddingOutput(hidden_size={self.hidden_size})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    pooled_data = pooling_output.data
    if pooled_data.ndim != 1:
        raise ValueError("pooled_data should be a 1-D embedding vector")

    return EmbeddingOutput(pooled_data.tolist())

EmbeddingRequestOutput

Bases: PoolingRequestOutput[EmbeddingOutput]

Source code in vllm/outputs.py
class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):

    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return EmbeddingRequestOutput(
            request_id=request_output.request_id,
            outputs=EmbeddingOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return EmbeddingRequestOutput(
        request_id=request_output.request_id,
        outputs=EmbeddingOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        finished=request_output.finished,
    )

EngineArgs dataclass

Arguments for vLLM engine.

Source code in vllm/engine/arg_utils.py
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
@dataclass
class EngineArgs:
    """Arguments for vLLM engine."""
    model: str = ModelConfig.model
    served_model_name: Optional[Union[
        str, List[str]]] = ModelConfig.served_model_name
    tokenizer: Optional[str] = ModelConfig.tokenizer
    hf_config_path: Optional[str] = ModelConfig.hf_config_path
    runner: RunnerOption = ModelConfig.runner
    convert: ConvertOption = ModelConfig.convert
    task: Optional[TaskOption] = ModelConfig.task
    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
    enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
    trust_remote_code: bool = ModelConfig.trust_remote_code
    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
    allowed_media_domains: Optional[
        list[str]] = ModelConfig.allowed_media_domains
    download_dir: Optional[str] = LoadConfig.download_dir
    safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
    load_format: Union[str, LoadFormats] = LoadConfig.load_format
    config_format: str = ModelConfig.config_format
    dtype: ModelDType = ModelConfig.dtype
    kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
    seed: Optional[int] = ModelConfig.seed
    max_model_len: Optional[int] = ModelConfig.max_model_len
    cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
                                            "cuda_graph_sizes")
    # Note: Specifying a custom executor backend by passing a class
    # is intended for expert use only. The API may change without
    # notice.
    distributed_executor_backend: Optional[Union[
        str, DistributedExecutorBackend,
        Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
    # number of P/D disaggregation (or other disaggregation) workers
    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
    tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
    decode_context_parallel_size: int = \
        ParallelConfig.decode_context_parallel_size
    data_parallel_size: int = ParallelConfig.data_parallel_size
    data_parallel_rank: Optional[int] = None
    data_parallel_start_rank: Optional[int] = None
    data_parallel_size_local: Optional[int] = None
    data_parallel_address: Optional[str] = None
    data_parallel_rpc_port: Optional[int] = None
    data_parallel_hybrid_lb: bool = False
    data_parallel_backend: str = ParallelConfig.data_parallel_backend
    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
    enable_dbo: bool = ParallelConfig.enable_dbo
    dbo_decode_token_threshold: int = \
        ParallelConfig.dbo_decode_token_threshold
    dbo_prefill_token_threshold: int = \
        ParallelConfig.dbo_prefill_token_threshold
    eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
    enable_eplb: bool = ParallelConfig.enable_eplb
    expert_placement_strategy: ExpertPlacementStrategy = \
        ParallelConfig.expert_placement_strategy
    _api_process_count: int = ParallelConfig._api_process_count
    _api_process_rank: int = ParallelConfig._api_process_rank
    num_redundant_experts: int = EPLBConfig.num_redundant_experts
    eplb_window_size: int = EPLBConfig.window_size
    eplb_step_interval: int = EPLBConfig.step_interval
    eplb_log_balancedness: bool = EPLBConfig.log_balancedness
    max_parallel_loading_workers: Optional[
        int] = ParallelConfig.max_parallel_loading_workers
    block_size: Optional[BlockSize] = CacheConfig.block_size
    enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
    prefix_caching_hash_algo: PrefixCachingHashAlgo = \
        CacheConfig.prefix_caching_hash_algo
    disable_sliding_window: bool = ModelConfig.disable_sliding_window
    disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
    swap_space: float = CacheConfig.swap_space
    cpu_offload_gb: float = CacheConfig.cpu_offload_gb
    gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
    kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes
    max_num_batched_tokens: Optional[
        int] = SchedulerConfig.max_num_batched_tokens
    max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
    max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
    long_prefill_token_threshold: int = \
        SchedulerConfig.long_prefill_token_threshold
    max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
    max_logprobs: int = ModelConfig.max_logprobs
    logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
    disable_log_stats: bool = False
    revision: Optional[str] = ModelConfig.revision
    code_revision: Optional[str] = ModelConfig.code_revision
    rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
    rope_theta: Optional[float] = ModelConfig.rope_theta
    hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
    hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
    tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
    quantization: Optional[QuantizationMethods] = ModelConfig.quantization
    enforce_eager: bool = ModelConfig.enforce_eager
    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
    limit_mm_per_prompt: dict[str, int] = \
        get_field(MultiModalConfig, "limit_per_prompt")
    interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
    media_io_kwargs: dict[str, dict[str,
                                    Any]] = get_field(MultiModalConfig,
                                                      "media_io_kwargs")
    mm_processor_kwargs: Optional[Dict[str, Any]] = \
        MultiModalConfig.mm_processor_kwargs
    disable_mm_preprocessor_cache: bool = False  # DEPRECATED
    mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
    mm_processor_cache_type: Optional[MMCacheType] = \
        MultiModalConfig.mm_processor_cache_type
    mm_shm_cache_max_object_size_mb: int = \
        MultiModalConfig.mm_shm_cache_max_object_size_mb
    mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
    io_processor_plugin: Optional[str] = None
    skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
    video_pruning_rate: float = MultiModalConfig.video_pruning_rate
    # LoRA fields
    enable_lora: bool = False
    enable_lora_bias: bool = LoRAConfig.bias_enabled
    max_loras: int = LoRAConfig.max_loras
    max_lora_rank: int = LoRAConfig.max_lora_rank
    default_mm_loras: Optional[Dict[str, str]] = \
        LoRAConfig.default_mm_loras
    fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
    max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
    lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size

    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
    num_gpu_blocks_override: Optional[
        int] = CacheConfig.num_gpu_blocks_override
    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
    model_loader_extra_config: dict = \
        get_field(LoadConfig, "model_loader_extra_config")
    ignore_patterns: Optional[Union[str,
                                    List[str]]] = LoadConfig.ignore_patterns

    enable_chunked_prefill: Optional[
        bool] = SchedulerConfig.enable_chunked_prefill
    disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input

    disable_hybrid_kv_cache_manager: bool = (
        SchedulerConfig.disable_hybrid_kv_cache_manager)

    structured_outputs_config: StructuredOutputsConfig = get_field(
        VllmConfig, "structured_outputs_config")
    reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
    # Deprecated guided decoding fields
    guided_decoding_backend: Optional[str] = None
    guided_decoding_disable_fallback: Optional[bool] = None
    guided_decoding_disable_any_whitespace: Optional[bool] = None
    guided_decoding_disable_additional_properties: Optional[bool] = None

    logits_processor_pattern: Optional[
        str] = ModelConfig.logits_processor_pattern

    speculative_config: Optional[Dict[str, Any]] = None

    show_hidden_metrics_for_version: Optional[str] = \
        ObservabilityConfig.show_hidden_metrics_for_version
    otlp_traces_endpoint: Optional[str] = \
        ObservabilityConfig.otlp_traces_endpoint
    collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
        ObservabilityConfig.collect_detailed_traces
    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls

    pooler_config: Optional[PoolerConfig] = ModelConfig.pooler_config
    override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
        ModelConfig.override_pooler_config
    compilation_config: CompilationConfig = \
        get_field(VllmConfig, "compilation_config")
    worker_cls: str = ParallelConfig.worker_cls
    worker_extension_cls: str = ParallelConfig.worker_extension_cls

    kv_transfer_config: Optional[KVTransferConfig] = None
    kv_events_config: Optional[KVEventsConfig] = None

    generation_config: str = ModelConfig.generation_config
    enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
    override_generation_config: dict[str, Any] = \
        get_field(ModelConfig, "override_generation_config")
    model_impl: str = ModelConfig.model_impl
    override_attention_dtype: str = ModelConfig.override_attention_dtype

    calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
    mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
    mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype

    additional_config: dict[str, Any] = \
        get_field(VllmConfig, "additional_config")

    use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
    pt_load_map_location: str = LoadConfig.pt_load_map_location

    # DEPRECATED
    enable_multimodal_encoder_data_parallel: bool = False

    logits_processors: Optional[list[Union[
        str, type[LogitsProcessor]]]] = ModelConfig.logits_processors
    """Custom logitproc types"""

    async_scheduling: bool = SchedulerConfig.async_scheduling

    kv_sharing_fast_prefill: bool = \
        CacheConfig.kv_sharing_fast_prefill

    def __post_init__(self):
        # support `EngineArgs(compilation_config={...})`
        # without having to manually construct a
        # CompilationConfig object
        if isinstance(self.compilation_config, dict):
            self.compilation_config = CompilationConfig(
                **self.compilation_config)
        if isinstance(self.eplb_config, dict):
            self.eplb_config = EPLBConfig(**self.eplb_config)
        # Setup plugins
        from vllm.plugins import load_general_plugins
        load_general_plugins()
        # when use hf offline,replace model id to local model path
        if huggingface_hub.constants.HF_HUB_OFFLINE:
            model_id = self.model
            self.model = get_model_path(self.model, self.revision)
            logger.info(
                "HF_HUB_OFFLINE is True, replace model_id [%s] " \
                "to model_path [%s]",model_id, self.model)

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Shared CLI arguments for vLLM engine."""

        # Model arguments
        model_kwargs = get_kwargs(ModelConfig)
        model_group = parser.add_argument_group(
            title="ModelConfig",
            description=ModelConfig.__doc__,
        )
        if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
            model_group.add_argument("--model", **model_kwargs["model"])
        model_group.add_argument("--runner", **model_kwargs["runner"])
        model_group.add_argument("--convert", **model_kwargs["convert"])
        model_group.add_argument("--task",
                                 **model_kwargs["task"],
                                 deprecated=True)
        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
        model_group.add_argument("--tokenizer-mode",
                                 **model_kwargs["tokenizer_mode"])
        model_group.add_argument("--trust-remote-code",
                                 **model_kwargs["trust_remote_code"])
        model_group.add_argument("--dtype", **model_kwargs["dtype"])
        model_group.add_argument("--seed", **model_kwargs["seed"])
        model_group.add_argument("--hf-config-path",
                                 **model_kwargs["hf_config_path"])
        model_group.add_argument("--allowed-local-media-path",
                                 **model_kwargs["allowed_local_media_path"])
        model_group.add_argument("--allowed-media-domains",
                                 **model_kwargs["allowed_media_domains"])
        model_group.add_argument("--revision", **model_kwargs["revision"])
        model_group.add_argument("--code-revision",
                                 **model_kwargs["code_revision"])
        model_group.add_argument("--rope-scaling",
                                 **model_kwargs["rope_scaling"])
        model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
        model_group.add_argument("--tokenizer-revision",
                                 **model_kwargs["tokenizer_revision"])
        model_group.add_argument("--max-model-len",
                                 **model_kwargs["max_model_len"])
        model_group.add_argument("--quantization", "-q",
                                 **model_kwargs["quantization"])
        model_group.add_argument("--enforce-eager",
                                 **model_kwargs["enforce_eager"])
        model_group.add_argument("--max-logprobs",
                                 **model_kwargs["max_logprobs"])
        model_group.add_argument("--logprobs-mode",
                                 **model_kwargs["logprobs_mode"])
        model_group.add_argument("--disable-sliding-window",
                                 **model_kwargs["disable_sliding_window"])
        model_group.add_argument("--disable-cascade-attn",
                                 **model_kwargs["disable_cascade_attn"])
        model_group.add_argument("--skip-tokenizer-init",
                                 **model_kwargs["skip_tokenizer_init"])
        model_group.add_argument("--enable-prompt-embeds",
                                 **model_kwargs["enable_prompt_embeds"])
        model_group.add_argument("--served-model-name",
                                 **model_kwargs["served_model_name"])
        model_group.add_argument("--config-format",
                                 **model_kwargs["config_format"])
        # This one is a special case because it can bool
        # or str. TODO: Handle this in get_kwargs
        model_group.add_argument("--hf-token",
                                 type=str,
                                 nargs="?",
                                 const=True,
                                 default=model_kwargs["hf_token"]["default"],
                                 help=model_kwargs["hf_token"]["help"])
        model_group.add_argument("--hf-overrides",
                                 **model_kwargs["hf_overrides"])
        model_group.add_argument("--pooler-config",
                                 **model_kwargs["pooler_config"])
        model_group.add_argument("--override-pooler-config",
                                 **model_kwargs["override_pooler_config"],
                                 deprecated=True)
        model_group.add_argument("--logits-processor-pattern",
                                 **model_kwargs["logits_processor_pattern"])
        model_group.add_argument("--generation-config",
                                 **model_kwargs["generation_config"])
        model_group.add_argument("--override-generation-config",
                                 **model_kwargs["override_generation_config"])
        model_group.add_argument("--enable-sleep-mode",
                                 **model_kwargs["enable_sleep_mode"])
        model_group.add_argument("--model-impl", **model_kwargs["model_impl"])
        model_group.add_argument("--override-attention-dtype",
                                 **model_kwargs["override_attention_dtype"])
        model_group.add_argument("--logits-processors",
                                 **model_kwargs["logits_processors"])
        model_group.add_argument("--io-processor-plugin",
                                 **model_kwargs["io_processor_plugin"])

        # Model loading arguments
        load_kwargs = get_kwargs(LoadConfig)
        load_group = parser.add_argument_group(
            title="LoadConfig",
            description=LoadConfig.__doc__,
        )
        load_group.add_argument("--load-format", **load_kwargs["load_format"])
        load_group.add_argument("--download-dir",
                                **load_kwargs["download_dir"])
        load_group.add_argument("--safetensors-load-strategy",
                                **load_kwargs["safetensors_load_strategy"])
        load_group.add_argument("--model-loader-extra-config",
                                **load_kwargs["model_loader_extra_config"])
        load_group.add_argument("--ignore-patterns",
                                **load_kwargs["ignore_patterns"])
        load_group.add_argument("--use-tqdm-on-load",
                                **load_kwargs["use_tqdm_on_load"])
        load_group.add_argument('--pt-load-map-location',
                                **load_kwargs["pt_load_map_location"])

        # Structured outputs arguments
        structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
        structured_outputs_group = parser.add_argument_group(
            title="StructuredOutputsConfig",
            description=StructuredOutputsConfig.__doc__,
        )
        structured_outputs_group.add_argument(
            "--reasoning-parser",
            # This choice is a special case because it's not static
            choices=list(ReasoningParserManager.reasoning_parsers),
            **structured_outputs_kwargs["reasoning_parser"])
        # Deprecated guided decoding arguments
        for arg, type in [
            ("--guided-decoding-backend", str),
            ("--guided-decoding-disable-fallback", bool),
            ("--guided-decoding-disable-any-whitespace", bool),
            ("--guided-decoding-disable-additional-properties", bool),
        ]:
            structured_outputs_group.add_argument(
                arg,
                type=type,
                help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
                deprecated=True)

        # Parallel arguments
        parallel_kwargs = get_kwargs(ParallelConfig)
        parallel_group = parser.add_argument_group(
            title="ParallelConfig",
            description=ParallelConfig.__doc__,
        )
        parallel_group.add_argument(
            "--distributed-executor-backend",
            **parallel_kwargs["distributed_executor_backend"])
        parallel_group.add_argument(
            "--pipeline-parallel-size", "-pp",
            **parallel_kwargs["pipeline_parallel_size"])
        parallel_group.add_argument("--tensor-parallel-size", "-tp",
                                    **parallel_kwargs["tensor_parallel_size"])
        parallel_group.add_argument(
            "--decode-context-parallel-size", "-dcp",
            **parallel_kwargs["decode_context_parallel_size"])
        parallel_group.add_argument("--data-parallel-size", "-dp",
                                    **parallel_kwargs["data_parallel_size"])
        parallel_group.add_argument(
            '--data-parallel-rank',
            '-dpn',
            type=int,
            help='Data parallel rank of this instance. '
            'When set, enables external load balancer mode.')
        parallel_group.add_argument('--data-parallel-start-rank',
                                    '-dpr',
                                    type=int,
                                    help='Starting data parallel rank '
                                    'for secondary nodes.')
        parallel_group.add_argument('--data-parallel-size-local',
                                    '-dpl',
                                    type=int,
                                    help='Number of data parallel replicas '
                                    'to run on this node.')
        parallel_group.add_argument('--data-parallel-address',
                                    '-dpa',
                                    type=str,
                                    help='Address of data parallel cluster '
                                    'head-node.')
        parallel_group.add_argument('--data-parallel-rpc-port',
                                    '-dpp',
                                    type=int,
                                    help='Port for data parallel RPC '
                                    'communication.')
        parallel_group.add_argument('--data-parallel-backend',
                                    '-dpb',
                                    type=str,
                                    default='mp',
                                    help='Backend for data parallel, either '
                                    '"mp" or "ray".')
        parallel_group.add_argument(
            "--data-parallel-hybrid-lb",
            **parallel_kwargs["data_parallel_hybrid_lb"])
        parallel_group.add_argument(
            "--enable-expert-parallel",
            **parallel_kwargs["enable_expert_parallel"])
        parallel_group.add_argument("--enable-dbo",
                                    **parallel_kwargs["enable_dbo"])
        parallel_group.add_argument(
            "--dbo-decode-token-threshold",
            **parallel_kwargs["dbo_decode_token_threshold"])
        parallel_group.add_argument(
            "--dbo-prefill-token-threshold",
            **parallel_kwargs["dbo_prefill_token_threshold"])
        parallel_group.add_argument("--enable-eplb",
                                    **parallel_kwargs["enable_eplb"])
        parallel_group.add_argument("--eplb-config",
                                    **parallel_kwargs["eplb_config"])
        parallel_group.add_argument(
            "--expert-placement-strategy",
            **parallel_kwargs["expert_placement_strategy"])
        parallel_group.add_argument(
            "--num-redundant-experts",
            type=int,
            help=
            "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
            deprecated=True)
        parallel_group.add_argument(
            "--eplb-window-size",
            type=int,
            help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
            deprecated=True)
        parallel_group.add_argument(
            "--eplb-step-interval",
            type=int,
            help=
            "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
            deprecated=True)
        parallel_group.add_argument(
            "--eplb-log-balancedness",
            action=argparse.BooleanOptionalAction,
            help=
            "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
            deprecated=True)

        parallel_group.add_argument(
            "--max-parallel-loading-workers",
            **parallel_kwargs["max_parallel_loading_workers"])
        parallel_group.add_argument(
            "--ray-workers-use-nsight",
            **parallel_kwargs["ray_workers_use_nsight"])
        parallel_group.add_argument(
            "--disable-custom-all-reduce",
            **parallel_kwargs["disable_custom_all_reduce"])
        parallel_group.add_argument("--worker-cls",
                                    **parallel_kwargs["worker_cls"])
        parallel_group.add_argument("--worker-extension-cls",
                                    **parallel_kwargs["worker_extension_cls"])
        parallel_group.add_argument(
            "--enable-multimodal-encoder-data-parallel",
            action="store_true",
            deprecated=True)

        # KV cache arguments
        cache_kwargs = get_kwargs(CacheConfig)
        cache_group = parser.add_argument_group(
            title="CacheConfig",
            description=CacheConfig.__doc__,
        )
        cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
        cache_group.add_argument("--gpu-memory-utilization",
                                 **cache_kwargs["gpu_memory_utilization"])
        cache_group.add_argument("--kv-cache-memory-bytes",
                                 **cache_kwargs["kv_cache_memory_bytes"])
        cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
        cache_group.add_argument("--kv-cache-dtype",
                                 **cache_kwargs["cache_dtype"])
        cache_group.add_argument("--num-gpu-blocks-override",
                                 **cache_kwargs["num_gpu_blocks_override"])
        cache_group.add_argument("--enable-prefix-caching",
                                 **cache_kwargs["enable_prefix_caching"])
        cache_group.add_argument("--prefix-caching-hash-algo",
                                 **cache_kwargs["prefix_caching_hash_algo"])
        cache_group.add_argument("--cpu-offload-gb",
                                 **cache_kwargs["cpu_offload_gb"])
        cache_group.add_argument("--calculate-kv-scales",
                                 **cache_kwargs["calculate_kv_scales"])
        cache_group.add_argument("--kv-sharing-fast-prefill",
                                 **cache_kwargs["kv_sharing_fast_prefill"])
        cache_group.add_argument("--mamba-cache-dtype",
                                 **cache_kwargs["mamba_cache_dtype"])
        cache_group.add_argument("--mamba-ssm-cache-dtype",
                                 **cache_kwargs["mamba_ssm_cache_dtype"])

        # Multimodal related configs
        multimodal_kwargs = get_kwargs(MultiModalConfig)
        multimodal_group = parser.add_argument_group(
            title="MultiModalConfig",
            description=MultiModalConfig.__doc__,
        )
        multimodal_group.add_argument("--limit-mm-per-prompt",
                                      **multimodal_kwargs["limit_per_prompt"])
        multimodal_group.add_argument("--media-io-kwargs",
                                      **multimodal_kwargs["media_io_kwargs"])
        multimodal_group.add_argument(
            "--mm-processor-kwargs",
            **multimodal_kwargs["mm_processor_kwargs"])
        multimodal_group.add_argument(
            "--mm-processor-cache-gb",
            **multimodal_kwargs["mm_processor_cache_gb"])
        multimodal_group.add_argument("--disable-mm-preprocessor-cache",
                                      action="store_true",
                                      deprecated=True)
        multimodal_group.add_argument(
            "--mm-processor-cache-type",
            **multimodal_kwargs["mm_processor_cache_type"])
        multimodal_group.add_argument(
            "--mm-shm-cache-max-object-size-mb",
            **multimodal_kwargs["mm_shm_cache_max_object_size_mb"])
        multimodal_group.add_argument(
            "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"])
        multimodal_group.add_argument(
            "--interleave-mm-strings",
            **multimodal_kwargs["interleave_mm_strings"])
        multimodal_group.add_argument("--skip-mm-profiling",
                                      **multimodal_kwargs["skip_mm_profiling"])

        multimodal_group.add_argument(
            "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"])

        # LoRA related configs
        lora_kwargs = get_kwargs(LoRAConfig)
        lora_group = parser.add_argument_group(
            title="LoRAConfig",
            description=LoRAConfig.__doc__,
        )
        lora_group.add_argument(
            "--enable-lora",
            action=argparse.BooleanOptionalAction,
            help="If True, enable handling of LoRA adapters.")
        lora_group.add_argument("--enable-lora-bias",
                                **lora_kwargs["bias_enabled"])
        lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
        lora_group.add_argument("--max-lora-rank",
                                **lora_kwargs["max_lora_rank"])
        lora_group.add_argument("--lora-extra-vocab-size",
                                **lora_kwargs["lora_extra_vocab_size"])
        lora_group.add_argument(
            "--lora-dtype",
            **lora_kwargs["lora_dtype"],
        )
        lora_group.add_argument("--max-cpu-loras",
                                **lora_kwargs["max_cpu_loras"])
        lora_group.add_argument("--fully-sharded-loras",
                                **lora_kwargs["fully_sharded_loras"])
        lora_group.add_argument("--default-mm-loras",
                                **lora_kwargs["default_mm_loras"])

        # Observability arguments
        observability_kwargs = get_kwargs(ObservabilityConfig)
        observability_group = parser.add_argument_group(
            title="ObservabilityConfig",
            description=ObservabilityConfig.__doc__,
        )
        observability_group.add_argument(
            "--show-hidden-metrics-for-version",
            **observability_kwargs["show_hidden_metrics_for_version"])
        observability_group.add_argument(
            "--otlp-traces-endpoint",
            **observability_kwargs["otlp_traces_endpoint"])
        # TODO: generalise this special case
        choices = observability_kwargs["collect_detailed_traces"]["choices"]
        metavar = f"{{{','.join(choices)}}}"
        observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
        observability_kwargs["collect_detailed_traces"]["choices"] += [
            ",".join(p)
            for p in permutations(get_args(DetailedTraceModules), r=2)
        ]
        observability_group.add_argument(
            "--collect-detailed-traces",
            **observability_kwargs["collect_detailed_traces"])

        # Scheduler arguments
        scheduler_kwargs = get_kwargs(SchedulerConfig)
        scheduler_group = parser.add_argument_group(
            title="SchedulerConfig",
            description=SchedulerConfig.__doc__,
        )
        scheduler_group.add_argument(
            "--max-num-batched-tokens",
            **scheduler_kwargs["max_num_batched_tokens"])
        scheduler_group.add_argument("--max-num-seqs",
                                     **scheduler_kwargs["max_num_seqs"])
        scheduler_group.add_argument(
            "--max-num-partial-prefills",
            **scheduler_kwargs["max_num_partial_prefills"])
        scheduler_group.add_argument(
            "--max-long-partial-prefills",
            **scheduler_kwargs["max_long_partial_prefills"])
        scheduler_group.add_argument('--cuda-graph-sizes',
                                     **scheduler_kwargs["cuda_graph_sizes"])
        scheduler_group.add_argument(
            "--long-prefill-token-threshold",
            **scheduler_kwargs["long_prefill_token_threshold"])
        scheduler_group.add_argument("--num-lookahead-slots",
                                     **scheduler_kwargs["num_lookahead_slots"])
        # multi-step scheduling has been removed; corresponding arguments
        # are no longer supported.
        scheduler_group.add_argument("--scheduling-policy",
                                     **scheduler_kwargs["policy"])
        scheduler_group.add_argument(
            "--enable-chunked-prefill",
            **scheduler_kwargs["enable_chunked_prefill"])
        scheduler_group.add_argument(
            "--disable-chunked-mm-input",
            **scheduler_kwargs["disable_chunked_mm_input"])
        scheduler_group.add_argument("--scheduler-cls",
                                     **scheduler_kwargs["scheduler_cls"])
        scheduler_group.add_argument(
            "--disable-hybrid-kv-cache-manager",
            **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
        scheduler_group.add_argument("--async-scheduling",
                                     **scheduler_kwargs["async_scheduling"])

        # vLLM arguments
        vllm_kwargs = get_kwargs(VllmConfig)
        vllm_group = parser.add_argument_group(
            title="VllmConfig",
            description=VllmConfig.__doc__,
        )
        # We construct SpeculativeConfig using fields from other configs in
        # create_engine_config. So we set the type to a JSON string here to
        # delay the Pydantic validation that comes with SpeculativeConfig.
        vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads)
        vllm_group.add_argument("--speculative-config",
                                **vllm_kwargs["speculative_config"])
        vllm_group.add_argument("--kv-transfer-config",
                                **vllm_kwargs["kv_transfer_config"])
        vllm_group.add_argument('--kv-events-config',
                                **vllm_kwargs["kv_events_config"])
        vllm_group.add_argument("--compilation-config", "-O",
                                **vllm_kwargs["compilation_config"])
        vllm_group.add_argument("--additional-config",
                                **vllm_kwargs["additional_config"])
        vllm_group.add_argument('--structured-outputs-config',
                                **vllm_kwargs["structured_outputs_config"])

        # Other arguments
        parser.add_argument('--disable-log-stats',
                            action='store_true',
                            help='Disable logging statistics.')

        return parser

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        # Get the list of attributes of this dataclass.
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        # Set the attributes from the parsed arguments.
        engine_args = cls(**{
            attr: getattr(args, attr)
            for attr in attrs if hasattr(args, attr)
        })
        return engine_args

    def create_model_config(self) -> ModelConfig:
        # gguf file needs a specific model loader and doesn't use hf_repo
        if check_gguf_file(self.model):
            self.quantization = self.load_format = "gguf"

        # NOTE: This is to allow model loading from S3 in CI
        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
                and self.model in MODELS_ON_S3 and self.load_format == "auto"):
            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"

        if self.disable_mm_preprocessor_cache:
            logger.warning(
                "`--disable-mm-preprocessor-cache` is deprecated "
                "and will be removed in v0.13. "
                "Please use `--mm-processor-cache-gb 0` instead.", )

            self.mm_processor_cache_gb = 0
        elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
            logger.warning(
                "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
                "and will be removed in v0.13. "
                "Please use `--mm-processor-cache-gb %d` instead.",
                envs.VLLM_MM_INPUT_CACHE_GIB,
            )

            self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB

        if self.enable_multimodal_encoder_data_parallel:
            logger.warning(
                "--enable-multimodal-encoder-data-parallel` is deprecated "
                "and will be removed in v0.13. "
                "Please use `--mm-encoder-tp-mode data` instead.")

            self.mm_encoder_tp_mode = "data"

        return ModelConfig(
            model=self.model,
            hf_config_path=self.hf_config_path,
            runner=self.runner,
            convert=self.convert,
            task=self.task,
            tokenizer=self.tokenizer,
            tokenizer_mode=self.tokenizer_mode,
            trust_remote_code=self.trust_remote_code,
            allowed_local_media_path=self.allowed_local_media_path,
            allowed_media_domains=self.allowed_media_domains,
            dtype=self.dtype,
            seed=self.seed,
            revision=self.revision,
            code_revision=self.code_revision,
            rope_scaling=self.rope_scaling,
            rope_theta=self.rope_theta,
            hf_token=self.hf_token,
            hf_overrides=self.hf_overrides,
            tokenizer_revision=self.tokenizer_revision,
            max_model_len=self.max_model_len,
            quantization=self.quantization,
            enforce_eager=self.enforce_eager,
            max_logprobs=self.max_logprobs,
            logprobs_mode=self.logprobs_mode,
            disable_sliding_window=self.disable_sliding_window,
            disable_cascade_attn=self.disable_cascade_attn,
            skip_tokenizer_init=self.skip_tokenizer_init,
            enable_prompt_embeds=self.enable_prompt_embeds,
            served_model_name=self.served_model_name,
            limit_mm_per_prompt=self.limit_mm_per_prompt,
            interleave_mm_strings=self.interleave_mm_strings,
            media_io_kwargs=self.media_io_kwargs,
            skip_mm_profiling=self.skip_mm_profiling,
            config_format=self.config_format,
            mm_processor_kwargs=self.mm_processor_kwargs,
            mm_processor_cache_gb=self.mm_processor_cache_gb,
            mm_processor_cache_type=self.mm_processor_cache_type,
            mm_shm_cache_max_object_size_mb=self.
            mm_shm_cache_max_object_size_mb,
            mm_encoder_tp_mode=self.mm_encoder_tp_mode,
            pooler_config=self.pooler_config,
            override_pooler_config=self.override_pooler_config,
            logits_processor_pattern=self.logits_processor_pattern,
            generation_config=self.generation_config,
            override_generation_config=self.override_generation_config,
            enable_sleep_mode=self.enable_sleep_mode,
            model_impl=self.model_impl,
            override_attention_dtype=self.override_attention_dtype,
            logits_processors=self.logits_processors,
            video_pruning_rate=self.video_pruning_rate,
            io_processor_plugin=self.io_processor_plugin,
        )

    def validate_tensorizer_args(self):
        from vllm.model_executor.model_loader.tensorizer import (
            TensorizerConfig)
        for key in self.model_loader_extra_config:
            if key in TensorizerConfig._fields:
                self.model_loader_extra_config["tensorizer_config"][
                    key] = self.model_loader_extra_config[key]

    def create_load_config(self) -> LoadConfig:

        if self.quantization == "bitsandbytes":
            self.load_format = "bitsandbytes"

        if self.load_format == "tensorizer":
            if hasattr(self.model_loader_extra_config, "to_serializable"):
                self.model_loader_extra_config = (
                    self.model_loader_extra_config.to_serializable())
            self.model_loader_extra_config["tensorizer_config"] = {}
            self.model_loader_extra_config["tensorizer_config"][
                "tensorizer_dir"] = self.model
            self.validate_tensorizer_args()

        return LoadConfig(
            load_format=self.load_format,
            download_dir=self.download_dir,
            safetensors_load_strategy=self.safetensors_load_strategy,
            device="cpu"
            if is_online_quantization(self.quantization) else None,
            model_loader_extra_config=self.model_loader_extra_config,
            ignore_patterns=self.ignore_patterns,
            use_tqdm_on_load=self.use_tqdm_on_load,
            pt_load_map_location=self.pt_load_map_location,
        )

    def create_speculative_config(
        self,
        target_model_config: ModelConfig,
        target_parallel_config: ParallelConfig,
        enable_chunked_prefill: bool,
        disable_log_stats: bool,
    ) -> Optional["SpeculativeConfig"]:
        """Initializes and returns a SpeculativeConfig object based on
        `speculative_config`.

        This function utilizes `speculative_config` to create a
        SpeculativeConfig object. The `speculative_config` can either be
        provided as a JSON string input via CLI arguments or directly as a
        dictionary from the engine.
        """
        if self.speculative_config is None:
            return None

        # Note(Shangming): These parameters are not obtained from the cli arg
        # '--speculative-config' and must be passed in when creating the engine
        # config.
        self.speculative_config.update({
            "target_model_config": target_model_config,
            "target_parallel_config": target_parallel_config,
            "enable_chunked_prefill": enable_chunked_prefill,
            "disable_log_stats": disable_log_stats,
        })
        return SpeculativeConfig(**self.speculative_config)

    def create_engine_config(
        self,
        usage_context: Optional[UsageContext] = None,
        headless: bool = False,
    ) -> VllmConfig:
        """
        Create the VllmConfig.

        NOTE: for autoselection of V0 vs V1 engine, we need to
        create the ModelConfig first, since ModelConfig's attrs
        (e.g. the model arch) are needed to make the decision.

        This function set VLLM_USE_V1=X if VLLM_USE_V1 is
        unspecified by the user.

        If VLLM_USE_V1 is specified by the user but the VllmConfig
        is incompatible, we raise an error.
        """
        current_platform.pre_register_and_update()

        device_config = DeviceConfig(
            device=cast(Device, current_platform.device_type))

        (self.model, self.tokenizer,
         self.speculative_config) = maybe_override_with_speculators(
             model=self.model,
             tokenizer=self.tokenizer,
             revision=self.revision,
             trust_remote_code=self.trust_remote_code,
             vllm_speculative_config=self.speculative_config,
         )
        model_config = self.create_model_config()

        # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
        #   and fall back to V0 for experimental or unsupported features.
        # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
        #   features and raise error for unsupported features.
        # * If VLLM_USE_V1=0, we disable V1.
        use_v1 = False
        try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
        if try_v1 and self._is_v1_supported_oracle(model_config):
            use_v1 = True

        # If user explicitly set VLLM_USE_V1, sanity check we respect it.
        if envs.is_set("VLLM_USE_V1"):
            assert use_v1 == envs.VLLM_USE_V1
        # Otherwise, set the VLLM_USE_V1 variable globally.
        else:
            envs.set_vllm_use_v1(use_v1)

        # Set default arguments for V1 Engine.
        self._set_default_args(usage_context, model_config)
        # Disable chunked prefill for POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
        if current_platform.is_cpu() and current_platform.get_cpu_architecture(
        ) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM,
              CpuArchEnum.RISCV):
            logger.info("Chunked prefill is not supported for ARM and POWER, "
                        "S390X and RISC-V CPUs; "
                        "disabling it for V1 backend.")
            self.enable_chunked_prefill = False
        assert self.enable_chunked_prefill is not None

        sliding_window: Optional[int] = None
        if not is_interleaved(model_config.hf_text_config):
            # Only set CacheConfig.sliding_window if the model is all sliding
            # window. Otherwise CacheConfig.sliding_window will override the
            # global layers in interleaved sliding window models.
            sliding_window = model_config.get_sliding_window()

        # Note(hc): In the current implementation of decode context
        # parallel(DCP), tp_size needs to be divisible by dcp_size,
        # because the world size does not change by dcp, it simply
        # reuses the GPUs of TP group, and split one TP group into
        # tp_size//dcp_size DCP groups.
        assert self.tensor_parallel_size % self.decode_context_parallel_size \
            == 0, (
            f"tp_size={self.tensor_parallel_size} must be divisible by"
            f"dcp_size={self.decode_context_parallel_size}."
        )

        cache_config = CacheConfig(
            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,
            kv_cache_memory_bytes=self.kv_cache_memory_bytes,
            swap_space=self.swap_space,
            cache_dtype=self.kv_cache_dtype,
            is_attention_free=model_config.is_attention_free,
            num_gpu_blocks_override=self.num_gpu_blocks_override,
            sliding_window=sliding_window,
            enable_prefix_caching=self.enable_prefix_caching,
            prefix_caching_hash_algo=self.prefix_caching_hash_algo,
            cpu_offload_gb=self.cpu_offload_gb,
            calculate_kv_scales=self.calculate_kv_scales,
            kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
            mamba_cache_dtype=self.mamba_cache_dtype,
            mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
        )

        ray_runtime_env = None
        if is_ray_initialized():
            # Ray Serve LLM calls `create_engine_config` in the context
            # of a Ray task, therefore we check is_ray_initialized()
            # as opposed to is_in_ray_actor().
            import ray
            ray_runtime_env = ray.get_runtime_context().runtime_env
            logger.info("Using ray runtime env: %s", ray_runtime_env)

        # Get the current placement group if Ray is initialized and
        # we are in a Ray actor. If so, then the placement group will be
        # passed to spawned processes.
        placement_group = None
        if is_in_ray_actor():
            import ray

            # This call initializes Ray automatically if it is not initialized,
            # but we should not do this here.
            placement_group = ray.util.get_current_placement_group()

        assert not headless or not self.data_parallel_hybrid_lb, (
            "data_parallel_hybrid_lb is not applicable in "
            "headless mode")

        data_parallel_external_lb = self.data_parallel_rank is not None
        # Local DP rank = 1, use pure-external LB.
        if data_parallel_external_lb:
            assert self.data_parallel_size_local in (1, None), (
                "data_parallel_size_local must be 1 when data_parallel_rank "
                "is set")
            data_parallel_size_local = 1
            # Use full external lb if we have local_size of 1.
            self.data_parallel_hybrid_lb = False
        elif self.data_parallel_size_local is not None:
            data_parallel_size_local = self.data_parallel_size_local

            if self.data_parallel_start_rank and not headless:
                # Infer hybrid LB mode.
                self.data_parallel_hybrid_lb = True

            if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
                # Use full external lb if we have local_size of 1.
                data_parallel_external_lb = True
                self.data_parallel_hybrid_lb = False

            if data_parallel_size_local == self.data_parallel_size:
                # Disable hybrid LB mode if set for a single node
                self.data_parallel_hybrid_lb = False

            self.data_parallel_rank = self.data_parallel_start_rank or 0
        else:
            assert not self.data_parallel_hybrid_lb, (
                "data_parallel_size_local must be set to use "
                "data_parallel_hybrid_lb.")

            # Local DP size defaults to global DP size if not set.
            data_parallel_size_local = self.data_parallel_size

        # DP address, used in multi-node case for torch distributed group
        # and ZMQ sockets.
        if self.data_parallel_address is None:
            if self.data_parallel_backend == "ray":
                host_ip = get_ip()
                logger.info(
                    "Using host IP %s as ray-based data parallel address",
                    host_ip)
                data_parallel_address = host_ip
            else:
                assert self.data_parallel_backend == "mp", (
                    "data_parallel_backend can only be ray or mp, got %s",
                    self.data_parallel_backend)
                data_parallel_address = ParallelConfig.data_parallel_master_ip
        else:
            data_parallel_address = self.data_parallel_address

        # This port is only used when there are remote data parallel engines,
        # otherwise the local IPC transport is used.
        data_parallel_rpc_port = self.data_parallel_rpc_port if (
            self.data_parallel_rpc_port
            is not None) else ParallelConfig.data_parallel_rpc_port

        if self.async_scheduling:
            # Async scheduling does not work with the uniprocess backend.
            if self.distributed_executor_backend is None:
                self.distributed_executor_backend = "mp"
                logger.info("Defaulting to mp-based distributed executor "
                            "backend for async scheduling.")
            if self.pipeline_parallel_size > 1:
                raise ValueError("Async scheduling is not supported with "
                                 "pipeline-parallel-size > 1.")

            # Currently, async scheduling does not support speculative decoding.
            # TODO(woosuk): Support it.
            if self.speculative_config is not None:
                raise ValueError(
                    "Currently, speculative decoding is not supported with "
                    "async scheduling.")

        # Forward the deprecated CLI args to the EPLB config.
        if self.num_redundant_experts is not None:
            self.eplb_config.num_redundant_experts = self.num_redundant_experts
        if self.eplb_window_size is not None:
            self.eplb_config.window_size = self.eplb_window_size
        if self.eplb_step_interval is not None:
            self.eplb_config.step_interval = self.eplb_step_interval
        if self.eplb_log_balancedness is not None:
            self.eplb_config.log_balancedness = self.eplb_log_balancedness

        parallel_config = ParallelConfig(
            pipeline_parallel_size=self.pipeline_parallel_size,
            tensor_parallel_size=self.tensor_parallel_size,
            data_parallel_size=self.data_parallel_size,
            data_parallel_rank=self.data_parallel_rank or 0,
            data_parallel_external_lb=data_parallel_external_lb,
            data_parallel_size_local=data_parallel_size_local,
            data_parallel_master_ip=data_parallel_address,
            data_parallel_rpc_port=data_parallel_rpc_port,
            data_parallel_backend=self.data_parallel_backend,
            data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
            enable_expert_parallel=self.enable_expert_parallel,
            enable_dbo=self.enable_dbo,
            dbo_decode_token_threshold=self.dbo_decode_token_threshold,
            dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
            enable_eplb=self.enable_eplb,
            eplb_config=self.eplb_config,
            expert_placement_strategy=self.expert_placement_strategy,
            max_parallel_loading_workers=self.max_parallel_loading_workers,
            disable_custom_all_reduce=self.disable_custom_all_reduce,
            ray_workers_use_nsight=self.ray_workers_use_nsight,
            ray_runtime_env=ray_runtime_env,
            placement_group=placement_group,
            distributed_executor_backend=self.distributed_executor_backend,
            worker_cls=self.worker_cls,
            worker_extension_cls=self.worker_extension_cls,
            decode_context_parallel_size=self.decode_context_parallel_size,
            _api_process_count=self._api_process_count,
            _api_process_rank=self._api_process_rank,
        )

        speculative_config = self.create_speculative_config(
            target_model_config=model_config,
            target_parallel_config=parallel_config,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_log_stats=self.disable_log_stats,
        )

        # make sure num_lookahead_slots is set appropriately depending on
        # whether speculative decoding is enabled
        num_lookahead_slots = self.num_lookahead_slots
        if speculative_config is not None:
            num_lookahead_slots = speculative_config.num_lookahead_slots

        scheduler_config = SchedulerConfig(
            runner_type=model_config.runner_type,
            max_num_batched_tokens=self.max_num_batched_tokens,
            max_num_seqs=self.max_num_seqs,
            max_model_len=model_config.max_model_len,
            cuda_graph_sizes=self.cuda_graph_sizes,
            num_lookahead_slots=num_lookahead_slots,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_chunked_mm_input=self.disable_chunked_mm_input,
            is_multimodal_model=model_config.is_multimodal_model,
            is_encoder_decoder=model_config.is_encoder_decoder,
            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                             and parallel_config.use_ray),
            policy=self.scheduling_policy,
            scheduler_cls=self.scheduler_cls,
            max_num_partial_prefills=self.max_num_partial_prefills,
            max_long_partial_prefills=self.max_long_partial_prefills,
            long_prefill_token_threshold=self.long_prefill_token_threshold,
            disable_hybrid_kv_cache_manager=self.
            disable_hybrid_kv_cache_manager,
            async_scheduling=self.async_scheduling,
        )

        if not model_config.is_multimodal_model and self.default_mm_loras:
            raise ValueError(
                "Default modality-specific LoRA(s) were provided for a "
                "non multimodal model")

        lora_config = LoRAConfig(
            bias_enabled=self.enable_lora_bias,
            max_lora_rank=self.max_lora_rank,
            max_loras=self.max_loras,
            default_mm_loras=self.default_mm_loras,
            fully_sharded_loras=self.fully_sharded_loras,
            lora_extra_vocab_size=self.lora_extra_vocab_size,
            lora_dtype=self.lora_dtype,
            max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
            and self.max_cpu_loras > 0 else None) if self.enable_lora else None

        # bitsandbytes pre-quantized model need a specific model loader
        if model_config.quantization == "bitsandbytes":
            self.quantization = self.load_format = "bitsandbytes"

        load_config = self.create_load_config()

        # Pass reasoning_parser into StructuredOutputsConfig
        if self.reasoning_parser:
            self.structured_outputs_config.reasoning_parser = \
                self.reasoning_parser

        # Forward the deprecated CLI args to the StructuredOutputsConfig
        so_config = self.structured_outputs_config
        if self.guided_decoding_backend is not None:
            so_config.guided_decoding_backend = \
            self.guided_decoding_backend
        if self.guided_decoding_disable_fallback is not None:
            so_config.guided_decoding_disable_fallback = \
            self.guided_decoding_disable_fallback
        if self.guided_decoding_disable_any_whitespace is not None:
            so_config.guided_decoding_disable_any_whitespace = \
            self.guided_decoding_disable_any_whitespace
        if self.guided_decoding_disable_additional_properties is not None:
            so_config.guided_decoding_disable_additional_properties = \
            self.guided_decoding_disable_additional_properties

        observability_config = ObservabilityConfig(
            show_hidden_metrics_for_version=(
                self.show_hidden_metrics_for_version),
            otlp_traces_endpoint=self.otlp_traces_endpoint,
            collect_detailed_traces=self.collect_detailed_traces,
        )

        config = VllmConfig(
            model_config=model_config,
            cache_config=cache_config,
            parallel_config=parallel_config,
            scheduler_config=scheduler_config,
            device_config=device_config,
            lora_config=lora_config,
            speculative_config=speculative_config,
            load_config=load_config,
            structured_outputs_config=self.structured_outputs_config,
            observability_config=observability_config,
            compilation_config=self.compilation_config,
            kv_transfer_config=self.kv_transfer_config,
            kv_events_config=self.kv_events_config,
            additional_config=self.additional_config,
        )

        return config

    def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
        """Oracle for whether to use V0 or V1 Engine by default."""

        #############################################################
        # Unsupported Feature Flags on V1.

        if (self.logits_processor_pattern
                != EngineArgs.logits_processor_pattern):
            _raise_or_fallback(feature_name="--logits-processor-pattern",
                               recommend_to_remove=False)
            return False

        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
                               recommend_to_remove=False)
            return False

        # No Concurrent Partial Prefills so far.
        if (self.max_num_partial_prefills
                != SchedulerConfig.max_num_partial_prefills
                or self.max_long_partial_prefills
                != SchedulerConfig.max_long_partial_prefills):
            _raise_or_fallback(feature_name="Concurrent Partial Prefill",
                               recommend_to_remove=False)
            return False

        # V1 supports N-gram, Medusa, and Eagle speculative decoding.
        if self.speculative_config is not None:
            # speculative_config could still be a dict at this point
            if isinstance(self.speculative_config, dict):
                method = self.speculative_config.get("method", None)
            else:
                method = self.speculative_config.method

            if method == "draft_model":
                raise NotImplementedError(
                    "Draft model speculative decoding is not supported yet. "
                    "Please consider using other speculative decoding methods "
                    "such as ngram, medusa, eagle, or mtp.")

        V1_BACKENDS = [
            "FLASH_ATTN",
            "PALLAS",
            "TRITON_ATTN",
            "TRITON_MLA",
            "CUTLASS_MLA",
            "FLASHMLA",
            "FLASH_ATTN_MLA",
            "FLASHINFER",
            "FLASHINFER_MLA",
            "ROCM_AITER_MLA",
            "TORCH_SDPA",
            "FLEX_ATTENTION",
            "TREE_ATTN",
            "XFORMERS",
            "ROCM_ATTN",
        ]
        if (envs.is_set("VLLM_ATTENTION_BACKEND")
                and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
            name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
            _raise_or_fallback(feature_name=name, recommend_to_remove=True)
            return False

        #############################################################
        # Experimental Features - allow users to opt in.

        if self.pipeline_parallel_size > 1:
            supports_pp = getattr(self.distributed_executor_backend,
                                  'supports_pp', False)
            if not supports_pp and self.distributed_executor_backend not in (
                    ParallelConfig.distributed_executor_backend, "ray", "mp",
                    "external_launcher"):
                name = "Pipeline Parallelism without Ray distributed " \
                        "executor or multiprocessing executor or external " \
                        "launcher"
                _raise_or_fallback(feature_name=name,
                                   recommend_to_remove=False)
                return False

        if (current_platform.is_cpu()
                and model_config.get_sliding_window() is not None):
            _raise_or_fallback(feature_name="sliding window (CPU backend)",
                               recommend_to_remove=False)
            return False

        #############################################################

        return True

    def _set_default_args(self, usage_context: UsageContext,
                          model_config: ModelConfig) -> None:
        """Set Default Arguments for V1 Engine."""

        # V1 always uses chunked prefills and prefix caching
        # for non-pooling tasks.
        # For pooling tasks the default is False
        if model_config.runner_type != "pooling":
            self.enable_chunked_prefill = True

            # TODO: When prefix caching supports prompt embeds inputs, this
            # check can be removed.
            if (self.enable_prompt_embeds
                    and self.enable_prefix_caching is not False):
                logger.warning(
                    "--enable-prompt-embeds and --enable-prefix-caching "
                    "are not supported together in V1. Prefix caching has "
                    "been disabled.")
                self.enable_prefix_caching = False

            if self.enable_prefix_caching is None:
                self.enable_prefix_caching = True
        else:

            pooling_type = model_config.pooler_config.pooling_type
            is_causal = getattr(model_config.hf_config, "is_causal", True)
            incremental_prefill_supported = (pooling_type is not None
                                             and pooling_type.lower() == "last"
                                             and is_causal)

            action = "Enabling" if \
                incremental_prefill_supported else "Disabling"

            if self.enable_chunked_prefill is None:
                self.enable_chunked_prefill = incremental_prefill_supported
                logger.info("(%s) chunked prefill by default", action)
            if self.enable_prefix_caching is None:
                self.enable_prefix_caching = incremental_prefill_supported
                logger.info("(%s) prefix caching by default", action)

        # V1 should use the new scheduler by default.
        # Swap it only if this arg is set to the original V0 default
        if self.scheduler_cls == EngineArgs.scheduler_cls:
            self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"

        # When no user override, set the default values based on the usage
        # context.
        # Use different default values for different hardware.

        # Try to query the device name on the current platform. If it fails,
        # it may be because the platform that imports vLLM is not the same
        # as the platform that vLLM is running on (e.g. the case of scaling
        # vLLM with Ray) and has no GPUs. In this case we use the default
        # values for non-H100/H200 GPUs.
        try:
            device_memory = current_platform.get_device_total_memory()
            device_name = current_platform.get_device_name().lower()
        except Exception:
            # This is only used to set default_max_num_batched_tokens
            device_memory = 0

        # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
        # throughput, see PR #17885 for more details.
        # So here we do an extra device name check to prevent such regression.
        from vllm.usage.usage_lib import UsageContext
        if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
            # For GPUs like H100 and MI300x, use larger default values.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 16384,
                UsageContext.OPENAI_API_SERVER: 8192,
            }
            default_max_num_seqs = {
                UsageContext.LLM_CLASS: 1024,
                UsageContext.OPENAI_API_SERVER: 1024,
            }
        else:
            # TODO(woosuk): Tune the default values for other hardware.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 8192,
                UsageContext.OPENAI_API_SERVER: 2048,
            }
            default_max_num_seqs = {
                UsageContext.LLM_CLASS: 256,
                UsageContext.OPENAI_API_SERVER: 256,
            }

        # tpu specific default values.
        if current_platform.is_tpu():
            default_max_num_batched_tokens_tpu = {
                UsageContext.LLM_CLASS: {
                    'V6E': 2048,
                    'V5E': 1024,
                    'V5P': 512,
                },
                UsageContext.OPENAI_API_SERVER: {
                    'V6E': 1024,
                    'V5E': 512,
                    'V5P': 256,
                }
            }

        # cpu specific default values.
        if current_platform.is_cpu():
            world_size = self.pipeline_parallel_size * self.tensor_parallel_size
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 4096 * world_size,
                UsageContext.OPENAI_API_SERVER: 2048 * world_size,
            }
            default_max_num_seqs = {
                UsageContext.LLM_CLASS: 256 * world_size,
                UsageContext.OPENAI_API_SERVER: 128 * world_size,
            }

        use_context_value = usage_context.value if usage_context else None
        if (self.max_num_batched_tokens is None
                and usage_context in default_max_num_batched_tokens):
            if current_platform.is_tpu():
                chip_name = current_platform.get_device_name()
                if chip_name in default_max_num_batched_tokens_tpu[
                        usage_context]:
                    self.max_num_batched_tokens = \
                        default_max_num_batched_tokens_tpu[
                            usage_context][chip_name]
                else:
                    self.max_num_batched_tokens = \
                        default_max_num_batched_tokens[usage_context]
            else:
                if not self.enable_chunked_prefill:
                    self.max_num_batched_tokens = model_config.max_model_len
                else:
                    self.max_num_batched_tokens = \
                        default_max_num_batched_tokens[usage_context]
            logger.debug(
                "Setting max_num_batched_tokens to %d for %s usage context.",
                self.max_num_batched_tokens, use_context_value)

        if (self.max_num_seqs is None
                and usage_context in default_max_num_seqs):
            self.max_num_seqs = min(default_max_num_seqs[usage_context],
                                    self.max_num_batched_tokens or sys.maxsize)

            logger.debug("Setting max_num_seqs to %d for %s usage context.",
                         self.max_num_seqs, use_context_value)

_api_process_count class-attribute instance-attribute

_api_process_count: int = _api_process_count

_api_process_rank class-attribute instance-attribute

_api_process_rank: int = _api_process_rank

additional_config class-attribute instance-attribute

additional_config: dict[str, Any] = get_field(
    VllmConfig, "additional_config"
)

allowed_local_media_path class-attribute instance-attribute

allowed_local_media_path: str = allowed_local_media_path

allowed_media_domains class-attribute instance-attribute

allowed_media_domains: Optional[list[str]] = (
    allowed_media_domains
)

async_scheduling class-attribute instance-attribute

async_scheduling: bool = async_scheduling

block_size class-attribute instance-attribute

block_size: Optional[BlockSize] = block_size

calculate_kv_scales class-attribute instance-attribute

calculate_kv_scales: bool = calculate_kv_scales

code_revision class-attribute instance-attribute

code_revision: Optional[str] = code_revision

collect_detailed_traces class-attribute instance-attribute

compilation_config class-attribute instance-attribute

compilation_config: CompilationConfig = get_field(
    VllmConfig, "compilation_config"
)

config_format class-attribute instance-attribute

config_format: str = config_format

convert class-attribute instance-attribute

cpu_offload_gb class-attribute instance-attribute

cpu_offload_gb: float = cpu_offload_gb

cuda_graph_sizes class-attribute instance-attribute

cuda_graph_sizes: list[int] = get_field(
    SchedulerConfig, "cuda_graph_sizes"
)

data_parallel_address class-attribute instance-attribute

data_parallel_address: Optional[str] = None

data_parallel_backend class-attribute instance-attribute

data_parallel_backend: str = data_parallel_backend

data_parallel_hybrid_lb class-attribute instance-attribute

data_parallel_hybrid_lb: bool = False

data_parallel_rank class-attribute instance-attribute

data_parallel_rank: Optional[int] = None

data_parallel_rpc_port class-attribute instance-attribute

data_parallel_rpc_port: Optional[int] = None

data_parallel_size class-attribute instance-attribute

data_parallel_size: int = data_parallel_size

data_parallel_size_local class-attribute instance-attribute

data_parallel_size_local: Optional[int] = None

data_parallel_start_rank class-attribute instance-attribute

data_parallel_start_rank: Optional[int] = None

dbo_decode_token_threshold class-attribute instance-attribute

dbo_decode_token_threshold: int = dbo_decode_token_threshold

dbo_prefill_token_threshold class-attribute instance-attribute

dbo_prefill_token_threshold: int = (
    dbo_prefill_token_threshold
)

decode_context_parallel_size class-attribute instance-attribute

decode_context_parallel_size: int = (
    decode_context_parallel_size
)

default_mm_loras class-attribute instance-attribute

default_mm_loras: Optional[Dict[str, str]] = (
    default_mm_loras
)

disable_cascade_attn class-attribute instance-attribute

disable_cascade_attn: bool = disable_cascade_attn

disable_chunked_mm_input class-attribute instance-attribute

disable_chunked_mm_input: bool = disable_chunked_mm_input

disable_custom_all_reduce class-attribute instance-attribute

disable_custom_all_reduce: bool = disable_custom_all_reduce

disable_hybrid_kv_cache_manager class-attribute instance-attribute

disable_hybrid_kv_cache_manager: bool = (
    disable_hybrid_kv_cache_manager
)

disable_log_stats class-attribute instance-attribute

disable_log_stats: bool = False

disable_mm_preprocessor_cache class-attribute instance-attribute

disable_mm_preprocessor_cache: bool = False

disable_sliding_window class-attribute instance-attribute

disable_sliding_window: bool = disable_sliding_window

distributed_executor_backend class-attribute instance-attribute

download_dir class-attribute instance-attribute

download_dir: Optional[str] = download_dir

dtype class-attribute instance-attribute

dtype: ModelDType = dtype

enable_chunked_prefill class-attribute instance-attribute

enable_chunked_prefill: Optional[bool] = (
    enable_chunked_prefill
)

enable_dbo class-attribute instance-attribute

enable_dbo: bool = enable_dbo

enable_eplb class-attribute instance-attribute

enable_eplb: bool = enable_eplb

enable_expert_parallel class-attribute instance-attribute

enable_expert_parallel: bool = enable_expert_parallel

enable_lora class-attribute instance-attribute

enable_lora: bool = False

enable_lora_bias class-attribute instance-attribute

enable_lora_bias: bool = bias_enabled

enable_multimodal_encoder_data_parallel class-attribute instance-attribute

enable_multimodal_encoder_data_parallel: bool = False

enable_prefix_caching class-attribute instance-attribute

enable_prefix_caching: Optional[bool] = (
    enable_prefix_caching
)

enable_prompt_embeds class-attribute instance-attribute

enable_prompt_embeds: bool = enable_prompt_embeds

enable_sleep_mode class-attribute instance-attribute

enable_sleep_mode: bool = enable_sleep_mode

enforce_eager class-attribute instance-attribute

enforce_eager: bool = enforce_eager

eplb_config class-attribute instance-attribute

eplb_config: EPLBConfig = get_field(
    ParallelConfig, "eplb_config"
)

eplb_log_balancedness class-attribute instance-attribute

eplb_log_balancedness: bool = log_balancedness

eplb_step_interval class-attribute instance-attribute

eplb_step_interval: int = step_interval

eplb_window_size class-attribute instance-attribute

eplb_window_size: int = window_size

expert_placement_strategy class-attribute instance-attribute

expert_placement_strategy: ExpertPlacementStrategy = (
    expert_placement_strategy
)

fully_sharded_loras class-attribute instance-attribute

fully_sharded_loras: bool = fully_sharded_loras

generation_config class-attribute instance-attribute

generation_config: str = generation_config

gpu_memory_utilization class-attribute instance-attribute

gpu_memory_utilization: float = gpu_memory_utilization

guided_decoding_backend class-attribute instance-attribute

guided_decoding_backend: Optional[str] = None

guided_decoding_disable_additional_properties class-attribute instance-attribute

guided_decoding_disable_additional_properties: Optional[
    bool
] = None

guided_decoding_disable_any_whitespace class-attribute instance-attribute

guided_decoding_disable_any_whitespace: Optional[bool] = (
    None
)

guided_decoding_disable_fallback class-attribute instance-attribute

guided_decoding_disable_fallback: Optional[bool] = None

hf_config_path class-attribute instance-attribute

hf_config_path: Optional[str] = hf_config_path

hf_overrides class-attribute instance-attribute

hf_overrides: HfOverrides = get_field(
    ModelConfig, "hf_overrides"
)

hf_token class-attribute instance-attribute

hf_token: Optional[Union[bool, str]] = hf_token

ignore_patterns class-attribute instance-attribute

ignore_patterns: Optional[Union[str, List[str]]] = (
    ignore_patterns
)

interleave_mm_strings class-attribute instance-attribute

interleave_mm_strings: bool = interleave_mm_strings

io_processor_plugin class-attribute instance-attribute

io_processor_plugin: Optional[str] = None

kv_cache_dtype class-attribute instance-attribute

kv_cache_dtype: CacheDType = cache_dtype

kv_cache_memory_bytes class-attribute instance-attribute

kv_cache_memory_bytes: Optional[int] = kv_cache_memory_bytes

kv_events_config class-attribute instance-attribute

kv_events_config: Optional[KVEventsConfig] = None

kv_sharing_fast_prefill class-attribute instance-attribute

kv_sharing_fast_prefill: bool = kv_sharing_fast_prefill

kv_transfer_config class-attribute instance-attribute

kv_transfer_config: Optional[KVTransferConfig] = None

limit_mm_per_prompt class-attribute instance-attribute

limit_mm_per_prompt: dict[str, int] = get_field(
    MultiModalConfig, "limit_per_prompt"
)

load_format class-attribute instance-attribute

load_format: Union[str, LoadFormats] = load_format

logits_processor_pattern class-attribute instance-attribute

logits_processor_pattern: Optional[str] = (
    logits_processor_pattern
)

logits_processors class-attribute instance-attribute

Custom logitproc types

logprobs_mode class-attribute instance-attribute

logprobs_mode: LogprobsMode = logprobs_mode

long_prefill_token_threshold class-attribute instance-attribute

long_prefill_token_threshold: int = (
    long_prefill_token_threshold
)

lora_dtype class-attribute instance-attribute

lora_dtype: Optional[Union[str, dtype]] = lora_dtype

lora_extra_vocab_size class-attribute instance-attribute

lora_extra_vocab_size: int = lora_extra_vocab_size

mamba_cache_dtype class-attribute instance-attribute

mamba_cache_dtype: MambaDType = mamba_cache_dtype

mamba_ssm_cache_dtype class-attribute instance-attribute

mamba_ssm_cache_dtype: MambaDType = mamba_ssm_cache_dtype

max_cpu_loras class-attribute instance-attribute

max_cpu_loras: Optional[int] = max_cpu_loras

max_logprobs class-attribute instance-attribute

max_logprobs: int = max_logprobs

max_long_partial_prefills class-attribute instance-attribute

max_long_partial_prefills: int = max_long_partial_prefills

max_lora_rank class-attribute instance-attribute

max_lora_rank: int = max_lora_rank

max_loras class-attribute instance-attribute

max_loras: int = max_loras

max_model_len class-attribute instance-attribute

max_model_len: Optional[int] = max_model_len

max_num_batched_tokens class-attribute instance-attribute

max_num_batched_tokens: Optional[int] = (
    max_num_batched_tokens
)

max_num_partial_prefills class-attribute instance-attribute

max_num_partial_prefills: int = max_num_partial_prefills

max_num_seqs class-attribute instance-attribute

max_num_seqs: Optional[int] = max_num_seqs

max_parallel_loading_workers class-attribute instance-attribute

max_parallel_loading_workers: Optional[int] = (
    max_parallel_loading_workers
)

media_io_kwargs class-attribute instance-attribute

media_io_kwargs: dict[str, dict[str, Any]] = get_field(
    MultiModalConfig, "media_io_kwargs"
)

mm_encoder_tp_mode class-attribute instance-attribute

mm_encoder_tp_mode: MMEncoderTPMode = mm_encoder_tp_mode

mm_processor_cache_gb class-attribute instance-attribute

mm_processor_cache_gb: float = mm_processor_cache_gb

mm_processor_cache_type class-attribute instance-attribute

mm_processor_cache_type: Optional[MMCacheType] = (
    mm_processor_cache_type
)

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[Dict[str, Any]] = (
    mm_processor_kwargs
)

mm_shm_cache_max_object_size_mb class-attribute instance-attribute

mm_shm_cache_max_object_size_mb: int = (
    mm_shm_cache_max_object_size_mb
)

model class-attribute instance-attribute

model: str = model

model_impl class-attribute instance-attribute

model_impl: str = model_impl

model_loader_extra_config class-attribute instance-attribute

model_loader_extra_config: dict = get_field(
    LoadConfig, "model_loader_extra_config"
)

num_gpu_blocks_override class-attribute instance-attribute

num_gpu_blocks_override: Optional[int] = (
    num_gpu_blocks_override
)

num_lookahead_slots class-attribute instance-attribute

num_lookahead_slots: int = num_lookahead_slots

num_redundant_experts class-attribute instance-attribute

num_redundant_experts: int = num_redundant_experts

otlp_traces_endpoint class-attribute instance-attribute

otlp_traces_endpoint: Optional[str] = otlp_traces_endpoint

override_attention_dtype class-attribute instance-attribute

override_attention_dtype: str = override_attention_dtype

override_generation_config class-attribute instance-attribute

override_generation_config: dict[str, Any] = get_field(
    ModelConfig, "override_generation_config"
)

override_pooler_config class-attribute instance-attribute

override_pooler_config: Optional[
    Union[dict, PoolerConfig]
] = override_pooler_config

pipeline_parallel_size class-attribute instance-attribute

pipeline_parallel_size: int = pipeline_parallel_size

pooler_config class-attribute instance-attribute

prefix_caching_hash_algo class-attribute instance-attribute

prefix_caching_hash_algo: PrefixCachingHashAlgo = (
    prefix_caching_hash_algo
)

pt_load_map_location class-attribute instance-attribute

pt_load_map_location: str = pt_load_map_location

quantization class-attribute instance-attribute

ray_workers_use_nsight class-attribute instance-attribute

ray_workers_use_nsight: bool = ray_workers_use_nsight

reasoning_parser class-attribute instance-attribute

reasoning_parser: str = reasoning_parser

revision class-attribute instance-attribute

revision: Optional[str] = revision

rope_scaling class-attribute instance-attribute

rope_scaling: dict[str, Any] = get_field(
    ModelConfig, "rope_scaling"
)

rope_theta class-attribute instance-attribute

rope_theta: Optional[float] = rope_theta

runner class-attribute instance-attribute

runner: RunnerOption = runner

safetensors_load_strategy class-attribute instance-attribute

safetensors_load_strategy: str = safetensors_load_strategy

scheduler_cls class-attribute instance-attribute

scheduler_cls: Union[str, Type[object]] = scheduler_cls

scheduling_policy class-attribute instance-attribute

scheduling_policy: SchedulerPolicy = policy

seed class-attribute instance-attribute

seed: Optional[int] = seed

served_model_name class-attribute instance-attribute

served_model_name: Optional[Union[str, List[str]]] = (
    served_model_name
)

show_hidden_metrics_for_version class-attribute instance-attribute

show_hidden_metrics_for_version: Optional[str] = (
    show_hidden_metrics_for_version
)

skip_mm_profiling class-attribute instance-attribute

skip_mm_profiling: bool = skip_mm_profiling

skip_tokenizer_init class-attribute instance-attribute

skip_tokenizer_init: bool = skip_tokenizer_init

speculative_config class-attribute instance-attribute

speculative_config: Optional[Dict[str, Any]] = None

structured_outputs_config class-attribute instance-attribute

structured_outputs_config: StructuredOutputsConfig = (
    get_field(VllmConfig, "structured_outputs_config")
)

swap_space class-attribute instance-attribute

swap_space: float = swap_space

task class-attribute instance-attribute

tensor_parallel_size class-attribute instance-attribute

tensor_parallel_size: int = tensor_parallel_size

tokenizer class-attribute instance-attribute

tokenizer: Optional[str] = tokenizer

tokenizer_mode class-attribute instance-attribute

tokenizer_mode: TokenizerMode = tokenizer_mode

tokenizer_revision class-attribute instance-attribute

tokenizer_revision: Optional[str] = tokenizer_revision

trust_remote_code class-attribute instance-attribute

trust_remote_code: bool = trust_remote_code

use_tqdm_on_load class-attribute instance-attribute

use_tqdm_on_load: bool = use_tqdm_on_load

video_pruning_rate class-attribute instance-attribute

video_pruning_rate: float = video_pruning_rate

worker_cls class-attribute instance-attribute

worker_cls: str = worker_cls

worker_extension_cls class-attribute instance-attribute

worker_extension_cls: str = worker_extension_cls

__init__

__init__(
    model: str = model,
    served_model_name: Optional[
        Union[str, List[str]]
    ] = served_model_name,
    tokenizer: Optional[str] = tokenizer,
    hf_config_path: Optional[str] = hf_config_path,
    runner: RunnerOption = runner,
    convert: ConvertOption = convert,
    task: Optional[TaskOption] = task,
    skip_tokenizer_init: bool = skip_tokenizer_init,
    enable_prompt_embeds: bool = enable_prompt_embeds,
    tokenizer_mode: TokenizerMode = tokenizer_mode,
    trust_remote_code: bool = trust_remote_code,
    allowed_local_media_path: str = allowed_local_media_path,
    allowed_media_domains: Optional[
        list[str]
    ] = allowed_media_domains,
    download_dir: Optional[str] = download_dir,
    safetensors_load_strategy: str = safetensors_load_strategy,
    load_format: Union[str, LoadFormats] = load_format,
    config_format: str = config_format,
    dtype: ModelDType = dtype,
    kv_cache_dtype: CacheDType = cache_dtype,
    seed: Optional[int] = seed,
    max_model_len: Optional[int] = max_model_len,
    cuda_graph_sizes: list[int] = get_field(
        SchedulerConfig, "cuda_graph_sizes"
    ),
    distributed_executor_backend: Optional[
        Union[
            str,
            DistributedExecutorBackend,
            Type[ExecutorBase],
        ]
    ] = distributed_executor_backend,
    pipeline_parallel_size: int = pipeline_parallel_size,
    tensor_parallel_size: int = tensor_parallel_size,
    decode_context_parallel_size: int = decode_context_parallel_size,
    data_parallel_size: int = data_parallel_size,
    data_parallel_rank: Optional[int] = None,
    data_parallel_start_rank: Optional[int] = None,
    data_parallel_size_local: Optional[int] = None,
    data_parallel_address: Optional[str] = None,
    data_parallel_rpc_port: Optional[int] = None,
    data_parallel_hybrid_lb: bool = False,
    data_parallel_backend: str = data_parallel_backend,
    enable_expert_parallel: bool = enable_expert_parallel,
    enable_dbo: bool = enable_dbo,
    dbo_decode_token_threshold: int = dbo_decode_token_threshold,
    dbo_prefill_token_threshold: int = dbo_prefill_token_threshold,
    eplb_config: EPLBConfig = get_field(
        ParallelConfig, "eplb_config"
    ),
    enable_eplb: bool = enable_eplb,
    expert_placement_strategy: ExpertPlacementStrategy = expert_placement_strategy,
    _api_process_count: int = _api_process_count,
    _api_process_rank: int = _api_process_rank,
    num_redundant_experts: int = num_redundant_experts,
    eplb_window_size: int = window_size,
    eplb_step_interval: int = step_interval,
    eplb_log_balancedness: bool = log_balancedness,
    max_parallel_loading_workers: Optional[
        int
    ] = max_parallel_loading_workers,
    block_size: Optional[BlockSize] = block_size,
    enable_prefix_caching: Optional[
        bool
    ] = enable_prefix_caching,
    prefix_caching_hash_algo: PrefixCachingHashAlgo = prefix_caching_hash_algo,
    disable_sliding_window: bool = disable_sliding_window,
    disable_cascade_attn: bool = disable_cascade_attn,
    swap_space: float = swap_space,
    cpu_offload_gb: float = cpu_offload_gb,
    gpu_memory_utilization: float = gpu_memory_utilization,
    kv_cache_memory_bytes: Optional[
        int
    ] = kv_cache_memory_bytes,
    max_num_batched_tokens: Optional[
        int
    ] = max_num_batched_tokens,
    max_num_partial_prefills: int = max_num_partial_prefills,
    max_long_partial_prefills: int = max_long_partial_prefills,
    long_prefill_token_threshold: int = long_prefill_token_threshold,
    max_num_seqs: Optional[int] = max_num_seqs,
    max_logprobs: int = max_logprobs,
    logprobs_mode: LogprobsMode = logprobs_mode,
    disable_log_stats: bool = False,
    revision: Optional[str] = revision,
    code_revision: Optional[str] = code_revision,
    rope_scaling: dict[str, Any] = get_field(
        ModelConfig, "rope_scaling"
    ),
    rope_theta: Optional[float] = rope_theta,
    hf_token: Optional[Union[bool, str]] = hf_token,
    hf_overrides: HfOverrides = get_field(
        ModelConfig, "hf_overrides"
    ),
    tokenizer_revision: Optional[str] = tokenizer_revision,
    quantization: Optional[
        QuantizationMethods
    ] = quantization,
    enforce_eager: bool = enforce_eager,
    disable_custom_all_reduce: bool = disable_custom_all_reduce,
    limit_mm_per_prompt: dict[str, int] = get_field(
        MultiModalConfig, "limit_per_prompt"
    ),
    interleave_mm_strings: bool = interleave_mm_strings,
    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
        MultiModalConfig, "media_io_kwargs"
    ),
    mm_processor_kwargs: Optional[
        Dict[str, Any]
    ] = mm_processor_kwargs,
    disable_mm_preprocessor_cache: bool = False,
    mm_processor_cache_gb: float = mm_processor_cache_gb,
    mm_processor_cache_type: Optional[
        MMCacheType
    ] = mm_processor_cache_type,
    mm_shm_cache_max_object_size_mb: int = mm_shm_cache_max_object_size_mb,
    mm_encoder_tp_mode: MMEncoderTPMode = mm_encoder_tp_mode,
    io_processor_plugin: Optional[str] = None,
    skip_mm_profiling: bool = skip_mm_profiling,
    video_pruning_rate: float = video_pruning_rate,
    enable_lora: bool = False,
    enable_lora_bias: bool = bias_enabled,
    max_loras: int = max_loras,
    max_lora_rank: int = max_lora_rank,
    default_mm_loras: Optional[
        Dict[str, str]
    ] = default_mm_loras,
    fully_sharded_loras: bool = fully_sharded_loras,
    max_cpu_loras: Optional[int] = max_cpu_loras,
    lora_dtype: Optional[Union[str, dtype]] = lora_dtype,
    lora_extra_vocab_size: int = lora_extra_vocab_size,
    ray_workers_use_nsight: bool = ray_workers_use_nsight,
    num_gpu_blocks_override: Optional[
        int
    ] = num_gpu_blocks_override,
    num_lookahead_slots: int = num_lookahead_slots,
    model_loader_extra_config: dict = get_field(
        LoadConfig, "model_loader_extra_config"
    ),
    ignore_patterns: Optional[
        Union[str, List[str]]
    ] = ignore_patterns,
    enable_chunked_prefill: Optional[
        bool
    ] = enable_chunked_prefill,
    disable_chunked_mm_input: bool = disable_chunked_mm_input,
    disable_hybrid_kv_cache_manager: bool = disable_hybrid_kv_cache_manager,
    structured_outputs_config: StructuredOutputsConfig = get_field(
        VllmConfig, "structured_outputs_config"
    ),
    reasoning_parser: str = reasoning_parser,
    guided_decoding_backend: Optional[str] = None,
    guided_decoding_disable_fallback: Optional[bool] = None,
    guided_decoding_disable_any_whitespace: Optional[
        bool
    ] = None,
    guided_decoding_disable_additional_properties: Optional[
        bool
    ] = None,
    logits_processor_pattern: Optional[
        str
    ] = logits_processor_pattern,
    speculative_config: Optional[Dict[str, Any]] = None,
    show_hidden_metrics_for_version: Optional[
        str
    ] = show_hidden_metrics_for_version,
    otlp_traces_endpoint: Optional[
        str
    ] = otlp_traces_endpoint,
    collect_detailed_traces: Optional[
        list[DetailedTraceModules]
    ] = collect_detailed_traces,
    scheduling_policy: SchedulerPolicy = policy,
    scheduler_cls: Union[str, Type[object]] = scheduler_cls,
    pooler_config: Optional[PoolerConfig] = pooler_config,
    override_pooler_config: Optional[
        Union[dict, PoolerConfig]
    ] = override_pooler_config,
    compilation_config: CompilationConfig = get_field(
        VllmConfig, "compilation_config"
    ),
    worker_cls: str = worker_cls,
    worker_extension_cls: str = worker_extension_cls,
    kv_transfer_config: Optional[KVTransferConfig] = None,
    kv_events_config: Optional[KVEventsConfig] = None,
    generation_config: str = generation_config,
    enable_sleep_mode: bool = enable_sleep_mode,
    override_generation_config: dict[str, Any] = get_field(
        ModelConfig, "override_generation_config"
    ),
    model_impl: str = model_impl,
    override_attention_dtype: str = override_attention_dtype,
    calculate_kv_scales: bool = calculate_kv_scales,
    mamba_cache_dtype: MambaDType = mamba_cache_dtype,
    mamba_ssm_cache_dtype: MambaDType = mamba_ssm_cache_dtype,
    additional_config: dict[str, Any] = get_field(
        VllmConfig, "additional_config"
    ),
    use_tqdm_on_load: bool = use_tqdm_on_load,
    pt_load_map_location: str = pt_load_map_location,
    enable_multimodal_encoder_data_parallel: bool = False,
    logits_processors: Optional[
        list[Union[str, type[LogitsProcessor]]]
    ] = logits_processors,
    async_scheduling: bool = async_scheduling,
    kv_sharing_fast_prefill: bool = kv_sharing_fast_prefill,
) -> None

__post_init__

__post_init__()
Source code in vllm/engine/arg_utils.py
def __post_init__(self):
    # support `EngineArgs(compilation_config={...})`
    # without having to manually construct a
    # CompilationConfig object
    if isinstance(self.compilation_config, dict):
        self.compilation_config = CompilationConfig(
            **self.compilation_config)
    if isinstance(self.eplb_config, dict):
        self.eplb_config = EPLBConfig(**self.eplb_config)
    # Setup plugins
    from vllm.plugins import load_general_plugins
    load_general_plugins()
    # when use hf offline,replace model id to local model path
    if huggingface_hub.constants.HF_HUB_OFFLINE:
        model_id = self.model
        self.model = get_model_path(self.model, self.revision)
        logger.info(
            "HF_HUB_OFFLINE is True, replace model_id [%s] " \
            "to model_path [%s]",model_id, self.model)

_is_v1_supported_oracle

_is_v1_supported_oracle(model_config: ModelConfig) -> bool

Oracle for whether to use V0 or V1 Engine by default.

Source code in vllm/engine/arg_utils.py
def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
    """Oracle for whether to use V0 or V1 Engine by default."""

    #############################################################
    # Unsupported Feature Flags on V1.

    if (self.logits_processor_pattern
            != EngineArgs.logits_processor_pattern):
        _raise_or_fallback(feature_name="--logits-processor-pattern",
                           recommend_to_remove=False)
        return False

    # No Mamba or Encoder-Decoder so far.
    if not model_config.is_v1_compatible:
        _raise_or_fallback(feature_name=model_config.architectures,
                           recommend_to_remove=False)
        return False

    # No Concurrent Partial Prefills so far.
    if (self.max_num_partial_prefills
            != SchedulerConfig.max_num_partial_prefills
            or self.max_long_partial_prefills
            != SchedulerConfig.max_long_partial_prefills):
        _raise_or_fallback(feature_name="Concurrent Partial Prefill",
                           recommend_to_remove=False)
        return False

    # V1 supports N-gram, Medusa, and Eagle speculative decoding.
    if self.speculative_config is not None:
        # speculative_config could still be a dict at this point
        if isinstance(self.speculative_config, dict):
            method = self.speculative_config.get("method", None)
        else:
            method = self.speculative_config.method

        if method == "draft_model":
            raise NotImplementedError(
                "Draft model speculative decoding is not supported yet. "
                "Please consider using other speculative decoding methods "
                "such as ngram, medusa, eagle, or mtp.")

    V1_BACKENDS = [
        "FLASH_ATTN",
        "PALLAS",
        "TRITON_ATTN",
        "TRITON_MLA",
        "CUTLASS_MLA",
        "FLASHMLA",
        "FLASH_ATTN_MLA",
        "FLASHINFER",
        "FLASHINFER_MLA",
        "ROCM_AITER_MLA",
        "TORCH_SDPA",
        "FLEX_ATTENTION",
        "TREE_ATTN",
        "XFORMERS",
        "ROCM_ATTN",
    ]
    if (envs.is_set("VLLM_ATTENTION_BACKEND")
            and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
        name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
        _raise_or_fallback(feature_name=name, recommend_to_remove=True)
        return False

    #############################################################
    # Experimental Features - allow users to opt in.

    if self.pipeline_parallel_size > 1:
        supports_pp = getattr(self.distributed_executor_backend,
                              'supports_pp', False)
        if not supports_pp and self.distributed_executor_backend not in (
                ParallelConfig.distributed_executor_backend, "ray", "mp",
                "external_launcher"):
            name = "Pipeline Parallelism without Ray distributed " \
                    "executor or multiprocessing executor or external " \
                    "launcher"
            _raise_or_fallback(feature_name=name,
                               recommend_to_remove=False)
            return False

    if (current_platform.is_cpu()
            and model_config.get_sliding_window() is not None):
        _raise_or_fallback(feature_name="sliding window (CPU backend)",
                           recommend_to_remove=False)
        return False

    #############################################################

    return True

_set_default_args

_set_default_args(
    usage_context: UsageContext, model_config: ModelConfig
) -> None

Set Default Arguments for V1 Engine.

Source code in vllm/engine/arg_utils.py
def _set_default_args(self, usage_context: UsageContext,
                      model_config: ModelConfig) -> None:
    """Set Default Arguments for V1 Engine."""

    # V1 always uses chunked prefills and prefix caching
    # for non-pooling tasks.
    # For pooling tasks the default is False
    if model_config.runner_type != "pooling":
        self.enable_chunked_prefill = True

        # TODO: When prefix caching supports prompt embeds inputs, this
        # check can be removed.
        if (self.enable_prompt_embeds
                and self.enable_prefix_caching is not False):
            logger.warning(
                "--enable-prompt-embeds and --enable-prefix-caching "
                "are not supported together in V1. Prefix caching has "
                "been disabled.")
            self.enable_prefix_caching = False

        if self.enable_prefix_caching is None:
            self.enable_prefix_caching = True
    else:

        pooling_type = model_config.pooler_config.pooling_type
        is_causal = getattr(model_config.hf_config, "is_causal", True)
        incremental_prefill_supported = (pooling_type is not None
                                         and pooling_type.lower() == "last"
                                         and is_causal)

        action = "Enabling" if \
            incremental_prefill_supported else "Disabling"

        if self.enable_chunked_prefill is None:
            self.enable_chunked_prefill = incremental_prefill_supported
            logger.info("(%s) chunked prefill by default", action)
        if self.enable_prefix_caching is None:
            self.enable_prefix_caching = incremental_prefill_supported
            logger.info("(%s) prefix caching by default", action)

    # V1 should use the new scheduler by default.
    # Swap it only if this arg is set to the original V0 default
    if self.scheduler_cls == EngineArgs.scheduler_cls:
        self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"

    # When no user override, set the default values based on the usage
    # context.
    # Use different default values for different hardware.

    # Try to query the device name on the current platform. If it fails,
    # it may be because the platform that imports vLLM is not the same
    # as the platform that vLLM is running on (e.g. the case of scaling
    # vLLM with Ray) and has no GPUs. In this case we use the default
    # values for non-H100/H200 GPUs.
    try:
        device_memory = current_platform.get_device_total_memory()
        device_name = current_platform.get_device_name().lower()
    except Exception:
        # This is only used to set default_max_num_batched_tokens
        device_memory = 0

    # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
    # throughput, see PR #17885 for more details.
    # So here we do an extra device name check to prevent such regression.
    from vllm.usage.usage_lib import UsageContext
    if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
        # For GPUs like H100 and MI300x, use larger default values.
        default_max_num_batched_tokens = {
            UsageContext.LLM_CLASS: 16384,
            UsageContext.OPENAI_API_SERVER: 8192,
        }
        default_max_num_seqs = {
            UsageContext.LLM_CLASS: 1024,
            UsageContext.OPENAI_API_SERVER: 1024,
        }
    else:
        # TODO(woosuk): Tune the default values for other hardware.
        default_max_num_batched_tokens = {
            UsageContext.LLM_CLASS: 8192,
            UsageContext.OPENAI_API_SERVER: 2048,
        }
        default_max_num_seqs = {
            UsageContext.LLM_CLASS: 256,
            UsageContext.OPENAI_API_SERVER: 256,
        }

    # tpu specific default values.
    if current_platform.is_tpu():
        default_max_num_batched_tokens_tpu = {
            UsageContext.LLM_CLASS: {
                'V6E': 2048,
                'V5E': 1024,
                'V5P': 512,
            },
            UsageContext.OPENAI_API_SERVER: {
                'V6E': 1024,
                'V5E': 512,
                'V5P': 256,
            }
        }

    # cpu specific default values.
    if current_platform.is_cpu():
        world_size = self.pipeline_parallel_size * self.tensor_parallel_size
        default_max_num_batched_tokens = {
            UsageContext.LLM_CLASS: 4096 * world_size,
            UsageContext.OPENAI_API_SERVER: 2048 * world_size,
        }
        default_max_num_seqs = {
            UsageContext.LLM_CLASS: 256 * world_size,
            UsageContext.OPENAI_API_SERVER: 128 * world_size,
        }

    use_context_value = usage_context.value if usage_context else None
    if (self.max_num_batched_tokens is None
            and usage_context in default_max_num_batched_tokens):
        if current_platform.is_tpu():
            chip_name = current_platform.get_device_name()
            if chip_name in default_max_num_batched_tokens_tpu[
                    usage_context]:
                self.max_num_batched_tokens = \
                    default_max_num_batched_tokens_tpu[
                        usage_context][chip_name]
            else:
                self.max_num_batched_tokens = \
                    default_max_num_batched_tokens[usage_context]
        else:
            if not self.enable_chunked_prefill:
                self.max_num_batched_tokens = model_config.max_model_len
            else:
                self.max_num_batched_tokens = \
                    default_max_num_batched_tokens[usage_context]
        logger.debug(
            "Setting max_num_batched_tokens to %d for %s usage context.",
            self.max_num_batched_tokens, use_context_value)

    if (self.max_num_seqs is None
            and usage_context in default_max_num_seqs):
        self.max_num_seqs = min(default_max_num_seqs[usage_context],
                                self.max_num_batched_tokens or sys.maxsize)

        logger.debug("Setting max_num_seqs to %d for %s usage context.",
                     self.max_num_seqs, use_context_value)

add_cli_args staticmethod

add_cli_args(
    parser: FlexibleArgumentParser,
) -> FlexibleArgumentParser

Shared CLI arguments for vLLM engine.

Source code in vllm/engine/arg_utils.py
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
    """Shared CLI arguments for vLLM engine."""

    # Model arguments
    model_kwargs = get_kwargs(ModelConfig)
    model_group = parser.add_argument_group(
        title="ModelConfig",
        description=ModelConfig.__doc__,
    )
    if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
        model_group.add_argument("--model", **model_kwargs["model"])
    model_group.add_argument("--runner", **model_kwargs["runner"])
    model_group.add_argument("--convert", **model_kwargs["convert"])
    model_group.add_argument("--task",
                             **model_kwargs["task"],
                             deprecated=True)
    model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
    model_group.add_argument("--tokenizer-mode",
                             **model_kwargs["tokenizer_mode"])
    model_group.add_argument("--trust-remote-code",
                             **model_kwargs["trust_remote_code"])
    model_group.add_argument("--dtype", **model_kwargs["dtype"])
    model_group.add_argument("--seed", **model_kwargs["seed"])
    model_group.add_argument("--hf-config-path",
                             **model_kwargs["hf_config_path"])
    model_group.add_argument("--allowed-local-media-path",
                             **model_kwargs["allowed_local_media_path"])
    model_group.add_argument("--allowed-media-domains",
                             **model_kwargs["allowed_media_domains"])
    model_group.add_argument("--revision", **model_kwargs["revision"])
    model_group.add_argument("--code-revision",
                             **model_kwargs["code_revision"])
    model_group.add_argument("--rope-scaling",
                             **model_kwargs["rope_scaling"])
    model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
    model_group.add_argument("--tokenizer-revision",
                             **model_kwargs["tokenizer_revision"])
    model_group.add_argument("--max-model-len",
                             **model_kwargs["max_model_len"])
    model_group.add_argument("--quantization", "-q",
                             **model_kwargs["quantization"])
    model_group.add_argument("--enforce-eager",
                             **model_kwargs["enforce_eager"])
    model_group.add_argument("--max-logprobs",
                             **model_kwargs["max_logprobs"])
    model_group.add_argument("--logprobs-mode",
                             **model_kwargs["logprobs_mode"])
    model_group.add_argument("--disable-sliding-window",
                             **model_kwargs["disable_sliding_window"])
    model_group.add_argument("--disable-cascade-attn",
                             **model_kwargs["disable_cascade_attn"])
    model_group.add_argument("--skip-tokenizer-init",
                             **model_kwargs["skip_tokenizer_init"])
    model_group.add_argument("--enable-prompt-embeds",
                             **model_kwargs["enable_prompt_embeds"])
    model_group.add_argument("--served-model-name",
                             **model_kwargs["served_model_name"])
    model_group.add_argument("--config-format",
                             **model_kwargs["config_format"])
    # This one is a special case because it can bool
    # or str. TODO: Handle this in get_kwargs
    model_group.add_argument("--hf-token",
                             type=str,
                             nargs="?",
                             const=True,
                             default=model_kwargs["hf_token"]["default"],
                             help=model_kwargs["hf_token"]["help"])
    model_group.add_argument("--hf-overrides",
                             **model_kwargs["hf_overrides"])
    model_group.add_argument("--pooler-config",
                             **model_kwargs["pooler_config"])
    model_group.add_argument("--override-pooler-config",
                             **model_kwargs["override_pooler_config"],
                             deprecated=True)
    model_group.add_argument("--logits-processor-pattern",
                             **model_kwargs["logits_processor_pattern"])
    model_group.add_argument("--generation-config",
                             **model_kwargs["generation_config"])
    model_group.add_argument("--override-generation-config",
                             **model_kwargs["override_generation_config"])
    model_group.add_argument("--enable-sleep-mode",
                             **model_kwargs["enable_sleep_mode"])
    model_group.add_argument("--model-impl", **model_kwargs["model_impl"])
    model_group.add_argument("--override-attention-dtype",
                             **model_kwargs["override_attention_dtype"])
    model_group.add_argument("--logits-processors",
                             **model_kwargs["logits_processors"])
    model_group.add_argument("--io-processor-plugin",
                             **model_kwargs["io_processor_plugin"])

    # Model loading arguments
    load_kwargs = get_kwargs(LoadConfig)
    load_group = parser.add_argument_group(
        title="LoadConfig",
        description=LoadConfig.__doc__,
    )
    load_group.add_argument("--load-format", **load_kwargs["load_format"])
    load_group.add_argument("--download-dir",
                            **load_kwargs["download_dir"])
    load_group.add_argument("--safetensors-load-strategy",
                            **load_kwargs["safetensors_load_strategy"])
    load_group.add_argument("--model-loader-extra-config",
                            **load_kwargs["model_loader_extra_config"])
    load_group.add_argument("--ignore-patterns",
                            **load_kwargs["ignore_patterns"])
    load_group.add_argument("--use-tqdm-on-load",
                            **load_kwargs["use_tqdm_on_load"])
    load_group.add_argument('--pt-load-map-location',
                            **load_kwargs["pt_load_map_location"])

    # Structured outputs arguments
    structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
    structured_outputs_group = parser.add_argument_group(
        title="StructuredOutputsConfig",
        description=StructuredOutputsConfig.__doc__,
    )
    structured_outputs_group.add_argument(
        "--reasoning-parser",
        # This choice is a special case because it's not static
        choices=list(ReasoningParserManager.reasoning_parsers),
        **structured_outputs_kwargs["reasoning_parser"])
    # Deprecated guided decoding arguments
    for arg, type in [
        ("--guided-decoding-backend", str),
        ("--guided-decoding-disable-fallback", bool),
        ("--guided-decoding-disable-any-whitespace", bool),
        ("--guided-decoding-disable-additional-properties", bool),
    ]:
        structured_outputs_group.add_argument(
            arg,
            type=type,
            help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
            deprecated=True)

    # Parallel arguments
    parallel_kwargs = get_kwargs(ParallelConfig)
    parallel_group = parser.add_argument_group(
        title="ParallelConfig",
        description=ParallelConfig.__doc__,
    )
    parallel_group.add_argument(
        "--distributed-executor-backend",
        **parallel_kwargs["distributed_executor_backend"])
    parallel_group.add_argument(
        "--pipeline-parallel-size", "-pp",
        **parallel_kwargs["pipeline_parallel_size"])
    parallel_group.add_argument("--tensor-parallel-size", "-tp",
                                **parallel_kwargs["tensor_parallel_size"])
    parallel_group.add_argument(
        "--decode-context-parallel-size", "-dcp",
        **parallel_kwargs["decode_context_parallel_size"])
    parallel_group.add_argument("--data-parallel-size", "-dp",
                                **parallel_kwargs["data_parallel_size"])
    parallel_group.add_argument(
        '--data-parallel-rank',
        '-dpn',
        type=int,
        help='Data parallel rank of this instance. '
        'When set, enables external load balancer mode.')
    parallel_group.add_argument('--data-parallel-start-rank',
                                '-dpr',
                                type=int,
                                help='Starting data parallel rank '
                                'for secondary nodes.')
    parallel_group.add_argument('--data-parallel-size-local',
                                '-dpl',
                                type=int,
                                help='Number of data parallel replicas '
                                'to run on this node.')
    parallel_group.add_argument('--data-parallel-address',
                                '-dpa',
                                type=str,
                                help='Address of data parallel cluster '
                                'head-node.')
    parallel_group.add_argument('--data-parallel-rpc-port',
                                '-dpp',
                                type=int,
                                help='Port for data parallel RPC '
                                'communication.')
    parallel_group.add_argument('--data-parallel-backend',
                                '-dpb',
                                type=str,
                                default='mp',
                                help='Backend for data parallel, either '
                                '"mp" or "ray".')
    parallel_group.add_argument(
        "--data-parallel-hybrid-lb",
        **parallel_kwargs["data_parallel_hybrid_lb"])
    parallel_group.add_argument(
        "--enable-expert-parallel",
        **parallel_kwargs["enable_expert_parallel"])
    parallel_group.add_argument("--enable-dbo",
                                **parallel_kwargs["enable_dbo"])
    parallel_group.add_argument(
        "--dbo-decode-token-threshold",
        **parallel_kwargs["dbo_decode_token_threshold"])
    parallel_group.add_argument(
        "--dbo-prefill-token-threshold",
        **parallel_kwargs["dbo_prefill_token_threshold"])
    parallel_group.add_argument("--enable-eplb",
                                **parallel_kwargs["enable_eplb"])
    parallel_group.add_argument("--eplb-config",
                                **parallel_kwargs["eplb_config"])
    parallel_group.add_argument(
        "--expert-placement-strategy",
        **parallel_kwargs["expert_placement_strategy"])
    parallel_group.add_argument(
        "--num-redundant-experts",
        type=int,
        help=
        "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
        deprecated=True)
    parallel_group.add_argument(
        "--eplb-window-size",
        type=int,
        help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
        deprecated=True)
    parallel_group.add_argument(
        "--eplb-step-interval",
        type=int,
        help=
        "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
        deprecated=True)
    parallel_group.add_argument(
        "--eplb-log-balancedness",
        action=argparse.BooleanOptionalAction,
        help=
        "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
        deprecated=True)

    parallel_group.add_argument(
        "--max-parallel-loading-workers",
        **parallel_kwargs["max_parallel_loading_workers"])
    parallel_group.add_argument(
        "--ray-workers-use-nsight",
        **parallel_kwargs["ray_workers_use_nsight"])
    parallel_group.add_argument(
        "--disable-custom-all-reduce",
        **parallel_kwargs["disable_custom_all_reduce"])
    parallel_group.add_argument("--worker-cls",
                                **parallel_kwargs["worker_cls"])
    parallel_group.add_argument("--worker-extension-cls",
                                **parallel_kwargs["worker_extension_cls"])
    parallel_group.add_argument(
        "--enable-multimodal-encoder-data-parallel",
        action="store_true",
        deprecated=True)

    # KV cache arguments
    cache_kwargs = get_kwargs(CacheConfig)
    cache_group = parser.add_argument_group(
        title="CacheConfig",
        description=CacheConfig.__doc__,
    )
    cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
    cache_group.add_argument("--gpu-memory-utilization",
                             **cache_kwargs["gpu_memory_utilization"])
    cache_group.add_argument("--kv-cache-memory-bytes",
                             **cache_kwargs["kv_cache_memory_bytes"])
    cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
    cache_group.add_argument("--kv-cache-dtype",
                             **cache_kwargs["cache_dtype"])
    cache_group.add_argument("--num-gpu-blocks-override",
                             **cache_kwargs["num_gpu_blocks_override"])
    cache_group.add_argument("--enable-prefix-caching",
                             **cache_kwargs["enable_prefix_caching"])
    cache_group.add_argument("--prefix-caching-hash-algo",
                             **cache_kwargs["prefix_caching_hash_algo"])
    cache_group.add_argument("--cpu-offload-gb",
                             **cache_kwargs["cpu_offload_gb"])
    cache_group.add_argument("--calculate-kv-scales",
                             **cache_kwargs["calculate_kv_scales"])
    cache_group.add_argument("--kv-sharing-fast-prefill",
                             **cache_kwargs["kv_sharing_fast_prefill"])
    cache_group.add_argument("--mamba-cache-dtype",
                             **cache_kwargs["mamba_cache_dtype"])
    cache_group.add_argument("--mamba-ssm-cache-dtype",
                             **cache_kwargs["mamba_ssm_cache_dtype"])

    # Multimodal related configs
    multimodal_kwargs = get_kwargs(MultiModalConfig)
    multimodal_group = parser.add_argument_group(
        title="MultiModalConfig",
        description=MultiModalConfig.__doc__,
    )
    multimodal_group.add_argument("--limit-mm-per-prompt",
                                  **multimodal_kwargs["limit_per_prompt"])
    multimodal_group.add_argument("--media-io-kwargs",
                                  **multimodal_kwargs["media_io_kwargs"])
    multimodal_group.add_argument(
        "--mm-processor-kwargs",
        **multimodal_kwargs["mm_processor_kwargs"])
    multimodal_group.add_argument(
        "--mm-processor-cache-gb",
        **multimodal_kwargs["mm_processor_cache_gb"])
    multimodal_group.add_argument("--disable-mm-preprocessor-cache",
                                  action="store_true",
                                  deprecated=True)
    multimodal_group.add_argument(
        "--mm-processor-cache-type",
        **multimodal_kwargs["mm_processor_cache_type"])
    multimodal_group.add_argument(
        "--mm-shm-cache-max-object-size-mb",
        **multimodal_kwargs["mm_shm_cache_max_object_size_mb"])
    multimodal_group.add_argument(
        "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"])
    multimodal_group.add_argument(
        "--interleave-mm-strings",
        **multimodal_kwargs["interleave_mm_strings"])
    multimodal_group.add_argument("--skip-mm-profiling",
                                  **multimodal_kwargs["skip_mm_profiling"])

    multimodal_group.add_argument(
        "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"])

    # LoRA related configs
    lora_kwargs = get_kwargs(LoRAConfig)
    lora_group = parser.add_argument_group(
        title="LoRAConfig",
        description=LoRAConfig.__doc__,
    )
    lora_group.add_argument(
        "--enable-lora",
        action=argparse.BooleanOptionalAction,
        help="If True, enable handling of LoRA adapters.")
    lora_group.add_argument("--enable-lora-bias",
                            **lora_kwargs["bias_enabled"])
    lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
    lora_group.add_argument("--max-lora-rank",
                            **lora_kwargs["max_lora_rank"])
    lora_group.add_argument("--lora-extra-vocab-size",
                            **lora_kwargs["lora_extra_vocab_size"])
    lora_group.add_argument(
        "--lora-dtype",
        **lora_kwargs["lora_dtype"],
    )
    lora_group.add_argument("--max-cpu-loras",
                            **lora_kwargs["max_cpu_loras"])
    lora_group.add_argument("--fully-sharded-loras",
                            **lora_kwargs["fully_sharded_loras"])
    lora_group.add_argument("--default-mm-loras",
                            **lora_kwargs["default_mm_loras"])

    # Observability arguments
    observability_kwargs = get_kwargs(ObservabilityConfig)
    observability_group = parser.add_argument_group(
        title="ObservabilityConfig",
        description=ObservabilityConfig.__doc__,
    )
    observability_group.add_argument(
        "--show-hidden-metrics-for-version",
        **observability_kwargs["show_hidden_metrics_for_version"])
    observability_group.add_argument(
        "--otlp-traces-endpoint",
        **observability_kwargs["otlp_traces_endpoint"])
    # TODO: generalise this special case
    choices = observability_kwargs["collect_detailed_traces"]["choices"]
    metavar = f"{{{','.join(choices)}}}"
    observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
    observability_kwargs["collect_detailed_traces"]["choices"] += [
        ",".join(p)
        for p in permutations(get_args(DetailedTraceModules), r=2)
    ]
    observability_group.add_argument(
        "--collect-detailed-traces",
        **observability_kwargs["collect_detailed_traces"])

    # Scheduler arguments
    scheduler_kwargs = get_kwargs(SchedulerConfig)
    scheduler_group = parser.add_argument_group(
        title="SchedulerConfig",
        description=SchedulerConfig.__doc__,
    )
    scheduler_group.add_argument(
        "--max-num-batched-tokens",
        **scheduler_kwargs["max_num_batched_tokens"])
    scheduler_group.add_argument("--max-num-seqs",
                                 **scheduler_kwargs["max_num_seqs"])
    scheduler_group.add_argument(
        "--max-num-partial-prefills",
        **scheduler_kwargs["max_num_partial_prefills"])
    scheduler_group.add_argument(
        "--max-long-partial-prefills",
        **scheduler_kwargs["max_long_partial_prefills"])
    scheduler_group.add_argument('--cuda-graph-sizes',
                                 **scheduler_kwargs["cuda_graph_sizes"])
    scheduler_group.add_argument(
        "--long-prefill-token-threshold",
        **scheduler_kwargs["long_prefill_token_threshold"])
    scheduler_group.add_argument("--num-lookahead-slots",
                                 **scheduler_kwargs["num_lookahead_slots"])
    # multi-step scheduling has been removed; corresponding arguments
    # are no longer supported.
    scheduler_group.add_argument("--scheduling-policy",
                                 **scheduler_kwargs["policy"])
    scheduler_group.add_argument(
        "--enable-chunked-prefill",
        **scheduler_kwargs["enable_chunked_prefill"])
    scheduler_group.add_argument(
        "--disable-chunked-mm-input",
        **scheduler_kwargs["disable_chunked_mm_input"])
    scheduler_group.add_argument("--scheduler-cls",
                                 **scheduler_kwargs["scheduler_cls"])
    scheduler_group.add_argument(
        "--disable-hybrid-kv-cache-manager",
        **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
    scheduler_group.add_argument("--async-scheduling",
                                 **scheduler_kwargs["async_scheduling"])

    # vLLM arguments
    vllm_kwargs = get_kwargs(VllmConfig)
    vllm_group = parser.add_argument_group(
        title="VllmConfig",
        description=VllmConfig.__doc__,
    )
    # We construct SpeculativeConfig using fields from other configs in
    # create_engine_config. So we set the type to a JSON string here to
    # delay the Pydantic validation that comes with SpeculativeConfig.
    vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads)
    vllm_group.add_argument("--speculative-config",
                            **vllm_kwargs["speculative_config"])
    vllm_group.add_argument("--kv-transfer-config",
                            **vllm_kwargs["kv_transfer_config"])
    vllm_group.add_argument('--kv-events-config',
                            **vllm_kwargs["kv_events_config"])
    vllm_group.add_argument("--compilation-config", "-O",
                            **vllm_kwargs["compilation_config"])
    vllm_group.add_argument("--additional-config",
                            **vllm_kwargs["additional_config"])
    vllm_group.add_argument('--structured-outputs-config',
                            **vllm_kwargs["structured_outputs_config"])

    # Other arguments
    parser.add_argument('--disable-log-stats',
                        action='store_true',
                        help='Disable logging statistics.')

    return parser

create_engine_config

create_engine_config(
    usage_context: Optional[UsageContext] = None,
    headless: bool = False,
) -> VllmConfig

Create the VllmConfig.

NOTE: for autoselection of V0 vs V1 engine, we need to create the ModelConfig first, since ModelConfig's attrs (e.g. the model arch) are needed to make the decision.

This function set VLLM_USE_V1=X if VLLM_USE_V1 is unspecified by the user.

If VLLM_USE_V1 is specified by the user but the VllmConfig is incompatible, we raise an error.

Source code in vllm/engine/arg_utils.py
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
def create_engine_config(
    self,
    usage_context: Optional[UsageContext] = None,
    headless: bool = False,
) -> VllmConfig:
    """
    Create the VllmConfig.

    NOTE: for autoselection of V0 vs V1 engine, we need to
    create the ModelConfig first, since ModelConfig's attrs
    (e.g. the model arch) are needed to make the decision.

    This function set VLLM_USE_V1=X if VLLM_USE_V1 is
    unspecified by the user.

    If VLLM_USE_V1 is specified by the user but the VllmConfig
    is incompatible, we raise an error.
    """
    current_platform.pre_register_and_update()

    device_config = DeviceConfig(
        device=cast(Device, current_platform.device_type))

    (self.model, self.tokenizer,
     self.speculative_config) = maybe_override_with_speculators(
         model=self.model,
         tokenizer=self.tokenizer,
         revision=self.revision,
         trust_remote_code=self.trust_remote_code,
         vllm_speculative_config=self.speculative_config,
     )
    model_config = self.create_model_config()

    # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
    #   and fall back to V0 for experimental or unsupported features.
    # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
    #   features and raise error for unsupported features.
    # * If VLLM_USE_V1=0, we disable V1.
    use_v1 = False
    try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
    if try_v1 and self._is_v1_supported_oracle(model_config):
        use_v1 = True

    # If user explicitly set VLLM_USE_V1, sanity check we respect it.
    if envs.is_set("VLLM_USE_V1"):
        assert use_v1 == envs.VLLM_USE_V1
    # Otherwise, set the VLLM_USE_V1 variable globally.
    else:
        envs.set_vllm_use_v1(use_v1)

    # Set default arguments for V1 Engine.
    self._set_default_args(usage_context, model_config)
    # Disable chunked prefill for POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
    if current_platform.is_cpu() and current_platform.get_cpu_architecture(
    ) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM,
          CpuArchEnum.RISCV):
        logger.info("Chunked prefill is not supported for ARM and POWER, "
                    "S390X and RISC-V CPUs; "
                    "disabling it for V1 backend.")
        self.enable_chunked_prefill = False
    assert self.enable_chunked_prefill is not None

    sliding_window: Optional[int] = None
    if not is_interleaved(model_config.hf_text_config):
        # Only set CacheConfig.sliding_window if the model is all sliding
        # window. Otherwise CacheConfig.sliding_window will override the
        # global layers in interleaved sliding window models.
        sliding_window = model_config.get_sliding_window()

    # Note(hc): In the current implementation of decode context
    # parallel(DCP), tp_size needs to be divisible by dcp_size,
    # because the world size does not change by dcp, it simply
    # reuses the GPUs of TP group, and split one TP group into
    # tp_size//dcp_size DCP groups.
    assert self.tensor_parallel_size % self.decode_context_parallel_size \
        == 0, (
        f"tp_size={self.tensor_parallel_size} must be divisible by"
        f"dcp_size={self.decode_context_parallel_size}."
    )

    cache_config = CacheConfig(
        block_size=self.block_size,
        gpu_memory_utilization=self.gpu_memory_utilization,
        kv_cache_memory_bytes=self.kv_cache_memory_bytes,
        swap_space=self.swap_space,
        cache_dtype=self.kv_cache_dtype,
        is_attention_free=model_config.is_attention_free,
        num_gpu_blocks_override=self.num_gpu_blocks_override,
        sliding_window=sliding_window,
        enable_prefix_caching=self.enable_prefix_caching,
        prefix_caching_hash_algo=self.prefix_caching_hash_algo,
        cpu_offload_gb=self.cpu_offload_gb,
        calculate_kv_scales=self.calculate_kv_scales,
        kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
        mamba_cache_dtype=self.mamba_cache_dtype,
        mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
    )

    ray_runtime_env = None
    if is_ray_initialized():
        # Ray Serve LLM calls `create_engine_config` in the context
        # of a Ray task, therefore we check is_ray_initialized()
        # as opposed to is_in_ray_actor().
        import ray
        ray_runtime_env = ray.get_runtime_context().runtime_env
        logger.info("Using ray runtime env: %s", ray_runtime_env)

    # Get the current placement group if Ray is initialized and
    # we are in a Ray actor. If so, then the placement group will be
    # passed to spawned processes.
    placement_group = None
    if is_in_ray_actor():
        import ray

        # This call initializes Ray automatically if it is not initialized,
        # but we should not do this here.
        placement_group = ray.util.get_current_placement_group()

    assert not headless or not self.data_parallel_hybrid_lb, (
        "data_parallel_hybrid_lb is not applicable in "
        "headless mode")

    data_parallel_external_lb = self.data_parallel_rank is not None
    # Local DP rank = 1, use pure-external LB.
    if data_parallel_external_lb:
        assert self.data_parallel_size_local in (1, None), (
            "data_parallel_size_local must be 1 when data_parallel_rank "
            "is set")
        data_parallel_size_local = 1
        # Use full external lb if we have local_size of 1.
        self.data_parallel_hybrid_lb = False
    elif self.data_parallel_size_local is not None:
        data_parallel_size_local = self.data_parallel_size_local

        if self.data_parallel_start_rank and not headless:
            # Infer hybrid LB mode.
            self.data_parallel_hybrid_lb = True

        if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
            # Use full external lb if we have local_size of 1.
            data_parallel_external_lb = True
            self.data_parallel_hybrid_lb = False

        if data_parallel_size_local == self.data_parallel_size:
            # Disable hybrid LB mode if set for a single node
            self.data_parallel_hybrid_lb = False

        self.data_parallel_rank = self.data_parallel_start_rank or 0
    else:
        assert not self.data_parallel_hybrid_lb, (
            "data_parallel_size_local must be set to use "
            "data_parallel_hybrid_lb.")

        # Local DP size defaults to global DP size if not set.
        data_parallel_size_local = self.data_parallel_size

    # DP address, used in multi-node case for torch distributed group
    # and ZMQ sockets.
    if self.data_parallel_address is None:
        if self.data_parallel_backend == "ray":
            host_ip = get_ip()
            logger.info(
                "Using host IP %s as ray-based data parallel address",
                host_ip)
            data_parallel_address = host_ip
        else:
            assert self.data_parallel_backend == "mp", (
                "data_parallel_backend can only be ray or mp, got %s",
                self.data_parallel_backend)
            data_parallel_address = ParallelConfig.data_parallel_master_ip
    else:
        data_parallel_address = self.data_parallel_address

    # This port is only used when there are remote data parallel engines,
    # otherwise the local IPC transport is used.
    data_parallel_rpc_port = self.data_parallel_rpc_port if (
        self.data_parallel_rpc_port
        is not None) else ParallelConfig.data_parallel_rpc_port

    if self.async_scheduling:
        # Async scheduling does not work with the uniprocess backend.
        if self.distributed_executor_backend is None:
            self.distributed_executor_backend = "mp"
            logger.info("Defaulting to mp-based distributed executor "
                        "backend for async scheduling.")
        if self.pipeline_parallel_size > 1:
            raise ValueError("Async scheduling is not supported with "
                             "pipeline-parallel-size > 1.")

        # Currently, async scheduling does not support speculative decoding.
        # TODO(woosuk): Support it.
        if self.speculative_config is not None:
            raise ValueError(
                "Currently, speculative decoding is not supported with "
                "async scheduling.")

    # Forward the deprecated CLI args to the EPLB config.
    if self.num_redundant_experts is not None:
        self.eplb_config.num_redundant_experts = self.num_redundant_experts
    if self.eplb_window_size is not None:
        self.eplb_config.window_size = self.eplb_window_size
    if self.eplb_step_interval is not None:
        self.eplb_config.step_interval = self.eplb_step_interval
    if self.eplb_log_balancedness is not None:
        self.eplb_config.log_balancedness = self.eplb_log_balancedness

    parallel_config = ParallelConfig(
        pipeline_parallel_size=self.pipeline_parallel_size,
        tensor_parallel_size=self.tensor_parallel_size,
        data_parallel_size=self.data_parallel_size,
        data_parallel_rank=self.data_parallel_rank or 0,
        data_parallel_external_lb=data_parallel_external_lb,
        data_parallel_size_local=data_parallel_size_local,
        data_parallel_master_ip=data_parallel_address,
        data_parallel_rpc_port=data_parallel_rpc_port,
        data_parallel_backend=self.data_parallel_backend,
        data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
        enable_expert_parallel=self.enable_expert_parallel,
        enable_dbo=self.enable_dbo,
        dbo_decode_token_threshold=self.dbo_decode_token_threshold,
        dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
        enable_eplb=self.enable_eplb,
        eplb_config=self.eplb_config,
        expert_placement_strategy=self.expert_placement_strategy,
        max_parallel_loading_workers=self.max_parallel_loading_workers,
        disable_custom_all_reduce=self.disable_custom_all_reduce,
        ray_workers_use_nsight=self.ray_workers_use_nsight,
        ray_runtime_env=ray_runtime_env,
        placement_group=placement_group,
        distributed_executor_backend=self.distributed_executor_backend,
        worker_cls=self.worker_cls,
        worker_extension_cls=self.worker_extension_cls,
        decode_context_parallel_size=self.decode_context_parallel_size,
        _api_process_count=self._api_process_count,
        _api_process_rank=self._api_process_rank,
    )

    speculative_config = self.create_speculative_config(
        target_model_config=model_config,
        target_parallel_config=parallel_config,
        enable_chunked_prefill=self.enable_chunked_prefill,
        disable_log_stats=self.disable_log_stats,
    )

    # make sure num_lookahead_slots is set appropriately depending on
    # whether speculative decoding is enabled
    num_lookahead_slots = self.num_lookahead_slots
    if speculative_config is not None:
        num_lookahead_slots = speculative_config.num_lookahead_slots

    scheduler_config = SchedulerConfig(
        runner_type=model_config.runner_type,
        max_num_batched_tokens=self.max_num_batched_tokens,
        max_num_seqs=self.max_num_seqs,
        max_model_len=model_config.max_model_len,
        cuda_graph_sizes=self.cuda_graph_sizes,
        num_lookahead_slots=num_lookahead_slots,
        enable_chunked_prefill=self.enable_chunked_prefill,
        disable_chunked_mm_input=self.disable_chunked_mm_input,
        is_multimodal_model=model_config.is_multimodal_model,
        is_encoder_decoder=model_config.is_encoder_decoder,
        send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                         and parallel_config.use_ray),
        policy=self.scheduling_policy,
        scheduler_cls=self.scheduler_cls,
        max_num_partial_prefills=self.max_num_partial_prefills,
        max_long_partial_prefills=self.max_long_partial_prefills,
        long_prefill_token_threshold=self.long_prefill_token_threshold,
        disable_hybrid_kv_cache_manager=self.
        disable_hybrid_kv_cache_manager,
        async_scheduling=self.async_scheduling,
    )

    if not model_config.is_multimodal_model and self.default_mm_loras:
        raise ValueError(
            "Default modality-specific LoRA(s) were provided for a "
            "non multimodal model")

    lora_config = LoRAConfig(
        bias_enabled=self.enable_lora_bias,
        max_lora_rank=self.max_lora_rank,
        max_loras=self.max_loras,
        default_mm_loras=self.default_mm_loras,
        fully_sharded_loras=self.fully_sharded_loras,
        lora_extra_vocab_size=self.lora_extra_vocab_size,
        lora_dtype=self.lora_dtype,
        max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
        and self.max_cpu_loras > 0 else None) if self.enable_lora else None

    # bitsandbytes pre-quantized model need a specific model loader
    if model_config.quantization == "bitsandbytes":
        self.quantization = self.load_format = "bitsandbytes"

    load_config = self.create_load_config()

    # Pass reasoning_parser into StructuredOutputsConfig
    if self.reasoning_parser:
        self.structured_outputs_config.reasoning_parser = \
            self.reasoning_parser

    # Forward the deprecated CLI args to the StructuredOutputsConfig
    so_config = self.structured_outputs_config
    if self.guided_decoding_backend is not None:
        so_config.guided_decoding_backend = \
        self.guided_decoding_backend
    if self.guided_decoding_disable_fallback is not None:
        so_config.guided_decoding_disable_fallback = \
        self.guided_decoding_disable_fallback
    if self.guided_decoding_disable_any_whitespace is not None:
        so_config.guided_decoding_disable_any_whitespace = \
        self.guided_decoding_disable_any_whitespace
    if self.guided_decoding_disable_additional_properties is not None:
        so_config.guided_decoding_disable_additional_properties = \
        self.guided_decoding_disable_additional_properties

    observability_config = ObservabilityConfig(
        show_hidden_metrics_for_version=(
            self.show_hidden_metrics_for_version),
        otlp_traces_endpoint=self.otlp_traces_endpoint,
        collect_detailed_traces=self.collect_detailed_traces,
    )

    config = VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
        device_config=device_config,
        lora_config=lora_config,
        speculative_config=speculative_config,
        load_config=load_config,
        structured_outputs_config=self.structured_outputs_config,
        observability_config=observability_config,
        compilation_config=self.compilation_config,
        kv_transfer_config=self.kv_transfer_config,
        kv_events_config=self.kv_events_config,
        additional_config=self.additional_config,
    )

    return config

create_load_config

create_load_config() -> LoadConfig
Source code in vllm/engine/arg_utils.py
def create_load_config(self) -> LoadConfig:

    if self.quantization == "bitsandbytes":
        self.load_format = "bitsandbytes"

    if self.load_format == "tensorizer":
        if hasattr(self.model_loader_extra_config, "to_serializable"):
            self.model_loader_extra_config = (
                self.model_loader_extra_config.to_serializable())
        self.model_loader_extra_config["tensorizer_config"] = {}
        self.model_loader_extra_config["tensorizer_config"][
            "tensorizer_dir"] = self.model
        self.validate_tensorizer_args()

    return LoadConfig(
        load_format=self.load_format,
        download_dir=self.download_dir,
        safetensors_load_strategy=self.safetensors_load_strategy,
        device="cpu"
        if is_online_quantization(self.quantization) else None,
        model_loader_extra_config=self.model_loader_extra_config,
        ignore_patterns=self.ignore_patterns,
        use_tqdm_on_load=self.use_tqdm_on_load,
        pt_load_map_location=self.pt_load_map_location,
    )

create_model_config

create_model_config() -> ModelConfig
Source code in vllm/engine/arg_utils.py
def create_model_config(self) -> ModelConfig:
    # gguf file needs a specific model loader and doesn't use hf_repo
    if check_gguf_file(self.model):
        self.quantization = self.load_format = "gguf"

    # NOTE: This is to allow model loading from S3 in CI
    if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
            and self.model in MODELS_ON_S3 and self.load_format == "auto"):
        self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"

    if self.disable_mm_preprocessor_cache:
        logger.warning(
            "`--disable-mm-preprocessor-cache` is deprecated "
            "and will be removed in v0.13. "
            "Please use `--mm-processor-cache-gb 0` instead.", )

        self.mm_processor_cache_gb = 0
    elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
        logger.warning(
            "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
            "and will be removed in v0.13. "
            "Please use `--mm-processor-cache-gb %d` instead.",
            envs.VLLM_MM_INPUT_CACHE_GIB,
        )

        self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB

    if self.enable_multimodal_encoder_data_parallel:
        logger.warning(
            "--enable-multimodal-encoder-data-parallel` is deprecated "
            "and will be removed in v0.13. "
            "Please use `--mm-encoder-tp-mode data` instead.")

        self.mm_encoder_tp_mode = "data"

    return ModelConfig(
        model=self.model,
        hf_config_path=self.hf_config_path,
        runner=self.runner,
        convert=self.convert,
        task=self.task,
        tokenizer=self.tokenizer,
        tokenizer_mode=self.tokenizer_mode,
        trust_remote_code=self.trust_remote_code,
        allowed_local_media_path=self.allowed_local_media_path,
        allowed_media_domains=self.allowed_media_domains,
        dtype=self.dtype,
        seed=self.seed,
        revision=self.revision,
        code_revision=self.code_revision,
        rope_scaling=self.rope_scaling,
        rope_theta=self.rope_theta,
        hf_token=self.hf_token,
        hf_overrides=self.hf_overrides,
        tokenizer_revision=self.tokenizer_revision,
        max_model_len=self.max_model_len,
        quantization=self.quantization,
        enforce_eager=self.enforce_eager,
        max_logprobs=self.max_logprobs,
        logprobs_mode=self.logprobs_mode,
        disable_sliding_window=self.disable_sliding_window,
        disable_cascade_attn=self.disable_cascade_attn,
        skip_tokenizer_init=self.skip_tokenizer_init,
        enable_prompt_embeds=self.enable_prompt_embeds,
        served_model_name=self.served_model_name,
        limit_mm_per_prompt=self.limit_mm_per_prompt,
        interleave_mm_strings=self.interleave_mm_strings,
        media_io_kwargs=self.media_io_kwargs,
        skip_mm_profiling=self.skip_mm_profiling,
        config_format=self.config_format,
        mm_processor_kwargs=self.mm_processor_kwargs,
        mm_processor_cache_gb=self.mm_processor_cache_gb,
        mm_processor_cache_type=self.mm_processor_cache_type,
        mm_shm_cache_max_object_size_mb=self.
        mm_shm_cache_max_object_size_mb,
        mm_encoder_tp_mode=self.mm_encoder_tp_mode,
        pooler_config=self.pooler_config,
        override_pooler_config=self.override_pooler_config,
        logits_processor_pattern=self.logits_processor_pattern,
        generation_config=self.generation_config,
        override_generation_config=self.override_generation_config,
        enable_sleep_mode=self.enable_sleep_mode,
        model_impl=self.model_impl,
        override_attention_dtype=self.override_attention_dtype,
        logits_processors=self.logits_processors,
        video_pruning_rate=self.video_pruning_rate,
        io_processor_plugin=self.io_processor_plugin,
    )

create_speculative_config

create_speculative_config(
    target_model_config: ModelConfig,
    target_parallel_config: ParallelConfig,
    enable_chunked_prefill: bool,
    disable_log_stats: bool,
) -> Optional[SpeculativeConfig]

Initializes and returns a SpeculativeConfig object based on speculative_config.

This function utilizes speculative_config to create a SpeculativeConfig object. The speculative_config can either be provided as a JSON string input via CLI arguments or directly as a dictionary from the engine.

Source code in vllm/engine/arg_utils.py
def create_speculative_config(
    self,
    target_model_config: ModelConfig,
    target_parallel_config: ParallelConfig,
    enable_chunked_prefill: bool,
    disable_log_stats: bool,
) -> Optional["SpeculativeConfig"]:
    """Initializes and returns a SpeculativeConfig object based on
    `speculative_config`.

    This function utilizes `speculative_config` to create a
    SpeculativeConfig object. The `speculative_config` can either be
    provided as a JSON string input via CLI arguments or directly as a
    dictionary from the engine.
    """
    if self.speculative_config is None:
        return None

    # Note(Shangming): These parameters are not obtained from the cli arg
    # '--speculative-config' and must be passed in when creating the engine
    # config.
    self.speculative_config.update({
        "target_model_config": target_model_config,
        "target_parallel_config": target_parallel_config,
        "enable_chunked_prefill": enable_chunked_prefill,
        "disable_log_stats": disable_log_stats,
    })
    return SpeculativeConfig(**self.speculative_config)

from_cli_args classmethod

from_cli_args(args: Namespace)
Source code in vllm/engine/arg_utils.py
@classmethod
def from_cli_args(cls, args: argparse.Namespace):
    # Get the list of attributes of this dataclass.
    attrs = [attr.name for attr in dataclasses.fields(cls)]
    # Set the attributes from the parsed arguments.
    engine_args = cls(**{
        attr: getattr(args, attr)
        for attr in attrs if hasattr(args, attr)
    })
    return engine_args

validate_tensorizer_args

validate_tensorizer_args()
Source code in vllm/engine/arg_utils.py
def validate_tensorizer_args(self):
    from vllm.model_executor.model_loader.tensorizer import (
        TensorizerConfig)
    for key in self.model_loader_extra_config:
        if key in TensorizerConfig._fields:
            self.model_loader_extra_config["tensorizer_config"][
                key] = self.model_loader_extra_config[key]

LLM

An LLM for generating texts from given prompts and sampling parameters.

This class includes a tokenizer, a language model (possibly distributed across multiple GPUs), and GPU memory space allocated for intermediate states (aka KV cache). Given a batch of prompts and sampling parameters, this class generates texts from the model, using an intelligent batching mechanism and efficient memory management.

Parameters:

Name Type Description Default
model str

The name or path of a HuggingFace Transformers model.

required
tokenizer Optional[str]

The name or path of a HuggingFace Transformers tokenizer.

None
tokenizer_mode TokenizerMode

The tokenizer mode. "auto" will use the fast tokenizer if available, and "slow" will always use the slow tokenizer.

'auto'
skip_tokenizer_init bool

If true, skip initialization of tokenizer and detokenizer. Expect valid prompt_token_ids and None for prompt from the input.

False
trust_remote_code bool

Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.

False
allowed_local_media_path str

Allowing API requests to read local images or videos from directories specified by the server file system. This is a security risk. Should only be enabled in trusted environments.

''
allowed_media_domains Optional[list[str]]

If set, only media URLs that belong to this domain can be used for multi-modal inputs.

None
tensor_parallel_size int

The number of GPUs to use for distributed execution with tensor parallelism.

1
dtype ModelDType

The data type for the model weights and activations. Currently, we support float32, float16, and bfloat16. If auto, we use the torch_dtype attribute specified in the model config file. However, if the torch_dtype in the config is float32, we will use float16 instead.

'auto'
quantization Optional[QuantizationMethods]

The method used to quantize the model weights. Currently, we support "awq", "gptq", and "fp8" (experimental). If None, we first check the quantization_config attribute in the model config file. If that is None, we assume the model weights are not quantized and use dtype to determine the data type of the weights.

None
revision Optional[str]

The specific model version to use. It can be a branch name, a tag name, or a commit id.

None
tokenizer_revision Optional[str]

The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id.

None
seed Optional[int]

The seed to initialize the random number generator for sampling.

None
gpu_memory_utilization float

The ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache. Higher values will increase the KV cache size and thus improve the model's throughput. However, if the value is too high, it may cause out-of- memory (OOM) errors.

0.9
kv_cache_memory_bytes Optional[int]

Size of KV Cache per GPU in bytes. By default, this is set to None and vllm can automatically infer the kv cache size based on gpu_memory_utilization. However, users may want to manually specify the kv cache memory size. kv_cache_memory_bytes allows more fine-grain control of how much memory gets used when compared with using gpu_memory_memory_utilization. Note that kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization

None
swap_space float

The size (GiB) of CPU memory per GPU to use as swap space. This can be used for temporarily storing the states of the requests when their best_of sampling parameters are larger than 1. If all requests will have best_of=1, you can safely set this to 0. Noting that best_of is only supported in V0. Otherwise, too small values may cause out-of-memory (OOM) errors.

4
cpu_offload_gb float

The size (GiB) of CPU memory to use for offloading the model weights. This virtually increases the GPU memory space you can use to hold the model weights, at the cost of CPU-GPU data transfer for every forward pass.

0
enforce_eager bool

Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid.

False
disable_custom_all_reduce bool False
hf_token Optional[Union[bool, str]]

The token to use as HTTP bearer authorization for remote files . If True, will use the token generated when running huggingface-cli login (stored in ~/.huggingface).

None
hf_overrides Optional[HfOverrides]

If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the HuggingFace config.

None
mm_processor_kwargs Optional[dict[str, Any]]

Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. Overrides for the multi-modal processor obtained from AutoProcessor.from_pretrained. The available overrides depend on the model that is being run. For example, for Phi-3-Vision: {"num_crops": 4}.

None
pooler_config Optional[PoolerConfig]

Initialize non-default pooling config for the pooling model. e.g. PoolerConfig(pooling_type="mean", normalize=False).

None
override_pooler_config Optional[PoolerConfig]

[DEPRECATED] Use pooler_config instead. This argument is deprecated and will be removed in v0.12.0 or v1.0.0, whichever is sooner.

None
compilation_config Optional[Union[int, dict[str, Any], CompilationConfig]]

Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration.

None
**kwargs Any

Arguments for EngineArgs.

{}
Note

This class is intended to be used for offline inference. For online serving, use the AsyncLLMEngine class instead.

Source code in vllm/entrypoints/llm.py
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
class LLM:
    """An LLM for generating texts from given prompts and sampling parameters.

    This class includes a tokenizer, a language model (possibly distributed
    across multiple GPUs), and GPU memory space allocated for intermediate
    states (aka KV cache). Given a batch of prompts and sampling parameters,
    this class generates texts from the model, using an intelligent batching
    mechanism and efficient memory management.

    Args:
        model: The name or path of a HuggingFace Transformers model.
        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
        tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
            if available, and "slow" will always use the slow tokenizer.
        skip_tokenizer_init: If true, skip initialization of tokenizer and
            detokenizer. Expect valid prompt_token_ids and None for prompt
            from the input.
        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
            downloading the model and tokenizer.
        allowed_local_media_path: Allowing API requests to read local images
            or videos from directories specified by the server file system.
            This is a security risk. Should only be enabled in trusted
            environments.
        allowed_media_domains: If set, only media URLs that belong to this 
            domain can be used for multi-modal inputs.
        tensor_parallel_size: The number of GPUs to use for distributed
            execution with tensor parallelism.
        dtype: The data type for the model weights and activations. Currently,
            we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
            the `torch_dtype` attribute specified in the model config file.
            However, if the `torch_dtype` in the config is `float32`, we will
            use `float16` instead.
        quantization: The method used to quantize the model weights. Currently,
            we support "awq", "gptq", and "fp8" (experimental).
            If None, we first check the `quantization_config` attribute in the
            model config file. If that is None, we assume the model weights are
            not quantized and use `dtype` to determine the data type of
            the weights.
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id.
        tokenizer_revision: The specific tokenizer version to use. It can be a
            branch name, a tag name, or a commit id.
        seed: The seed to initialize the random number generator for sampling.
        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
            reserve for the model weights, activations, and KV cache. Higher
            values will increase the KV cache size and thus improve the model's
            throughput. However, if the value is too high, it may cause out-of-
            memory (OOM) errors.
        kv_cache_memory_bytes: Size of KV Cache per GPU in bytes. By default,
            this is set to None and vllm can automatically infer the kv cache
            size based on gpu_memory_utilization. However, users may want to
            manually specify the kv cache memory size. kv_cache_memory_bytes
            allows more fine-grain control of how much memory gets used when
            compared with using gpu_memory_memory_utilization. Note that
            kv_cache_memory_bytes (when not-None) ignores
            gpu_memory_utilization
        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
            This can be used for temporarily storing the states of the requests
            when their `best_of` sampling parameters are larger than 1. If all
            requests will have `best_of=1`, you can safely set this to 0.
            Noting that `best_of` is only supported in V0. Otherwise, too small
            values may cause out-of-memory (OOM) errors.
        cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
            the model weights. This virtually increases the GPU memory space
            you can use to hold the model weights, at the cost of CPU-GPU data
            transfer for every forward pass.
        enforce_eager: Whether to enforce eager execution. If True, we will
            disable CUDA graph and always execute the model in eager mode.
            If False, we will use CUDA graph and eager execution in hybrid.
        disable_custom_all_reduce: See
            [ParallelConfig][vllm.config.ParallelConfig].
        hf_token: The token to use as HTTP bearer authorization for remote files
            . If `True`, will use the token generated when running
            `huggingface-cli login` (stored in `~/.huggingface`).
        hf_overrides: If a dictionary, contains arguments to be forwarded to the
            HuggingFace config. If a callable, it is called to update the
            HuggingFace config.
        mm_processor_kwargs: Arguments to be forwarded to the model's processor
            for multi-modal data, e.g., image processor. Overrides for the
            multi-modal processor obtained from `AutoProcessor.from_pretrained`.
            The available overrides depend on the model that is being run.
            For example, for Phi-3-Vision: `{"num_crops": 4}`.
        pooler_config: Initialize non-default pooling config for the pooling
            model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
        override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
            argument is deprecated and will be removed in v0.12.0 or v1.0.0,
            whichever is sooner.
        compilation_config: Either an integer or a dictionary. If it is an
            integer, it is used as the level of compilation optimization. If it
            is a dictionary, it can specify the full compilation configuration.
        **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].

    Note:
        This class is intended to be used for offline inference. For online
        serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead.
    """

    def __init__(
        self,
        model: str,
        *,
        runner: RunnerOption = "auto",
        convert: ConvertOption = "auto",
        tokenizer: Optional[str] = None,
        tokenizer_mode: TokenizerMode = "auto",
        skip_tokenizer_init: bool = False,
        trust_remote_code: bool = False,
        allowed_local_media_path: str = "",
        allowed_media_domains: Optional[list[str]] = None,
        tensor_parallel_size: int = 1,
        dtype: ModelDType = "auto",
        quantization: Optional[QuantizationMethods] = None,
        revision: Optional[str] = None,
        tokenizer_revision: Optional[str] = None,
        seed: Optional[int] = None,
        gpu_memory_utilization: float = 0.9,
        swap_space: float = 4,
        cpu_offload_gb: float = 0,
        enforce_eager: bool = False,
        disable_custom_all_reduce: bool = False,
        hf_token: Optional[Union[bool, str]] = None,
        hf_overrides: Optional[HfOverrides] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
        pooler_config: Optional[PoolerConfig] = None,
        override_pooler_config: Optional[PoolerConfig] = None,
        structured_outputs_config: Optional[Union[dict[
            str, Any], StructuredOutputsConfig]] = None,
        kv_cache_memory_bytes: Optional[int] = None,
        compilation_config: Optional[Union[int, dict[str, Any],
                                           CompilationConfig]] = None,
        logits_processors: Optional[list[Union[str,
                                               type[LogitsProcessor]]]] = None,
        **kwargs: Any,
    ) -> None:
        """LLM constructor."""

        if "disable_log_stats" not in kwargs:
            kwargs["disable_log_stats"] = True

        if "worker_cls" in kwargs:
            worker_cls = kwargs["worker_cls"]
            # if the worker_cls is not qualified string name,
            # we serialize it using cloudpickle to avoid pickling issues
            if isinstance(worker_cls, type):
                kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)

        if "kv_transfer_config" in kwargs and isinstance(
                kwargs["kv_transfer_config"], dict):
            from vllm.config.kv_transfer import KVTransferConfig
            raw_config_dict = kwargs["kv_transfer_config"]
            try:
                kwargs["kv_transfer_config"] = KVTransferConfig(
                    **raw_config_dict)
            except ValidationError as e:
                logger.error(
                    "Failed to convert 'kv_transfer_config' dict to "
                    "KVTransferConfig object. Dict: %s. Error: %s",
                    raw_config_dict, e)
                # Consider re-raising a more specific vLLM error or ValueError
                # to provide better context to the user.
                raise ValueError(
                    f"Invalid 'kv_transfer_config' provided: {e}") from e

        if hf_overrides is None:
            hf_overrides = {}

        if compilation_config is not None:
            if isinstance(compilation_config, int):
                compilation_config_instance = CompilationConfig(
                    level=compilation_config)
            elif isinstance(compilation_config, dict):
                compilation_config_instance = CompilationConfig(
                    **{
                        k: v
                        for k, v in compilation_config.items()
                        if is_init_field(CompilationConfig, k)
                    })
            else:
                compilation_config_instance = compilation_config
        else:
            compilation_config_instance = CompilationConfig()

        if structured_outputs_config is not None:
            if isinstance(structured_outputs_config, dict):
                structured_outputs_instance = StructuredOutputsConfig(
                    **{
                        k: v
                        for k, v in structured_outputs_config.items()
                        if is_init_field(StructuredOutputsConfig, k)
                    })
            else:
                structured_outputs_instance = structured_outputs_config
        else:
            structured_outputs_instance = StructuredOutputsConfig()

        engine_args = EngineArgs(
            model=model,
            runner=runner,
            convert=convert,
            tokenizer=tokenizer,
            tokenizer_mode=tokenizer_mode,
            skip_tokenizer_init=skip_tokenizer_init,
            trust_remote_code=trust_remote_code,
            allowed_local_media_path=allowed_local_media_path,
            allowed_media_domains=allowed_media_domains,
            tensor_parallel_size=tensor_parallel_size,
            dtype=dtype,
            quantization=quantization,
            revision=revision,
            tokenizer_revision=tokenizer_revision,
            seed=seed,
            gpu_memory_utilization=gpu_memory_utilization,
            kv_cache_memory_bytes=kv_cache_memory_bytes,
            swap_space=swap_space,
            cpu_offload_gb=cpu_offload_gb,
            enforce_eager=enforce_eager,
            disable_custom_all_reduce=disable_custom_all_reduce,
            hf_token=hf_token,
            hf_overrides=hf_overrides,
            mm_processor_kwargs=mm_processor_kwargs,
            pooler_config=pooler_config,
            override_pooler_config=override_pooler_config,
            structured_outputs_config=structured_outputs_instance,
            compilation_config=compilation_config_instance,
            logits_processors=logits_processors,
            **kwargs,
        )

        log_non_default_args(engine_args)

        # Create the Engine (autoselects V0 vs V1)
        self.llm_engine = LLMEngine.from_engine_args(
            engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
        self.engine_class = type(self.llm_engine)

        self.request_counter = Counter()
        self.default_sampling_params: Union[dict[str, Any], None] = None

        supported_tasks = self.llm_engine.get_supported_tasks()  # type: ignore

        logger.info("Supported_tasks: %s", supported_tasks)

        self.supported_tasks = supported_tasks

        # Load the Input/Output processor plugin if any
        io_processor_plugin = self.llm_engine.model_config.io_processor_plugin
        self.io_processor = get_io_processor(self.llm_engine.vllm_config,
                                             io_processor_plugin)

    def get_tokenizer(self) -> AnyTokenizer:
        return self.llm_engine.get_tokenizer()

    def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
        # While CachedTokenizer is dynamic, have no choice but
        # compare class name. Misjudgment will arise from
        # user-defined tokenizer started with 'Cached'
        if tokenizer.__class__.__name__.startswith("Cached"):
            self.llm_engine.tokenizer = tokenizer
        else:
            self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)

    def get_default_sampling_params(self) -> SamplingParams:
        if self.default_sampling_params is None:
            self.default_sampling_params = (
                self.llm_engine.model_config.get_diff_sampling_param())
        if self.default_sampling_params:
            return SamplingParams.from_optional(**self.default_sampling_params)
        return SamplingParams()

    def generate(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        sampling_params: Optional[Union[SamplingParams,
                                        Sequence[SamplingParams]]] = None,
        *,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
        priority: Optional[list[int]] = None,
    ) -> list[RequestOutput]:
        """Generates the completions for the input prompts.

        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompt.
            sampling_params: The sampling parameters for text generation. If
                None, we use the default sampling parameters.
                When it is a single value, it is applied to every prompt.
                When it is a list, the list must have the same length as the
                prompts and it is paired one by one with the prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            priority: The priority of the requests, if any.
                Only applicable when priority scheduling policy is enabled.

        Returns:
            A list of `RequestOutput` objects containing the
            generated completions in the same order as the input prompts.

        Note:
            Using `prompts` and `prompt_token_ids` as keyword parameters is
            considered legacy and may be deprecated in the future. You should
            instead pass them via the `inputs` parameter.
        """
        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "generate":
            raise ValueError(
                "LLM.generate() is only supported for generative models. "
                "Try passing `--runner generate` to use the model as a "
                "generative model.")

        if sampling_params is None:
            # Use default sampling params.
            sampling_params = self.get_default_sampling_params()

        # Add any modality specific loras to the corresponding prompts
        lora_request = self._get_modality_specific_lora_reqs(
            prompts, lora_request)

        self._validate_and_add_requests(
            prompts=prompts,
            params=sampling_params,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
            priority=priority,
        )

        outputs = self._run_engine(use_tqdm=use_tqdm)
        return self.engine_class.validate_outputs(outputs, RequestOutput)

    def _get_modality_specific_lora_reqs(
            self, prompts: Union[PromptType, Sequence[PromptType]],
            lora_request: Optional[Union[list[LoRARequest], LoRARequest]]):
        # Grab the lora config off the vllm config on the engine,
        # since this is the same for both v0 & v1.
        lora_config = self.llm_engine.vllm_config.lora_config

        # If there's no lora config / default_mm_loras, or the model
        # isn't multimodal, leave the lora as is.
        if (lora_config is None
                or not self.llm_engine.model_config.is_multimodal_model
                or (lora_config and lora_config.default_mm_loras is None)):
            return lora_request

        if not isinstance(prompts, Sequence):
            prompts = [prompts]

        optional_loras = ([lora_request] * len(prompts)
                          if not isinstance(lora_request, Sequence) else
                          lora_request)

        return [
            self._resolve_single_prompt_mm_lora(
                prompt,
                opt_lora_req,
                lora_config.default_mm_loras,
            ) for prompt, opt_lora_req in zip(prompts, optional_loras)
        ]

    def _resolve_single_prompt_mm_lora(self, prompt: PromptType,
                                       lora_request: Optional[LoRARequest],
                                       default_mm_loras: Optional[dict[str,
                                                                       str]]):
        if (not default_mm_loras or not isinstance(prompt, dict)
                or "multi_modal_data" not in prompt):
            return lora_request

        prompt = cast(Union[TextPrompt, TokensPrompt], prompt)

        intersection = set(prompt["multi_modal_data"].keys()) \
            .intersection(default_mm_loras.keys())
        if not intersection:
            return lora_request
        if len(intersection) > 1:
            # TODO: Would be nice to be able to have multiple loras per prompt
            logger.warning(
                "Multiple modality specific loras were registered and would be"
                " used by a single prompt consuming several modalities; "
                " currently we only support one lora per request; as such,"
                " lora(s) registered with modalities: %s"
                " will be skipped", intersection)
            return lora_request

        # Build the LoRA request; the ID of the default mm lora is the
        # index of the modality name sorted alphabetically + 1.
        modality_name = intersection.pop()
        modality_lora_path = default_mm_loras[modality_name]
        modality_lora_id = sorted(default_mm_loras).index(modality_name) + 1

        # If we have a collision, warn if there is a collision,
        # but always send the explicitly provided request.
        if lora_request:
            if lora_request.lora_int_id != modality_lora_id:
                logger.warning(
                    "A modality with a registered lora and a lora_request "
                    "with a different ID were provided; falling back to the "
                    "lora_request as we only apply one LoRARequest per prompt")
            return lora_request

        return LoRARequest(
            modality_name,
            modality_lora_id,
            modality_lora_path,
        )

    def collective_rpc(self,
                       method: Union[str, Callable[..., _R]],
                       timeout: Optional[float] = None,
                       args: tuple = (),
                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
        """
        Execute an RPC call on all workers.

        Args:
            method: Name of the worker method to execute, or a callable that
                is serialized and sent to all workers to execute.

                If the method is a callable, it should accept an additional
                `self` argument, in addition to the arguments passed in `args`
                and `kwargs`. The `self` argument will be the worker object.
            timeout: Maximum time in seconds to wait for execution. Raises a
                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
            args: Positional arguments to pass to the worker method.
            kwargs: Keyword arguments to pass to the worker method.

        Returns:
            A list containing the results from each worker.

        Note:
            It is recommended to use this API to only pass control messages,
            and set up data-plane communication to pass data.
        """

        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)

    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
        """
        Run a function directly on the model inside each worker,
        returning the result for each of them.

        !!! warning
            To reduce the overhead of data transfer, avoid returning large
            arrays or tensors from this method. If you must return them,
            make sure you move them to CPU first to avoid taking up additional
            VRAM!
        """
        return self.llm_engine.apply_model(func)

    def _get_beam_search_lora_requests(
        self,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
        prompts: list[Union[TokensPrompt, TextPrompt]],
    ) -> list[Optional[LoRARequest]]:
        """Get the optional lora request corresponding to each prompt."""
        if isinstance(lora_request,
                      Sequence) and len(lora_request) != len(prompts):
            raise ValueError(
                "Lora request list should be the same length as the prompts")

        if lora_request is None or isinstance(lora_request, LoRARequest):
            return [lora_request] * len(prompts)

        raise TypeError(f"Invalid lora_request type {type(lora_request)}")

    def beam_search(
        self,
        prompts: list[Union[TokensPrompt, TextPrompt]],
        params: BeamSearchParams,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
        use_tqdm: bool = False,
        concurrency_limit: Optional[int] = None,
    ) -> list[BeamSearchOutput]:
        """
        Generate sequences using beam search.

        Args:
            prompts: A list of prompts. Each prompt can be a string or a list
                of token IDs.
            params: The beam search parameters.
            lora_request: LoRA request to use for generation, if any.
            use_tqdm: Whether to use tqdm to display the progress bar.
            concurrency_limit: The maximum number of concurrent requests.
                If None, the number of concurrent requests is unlimited.
        """
        # TODO: how does beam search work together with length penalty,
        # frequency, penalty, and stopping criteria, etc.?
        beam_width = params.beam_width
        max_tokens = params.max_tokens
        temperature = params.temperature
        ignore_eos = params.ignore_eos
        length_penalty = params.length_penalty

        lora_requests = self._get_beam_search_lora_requests(
            lora_request, prompts)

        tokenizer = self.get_tokenizer()
        sort_beams_key = create_sort_beams_key_function(
            tokenizer.eos_token_id,
            length_penalty,
        )

        if use_tqdm and concurrency_limit is not None:
            logger.warning(
                "Progress bar is not supported when using concurrency_limit. "
                "Disabling progress bar.")
            use_tqdm = False

        if concurrency_limit is None:
            concurrency_limit = len(prompts)

        def create_tokens_prompt_from_beam(
                beam: BeamSearchSequence) -> TokensPrompt:
            token_prompt_kwargs: TokensPrompt = {
                "prompt_token_ids": beam.tokens
            }
            if beam.multi_modal_data is not None:
                token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data

            if beam.mm_processor_kwargs is not None:
                token_prompt_kwargs[
                    "mm_processor_kwargs"] = beam.mm_processor_kwargs
            return TokensPrompt(**token_prompt_kwargs)

        # generate 2 * beam_width candidates at each step
        # following the huggingface transformers implementation
        # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
        beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                            max_tokens=1,
                                            temperature=temperature)
        instances: list[BeamSearchInstance] = []

        for lora_req, prompt in zip(lora_requests, prompts):
            # Add multimodal processor kwargs & data
            mm_kwargs = {}
            if "multi_modal_data" in prompt:
                mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"]
            if "mm_processor_kwargs" in prompt:
                mm_kwargs["mm_processor_kwargs"] = prompt[
                    "mm_processor_kwargs"]

            if "prompt_token_ids" in prompt:
                prompt = cast(TokensPrompt, prompt)  # Needed for mypy
                prompt_tokens = prompt["prompt_token_ids"]
            else:
                prompt_tokens = tokenizer.encode(prompt["prompt"])

            instances.append(
                BeamSearchInstance(
                    prompt_tokens,
                    lora_request=lora_req,
                    logprobs=None,
                    **mm_kwargs,
                ), )

        for prompt_start in range(0, len(prompts), concurrency_limit):
            instances_batch = instances[prompt_start:prompt_start +
                                        concurrency_limit]

            token_iter = range(max_tokens)
            if use_tqdm:
                token_iter = tqdm(token_iter,
                                  desc="Beam search",
                                  unit="token",
                                  unit_scale=False)
                logger.warning(
                    "The progress bar shows the upper bound on token steps and "
                    "may finish early due to stopping conditions. It does not "
                    "reflect instance-level progress.")
            for _ in token_iter:
                all_beams: list[BeamSearchSequence] = list(
                    sum((instance.beams for instance in instances_batch), []))
                pos = [0] + list(
                    itertools.accumulate(
                        len(instance.beams) for instance in instances_batch))
                instance_start_and_end: list[tuple[int, int]] = list(
                    zip(pos[:-1], pos[1:]))

                if len(all_beams) == 0:
                    break

                # create corresponding batch entries for prompt & optional lora
                prompts_batch, lora_req_batch = zip(
                    *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
                      for beam in all_beams])

                # only runs for one step
                # we don't need to use tqdm here
                output = self.generate(prompts_batch,
                                       sampling_params=beam_search_params,
                                       use_tqdm=False,
                                       lora_request=lora_req_batch)

                for (start, end), instance in zip(instance_start_and_end,
                                                  instances_batch):
                    instance_new_beams = []
                    for i in range(start, end):
                        current_beam = all_beams[i]
                        result = output[i]

                        if result.outputs[0].logprobs is not None:
                            # if `result.outputs[0].logprobs` is None, it means
                            # the sequence is completed because of the
                            # max-model-len or abortion. we don't need to add
                            # it to the new beams.
                            logprobs = result.outputs[0].logprobs[0]
                            for token_id, logprob_obj in logprobs.items():
                                new_beam = BeamSearchSequence(
                                    tokens=current_beam.tokens + [token_id],
                                    logprobs=current_beam.logprobs +
                                    [logprobs],
                                    lora_request=current_beam.lora_request,
                                    cum_logprob=current_beam.cum_logprob +
                                    logprob_obj.logprob,
                                    multi_modal_data=current_beam.
                                    multi_modal_data,
                                    mm_processor_kwargs=current_beam.
                                    mm_processor_kwargs)

                                if token_id == tokenizer.eos_token_id and \
                                    not ignore_eos:
                                    instance.completed.append(new_beam)
                                else:
                                    instance_new_beams.append(new_beam)
                    sorted_beams = sorted(instance_new_beams,
                                          key=sort_beams_key,
                                          reverse=True)
                    instance.beams = sorted_beams[:beam_width]

        outputs = []
        for instance in instances:
            instance.completed.extend(instance.beams)
            sorted_completed = sorted(instance.completed,
                                      key=sort_beams_key,
                                      reverse=True)
            best_beams = sorted_completed[:beam_width]

            for beam in best_beams:
                beam.text = tokenizer.decode(beam.tokens)
            outputs.append(BeamSearchOutput(sequences=best_beams))

        return outputs

    def preprocess_chat(
        self,
        messages: Union[list[ChatCompletionMessageParam],
                        list[list[ChatCompletionMessageParam]]],
        chat_template: Optional[str] = None,
        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
        add_generation_prompt: bool = True,
        continue_final_message: bool = False,
        tools: Optional[list[dict[str, Any]]] = None,
        chat_template_kwargs: Optional[dict[str, Any]] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
    ) -> list[TokensPrompt]:
        """
        Generate prompt for a chat conversation. The pre-processed
        prompt can then be used as input for the other LLM methods.

        Refer to `chat` for a complete description of the arguments.
        Returns:
            A list of `TokensPrompts` objects containing the tokenized
            prompt after chat template interpolation, and the
            pre-processed multi-modal inputs.
        """
        list_of_messages: list[list[ChatCompletionMessageParam]]

        # Handle multi and single conversations
        if is_list_of(messages, list):
            # messages is list[list[...]]
            list_of_messages = cast(list[list[ChatCompletionMessageParam]],
                                    messages)
        else:
            # messages is list[...]
            list_of_messages = [
                cast(list[ChatCompletionMessageParam], messages)
            ]

        tokenizer = self.get_tokenizer()
        model_config = self.llm_engine.get_model_config()
        resolved_content_format = resolve_chat_template_content_format(
            chat_template,
            tools,
            chat_template_content_format,
            tokenizer,
            model_config=model_config,
        )

        _chat_template_kwargs: dict[str, Any] = dict(
            chat_template=chat_template,
            add_generation_prompt=add_generation_prompt,
            continue_final_message=continue_final_message,
            tools=tools,
        )
        _chat_template_kwargs.update(chat_template_kwargs or {})

        prompts: list[TokensPrompt] = []

        for msgs in list_of_messages:
            # NOTE: _parse_chat_message_content_parts() currently doesn't
            # handle mm_processor_kwargs, since there is no implementation in
            # the chat message parsing for it.
            conversation, mm_data, mm_uuids = parse_chat_messages(
                msgs,
                model_config,
                tokenizer,
                content_format=resolved_content_format,
            )

            if isinstance(tokenizer, MistralTokenizer):
                prompt_token_ids = apply_mistral_chat_template(
                    tokenizer,
                    messages=msgs,
                    **_chat_template_kwargs,
                )
            else:
                prompt_str = apply_hf_chat_template(
                    tokenizer=tokenizer,
                    conversation=conversation,
                    model_config=model_config,
                    **_chat_template_kwargs,
                )
                # Special tokens are already included in chat templates so
                # should not be added by the tokenizer in this case.
                prompt_token_ids = tokenizer.encode(prompt_str,
                                                    add_special_tokens=False)

            prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)

            if mm_data is not None:
                prompt["multi_modal_data"] = mm_data

            if mm_uuids is not None:
                prompt["multi_modal_uuids"] = mm_uuids

            if mm_processor_kwargs is not None:
                prompt["mm_processor_kwargs"] = mm_processor_kwargs

            prompts.append(prompt)

        return prompts

    def chat(
        self,
        messages: Union[list[ChatCompletionMessageParam],
                        list[list[ChatCompletionMessageParam]]],
        sampling_params: Optional[Union[SamplingParams,
                                        list[SamplingParams]]] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[LoRARequest] = None,
        chat_template: Optional[str] = None,
        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
        add_generation_prompt: bool = True,
        continue_final_message: bool = False,
        tools: Optional[list[dict[str, Any]]] = None,
        chat_template_kwargs: Optional[dict[str, Any]] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
    ) -> list[RequestOutput]:
        """
        Generate responses for a chat conversation.

        The chat conversation is converted into a text prompt using the
        tokenizer and calls the [generate][vllm.LLM.generate] method to generate
        the responses.

        Multi-modal inputs can be passed in the same way you would pass them
        to the OpenAI API.

        Args:
            messages: A list of conversations or a single conversation.

                - Each conversation is represented as a list of messages.
                - Each message is a dictionary with 'role' and 'content' keys.

            sampling_params: The sampling parameters for text generation.
                If None, we use the default sampling parameters. When it
                is a single value, it is applied to every prompt. When it
                is a list, the list must have the same length as the
                prompts and it is paired one by one with the prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            chat_template: The template to use for structuring the chat.
                If not provided, the model's default chat template will be used.
            chat_template_content_format: The format to render message content.

                - "string" will render the content as a string.
                  Example: `"Who are you?"`
                - "openai" will render the content as a list of dictionaries,
                  similar to OpenAI schema.
                  Example: `[{"type": "text", "text": "Who are you?"}]`

            add_generation_prompt: If True, adds a generation template
                to each message.
            continue_final_message: If True, continues the final message in
                the conversation instead of starting a new one. Cannot be
                `True` if `add_generation_prompt` is also `True`.
            chat_template_kwargs: Additional kwargs to pass to the chat
                template.
            mm_processor_kwargs: Multimodal processor kwarg overrides for this
                chat request. Only used for offline requests.

        Returns:
            A list of `RequestOutput` objects containing the generated
            responses in the same order as the input messages.
        """

        prompts = self.preprocess_chat(
            messages=messages,
            chat_template=chat_template,
            chat_template_content_format=chat_template_content_format,
            add_generation_prompt=add_generation_prompt,
            continue_final_message=continue_final_message,
            tools=tools,
            chat_template_kwargs=chat_template_kwargs,
            mm_processor_kwargs=mm_processor_kwargs,
        )

        return self.generate(
            prompts,
            sampling_params=sampling_params,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
        )

    def encode(
        self,
        prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
        pooling_params: Optional[Union[PoolingParams,
                                       Sequence[PoolingParams]]] = None,
        *,
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
        pooling_task: PoolingTask = "encode",
        tokenization_kwargs: Optional[dict[str, Any]] = None,
    ) -> list[PoolingRequestOutput]:
        """Apply pooling to the hidden states corresponding to the input
        prompts.

        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompt.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            pooling_task: Override the pooling task to use.
            tokenization_kwargs: overrides tokenization_kwargs set in
                pooling_params

        Returns:
            A list of `PoolingRequestOutput` objects containing the
            pooled hidden states in the same order as the input prompts.

        Note:
            Using `prompts` and `prompt_token_ids` as keyword parameters is
            considered legacy and may be deprecated in the future. You should
            instead pass them via the `inputs` parameter.
        """

        if self.supported_tasks == ["encode"] and pooling_task is None:
            pooling_task = "encode"

        if pooling_task is None:
            if "embed" in self.supported_tasks:
                pooling_task = "embed"
            else:
                pooling_task = "encode"

            logger.warning_once(
                "`LLM.encode` is currently using `pooling_task = %s`.\n"
                "Please use one of the more specific methods or set the "
                "task directly when using `LLM.encode`:\n"
                "  - For embeddings, use `LLM.embed(...)` "
                "or `pooling_task=\"embed\"`.\n"
                "  - For classification logits, use `LLM.classify(...)` "
                "or `pooling_task=\"classify\"`.\n"
                "  - For rewards, use `LLM.reward(...)` "
                "or `pooling_task=\"reward\"`\n"
                "  - For similarity scores, use `LLM.score(...)`.",
                pooling_task)

        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "pooling":
            raise ValueError(
                "LLM.encode() is only supported for pooling models. "
                "Try passing `--runner pooling` to use the model as a "
                "pooling model.")

        if pooling_task not in self.supported_tasks:
            raise ValueError(
                f"pooling_task must be one of {self.supported_tasks}.")

        if pooling_params is None:
            # Use default pooling params.
            pooling_params = PoolingParams()

        for param in as_iter(pooling_params):
            param.verify(pooling_task, model_config)
            # for backwards compatibility
            if truncate_prompt_tokens is not None:
                param.truncate_prompt_tokens = truncate_prompt_tokens

        io_processor_prompt = False
        if isinstance(prompts, dict) and "data" in prompts:
            io_processor_prompt = True
            if self.io_processor is None:
                raise ValueError(
                    "No IOProcessor plugin installed. Please refer "
                    "to the documentation and to the "
                    "'prithvi_geospatial_mae_io_processor' "
                    "offline inference example for more details.")

            # Validate the request data is valid for the loaded plugin
            validated_prompt = self.io_processor.parse_request(prompts)

            # obtain the actual model prompts from the pre-processor
            prompts = self.io_processor.pre_process(prompt=validated_prompt)

        self._validate_and_add_requests(
            prompts=prompts,
            params=pooling_params,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
        )

        outputs = self._run_engine(use_tqdm=use_tqdm)

        model_outputs = self.engine_class.validate_outputs(
            outputs, PoolingRequestOutput)

        if io_processor_prompt:
            # get the post-processed model outputs
            assert self.io_processor is not None
            processed_outputs = self.io_processor.post_process(
                model_output=model_outputs)

            return [
                PoolingRequestOutput[Any](request_id="",
                                          outputs=processed_outputs,
                                          prompt_token_ids=[],
                                          finished=True)
            ]
        else:
            return model_outputs

    def embed(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        *,
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[Union[PoolingParams,
                                       Sequence[PoolingParams]]] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[EmbeddingRequestOutput]:
        """
        Generate an embedding vector for each prompt.

        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompt.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.

        Returns:
            A list of `EmbeddingRequestOutput` objects containing the
            embedding vectors in the same order as the input prompts.
        """
        if "embed" not in self.supported_tasks:
            raise ValueError(
                "Embedding API is not supported by this model. "
                "Try converting the model using `--convert embed`.")

        items = self.encode(
            prompts,
            truncate_prompt_tokens=truncate_prompt_tokens,
            use_tqdm=use_tqdm,
            pooling_params=pooling_params,
            lora_request=lora_request,
            pooling_task="embed",
        )

        return [EmbeddingRequestOutput.from_base(item) for item in items]

    def classify(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        *,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[Union[PoolingParams,
                                       Sequence[PoolingParams]]] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ClassificationRequestOutput]:
        """
        Generate class logits for each prompt.

        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
        Returns:
            A list of `ClassificationRequestOutput` objects containing the
            embedding vectors in the same order as the input prompts.
        """
        if "classify" not in self.supported_tasks:
            raise ValueError(
                "Classification API is not supported by this model. "
                "Try converting the model using `--convert classify`.")

        items = self.encode(
            prompts,
            use_tqdm=use_tqdm,
            pooling_params=pooling_params,
            lora_request=lora_request,
            pooling_task="classify",
        )

        return [ClassificationRequestOutput.from_base(item) for item in items]

    def reward(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        /,
        *,
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[Union[PoolingParams,
                                       Sequence[PoolingParams]]] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[PoolingRequestOutput]:
        """
        Generate rewards for each prompt.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
        Returns:
            A list of `PoolingRequestOutput` objects containing the
            pooled hidden states in the same order as the input prompts.
        """

        return self.encode(
            prompts,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
            pooling_params=pooling_params,
            truncate_prompt_tokens=truncate_prompt_tokens,
            pooling_task="encode",
        )

    def _embedding_score(
        self,
        tokenizer: AnyTokenizer,
        text_1: list[Union[str, TextPrompt, TokensPrompt]],
        text_2: list[Union[str, TextPrompt, TokensPrompt]],
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[PoolingParams] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ScoringRequestOutput]:

        encoded_output: list[PoolingRequestOutput] = self.encode(
            text_1 + text_2,
            truncate_prompt_tokens=truncate_prompt_tokens,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
            pooling_params=pooling_params,
            pooling_task="embed",
        )

        encoded_output_1: list[PoolingRequestOutput] = encoded_output[
            0:len(text_1)]
        encoded_output_2: list[PoolingRequestOutput] = encoded_output[
            len(text_1):]

        if len(encoded_output_1) == 1:
            encoded_output_1 = encoded_output_1 * len(encoded_output_2)

        scores = _cosine_similarity(tokenizer=tokenizer,
                                    embed_1=encoded_output_1,
                                    embed_2=encoded_output_2)

        items = self.engine_class.validate_outputs(scores,
                                                   PoolingRequestOutput)
        return [ScoringRequestOutput.from_base(item) for item in items]

    def _cross_encoding_score(
        self,
        tokenizer: AnyTokenizer,
        data_1: Union[list[str], list[ScoreContentPartParam]],
        data_2: Union[list[str], list[ScoreContentPartParam]],
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[PoolingParams] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ScoringRequestOutput]:
        model_config = self.llm_engine.model_config

        if isinstance(tokenizer, MistralTokenizer):
            raise ValueError(
                "Score API is not supported for Mistral tokenizer")

        if len(data_1) == 1:
            data_1 = data_1 * len(data_2)

        if pooling_params is None:
            pooling_params = PoolingParams(task="score")

        model_config = self.llm_engine.model_config
        pooling_params.verify("score", model_config)
        pooling_params_list = list[PoolingParams]()

        tokenization_kwargs: dict[str, Any] = {}

        _validate_truncation_size(model_config.max_model_len,
                                  truncate_prompt_tokens, tokenization_kwargs)

        prompts = list[PromptType]()

        input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

        model_config = self.llm_engine.model_config

        for q, d in input_pairs:
            _, engine_prompt = get_score_prompt(
                model_config=model_config,
                data_1=q,
                data_2=d,
                tokenizer=tokenizer,
                tokenization_kwargs=tokenization_kwargs,
            )

            if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
                params = pooling_params.clone()
                compressed = compress_token_type_ids(token_type_ids)
                params.extra_kwargs = {"compressed_token_type_ids": compressed}
                pooling_params_list.append(params)
            else:
                pooling_params_list.append(pooling_params)

            prompts.append(engine_prompt)

        self._validate_and_add_requests(
            prompts=prompts,
            params=pooling_params_list,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
        )

        outputs = self._run_engine(use_tqdm=use_tqdm)
        items = self.engine_class.validate_outputs(outputs,
                                                   PoolingRequestOutput)

        return [ScoringRequestOutput.from_base(item) for item in items]

    def score(
        self,
        data_1: Union[SingletonPrompt, Sequence[SingletonPrompt],
                      ScoreMultiModalParam],
        data_2: Union[SingletonPrompt, Sequence[SingletonPrompt],
                      ScoreMultiModalParam],
        /,
        *,
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[PoolingParams] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ScoringRequestOutput]:
        """Generate similarity scores for all pairs `<text,text_pair>` or
          `<multi-modal data, multi-modal data pair>`.

        The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
        In the `1 - N` case the `data_1` input will be replicated `N`
        times to pair with the `data_2` inputs.
        The input pairs are used to build a list of prompts for the
        cross encoder model. This class automatically batches the prompts,
        considering the memory constraint. For the best performance, put all
        of your inputs into a single list and pass it to this method.

        Supports both text and multi-modal data (images, etc.) when used with
        appropriate multi-modal models. For multi-modal inputs, ensure the
        prompt structure matches the model's expected input format.

        Args:
            data_1: Can be a single prompt, a list of prompts or
                `ScoreMultiModalParam`, which can contain either text or
                multi-modal data. When a list, it must have the same length as
                the `data_2` list.
            data_2: The data to pair with the query to form the input to
                the LLM. Can be text or multi-modal data. See [PromptType]
                [vllm.inputs.PromptType] for more details about the format of
                each prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
        Returns:
            A list of `ScoringRequestOutput` objects containing the
            generated scores in the same order as the input prompts.
        """
        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "pooling":
            raise ValueError(
                "LLM.score() is only supported for pooling models. "
                "Try passing `--runner pooling` to use the model as a "
                "pooling model.")

        supported_tasks = self.supported_tasks
        if all(t not in supported_tasks for t in ("embed", "classify")):
            raise ValueError("Score API is not supported by this model. "
                             "Try converting the model using "
                             "`--convert embed` or `--convert classify`.")

        if (model_config.is_cross_encoder
                and getattr(model_config.hf_config, "num_labels", 0) != 1):
            raise ValueError("Score API is only enabled for num_labels == 1.")

        # the tokenizer for models such as
        # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
        # lists of tokens to the `text` and `text_pair` kwargs
        tokenizer = self.get_tokenizer()

        if not model_config.is_multimodal_model:

            def check_data_type(data: Union[SingletonPrompt,
                                            Sequence[SingletonPrompt],
                                            ScoreMultiModalParam]):
                if isinstance(data, dict) and "content" in data:
                    raise ValueError("ScoreMultiModalParam is not supported "
                                     f"for {model_config.architecture}")

            check_data_type(data_1)
            check_data_type(data_2)

            def ensure_str(prompt: SingletonPrompt):
                if isinstance(prompt, dict):
                    if "multi_modal_data" in prompt:
                        raise ValueError("Multi-modal prompt is not "
                                         "supported for scoring")
                    elif "prompt_token_ids" in prompt:
                        prompt = tokenizer.decode(
                            cast(TokensPrompt, prompt)["prompt_token_ids"])
                    elif "prompt" in prompt:
                        prompt = cast(TextPrompt, prompt)["prompt"]
                assert type(prompt) is str
                return prompt

            if isinstance(data_1, (str, dict)):
                # Convert a single prompt to a list.
                data_1 = [data_1]  # type: ignore[list-item]

            data_1 = [ensure_str(t) for t in data_1]

            if isinstance(data_2, (str, dict)):
                # Convert a single prompt to a list.
                data_2 = [data_2]  # type: ignore[list-item]

            data_2 = [ensure_str(t) for t in data_2]

        if isinstance(data_1, dict) and "content" in data_1:
            data_1 = data_1.get("content")  # type: ignore[assignment]
        elif isinstance(data_1, str):
            data_1 = [data_1]

        if isinstance(data_2, dict) and "content" in data_2:
            data_2 = data_2.get("content")  # type: ignore[assignment]
        elif isinstance(data_2, str):
            data_2 = [data_2]

        _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]

        if model_config.is_cross_encoder:
            return self._cross_encoding_score(
                tokenizer,
                data_1,  # type: ignore[arg-type]
                data_2,  # type: ignore[arg-type]
                truncate_prompt_tokens,
                use_tqdm,
                pooling_params,
                lora_request)
        else:
            return self._embedding_score(
                tokenizer,
                data_1,  # type: ignore[arg-type]
                data_2,  # type: ignore[arg-type]
                truncate_prompt_tokens,
                use_tqdm,
                pooling_params,
                lora_request)

    def start_profile(self) -> None:
        self.llm_engine.start_profile()

    def stop_profile(self) -> None:
        self.llm_engine.stop_profile()

    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
        return self.llm_engine.reset_prefix_cache(device)

    def sleep(self, level: int = 1):
        """
        Put the engine to sleep. The engine should not process any requests.
        The caller should guarantee that no requests are being processed
        during the sleep period, before `wake_up` is called.

        Args:
            level: The sleep level. Level 1 sleep will offload the model
                weights and discard the kv cache. The content of kv cache
                is forgotten. Level 1 sleep is good for sleeping and waking
                up the engine to run the same model again. The model weights
                are backed up in CPU memory. Please make sure there's enough
                CPU memory to store the model weights. Level 2 sleep will
                discard both the model weights and the kv cache. The content
                of both the model weights and kv cache is forgotten. Level 2
                sleep is good for sleeping and waking up the engine to run a
                different model or update the model, where previous model
                weights are not needed. It reduces CPU memory pressure.
        """
        self.reset_prefix_cache()
        self.llm_engine.sleep(level=level)

    def wake_up(self, tags: Optional[list[str]] = None):
        """
        Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
        method for more details.

        Args:
            tags: An optional list of tags to reallocate the engine memory
                for specific memory allocations. Values must be in
                `("weights", "kv_cache")`. If None, all memory is reallocated.
                wake_up should be called with all tags (or None) before the
                engine is used again.
        """
        self.llm_engine.wake_up(tags)

    def get_metrics(self) -> list["Metric"]:
        """Return a snapshot of aggregated metrics from Prometheus.

        Returns:
            A ``MetricSnapshot`` instance capturing the current state
            of all aggregated metrics from Prometheus.

        Note:
            This method is only available with the V1 LLM engine.
        """
        return self.llm_engine.get_metrics()

    def _validate_and_add_requests(
        self,
        prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
        params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                      Sequence[PoolingParams]],
        *,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
        priority: Optional[list[int]] = None,
    ) -> None:
        if isinstance(prompts, (str, dict)):
            # Convert a single prompt to a list.
            prompts = [prompts]  # type: ignore[list-item]

        num_requests = len(prompts)
        if isinstance(params, Sequence) and len(params) != num_requests:
            raise ValueError("The lengths of prompts and params "
                             "must be the same.")
        if isinstance(lora_request,
                      Sequence) and len(lora_request) != num_requests:
            raise ValueError("The lengths of prompts and lora_request "
                             "must be the same.")

        for sp in params if isinstance(params, Sequence) else (params, ):
            if isinstance(sp, SamplingParams):
                # We only care about the final output
                sp.output_kind = RequestOutputKind.FINAL_ONLY

        # Add requests to the engine.
        it = prompts
        if use_tqdm:
            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
            it = tqdm_func(it, desc="Adding requests")

        model_config = self.llm_engine.model_config

        for i, prompt in enumerate(it):

            if isinstance(prompt, dict):
                self._validate_mm_data_and_uuids(
                    prompt.get("multi_modal_data"),
                    prompt.get("multi_modal_uuids"))

            param = params[i] if isinstance(params, Sequence) else params

            tokenization_kwargs: dict[str, Any] = {}
            _validate_truncation_size(model_config.max_model_len,
                                      param.truncate_prompt_tokens,
                                      tokenization_kwargs)

            self._add_request(
                prompt,
                params[i] if isinstance(params, Sequence) else params,
                tokenization_kwargs=tokenization_kwargs,
                lora_request=lora_request[i] if isinstance(
                    lora_request, Sequence) else lora_request,
                priority=priority[i] if priority else 0,
            )

    def _validate_mm_data_and_uuids(
            self,
            multi_modal_data: Optional[Any],  # MultiModalDataDict
            multi_modal_uuids: Optional[Any],  # MultiModalUUIDDict
    ):
        """
        Validate that if any multi-modal data is skipped (i.e. None),
        then its corresponding UUID must be set.
        """
        if multi_modal_data is None:
            return

        for modality, data in multi_modal_data.items():
            if isinstance(data, list):
                for i, d in enumerate(data):
                    if d is None:
                        if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[  # noqa: E501
                                modality] is None:
                            raise ValueError(
                                f"Multi-modal data for {modality} is None "
                                f"but UUID is not provided")
                        else:
                            if len(
                                    multi_modal_uuids[modality]
                            ) <= i or multi_modal_uuids[modality][i] is None:
                                raise ValueError(
                                    f"Multi-modal data for {modality} is None "
                                    f"but UUID is not provided")
            else:
                if data is None and (multi_modal_uuids is None
                                     or modality not in multi_modal_uuids
                                     or multi_modal_uuids[modality] is None):
                    raise ValueError(f"Multi-modal data for {modality} is None"
                                     f" but UUID is not provided")

    def _add_request(
        self,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        lora_request: Optional[LoRARequest] = None,
        priority: int = 0,
    ) -> None:
        request_id = str(next(self.request_counter))
        self.llm_engine.add_request(
            request_id,
            prompt,
            params,
            lora_request=lora_request,
            tokenization_kwargs=tokenization_kwargs,
            priority=priority,
        )

    def _run_engine(
        self,
        *,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True
    ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
        # Initialize tqdm.
        if use_tqdm:
            num_requests = self.llm_engine.get_num_unfinished_requests()
            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
            pbar = tqdm_func(
                total=num_requests,
                desc="Processed prompts",
                dynamic_ncols=True,
                postfix=(f"est. speed input: {0:.2f} toks/s, "
                         f"output: {0:.2f} toks/s"),
            )

        # Run the engine.
        outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
        total_in_toks = 0
        total_out_toks = 0
        while self.llm_engine.has_unfinished_requests():
            step_outputs = self.llm_engine.step()
            for output in step_outputs:
                if output.finished:
                    outputs.append(output)
                    if use_tqdm:
                        if isinstance(output, RequestOutput):
                            # Calculate tokens only for RequestOutput
                            n = len(output.outputs)
                            assert output.prompt_token_ids is not None
                            total_in_toks += len(output.prompt_token_ids) * n
                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
                            total_out_toks += sum(
                                len(stp.token_ids) for stp in output.outputs)
                            out_spd = (total_out_toks /
                                       pbar.format_dict["elapsed"])
                            pbar.postfix = (
                                f"est. speed input: {in_spd:.2f} toks/s, "
                                f"output: {out_spd:.2f} toks/s")
                            pbar.update(n)
                        else:
                            pbar.update(1)
                        if pbar.n == num_requests:
                            pbar.refresh()

        if use_tqdm:
            pbar.close()
        # Sort the outputs by request ID.
        # This is necessary because some requests may be finished earlier than
        # its previous requests.
        return sorted(outputs, key=lambda x: int(x.request_id))

default_sampling_params instance-attribute

default_sampling_params: Union[dict[str, Any], None] = None

engine_class instance-attribute

engine_class = type(llm_engine)

io_processor instance-attribute

io_processor = get_io_processor(
    vllm_config, io_processor_plugin
)

llm_engine instance-attribute

llm_engine = from_engine_args(
    engine_args=engine_args, usage_context=LLM_CLASS
)

request_counter instance-attribute

request_counter = Counter()

supported_tasks instance-attribute

supported_tasks = supported_tasks

__init__

__init__(
    model: str,
    *,
    runner: RunnerOption = "auto",
    convert: ConvertOption = "auto",
    tokenizer: Optional[str] = None,
    tokenizer_mode: TokenizerMode = "auto",
    skip_tokenizer_init: bool = False,
    trust_remote_code: bool = False,
    allowed_local_media_path: str = "",
    allowed_media_domains: Optional[list[str]] = None,
    tensor_parallel_size: int = 1,
    dtype: ModelDType = "auto",
    quantization: Optional[QuantizationMethods] = None,
    revision: Optional[str] = None,
    tokenizer_revision: Optional[str] = None,
    seed: Optional[int] = None,
    gpu_memory_utilization: float = 0.9,
    swap_space: float = 4,
    cpu_offload_gb: float = 0,
    enforce_eager: bool = False,
    disable_custom_all_reduce: bool = False,
    hf_token: Optional[Union[bool, str]] = None,
    hf_overrides: Optional[HfOverrides] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
    pooler_config: Optional[PoolerConfig] = None,
    override_pooler_config: Optional[PoolerConfig] = None,
    structured_outputs_config: Optional[
        Union[dict[str, Any], StructuredOutputsConfig]
    ] = None,
    kv_cache_memory_bytes: Optional[int] = None,
    compilation_config: Optional[
        Union[int, dict[str, Any], CompilationConfig]
    ] = None,
    logits_processors: Optional[
        list[Union[str, type[LogitsProcessor]]]
    ] = None,
    **kwargs: Any,
) -> None

LLM constructor.

Source code in vllm/entrypoints/llm.py
def __init__(
    self,
    model: str,
    *,
    runner: RunnerOption = "auto",
    convert: ConvertOption = "auto",
    tokenizer: Optional[str] = None,
    tokenizer_mode: TokenizerMode = "auto",
    skip_tokenizer_init: bool = False,
    trust_remote_code: bool = False,
    allowed_local_media_path: str = "",
    allowed_media_domains: Optional[list[str]] = None,
    tensor_parallel_size: int = 1,
    dtype: ModelDType = "auto",
    quantization: Optional[QuantizationMethods] = None,
    revision: Optional[str] = None,
    tokenizer_revision: Optional[str] = None,
    seed: Optional[int] = None,
    gpu_memory_utilization: float = 0.9,
    swap_space: float = 4,
    cpu_offload_gb: float = 0,
    enforce_eager: bool = False,
    disable_custom_all_reduce: bool = False,
    hf_token: Optional[Union[bool, str]] = None,
    hf_overrides: Optional[HfOverrides] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
    pooler_config: Optional[PoolerConfig] = None,
    override_pooler_config: Optional[PoolerConfig] = None,
    structured_outputs_config: Optional[Union[dict[
        str, Any], StructuredOutputsConfig]] = None,
    kv_cache_memory_bytes: Optional[int] = None,
    compilation_config: Optional[Union[int, dict[str, Any],
                                       CompilationConfig]] = None,
    logits_processors: Optional[list[Union[str,
                                           type[LogitsProcessor]]]] = None,
    **kwargs: Any,
) -> None:
    """LLM constructor."""

    if "disable_log_stats" not in kwargs:
        kwargs["disable_log_stats"] = True

    if "worker_cls" in kwargs:
        worker_cls = kwargs["worker_cls"]
        # if the worker_cls is not qualified string name,
        # we serialize it using cloudpickle to avoid pickling issues
        if isinstance(worker_cls, type):
            kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)

    if "kv_transfer_config" in kwargs and isinstance(
            kwargs["kv_transfer_config"], dict):
        from vllm.config.kv_transfer import KVTransferConfig
        raw_config_dict = kwargs["kv_transfer_config"]
        try:
            kwargs["kv_transfer_config"] = KVTransferConfig(
                **raw_config_dict)
        except ValidationError as e:
            logger.error(
                "Failed to convert 'kv_transfer_config' dict to "
                "KVTransferConfig object. Dict: %s. Error: %s",
                raw_config_dict, e)
            # Consider re-raising a more specific vLLM error or ValueError
            # to provide better context to the user.
            raise ValueError(
                f"Invalid 'kv_transfer_config' provided: {e}") from e

    if hf_overrides is None:
        hf_overrides = {}

    if compilation_config is not None:
        if isinstance(compilation_config, int):
            compilation_config_instance = CompilationConfig(
                level=compilation_config)
        elif isinstance(compilation_config, dict):
            compilation_config_instance = CompilationConfig(
                **{
                    k: v
                    for k, v in compilation_config.items()
                    if is_init_field(CompilationConfig, k)
                })
        else:
            compilation_config_instance = compilation_config
    else:
        compilation_config_instance = CompilationConfig()

    if structured_outputs_config is not None:
        if isinstance(structured_outputs_config, dict):
            structured_outputs_instance = StructuredOutputsConfig(
                **{
                    k: v
                    for k, v in structured_outputs_config.items()
                    if is_init_field(StructuredOutputsConfig, k)
                })
        else:
            structured_outputs_instance = structured_outputs_config
    else:
        structured_outputs_instance = StructuredOutputsConfig()

    engine_args = EngineArgs(
        model=model,
        runner=runner,
        convert=convert,
        tokenizer=tokenizer,
        tokenizer_mode=tokenizer_mode,
        skip_tokenizer_init=skip_tokenizer_init,
        trust_remote_code=trust_remote_code,
        allowed_local_media_path=allowed_local_media_path,
        allowed_media_domains=allowed_media_domains,
        tensor_parallel_size=tensor_parallel_size,
        dtype=dtype,
        quantization=quantization,
        revision=revision,
        tokenizer_revision=tokenizer_revision,
        seed=seed,
        gpu_memory_utilization=gpu_memory_utilization,
        kv_cache_memory_bytes=kv_cache_memory_bytes,
        swap_space=swap_space,
        cpu_offload_gb=cpu_offload_gb,
        enforce_eager=enforce_eager,
        disable_custom_all_reduce=disable_custom_all_reduce,
        hf_token=hf_token,
        hf_overrides=hf_overrides,
        mm_processor_kwargs=mm_processor_kwargs,
        pooler_config=pooler_config,
        override_pooler_config=override_pooler_config,
        structured_outputs_config=structured_outputs_instance,
        compilation_config=compilation_config_instance,
        logits_processors=logits_processors,
        **kwargs,
    )

    log_non_default_args(engine_args)

    # Create the Engine (autoselects V0 vs V1)
    self.llm_engine = LLMEngine.from_engine_args(
        engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
    self.engine_class = type(self.llm_engine)

    self.request_counter = Counter()
    self.default_sampling_params: Union[dict[str, Any], None] = None

    supported_tasks = self.llm_engine.get_supported_tasks()  # type: ignore

    logger.info("Supported_tasks: %s", supported_tasks)

    self.supported_tasks = supported_tasks

    # Load the Input/Output processor plugin if any
    io_processor_plugin = self.llm_engine.model_config.io_processor_plugin
    self.io_processor = get_io_processor(self.llm_engine.vllm_config,
                                         io_processor_plugin)

_add_request

_add_request(
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    lora_request: Optional[LoRARequest] = None,
    priority: int = 0,
) -> None
Source code in vllm/entrypoints/llm.py
def _add_request(
    self,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    lora_request: Optional[LoRARequest] = None,
    priority: int = 0,
) -> None:
    request_id = str(next(self.request_counter))
    self.llm_engine.add_request(
        request_id,
        prompt,
        params,
        lora_request=lora_request,
        tokenization_kwargs=tokenization_kwargs,
        priority=priority,
    )

_cross_encoding_score

_cross_encoding_score(
    tokenizer: AnyTokenizer,
    data_1: Union[list[str], list[ScoreContentPartParam]],
    data_2: Union[list[str], list[ScoreContentPartParam]],
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[ScoringRequestOutput]
Source code in vllm/entrypoints/llm.py
def _cross_encoding_score(
    self,
    tokenizer: AnyTokenizer,
    data_1: Union[list[str], list[ScoreContentPartParam]],
    data_2: Union[list[str], list[ScoreContentPartParam]],
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]:
    model_config = self.llm_engine.model_config

    if isinstance(tokenizer, MistralTokenizer):
        raise ValueError(
            "Score API is not supported for Mistral tokenizer")

    if len(data_1) == 1:
        data_1 = data_1 * len(data_2)

    if pooling_params is None:
        pooling_params = PoolingParams(task="score")

    model_config = self.llm_engine.model_config
    pooling_params.verify("score", model_config)
    pooling_params_list = list[PoolingParams]()

    tokenization_kwargs: dict[str, Any] = {}

    _validate_truncation_size(model_config.max_model_len,
                              truncate_prompt_tokens, tokenization_kwargs)

    prompts = list[PromptType]()

    input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

    model_config = self.llm_engine.model_config

    for q, d in input_pairs:
        _, engine_prompt = get_score_prompt(
            model_config=model_config,
            data_1=q,
            data_2=d,
            tokenizer=tokenizer,
            tokenization_kwargs=tokenization_kwargs,
        )

        if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
            params = pooling_params.clone()
            compressed = compress_token_type_ids(token_type_ids)
            params.extra_kwargs = {"compressed_token_type_ids": compressed}
            pooling_params_list.append(params)
        else:
            pooling_params_list.append(pooling_params)

        prompts.append(engine_prompt)

    self._validate_and_add_requests(
        prompts=prompts,
        params=pooling_params_list,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
    )

    outputs = self._run_engine(use_tqdm=use_tqdm)
    items = self.engine_class.validate_outputs(outputs,
                                               PoolingRequestOutput)

    return [ScoringRequestOutput.from_base(item) for item in items]

_embedding_score

_embedding_score(
    tokenizer: AnyTokenizer,
    text_1: list[Union[str, TextPrompt, TokensPrompt]],
    text_2: list[Union[str, TextPrompt, TokensPrompt]],
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[ScoringRequestOutput]
Source code in vllm/entrypoints/llm.py
def _embedding_score(
    self,
    tokenizer: AnyTokenizer,
    text_1: list[Union[str, TextPrompt, TokensPrompt]],
    text_2: list[Union[str, TextPrompt, TokensPrompt]],
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]:

    encoded_output: list[PoolingRequestOutput] = self.encode(
        text_1 + text_2,
        truncate_prompt_tokens=truncate_prompt_tokens,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
        pooling_params=pooling_params,
        pooling_task="embed",
    )

    encoded_output_1: list[PoolingRequestOutput] = encoded_output[
        0:len(text_1)]
    encoded_output_2: list[PoolingRequestOutput] = encoded_output[
        len(text_1):]

    if len(encoded_output_1) == 1:
        encoded_output_1 = encoded_output_1 * len(encoded_output_2)

    scores = _cosine_similarity(tokenizer=tokenizer,
                                embed_1=encoded_output_1,
                                embed_2=encoded_output_2)

    items = self.engine_class.validate_outputs(scores,
                                               PoolingRequestOutput)
    return [ScoringRequestOutput.from_base(item) for item in items]

_get_beam_search_lora_requests

_get_beam_search_lora_requests(
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ],
    prompts: list[Union[TokensPrompt, TextPrompt]],
) -> list[Optional[LoRARequest]]

Get the optional lora request corresponding to each prompt.

Source code in vllm/entrypoints/llm.py
def _get_beam_search_lora_requests(
    self,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
    prompts: list[Union[TokensPrompt, TextPrompt]],
) -> list[Optional[LoRARequest]]:
    """Get the optional lora request corresponding to each prompt."""
    if isinstance(lora_request,
                  Sequence) and len(lora_request) != len(prompts):
        raise ValueError(
            "Lora request list should be the same length as the prompts")

    if lora_request is None or isinstance(lora_request, LoRARequest):
        return [lora_request] * len(prompts)

    raise TypeError(f"Invalid lora_request type {type(lora_request)}")

_get_modality_specific_lora_reqs

_get_modality_specific_lora_reqs(
    prompts: Union[PromptType, Sequence[PromptType]],
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ],
)
Source code in vllm/entrypoints/llm.py
def _get_modality_specific_lora_reqs(
        self, prompts: Union[PromptType, Sequence[PromptType]],
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]]):
    # Grab the lora config off the vllm config on the engine,
    # since this is the same for both v0 & v1.
    lora_config = self.llm_engine.vllm_config.lora_config

    # If there's no lora config / default_mm_loras, or the model
    # isn't multimodal, leave the lora as is.
    if (lora_config is None
            or not self.llm_engine.model_config.is_multimodal_model
            or (lora_config and lora_config.default_mm_loras is None)):
        return lora_request

    if not isinstance(prompts, Sequence):
        prompts = [prompts]

    optional_loras = ([lora_request] * len(prompts)
                      if not isinstance(lora_request, Sequence) else
                      lora_request)

    return [
        self._resolve_single_prompt_mm_lora(
            prompt,
            opt_lora_req,
            lora_config.default_mm_loras,
        ) for prompt, opt_lora_req in zip(prompts, optional_loras)
    ]

_resolve_single_prompt_mm_lora

_resolve_single_prompt_mm_lora(
    prompt: PromptType,
    lora_request: Optional[LoRARequest],
    default_mm_loras: Optional[dict[str, str]],
)
Source code in vllm/entrypoints/llm.py
def _resolve_single_prompt_mm_lora(self, prompt: PromptType,
                                   lora_request: Optional[LoRARequest],
                                   default_mm_loras: Optional[dict[str,
                                                                   str]]):
    if (not default_mm_loras or not isinstance(prompt, dict)
            or "multi_modal_data" not in prompt):
        return lora_request

    prompt = cast(Union[TextPrompt, TokensPrompt], prompt)

    intersection = set(prompt["multi_modal_data"].keys()) \
        .intersection(default_mm_loras.keys())
    if not intersection:
        return lora_request
    if len(intersection) > 1:
        # TODO: Would be nice to be able to have multiple loras per prompt
        logger.warning(
            "Multiple modality specific loras were registered and would be"
            " used by a single prompt consuming several modalities; "
            " currently we only support one lora per request; as such,"
            " lora(s) registered with modalities: %s"
            " will be skipped", intersection)
        return lora_request

    # Build the LoRA request; the ID of the default mm lora is the
    # index of the modality name sorted alphabetically + 1.
    modality_name = intersection.pop()
    modality_lora_path = default_mm_loras[modality_name]
    modality_lora_id = sorted(default_mm_loras).index(modality_name) + 1

    # If we have a collision, warn if there is a collision,
    # but always send the explicitly provided request.
    if lora_request:
        if lora_request.lora_int_id != modality_lora_id:
            logger.warning(
                "A modality with a registered lora and a lora_request "
                "with a different ID were provided; falling back to the "
                "lora_request as we only apply one LoRARequest per prompt")
        return lora_request

    return LoRARequest(
        modality_name,
        modality_lora_id,
        modality_lora_path,
    )

_run_engine

_run_engine(
    *, use_tqdm: Union[bool, Callable[..., tqdm]] = True
) -> list[Union[RequestOutput, PoolingRequestOutput]]
Source code in vllm/entrypoints/llm.py
def _run_engine(
    self,
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True
) -> list[Union[RequestOutput, PoolingRequestOutput]]:
    # Initialize tqdm.
    if use_tqdm:
        num_requests = self.llm_engine.get_num_unfinished_requests()
        tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
        pbar = tqdm_func(
            total=num_requests,
            desc="Processed prompts",
            dynamic_ncols=True,
            postfix=(f"est. speed input: {0:.2f} toks/s, "
                     f"output: {0:.2f} toks/s"),
        )

    # Run the engine.
    outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
    total_in_toks = 0
    total_out_toks = 0
    while self.llm_engine.has_unfinished_requests():
        step_outputs = self.llm_engine.step()
        for output in step_outputs:
            if output.finished:
                outputs.append(output)
                if use_tqdm:
                    if isinstance(output, RequestOutput):
                        # Calculate tokens only for RequestOutput
                        n = len(output.outputs)
                        assert output.prompt_token_ids is not None
                        total_in_toks += len(output.prompt_token_ids) * n
                        in_spd = total_in_toks / pbar.format_dict["elapsed"]
                        total_out_toks += sum(
                            len(stp.token_ids) for stp in output.outputs)
                        out_spd = (total_out_toks /
                                   pbar.format_dict["elapsed"])
                        pbar.postfix = (
                            f"est. speed input: {in_spd:.2f} toks/s, "
                            f"output: {out_spd:.2f} toks/s")
                        pbar.update(n)
                    else:
                        pbar.update(1)
                    if pbar.n == num_requests:
                        pbar.refresh()

    if use_tqdm:
        pbar.close()
    # Sort the outputs by request ID.
    # This is necessary because some requests may be finished earlier than
    # its previous requests.
    return sorted(outputs, key=lambda x: int(x.request_id))

_validate_and_add_requests

_validate_and_add_requests(
    prompts: Union[
        PromptType, Sequence[PromptType], DataPrompt
    ],
    params: Union[
        SamplingParams,
        Sequence[SamplingParams],
        PoolingParams,
        Sequence[PoolingParams],
    ],
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[
        Union[Sequence[LoRARequest], LoRARequest]
    ],
    priority: Optional[list[int]] = None,
) -> None
Source code in vllm/entrypoints/llm.py
def _validate_and_add_requests(
    self,
    prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
    params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                  Sequence[PoolingParams]],
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
    priority: Optional[list[int]] = None,
) -> None:
    if isinstance(prompts, (str, dict)):
        # Convert a single prompt to a list.
        prompts = [prompts]  # type: ignore[list-item]

    num_requests = len(prompts)
    if isinstance(params, Sequence) and len(params) != num_requests:
        raise ValueError("The lengths of prompts and params "
                         "must be the same.")
    if isinstance(lora_request,
                  Sequence) and len(lora_request) != num_requests:
        raise ValueError("The lengths of prompts and lora_request "
                         "must be the same.")

    for sp in params if isinstance(params, Sequence) else (params, ):
        if isinstance(sp, SamplingParams):
            # We only care about the final output
            sp.output_kind = RequestOutputKind.FINAL_ONLY

    # Add requests to the engine.
    it = prompts
    if use_tqdm:
        tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
        it = tqdm_func(it, desc="Adding requests")

    model_config = self.llm_engine.model_config

    for i, prompt in enumerate(it):

        if isinstance(prompt, dict):
            self._validate_mm_data_and_uuids(
                prompt.get("multi_modal_data"),
                prompt.get("multi_modal_uuids"))

        param = params[i] if isinstance(params, Sequence) else params

        tokenization_kwargs: dict[str, Any] = {}
        _validate_truncation_size(model_config.max_model_len,
                                  param.truncate_prompt_tokens,
                                  tokenization_kwargs)

        self._add_request(
            prompt,
            params[i] if isinstance(params, Sequence) else params,
            tokenization_kwargs=tokenization_kwargs,
            lora_request=lora_request[i] if isinstance(
                lora_request, Sequence) else lora_request,
            priority=priority[i] if priority else 0,
        )

_validate_mm_data_and_uuids

_validate_mm_data_and_uuids(
    multi_modal_data: Optional[Any],
    multi_modal_uuids: Optional[Any],
)

Validate that if any multi-modal data is skipped (i.e. None), then its corresponding UUID must be set.

Source code in vllm/entrypoints/llm.py
def _validate_mm_data_and_uuids(
        self,
        multi_modal_data: Optional[Any],  # MultiModalDataDict
        multi_modal_uuids: Optional[Any],  # MultiModalUUIDDict
):
    """
    Validate that if any multi-modal data is skipped (i.e. None),
    then its corresponding UUID must be set.
    """
    if multi_modal_data is None:
        return

    for modality, data in multi_modal_data.items():
        if isinstance(data, list):
            for i, d in enumerate(data):
                if d is None:
                    if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[  # noqa: E501
                            modality] is None:
                        raise ValueError(
                            f"Multi-modal data for {modality} is None "
                            f"but UUID is not provided")
                    else:
                        if len(
                                multi_modal_uuids[modality]
                        ) <= i or multi_modal_uuids[modality][i] is None:
                            raise ValueError(
                                f"Multi-modal data for {modality} is None "
                                f"but UUID is not provided")
        else:
            if data is None and (multi_modal_uuids is None
                                 or modality not in multi_modal_uuids
                                 or multi_modal_uuids[modality] is None):
                raise ValueError(f"Multi-modal data for {modality} is None"
                                 f" but UUID is not provided")

apply_model

apply_model(func: Callable[[Module], _R]) -> list[_R]

Run a function directly on the model inside each worker, returning the result for each of them.

Warning

To reduce the overhead of data transfer, avoid returning large arrays or tensors from this method. If you must return them, make sure you move them to CPU first to avoid taking up additional VRAM!

Source code in vllm/entrypoints/llm.py
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
    """
    Run a function directly on the model inside each worker,
    returning the result for each of them.

    !!! warning
        To reduce the overhead of data transfer, avoid returning large
        arrays or tensors from this method. If you must return them,
        make sure you move them to CPU first to avoid taking up additional
        VRAM!
    """
    return self.llm_engine.apply_model(func)
beam_search(
    prompts: list[Union[TokensPrompt, TextPrompt]],
    params: BeamSearchParams,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
    use_tqdm: bool = False,
    concurrency_limit: Optional[int] = None,
) -> list[BeamSearchOutput]

Generate sequences using beam search.

Parameters:

Name Type Description Default
prompts list[Union[TokensPrompt, TextPrompt]]

A list of prompts. Each prompt can be a string or a list of token IDs.

required
params BeamSearchParams

The beam search parameters.

required
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
use_tqdm bool

Whether to use tqdm to display the progress bar.

False
concurrency_limit Optional[int]

The maximum number of concurrent requests. If None, the number of concurrent requests is unlimited.

None
Source code in vllm/entrypoints/llm.py
def beam_search(
    self,
    prompts: list[Union[TokensPrompt, TextPrompt]],
    params: BeamSearchParams,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    use_tqdm: bool = False,
    concurrency_limit: Optional[int] = None,
) -> list[BeamSearchOutput]:
    """
    Generate sequences using beam search.

    Args:
        prompts: A list of prompts. Each prompt can be a string or a list
            of token IDs.
        params: The beam search parameters.
        lora_request: LoRA request to use for generation, if any.
        use_tqdm: Whether to use tqdm to display the progress bar.
        concurrency_limit: The maximum number of concurrent requests.
            If None, the number of concurrent requests is unlimited.
    """
    # TODO: how does beam search work together with length penalty,
    # frequency, penalty, and stopping criteria, etc.?
    beam_width = params.beam_width
    max_tokens = params.max_tokens
    temperature = params.temperature
    ignore_eos = params.ignore_eos
    length_penalty = params.length_penalty

    lora_requests = self._get_beam_search_lora_requests(
        lora_request, prompts)

    tokenizer = self.get_tokenizer()
    sort_beams_key = create_sort_beams_key_function(
        tokenizer.eos_token_id,
        length_penalty,
    )

    if use_tqdm and concurrency_limit is not None:
        logger.warning(
            "Progress bar is not supported when using concurrency_limit. "
            "Disabling progress bar.")
        use_tqdm = False

    if concurrency_limit is None:
        concurrency_limit = len(prompts)

    def create_tokens_prompt_from_beam(
            beam: BeamSearchSequence) -> TokensPrompt:
        token_prompt_kwargs: TokensPrompt = {
            "prompt_token_ids": beam.tokens
        }
        if beam.multi_modal_data is not None:
            token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data

        if beam.mm_processor_kwargs is not None:
            token_prompt_kwargs[
                "mm_processor_kwargs"] = beam.mm_processor_kwargs
        return TokensPrompt(**token_prompt_kwargs)

    # generate 2 * beam_width candidates at each step
    # following the huggingface transformers implementation
    # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
    beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                        max_tokens=1,
                                        temperature=temperature)
    instances: list[BeamSearchInstance] = []

    for lora_req, prompt in zip(lora_requests, prompts):
        # Add multimodal processor kwargs & data
        mm_kwargs = {}
        if "multi_modal_data" in prompt:
            mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"]
        if "mm_processor_kwargs" in prompt:
            mm_kwargs["mm_processor_kwargs"] = prompt[
                "mm_processor_kwargs"]

        if "prompt_token_ids" in prompt:
            prompt = cast(TokensPrompt, prompt)  # Needed for mypy
            prompt_tokens = prompt["prompt_token_ids"]
        else:
            prompt_tokens = tokenizer.encode(prompt["prompt"])

        instances.append(
            BeamSearchInstance(
                prompt_tokens,
                lora_request=lora_req,
                logprobs=None,
                **mm_kwargs,
            ), )

    for prompt_start in range(0, len(prompts), concurrency_limit):
        instances_batch = instances[prompt_start:prompt_start +
                                    concurrency_limit]

        token_iter = range(max_tokens)
        if use_tqdm:
            token_iter = tqdm(token_iter,
                              desc="Beam search",
                              unit="token",
                              unit_scale=False)
            logger.warning(
                "The progress bar shows the upper bound on token steps and "
                "may finish early due to stopping conditions. It does not "
                "reflect instance-level progress.")
        for _ in token_iter:
            all_beams: list[BeamSearchSequence] = list(
                sum((instance.beams for instance in instances_batch), []))
            pos = [0] + list(
                itertools.accumulate(
                    len(instance.beams) for instance in instances_batch))
            instance_start_and_end: list[tuple[int, int]] = list(
                zip(pos[:-1], pos[1:]))

            if len(all_beams) == 0:
                break

            # create corresponding batch entries for prompt & optional lora
            prompts_batch, lora_req_batch = zip(
                *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
                  for beam in all_beams])

            # only runs for one step
            # we don't need to use tqdm here
            output = self.generate(prompts_batch,
                                   sampling_params=beam_search_params,
                                   use_tqdm=False,
                                   lora_request=lora_req_batch)

            for (start, end), instance in zip(instance_start_and_end,
                                              instances_batch):
                instance_new_beams = []
                for i in range(start, end):
                    current_beam = all_beams[i]
                    result = output[i]

                    if result.outputs[0].logprobs is not None:
                        # if `result.outputs[0].logprobs` is None, it means
                        # the sequence is completed because of the
                        # max-model-len or abortion. we don't need to add
                        # it to the new beams.
                        logprobs = result.outputs[0].logprobs[0]
                        for token_id, logprob_obj in logprobs.items():
                            new_beam = BeamSearchSequence(
                                tokens=current_beam.tokens + [token_id],
                                logprobs=current_beam.logprobs +
                                [logprobs],
                                lora_request=current_beam.lora_request,
                                cum_logprob=current_beam.cum_logprob +
                                logprob_obj.logprob,
                                multi_modal_data=current_beam.
                                multi_modal_data,
                                mm_processor_kwargs=current_beam.
                                mm_processor_kwargs)

                            if token_id == tokenizer.eos_token_id and \
                                not ignore_eos:
                                instance.completed.append(new_beam)
                            else:
                                instance_new_beams.append(new_beam)
                sorted_beams = sorted(instance_new_beams,
                                      key=sort_beams_key,
                                      reverse=True)
                instance.beams = sorted_beams[:beam_width]

    outputs = []
    for instance in instances:
        instance.completed.extend(instance.beams)
        sorted_completed = sorted(instance.completed,
                                  key=sort_beams_key,
                                  reverse=True)
        best_beams = sorted_completed[:beam_width]

        for beam in best_beams:
            beam.text = tokenizer.decode(beam.tokens)
        outputs.append(BeamSearchOutput(sequences=best_beams))

    return outputs

chat

chat(
    messages: Union[
        list[ChatCompletionMessageParam],
        list[list[ChatCompletionMessageParam]],
    ],
    sampling_params: Optional[
        Union[SamplingParams, list[SamplingParams]]
    ] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[LoRARequest] = None,
    chat_template: Optional[str] = None,
    chat_template_content_format: ChatTemplateContentFormatOption = "auto",
    add_generation_prompt: bool = True,
    continue_final_message: bool = False,
    tools: Optional[list[dict[str, Any]]] = None,
    chat_template_kwargs: Optional[dict[str, Any]] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
) -> list[RequestOutput]

Generate responses for a chat conversation.

The chat conversation is converted into a text prompt using the tokenizer and calls the generate method to generate the responses.

Multi-modal inputs can be passed in the same way you would pass them to the OpenAI API.

Parameters:

Name Type Description Default
messages Union[list[ChatCompletionMessageParam], list[list[ChatCompletionMessageParam]]]

A list of conversations or a single conversation.

  • Each conversation is represented as a list of messages.
  • Each message is a dictionary with 'role' and 'content' keys.
required
sampling_params Optional[Union[SamplingParams, list[SamplingParams]]]

The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. When it is a list, the list must have the same length as the prompts and it is paired one by one with the prompt.

None
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[LoRARequest]

LoRA request to use for generation, if any.

None
chat_template Optional[str]

The template to use for structuring the chat. If not provided, the model's default chat template will be used.

None
chat_template_content_format ChatTemplateContentFormatOption

The format to render message content.

  • "string" will render the content as a string. Example: "Who are you?"
  • "openai" will render the content as a list of dictionaries, similar to OpenAI schema. Example: [{"type": "text", "text": "Who are you?"}]
'auto'
add_generation_prompt bool

If True, adds a generation template to each message.

True
continue_final_message bool

If True, continues the final message in the conversation instead of starting a new one. Cannot be True if add_generation_prompt is also True.

False
chat_template_kwargs Optional[dict[str, Any]]

Additional kwargs to pass to the chat template.

None
mm_processor_kwargs Optional[dict[str, Any]]

Multimodal processor kwarg overrides for this chat request. Only used for offline requests.

None

Returns:

Type Description
list[RequestOutput]

A list of RequestOutput objects containing the generated

list[RequestOutput]

responses in the same order as the input messages.

Source code in vllm/entrypoints/llm.py
def chat(
    self,
    messages: Union[list[ChatCompletionMessageParam],
                    list[list[ChatCompletionMessageParam]]],
    sampling_params: Optional[Union[SamplingParams,
                                    list[SamplingParams]]] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[LoRARequest] = None,
    chat_template: Optional[str] = None,
    chat_template_content_format: ChatTemplateContentFormatOption = "auto",
    add_generation_prompt: bool = True,
    continue_final_message: bool = False,
    tools: Optional[list[dict[str, Any]]] = None,
    chat_template_kwargs: Optional[dict[str, Any]] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
) -> list[RequestOutput]:
    """
    Generate responses for a chat conversation.

    The chat conversation is converted into a text prompt using the
    tokenizer and calls the [generate][vllm.LLM.generate] method to generate
    the responses.

    Multi-modal inputs can be passed in the same way you would pass them
    to the OpenAI API.

    Args:
        messages: A list of conversations or a single conversation.

            - Each conversation is represented as a list of messages.
            - Each message is a dictionary with 'role' and 'content' keys.

        sampling_params: The sampling parameters for text generation.
            If None, we use the default sampling parameters. When it
            is a single value, it is applied to every prompt. When it
            is a list, the list must have the same length as the
            prompts and it is paired one by one with the prompt.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        chat_template: The template to use for structuring the chat.
            If not provided, the model's default chat template will be used.
        chat_template_content_format: The format to render message content.

            - "string" will render the content as a string.
              Example: `"Who are you?"`
            - "openai" will render the content as a list of dictionaries,
              similar to OpenAI schema.
              Example: `[{"type": "text", "text": "Who are you?"}]`

        add_generation_prompt: If True, adds a generation template
            to each message.
        continue_final_message: If True, continues the final message in
            the conversation instead of starting a new one. Cannot be
            `True` if `add_generation_prompt` is also `True`.
        chat_template_kwargs: Additional kwargs to pass to the chat
            template.
        mm_processor_kwargs: Multimodal processor kwarg overrides for this
            chat request. Only used for offline requests.

    Returns:
        A list of `RequestOutput` objects containing the generated
        responses in the same order as the input messages.
    """

    prompts = self.preprocess_chat(
        messages=messages,
        chat_template=chat_template,
        chat_template_content_format=chat_template_content_format,
        add_generation_prompt=add_generation_prompt,
        continue_final_message=continue_final_message,
        tools=tools,
        chat_template_kwargs=chat_template_kwargs,
        mm_processor_kwargs=mm_processor_kwargs,
    )

    return self.generate(
        prompts,
        sampling_params=sampling_params,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
    )

classify

classify(
    prompts: Union[PromptType, Sequence[PromptType]],
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[
        Union[PoolingParams, Sequence[PoolingParams]]
    ] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[ClassificationRequestOutput]

Generate class logits for each prompt.

This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompt.

required
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
pooling_params Optional[Union[PoolingParams, Sequence[PoolingParams]]]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None

Returns: A list of ClassificationRequestOutput objects containing the embedding vectors in the same order as the input prompts.

Source code in vllm/entrypoints/llm.py
def classify(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[Union[PoolingParams,
                                   Sequence[PoolingParams]]] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ClassificationRequestOutput]:
    """
    Generate class logits for each prompt.

    This class automatically batches the given prompts, considering
    the memory constraint. For the best performance, put all of your prompts
    into a single list and pass it to this method.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompt.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
    Returns:
        A list of `ClassificationRequestOutput` objects containing the
        embedding vectors in the same order as the input prompts.
    """
    if "classify" not in self.supported_tasks:
        raise ValueError(
            "Classification API is not supported by this model. "
            "Try converting the model using `--convert classify`.")

    items = self.encode(
        prompts,
        use_tqdm=use_tqdm,
        pooling_params=pooling_params,
        lora_request=lora_request,
        pooling_task="classify",
    )

    return [ClassificationRequestOutput.from_base(item) for item in items]

collective_rpc

collective_rpc(
    method: Union[str, Callable[..., _R]],
    timeout: Optional[float] = None,
    args: tuple = (),
    kwargs: Optional[dict[str, Any]] = None,
) -> list[_R]

Execute an RPC call on all workers.

Parameters:

Name Type Description Default
method Union[str, Callable[..., _R]]

Name of the worker method to execute, or a callable that is serialized and sent to all workers to execute.

If the method is a callable, it should accept an additional self argument, in addition to the arguments passed in args and kwargs. The self argument will be the worker object.

required
timeout Optional[float]

Maximum time in seconds to wait for execution. Raises a TimeoutError on timeout. None means wait indefinitely.

None
args tuple

Positional arguments to pass to the worker method.

()
kwargs Optional[dict[str, Any]]

Keyword arguments to pass to the worker method.

None

Returns:

Type Description
list[_R]

A list containing the results from each worker.

Note

It is recommended to use this API to only pass control messages, and set up data-plane communication to pass data.

Source code in vllm/entrypoints/llm.py
def collective_rpc(self,
                   method: Union[str, Callable[..., _R]],
                   timeout: Optional[float] = None,
                   args: tuple = (),
                   kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
    """
    Execute an RPC call on all workers.

    Args:
        method: Name of the worker method to execute, or a callable that
            is serialized and sent to all workers to execute.

            If the method is a callable, it should accept an additional
            `self` argument, in addition to the arguments passed in `args`
            and `kwargs`. The `self` argument will be the worker object.
        timeout: Maximum time in seconds to wait for execution. Raises a
            [`TimeoutError`][] on timeout. `None` means wait indefinitely.
        args: Positional arguments to pass to the worker method.
        kwargs: Keyword arguments to pass to the worker method.

    Returns:
        A list containing the results from each worker.

    Note:
        It is recommended to use this API to only pass control messages,
        and set up data-plane communication to pass data.
    """

    return self.llm_engine.collective_rpc(method, timeout, args, kwargs)

embed

embed(
    prompts: Union[PromptType, Sequence[PromptType]],
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[
        Union[PoolingParams, Sequence[PoolingParams]]
    ] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[EmbeddingRequestOutput]

Generate an embedding vector for each prompt.

This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompt.

required
pooling_params Optional[Union[PoolingParams, Sequence[PoolingParams]]]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None

Returns:

Type Description
list[EmbeddingRequestOutput]

A list of EmbeddingRequestOutput objects containing the

list[EmbeddingRequestOutput]

embedding vectors in the same order as the input prompts.

Source code in vllm/entrypoints/llm.py
def embed(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[Union[PoolingParams,
                                   Sequence[PoolingParams]]] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[EmbeddingRequestOutput]:
    """
    Generate an embedding vector for each prompt.

    This class automatically batches the given prompts, considering
    the memory constraint. For the best performance, put all of your prompts
    into a single list and pass it to this method.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompt.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.

    Returns:
        A list of `EmbeddingRequestOutput` objects containing the
        embedding vectors in the same order as the input prompts.
    """
    if "embed" not in self.supported_tasks:
        raise ValueError(
            "Embedding API is not supported by this model. "
            "Try converting the model using `--convert embed`.")

    items = self.encode(
        prompts,
        truncate_prompt_tokens=truncate_prompt_tokens,
        use_tqdm=use_tqdm,
        pooling_params=pooling_params,
        lora_request=lora_request,
        pooling_task="embed",
    )

    return [EmbeddingRequestOutput.from_base(item) for item in items]

encode

encode(
    prompts: Union[
        PromptType, Sequence[PromptType], DataPrompt
    ],
    pooling_params: Optional[
        Union[PoolingParams, Sequence[PoolingParams]]
    ] = None,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
    pooling_task: PoolingTask = "encode",
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> list[PoolingRequestOutput]

Apply pooling to the hidden states corresponding to the input prompts.

This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType], DataPrompt]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompt.

required
pooling_params Optional[Union[PoolingParams, Sequence[PoolingParams]]]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
pooling_task PoolingTask

Override the pooling task to use.

'encode'
tokenization_kwargs Optional[dict[str, Any]]

overrides tokenization_kwargs set in pooling_params

None

Returns:

Type Description
list[PoolingRequestOutput]

A list of PoolingRequestOutput objects containing the

list[PoolingRequestOutput]

pooled hidden states in the same order as the input prompts.

Note

Using prompts and prompt_token_ids as keyword parameters is considered legacy and may be deprecated in the future. You should instead pass them via the inputs parameter.

Source code in vllm/entrypoints/llm.py
def encode(
    self,
    prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
    pooling_params: Optional[Union[PoolingParams,
                                   Sequence[PoolingParams]]] = None,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    pooling_task: PoolingTask = "encode",
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> list[PoolingRequestOutput]:
    """Apply pooling to the hidden states corresponding to the input
    prompts.

    This class automatically batches the given prompts, considering
    the memory constraint. For the best performance, put all of your prompts
    into a single list and pass it to this method.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompt.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        pooling_task: Override the pooling task to use.
        tokenization_kwargs: overrides tokenization_kwargs set in
            pooling_params

    Returns:
        A list of `PoolingRequestOutput` objects containing the
        pooled hidden states in the same order as the input prompts.

    Note:
        Using `prompts` and `prompt_token_ids` as keyword parameters is
        considered legacy and may be deprecated in the future. You should
        instead pass them via the `inputs` parameter.
    """

    if self.supported_tasks == ["encode"] and pooling_task is None:
        pooling_task = "encode"

    if pooling_task is None:
        if "embed" in self.supported_tasks:
            pooling_task = "embed"
        else:
            pooling_task = "encode"

        logger.warning_once(
            "`LLM.encode` is currently using `pooling_task = %s`.\n"
            "Please use one of the more specific methods or set the "
            "task directly when using `LLM.encode`:\n"
            "  - For embeddings, use `LLM.embed(...)` "
            "or `pooling_task=\"embed\"`.\n"
            "  - For classification logits, use `LLM.classify(...)` "
            "or `pooling_task=\"classify\"`.\n"
            "  - For rewards, use `LLM.reward(...)` "
            "or `pooling_task=\"reward\"`\n"
            "  - For similarity scores, use `LLM.score(...)`.",
            pooling_task)

    model_config = self.llm_engine.model_config
    runner_type = model_config.runner_type
    if runner_type != "pooling":
        raise ValueError(
            "LLM.encode() is only supported for pooling models. "
            "Try passing `--runner pooling` to use the model as a "
            "pooling model.")

    if pooling_task not in self.supported_tasks:
        raise ValueError(
            f"pooling_task must be one of {self.supported_tasks}.")

    if pooling_params is None:
        # Use default pooling params.
        pooling_params = PoolingParams()

    for param in as_iter(pooling_params):
        param.verify(pooling_task, model_config)
        # for backwards compatibility
        if truncate_prompt_tokens is not None:
            param.truncate_prompt_tokens = truncate_prompt_tokens

    io_processor_prompt = False
    if isinstance(prompts, dict) and "data" in prompts:
        io_processor_prompt = True
        if self.io_processor is None:
            raise ValueError(
                "No IOProcessor plugin installed. Please refer "
                "to the documentation and to the "
                "'prithvi_geospatial_mae_io_processor' "
                "offline inference example for more details.")

        # Validate the request data is valid for the loaded plugin
        validated_prompt = self.io_processor.parse_request(prompts)

        # obtain the actual model prompts from the pre-processor
        prompts = self.io_processor.pre_process(prompt=validated_prompt)

    self._validate_and_add_requests(
        prompts=prompts,
        params=pooling_params,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
    )

    outputs = self._run_engine(use_tqdm=use_tqdm)

    model_outputs = self.engine_class.validate_outputs(
        outputs, PoolingRequestOutput)

    if io_processor_prompt:
        # get the post-processed model outputs
        assert self.io_processor is not None
        processed_outputs = self.io_processor.post_process(
            model_output=model_outputs)

        return [
            PoolingRequestOutput[Any](request_id="",
                                      outputs=processed_outputs,
                                      prompt_token_ids=[],
                                      finished=True)
        ]
    else:
        return model_outputs

generate

generate(
    prompts: Union[PromptType, Sequence[PromptType]],
    sampling_params: Optional[
        Union[SamplingParams, Sequence[SamplingParams]]
    ] = None,
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
    priority: Optional[list[int]] = None,
) -> list[RequestOutput]

Generates the completions for the input prompts.

This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompt.

required
sampling_params Optional[Union[SamplingParams, Sequence[SamplingParams]]]

The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. When it is a list, the list must have the same length as the prompts and it is paired one by one with the prompt.

None
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
priority Optional[list[int]]

The priority of the requests, if any. Only applicable when priority scheduling policy is enabled.

None

Returns:

Type Description
list[RequestOutput]

A list of RequestOutput objects containing the

list[RequestOutput]

generated completions in the same order as the input prompts.

Note

Using prompts and prompt_token_ids as keyword parameters is considered legacy and may be deprecated in the future. You should instead pass them via the inputs parameter.

Source code in vllm/entrypoints/llm.py
def generate(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    sampling_params: Optional[Union[SamplingParams,
                                    Sequence[SamplingParams]]] = None,
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    priority: Optional[list[int]] = None,
) -> list[RequestOutput]:
    """Generates the completions for the input prompts.

    This class automatically batches the given prompts, considering
    the memory constraint. For the best performance, put all of your prompts
    into a single list and pass it to this method.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompt.
        sampling_params: The sampling parameters for text generation. If
            None, we use the default sampling parameters.
            When it is a single value, it is applied to every prompt.
            When it is a list, the list must have the same length as the
            prompts and it is paired one by one with the prompt.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        priority: The priority of the requests, if any.
            Only applicable when priority scheduling policy is enabled.

    Returns:
        A list of `RequestOutput` objects containing the
        generated completions in the same order as the input prompts.

    Note:
        Using `prompts` and `prompt_token_ids` as keyword parameters is
        considered legacy and may be deprecated in the future. You should
        instead pass them via the `inputs` parameter.
    """
    model_config = self.llm_engine.model_config
    runner_type = model_config.runner_type
    if runner_type != "generate":
        raise ValueError(
            "LLM.generate() is only supported for generative models. "
            "Try passing `--runner generate` to use the model as a "
            "generative model.")

    if sampling_params is None:
        # Use default sampling params.
        sampling_params = self.get_default_sampling_params()

    # Add any modality specific loras to the corresponding prompts
    lora_request = self._get_modality_specific_lora_reqs(
        prompts, lora_request)

    self._validate_and_add_requests(
        prompts=prompts,
        params=sampling_params,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
        priority=priority,
    )

    outputs = self._run_engine(use_tqdm=use_tqdm)
    return self.engine_class.validate_outputs(outputs, RequestOutput)

get_default_sampling_params

get_default_sampling_params() -> SamplingParams
Source code in vllm/entrypoints/llm.py
def get_default_sampling_params(self) -> SamplingParams:
    if self.default_sampling_params is None:
        self.default_sampling_params = (
            self.llm_engine.model_config.get_diff_sampling_param())
    if self.default_sampling_params:
        return SamplingParams.from_optional(**self.default_sampling_params)
    return SamplingParams()

get_metrics

get_metrics() -> list[Metric]

Return a snapshot of aggregated metrics from Prometheus.

Returns:

Type Description
list[Metric]

A MetricSnapshot instance capturing the current state

list[Metric]

of all aggregated metrics from Prometheus.

Note

This method is only available with the V1 LLM engine.

Source code in vllm/entrypoints/llm.py
def get_metrics(self) -> list["Metric"]:
    """Return a snapshot of aggregated metrics from Prometheus.

    Returns:
        A ``MetricSnapshot`` instance capturing the current state
        of all aggregated metrics from Prometheus.

    Note:
        This method is only available with the V1 LLM engine.
    """
    return self.llm_engine.get_metrics()

get_tokenizer

get_tokenizer() -> AnyTokenizer
Source code in vllm/entrypoints/llm.py
def get_tokenizer(self) -> AnyTokenizer:
    return self.llm_engine.get_tokenizer()

preprocess_chat

preprocess_chat(
    messages: Union[
        list[ChatCompletionMessageParam],
        list[list[ChatCompletionMessageParam]],
    ],
    chat_template: Optional[str] = None,
    chat_template_content_format: ChatTemplateContentFormatOption = "auto",
    add_generation_prompt: bool = True,
    continue_final_message: bool = False,
    tools: Optional[list[dict[str, Any]]] = None,
    chat_template_kwargs: Optional[dict[str, Any]] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
) -> list[TokensPrompt]

Generate prompt for a chat conversation. The pre-processed prompt can then be used as input for the other LLM methods.

Refer to chat for a complete description of the arguments. Returns: A list of TokensPrompts objects containing the tokenized prompt after chat template interpolation, and the pre-processed multi-modal inputs.

Source code in vllm/entrypoints/llm.py
def preprocess_chat(
    self,
    messages: Union[list[ChatCompletionMessageParam],
                    list[list[ChatCompletionMessageParam]]],
    chat_template: Optional[str] = None,
    chat_template_content_format: ChatTemplateContentFormatOption = "auto",
    add_generation_prompt: bool = True,
    continue_final_message: bool = False,
    tools: Optional[list[dict[str, Any]]] = None,
    chat_template_kwargs: Optional[dict[str, Any]] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
) -> list[TokensPrompt]:
    """
    Generate prompt for a chat conversation. The pre-processed
    prompt can then be used as input for the other LLM methods.

    Refer to `chat` for a complete description of the arguments.
    Returns:
        A list of `TokensPrompts` objects containing the tokenized
        prompt after chat template interpolation, and the
        pre-processed multi-modal inputs.
    """
    list_of_messages: list[list[ChatCompletionMessageParam]]

    # Handle multi and single conversations
    if is_list_of(messages, list):
        # messages is list[list[...]]
        list_of_messages = cast(list[list[ChatCompletionMessageParam]],
                                messages)
    else:
        # messages is list[...]
        list_of_messages = [
            cast(list[ChatCompletionMessageParam], messages)
        ]

    tokenizer = self.get_tokenizer()
    model_config = self.llm_engine.get_model_config()
    resolved_content_format = resolve_chat_template_content_format(
        chat_template,
        tools,
        chat_template_content_format,
        tokenizer,
        model_config=model_config,
    )

    _chat_template_kwargs: dict[str, Any] = dict(
        chat_template=chat_template,
        add_generation_prompt=add_generation_prompt,
        continue_final_message=continue_final_message,
        tools=tools,
    )
    _chat_template_kwargs.update(chat_template_kwargs or {})

    prompts: list[TokensPrompt] = []

    for msgs in list_of_messages:
        # NOTE: _parse_chat_message_content_parts() currently doesn't
        # handle mm_processor_kwargs, since there is no implementation in
        # the chat message parsing for it.
        conversation, mm_data, mm_uuids = parse_chat_messages(
            msgs,
            model_config,
            tokenizer,
            content_format=resolved_content_format,
        )

        if isinstance(tokenizer, MistralTokenizer):
            prompt_token_ids = apply_mistral_chat_template(
                tokenizer,
                messages=msgs,
                **_chat_template_kwargs,
            )
        else:
            prompt_str = apply_hf_chat_template(
                tokenizer=tokenizer,
                conversation=conversation,
                model_config=model_config,
                **_chat_template_kwargs,
            )
            # Special tokens are already included in chat templates so
            # should not be added by the tokenizer in this case.
            prompt_token_ids = tokenizer.encode(prompt_str,
                                                add_special_tokens=False)

        prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)

        if mm_data is not None:
            prompt["multi_modal_data"] = mm_data

        if mm_uuids is not None:
            prompt["multi_modal_uuids"] = mm_uuids

        if mm_processor_kwargs is not None:
            prompt["mm_processor_kwargs"] = mm_processor_kwargs

        prompts.append(prompt)

    return prompts

reset_prefix_cache

reset_prefix_cache(device: Optional[Device] = None) -> bool
Source code in vllm/entrypoints/llm.py
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
    return self.llm_engine.reset_prefix_cache(device)

reward

reward(
    prompts: Union[PromptType, Sequence[PromptType]],
    /,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[
        Union[PoolingParams, Sequence[PoolingParams]]
    ] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[PoolingRequestOutput]

Generate rewards for each prompt.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompt.

required
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
pooling_params Optional[Union[PoolingParams, Sequence[PoolingParams]]]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None

Returns: A list of PoolingRequestOutput objects containing the pooled hidden states in the same order as the input prompts.

Source code in vllm/entrypoints/llm.py
def reward(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    /,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[Union[PoolingParams,
                                   Sequence[PoolingParams]]] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[PoolingRequestOutput]:
    """
    Generate rewards for each prompt.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompt.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
    Returns:
        A list of `PoolingRequestOutput` objects containing the
        pooled hidden states in the same order as the input prompts.
    """

    return self.encode(
        prompts,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
        pooling_params=pooling_params,
        truncate_prompt_tokens=truncate_prompt_tokens,
        pooling_task="encode",
    )

score

score(
    data_1: Union[
        SingletonPrompt,
        Sequence[SingletonPrompt],
        ScoreMultiModalParam,
    ],
    data_2: Union[
        SingletonPrompt,
        Sequence[SingletonPrompt],
        ScoreMultiModalParam,
    ],
    /,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[ScoringRequestOutput]

Generate similarity scores for all pairs <text,text_pair> or <multi-modal data, multi-modal data pair>.

The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case the data_1 input will be replicated N times to pair with the data_2 inputs. The input pairs are used to build a list of prompts for the cross encoder model. This class automatically batches the prompts, considering the memory constraint. For the best performance, put all of your inputs into a single list and pass it to this method.

Supports both text and multi-modal data (images, etc.) when used with appropriate multi-modal models. For multi-modal inputs, ensure the prompt structure matches the model's expected input format.

Parameters:

Name Type Description Default
data_1 Union[SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam]

Can be a single prompt, a list of prompts or ScoreMultiModalParam, which can contain either text or multi-modal data. When a list, it must have the same length as the data_2 list.

required
data_2 Union[SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam]

The data to pair with the query to form the input to the LLM. Can be text or multi-modal data. See PromptType for more details about the format of each prompt.

required
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
pooling_params Optional[PoolingParams]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None

Returns: A list of ScoringRequestOutput objects containing the generated scores in the same order as the input prompts.

Source code in vllm/entrypoints/llm.py
def score(
    self,
    data_1: Union[SingletonPrompt, Sequence[SingletonPrompt],
                  ScoreMultiModalParam],
    data_2: Union[SingletonPrompt, Sequence[SingletonPrompt],
                  ScoreMultiModalParam],
    /,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]:
    """Generate similarity scores for all pairs `<text,text_pair>` or
      `<multi-modal data, multi-modal data pair>`.

    The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
    In the `1 - N` case the `data_1` input will be replicated `N`
    times to pair with the `data_2` inputs.
    The input pairs are used to build a list of prompts for the
    cross encoder model. This class automatically batches the prompts,
    considering the memory constraint. For the best performance, put all
    of your inputs into a single list and pass it to this method.

    Supports both text and multi-modal data (images, etc.) when used with
    appropriate multi-modal models. For multi-modal inputs, ensure the
    prompt structure matches the model's expected input format.

    Args:
        data_1: Can be a single prompt, a list of prompts or
            `ScoreMultiModalParam`, which can contain either text or
            multi-modal data. When a list, it must have the same length as
            the `data_2` list.
        data_2: The data to pair with the query to form the input to
            the LLM. Can be text or multi-modal data. See [PromptType]
            [vllm.inputs.PromptType] for more details about the format of
            each prompt.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
    Returns:
        A list of `ScoringRequestOutput` objects containing the
        generated scores in the same order as the input prompts.
    """
    model_config = self.llm_engine.model_config
    runner_type = model_config.runner_type
    if runner_type != "pooling":
        raise ValueError(
            "LLM.score() is only supported for pooling models. "
            "Try passing `--runner pooling` to use the model as a "
            "pooling model.")

    supported_tasks = self.supported_tasks
    if all(t not in supported_tasks for t in ("embed", "classify")):
        raise ValueError("Score API is not supported by this model. "
                         "Try converting the model using "
                         "`--convert embed` or `--convert classify`.")

    if (model_config.is_cross_encoder
            and getattr(model_config.hf_config, "num_labels", 0) != 1):
        raise ValueError("Score API is only enabled for num_labels == 1.")

    # the tokenizer for models such as
    # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
    # lists of tokens to the `text` and `text_pair` kwargs
    tokenizer = self.get_tokenizer()

    if not model_config.is_multimodal_model:

        def check_data_type(data: Union[SingletonPrompt,
                                        Sequence[SingletonPrompt],
                                        ScoreMultiModalParam]):
            if isinstance(data, dict) and "content" in data:
                raise ValueError("ScoreMultiModalParam is not supported "
                                 f"for {model_config.architecture}")

        check_data_type(data_1)
        check_data_type(data_2)

        def ensure_str(prompt: SingletonPrompt):
            if isinstance(prompt, dict):
                if "multi_modal_data" in prompt:
                    raise ValueError("Multi-modal prompt is not "
                                     "supported for scoring")
                elif "prompt_token_ids" in prompt:
                    prompt = tokenizer.decode(
                        cast(TokensPrompt, prompt)["prompt_token_ids"])
                elif "prompt" in prompt:
                    prompt = cast(TextPrompt, prompt)["prompt"]
            assert type(prompt) is str
            return prompt

        if isinstance(data_1, (str, dict)):
            # Convert a single prompt to a list.
            data_1 = [data_1]  # type: ignore[list-item]

        data_1 = [ensure_str(t) for t in data_1]

        if isinstance(data_2, (str, dict)):
            # Convert a single prompt to a list.
            data_2 = [data_2]  # type: ignore[list-item]

        data_2 = [ensure_str(t) for t in data_2]

    if isinstance(data_1, dict) and "content" in data_1:
        data_1 = data_1.get("content")  # type: ignore[assignment]
    elif isinstance(data_1, str):
        data_1 = [data_1]

    if isinstance(data_2, dict) and "content" in data_2:
        data_2 = data_2.get("content")  # type: ignore[assignment]
    elif isinstance(data_2, str):
        data_2 = [data_2]

    _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]

    if model_config.is_cross_encoder:
        return self._cross_encoding_score(
            tokenizer,
            data_1,  # type: ignore[arg-type]
            data_2,  # type: ignore[arg-type]
            truncate_prompt_tokens,
            use_tqdm,
            pooling_params,
            lora_request)
    else:
        return self._embedding_score(
            tokenizer,
            data_1,  # type: ignore[arg-type]
            data_2,  # type: ignore[arg-type]
            truncate_prompt_tokens,
            use_tqdm,
            pooling_params,
            lora_request)

set_tokenizer

set_tokenizer(tokenizer: AnyTokenizer) -> None
Source code in vllm/entrypoints/llm.py
def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
    # While CachedTokenizer is dynamic, have no choice but
    # compare class name. Misjudgment will arise from
    # user-defined tokenizer started with 'Cached'
    if tokenizer.__class__.__name__.startswith("Cached"):
        self.llm_engine.tokenizer = tokenizer
    else:
        self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)

sleep

sleep(level: int = 1)

Put the engine to sleep. The engine should not process any requests. The caller should guarantee that no requests are being processed during the sleep period, before wake_up is called.

Parameters:

Name Type Description Default
level int

The sleep level. Level 1 sleep will offload the model weights and discard the kv cache. The content of kv cache is forgotten. Level 1 sleep is good for sleeping and waking up the engine to run the same model again. The model weights are backed up in CPU memory. Please make sure there's enough CPU memory to store the model weights. Level 2 sleep will discard both the model weights and the kv cache. The content of both the model weights and kv cache is forgotten. Level 2 sleep is good for sleeping and waking up the engine to run a different model or update the model, where previous model weights are not needed. It reduces CPU memory pressure.

1
Source code in vllm/entrypoints/llm.py
def sleep(self, level: int = 1):
    """
    Put the engine to sleep. The engine should not process any requests.
    The caller should guarantee that no requests are being processed
    during the sleep period, before `wake_up` is called.

    Args:
        level: The sleep level. Level 1 sleep will offload the model
            weights and discard the kv cache. The content of kv cache
            is forgotten. Level 1 sleep is good for sleeping and waking
            up the engine to run the same model again. The model weights
            are backed up in CPU memory. Please make sure there's enough
            CPU memory to store the model weights. Level 2 sleep will
            discard both the model weights and the kv cache. The content
            of both the model weights and kv cache is forgotten. Level 2
            sleep is good for sleeping and waking up the engine to run a
            different model or update the model, where previous model
            weights are not needed. It reduces CPU memory pressure.
    """
    self.reset_prefix_cache()
    self.llm_engine.sleep(level=level)

start_profile

start_profile() -> None
Source code in vllm/entrypoints/llm.py
def start_profile(self) -> None:
    self.llm_engine.start_profile()

stop_profile

stop_profile() -> None
Source code in vllm/entrypoints/llm.py
def stop_profile(self) -> None:
    self.llm_engine.stop_profile()

wake_up

wake_up(tags: Optional[list[str]] = None)

Wake up the engine from sleep mode. See the sleep method for more details.

Parameters:

Name Type Description Default
tags Optional[list[str]]

An optional list of tags to reallocate the engine memory for specific memory allocations. Values must be in ("weights", "kv_cache"). If None, all memory is reallocated. wake_up should be called with all tags (or None) before the engine is used again.

None
Source code in vllm/entrypoints/llm.py
def wake_up(self, tags: Optional[list[str]] = None):
    """
    Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
    method for more details.

    Args:
        tags: An optional list of tags to reallocate the engine memory
            for specific memory allocations. Values must be in
            `("weights", "kv_cache")`. If None, all memory is reallocated.
            wake_up should be called with all tags (or None) before the
            engine is used again.
    """
    self.llm_engine.wake_up(tags)

PoolingOutput dataclass

The output data of one pooling output of a request.

Parameters:

Name Type Description Default
data Tensor

The extracted hidden states.

required
Source code in vllm/outputs.py
@dataclass
class PoolingOutput:
    """The output data of one pooling output of a request.

    Args:
        data: The extracted hidden states.
    """
    data: torch.Tensor

    def __repr__(self) -> str:
        return (f"PoolingOutput(data={self.data})")

    def __eq__(self, other: object) -> bool:
        return (isinstance(other, self.__class__) and bool(
            (self.data == other.data).all()))

data instance-attribute

data: Tensor

__eq__

__eq__(other: object) -> bool
Source code in vllm/outputs.py
def __eq__(self, other: object) -> bool:
    return (isinstance(other, self.__class__) and bool(
        (self.data == other.data).all()))

__init__

__init__(data: Tensor) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (f"PoolingOutput(data={self.data})")

PoolingParams

Bases: Struct

API parameters for pooling models.

Attributes:

Name Type Description
truncate_prompt_tokens Optional[Annotated[int, Meta(ge=-1)]]

Controls prompt truncation. Set to -1 to use the model's default truncation size. Set to k to keep only the last k tokens (left truncation). Set to None to disable truncation.

normalize Optional[bool]

Whether to normalize the embeddings outputs.

dimensions Optional[int]

Reduce the dimensions of embeddings if model support matryoshka representation.

activation Optional[bool]

Whether to apply activation function to the classification outputs.

softmax Optional[bool]

Whether to apply softmax to the reward outputs.

Source code in vllm/pooling_params.py
class PoolingParams(
        msgspec.Struct,
        omit_defaults=True,  # type: ignore[call-arg]
        array_like=True):  # type: ignore[call-arg]
    """API parameters for pooling models.

    Attributes:
        truncate_prompt_tokens: Controls prompt truncation.
            Set to -1 to use the model's default truncation size.
            Set to k to keep only the last k tokens (left truncation).
            Set to None to disable truncation.         
        normalize: Whether to normalize the embeddings outputs.
        dimensions: Reduce the dimensions of embeddings
            if model support matryoshka representation.
        activation: Whether to apply activation function to
            the classification outputs.
        softmax: Whether to apply softmax to the reward outputs.
    """

    # --8<-- [start:common-pooling-params]
    truncate_prompt_tokens: Optional[Annotated[int,
                                               msgspec.Meta(ge=-1)]] = None
    # --8<-- [end:common-pooling-params]

    ## for embeddings models
    # --8<-- [start:embedding-pooling-params]
    dimensions: Optional[int] = None
    normalize: Optional[bool] = None
    # --8<-- [end:embedding-pooling-params]

    ## for classification, scoring and rerank
    # --8<-- [start:classification-pooling-params]
    activation: Optional[bool] = None
    # --8<-- [end:classification-pooling-params]

    ## for reward models
    softmax: Optional[bool] = None
    step_tag_id: Optional[int] = None
    returned_token_ids: Optional[list[int]] = None

    task: Optional[PoolingTask] = None
    """Internal use only."""

    requires_token_ids: bool = False
    """Internal use only."""

    extra_kwargs: Optional[dict[str, Any]] = None
    """Internal use only."""

    output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY

    @property
    def all_parameters(self) -> list[str]:
        return [
            "dimensions", "normalize", "activation", "softmax", "step_tag_id",
            "returned_token_ids"
        ]

    @property
    def valid_parameters(self):
        return {
            "embed": ["dimensions", "normalize"],
            "classify": ["activation"],
            "score": ["activation"],
            "encode": ["softmax", "step_tag_id", "returned_token_ids"],
        }

    def clone(self) -> "PoolingParams":
        """Returns a deep copy of the PoolingParams instance."""
        return deepcopy(self)

    def verify(self,
               task: PoolingTask,
               model_config: Optional["ModelConfig"] = None) -> None:

        if self.task is None:
            self.task = task
        elif self.task != task:
            msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
            raise ValueError(msg)

        # NOTE: Task validation needs to done against the model instance,
        # which is not available in model config. So, it's not included
        # in this method

        self._merge_default_parameters(model_config)
        self._set_default_parameters(model_config)
        self._verify_valid_parameters()

    def _merge_default_parameters(self,
                                  model_config: Optional["ModelConfig"] = None
                                  ) -> None:

        if model_config is None:
            return

        pooler_config = model_config.pooler_config
        if pooler_config is None:
            return

        assert self.task is not None, "task must be set"
        valid_parameters = self.valid_parameters[self.task]

        for k in valid_parameters:
            if getattr(pooler_config, k, None) is None:
                continue

            if getattr(self, k, None) is None:
                setattr(self, k, getattr(pooler_config, k))

    def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
        if self.task == "embed":
            if self.normalize is None:
                self.normalize = True

            if self.dimensions is not None and model_config is not None:
                if not model_config.is_matryoshka:
                    raise ValueError(
                        f'Model "{model_config.served_model_name}" does not '
                        f'support matryoshka representation, '
                        f'changing output dimensions will lead to poor results.'
                    )

                mds = model_config.matryoshka_dimensions
                if mds is not None:
                    if self.dimensions not in mds:
                        raise ValueError(
                            f'Model "{model_config.served_model_name}" '
                            f'only supports {str(mds)} matryoshka dimensions, '
                            f'use other output dimensions will '
                            f'lead to poor results.')
                elif self.dimensions < 1:
                    raise ValueError("Dimensions must be greater than 0")

        elif self.task in ["classify", "score"]:
            if self.activation is None:
                self.activation = True

        elif self.task == "encode":
            if self.softmax is None:
                self.softmax = True
        else:
            raise ValueError(f"Unknown pooling task: {self.task}")

    def _verify_valid_parameters(self):
        assert self.task is not None, "task must be set"
        valid_parameters = self.valid_parameters[self.task]
        invalid_parameters = []
        for k in self.all_parameters:
            if k in valid_parameters:
                continue

            if getattr(self, k, None) is not None:
                invalid_parameters.append(k)

        if invalid_parameters:
            raise ValueError(
                f"Task {self.task} only supports {valid_parameters} "
                f"parameters, does not support "
                f"{invalid_parameters} parameters")

    def __repr__(self) -> str:
        return (f"PoolingParams("
                f"task={self.task}, "
                f"normalize={self.normalize}, "
                f"dimensions={self.dimensions}, "
                f"activation={self.activation}, "
                f"softmax={self.softmax}, "
                f"step_tag_id={self.step_tag_id}, "
                f"returned_token_ids={self.returned_token_ids}, "
                f"requires_token_ids={self.requires_token_ids}, "
                f"extra_kwargs={self.extra_kwargs})")

    def __post_init__(self) -> None:
        assert self.output_kind == RequestOutputKind.FINAL_ONLY,\
            "For pooling output_kind has to be FINAL_ONLY"

activation class-attribute instance-attribute

activation: Optional[bool] = None

all_parameters property

all_parameters: list[str]

dimensions class-attribute instance-attribute

dimensions: Optional[int] = None

extra_kwargs class-attribute instance-attribute

extra_kwargs: Optional[dict[str, Any]] = None

Internal use only.

normalize class-attribute instance-attribute

normalize: Optional[bool] = None

output_kind class-attribute instance-attribute

requires_token_ids class-attribute instance-attribute

requires_token_ids: bool = False

Internal use only.

returned_token_ids class-attribute instance-attribute

returned_token_ids: Optional[list[int]] = None

softmax class-attribute instance-attribute

softmax: Optional[bool] = None

step_tag_id class-attribute instance-attribute

step_tag_id: Optional[int] = None

task class-attribute instance-attribute

task: Optional[PoolingTask] = None

Internal use only.

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Meta(ge=-1)]
] = None

valid_parameters property

valid_parameters

__post_init__

__post_init__() -> None
Source code in vllm/pooling_params.py
def __post_init__(self) -> None:
    assert self.output_kind == RequestOutputKind.FINAL_ONLY,\
        "For pooling output_kind has to be FINAL_ONLY"

__repr__

__repr__() -> str
Source code in vllm/pooling_params.py
def __repr__(self) -> str:
    return (f"PoolingParams("
            f"task={self.task}, "
            f"normalize={self.normalize}, "
            f"dimensions={self.dimensions}, "
            f"activation={self.activation}, "
            f"softmax={self.softmax}, "
            f"step_tag_id={self.step_tag_id}, "
            f"returned_token_ids={self.returned_token_ids}, "
            f"requires_token_ids={self.requires_token_ids}, "
            f"extra_kwargs={self.extra_kwargs})")

_merge_default_parameters

_merge_default_parameters(
    model_config: Optional[ModelConfig] = None,
) -> None
Source code in vllm/pooling_params.py
def _merge_default_parameters(self,
                              model_config: Optional["ModelConfig"] = None
                              ) -> None:

    if model_config is None:
        return

    pooler_config = model_config.pooler_config
    if pooler_config is None:
        return

    assert self.task is not None, "task must be set"
    valid_parameters = self.valid_parameters[self.task]

    for k in valid_parameters:
        if getattr(pooler_config, k, None) is None:
            continue

        if getattr(self, k, None) is None:
            setattr(self, k, getattr(pooler_config, k))

_set_default_parameters

_set_default_parameters(
    model_config: Optional[ModelConfig],
)
Source code in vllm/pooling_params.py
def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
    if self.task == "embed":
        if self.normalize is None:
            self.normalize = True

        if self.dimensions is not None and model_config is not None:
            if not model_config.is_matryoshka:
                raise ValueError(
                    f'Model "{model_config.served_model_name}" does not '
                    f'support matryoshka representation, '
                    f'changing output dimensions will lead to poor results.'
                )

            mds = model_config.matryoshka_dimensions
            if mds is not None:
                if self.dimensions not in mds:
                    raise ValueError(
                        f'Model "{model_config.served_model_name}" '
                        f'only supports {str(mds)} matryoshka dimensions, '
                        f'use other output dimensions will '
                        f'lead to poor results.')
            elif self.dimensions < 1:
                raise ValueError("Dimensions must be greater than 0")

    elif self.task in ["classify", "score"]:
        if self.activation is None:
            self.activation = True

    elif self.task == "encode":
        if self.softmax is None:
            self.softmax = True
    else:
        raise ValueError(f"Unknown pooling task: {self.task}")

_verify_valid_parameters

_verify_valid_parameters()
Source code in vllm/pooling_params.py
def _verify_valid_parameters(self):
    assert self.task is not None, "task must be set"
    valid_parameters = self.valid_parameters[self.task]
    invalid_parameters = []
    for k in self.all_parameters:
        if k in valid_parameters:
            continue

        if getattr(self, k, None) is not None:
            invalid_parameters.append(k)

    if invalid_parameters:
        raise ValueError(
            f"Task {self.task} only supports {valid_parameters} "
            f"parameters, does not support "
            f"{invalid_parameters} parameters")

clone

clone() -> PoolingParams

Returns a deep copy of the PoolingParams instance.

Source code in vllm/pooling_params.py
def clone(self) -> "PoolingParams":
    """Returns a deep copy of the PoolingParams instance."""
    return deepcopy(self)

verify

verify(
    task: PoolingTask,
    model_config: Optional[ModelConfig] = None,
) -> None
Source code in vllm/pooling_params.py
def verify(self,
           task: PoolingTask,
           model_config: Optional["ModelConfig"] = None) -> None:

    if self.task is None:
        self.task = task
    elif self.task != task:
        msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
        raise ValueError(msg)

    # NOTE: Task validation needs to done against the model instance,
    # which is not available in model config. So, it's not included
    # in this method

    self._merge_default_parameters(model_config)
    self._set_default_parameters(model_config)
    self._verify_valid_parameters()

PoolingRequestOutput

Bases: Generic[_O]

The output data of a pooling request to the LLM.

Parameters:

Name Type Description Default
request_id str

A unique identifier for the pooling request.

required
outputs PoolingOutput

The pooling results for the given input.

required
prompt_token_ids list[int]

A list of token IDs used in the prompt.

required
finished bool

A flag indicating whether the pooling is completed.

required
Source code in vllm/outputs.py
class PoolingRequestOutput(Generic[_O]):
    """
    The output data of a pooling request to the LLM.

    Args:
        request_id (str): A unique identifier for the pooling request.
        outputs (PoolingOutput): The pooling results for the given input.
        prompt_token_ids (list[int]): A list of token IDs used in the prompt.
        finished (bool): A flag indicating whether the pooling is completed.
    """

    def __init__(self, request_id: str, outputs: _O,
                 prompt_token_ids: list[int], finished: bool):
        self.request_id = request_id
        self.prompt_token_ids = prompt_token_ids
        self.finished = finished
        self.outputs = outputs

    def __repr__(self):
        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
                f"outputs={self.outputs!r}, "
                f"prompt_token_ids={self.prompt_token_ids}, "
                f"finished={self.finished})")

finished instance-attribute

finished = finished

outputs instance-attribute

outputs = outputs

prompt_token_ids instance-attribute

prompt_token_ids = prompt_token_ids

request_id instance-attribute

request_id = request_id

__init__

__init__(
    request_id: str,
    outputs: _O,
    prompt_token_ids: list[int],
    finished: bool,
)
Source code in vllm/outputs.py
def __init__(self, request_id: str, outputs: _O,
             prompt_token_ids: list[int], finished: bool):
    self.request_id = request_id
    self.prompt_token_ids = prompt_token_ids
    self.finished = finished
    self.outputs = outputs

__repr__

__repr__()
Source code in vllm/outputs.py
def __repr__(self):
    return (f"{type(self).__name__}(request_id={self.request_id!r}, "
            f"outputs={self.outputs!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"finished={self.finished})")

RequestOutput

The output data of a completion request to the LLM.

Parameters:

Name Type Description Default
request_id str

The unique ID of the request.

required
prompt Optional[str]

The prompt string of the request. For encoder/decoder models, this is the decoder input prompt.

required
prompt_token_ids Optional[list[int]]

The token IDs of the prompt. For encoder/decoder models, this is the decoder input prompt token ids.

required
prompt_logprobs Optional[PromptLogprobs]

The log probabilities to return per prompt token.

required
outputs list[CompletionOutput]

The output sequences of the request.

required
finished bool

Whether the whole request is finished.

required
metrics Optional[RequestMetrics]

Metrics associated with the request.

None
lora_request Optional[LoRARequest]

The LoRA request that was used to generate the output.

None
encoder_prompt Optional[str]

The encoder prompt string of the request. None if decoder-only.

None
encoder_prompt_token_ids Optional[list[int]]

The token IDs of the encoder prompt. None if decoder-only.

None
num_cached_tokens Optional[int]

The number of tokens with prefix cache hit.

None
kv_transfer_params Optional[dict[str, Any]]

The params for remote K/V transfer.

None
Source code in vllm/outputs.py
class RequestOutput:
    """The output data of a completion request to the LLM.

    Args:
        request_id: The unique ID of the request.
        prompt: The prompt string of the request.
                For encoder/decoder models, this is the
                decoder input prompt.
        prompt_token_ids: The token IDs of the prompt.
                          For encoder/decoder models, this is the
                          decoder input prompt token ids.
        prompt_logprobs: The log probabilities to return per prompt token.
        outputs: The output sequences of the request.
        finished: Whether the whole request is finished.
        metrics: Metrics associated with the request.
        lora_request: The LoRA request that was used to generate the output.
        encoder_prompt: The encoder prompt string of the request.
                        None if decoder-only.
        encoder_prompt_token_ids: The token IDs of the encoder prompt.
                                  None if decoder-only.
        num_cached_tokens: The number of tokens with prefix cache hit.
        kv_transfer_params: The params for remote K/V transfer.
    """

    def __init__(
        self,
        request_id: str,
        prompt: Optional[str],
        prompt_token_ids: Optional[list[int]],
        prompt_logprobs: Optional[PromptLogprobs],
        outputs: list[CompletionOutput],
        finished: bool,
        metrics: Optional[RequestMetrics] = None,
        lora_request: Optional[LoRARequest] = None,
        encoder_prompt: Optional[str] = None,
        encoder_prompt_token_ids: Optional[list[int]] = None,
        num_cached_tokens: Optional[int] = None,
        *,
        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
        kv_transfer_params: Optional[dict[str, Any]] = None,
        # Forward compatibility, code that uses args added in new release can
        # still run with older versions of vLLM without breaking.
        **kwargs: Any,
    ) -> None:
        if kwargs:
            logger.warning_once("RequestOutput: Ignoring extra arguments: %s",
                                str(kwargs))
        self.request_id = request_id
        self.prompt = prompt
        self.prompt_token_ids = prompt_token_ids
        self.multi_modal_placeholders = multi_modal_placeholders or {}
        self.prompt_logprobs = prompt_logprobs
        self.outputs = outputs
        self.finished = finished
        self.metrics = metrics
        self.lora_request = lora_request
        self.encoder_prompt = encoder_prompt
        self.encoder_prompt_token_ids = encoder_prompt_token_ids
        self.num_cached_tokens = num_cached_tokens
        self.kv_transfer_params = kv_transfer_params

    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
        """Merge subsequent RequestOutput into this one"""

        self.finished |= next_output.finished
        self.kv_transfer_params = next_output.kv_transfer_params

        for next_completion in next_output.outputs:
            for i, completion in enumerate(self.outputs):
                if completion.index == next_completion.index:
                    if aggregate:
                        # Merge outputs with same index
                        completion.text += next_completion.text
                        if not isinstance(completion.token_ids,
                                          MutableSequence):
                            completion.token_ids = list(completion.token_ids)
                        completion.token_ids.extend(next_completion.token_ids)
                        if next_completion.logprobs:
                            assert completion.logprobs is not None
                            completion.logprobs.extend(
                                next_completion.logprobs)
                        completion.cumulative_logprob = (
                            next_completion.cumulative_logprob)
                        completion.finish_reason = next_completion.finish_reason
                        completion.stop_reason = next_completion.stop_reason
                    else:
                        # Replace the output with the new one
                        self.outputs[i] = next_completion
                    break
            else:
                self.outputs.append(next_completion)

    def __repr__(self) -> str:
        return (f"RequestOutput(request_id={self.request_id}, "
                f"prompt={self.prompt!r}, "
                f"prompt_token_ids={self.prompt_token_ids}, "
                f"encoder_prompt={self.encoder_prompt!r}, "
                f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
                f"prompt_logprobs={self.prompt_logprobs}, "
                f"outputs={self.outputs}, "
                f"finished={self.finished}, "
                f"metrics={self.metrics}, "
                f"lora_request={self.lora_request}, "
                f"num_cached_tokens={self.num_cached_tokens}, "
                f"multi_modal_placeholders={self.multi_modal_placeholders})")

encoder_prompt instance-attribute

encoder_prompt = encoder_prompt

encoder_prompt_token_ids instance-attribute

encoder_prompt_token_ids = encoder_prompt_token_ids

finished instance-attribute

finished = finished

kv_transfer_params instance-attribute

kv_transfer_params = kv_transfer_params

lora_request instance-attribute

lora_request = lora_request

metrics instance-attribute

metrics = metrics

multi_modal_placeholders instance-attribute

multi_modal_placeholders = multi_modal_placeholders or {}

num_cached_tokens instance-attribute

num_cached_tokens = num_cached_tokens

outputs instance-attribute

outputs = outputs

prompt instance-attribute

prompt = prompt

prompt_logprobs instance-attribute

prompt_logprobs = prompt_logprobs

prompt_token_ids instance-attribute

prompt_token_ids = prompt_token_ids

request_id instance-attribute

request_id = request_id

__init__

__init__(
    request_id: str,
    prompt: Optional[str],
    prompt_token_ids: Optional[list[int]],
    prompt_logprobs: Optional[PromptLogprobs],
    outputs: list[CompletionOutput],
    finished: bool,
    metrics: Optional[RequestMetrics] = None,
    lora_request: Optional[LoRARequest] = None,
    encoder_prompt: Optional[str] = None,
    encoder_prompt_token_ids: Optional[list[int]] = None,
    num_cached_tokens: Optional[int] = None,
    *,
    multi_modal_placeholders: Optional[
        MultiModalPlaceholderDict
    ] = None,
    kv_transfer_params: Optional[dict[str, Any]] = None,
    **kwargs: Any,
) -> None
Source code in vllm/outputs.py
def __init__(
    self,
    request_id: str,
    prompt: Optional[str],
    prompt_token_ids: Optional[list[int]],
    prompt_logprobs: Optional[PromptLogprobs],
    outputs: list[CompletionOutput],
    finished: bool,
    metrics: Optional[RequestMetrics] = None,
    lora_request: Optional[LoRARequest] = None,
    encoder_prompt: Optional[str] = None,
    encoder_prompt_token_ids: Optional[list[int]] = None,
    num_cached_tokens: Optional[int] = None,
    *,
    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
    kv_transfer_params: Optional[dict[str, Any]] = None,
    # Forward compatibility, code that uses args added in new release can
    # still run with older versions of vLLM without breaking.
    **kwargs: Any,
) -> None:
    if kwargs:
        logger.warning_once("RequestOutput: Ignoring extra arguments: %s",
                            str(kwargs))
    self.request_id = request_id
    self.prompt = prompt
    self.prompt_token_ids = prompt_token_ids
    self.multi_modal_placeholders = multi_modal_placeholders or {}
    self.prompt_logprobs = prompt_logprobs
    self.outputs = outputs
    self.finished = finished
    self.metrics = metrics
    self.lora_request = lora_request
    self.encoder_prompt = encoder_prompt
    self.encoder_prompt_token_ids = encoder_prompt_token_ids
    self.num_cached_tokens = num_cached_tokens
    self.kv_transfer_params = kv_transfer_params

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (f"RequestOutput(request_id={self.request_id}, "
            f"prompt={self.prompt!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"encoder_prompt={self.encoder_prompt!r}, "
            f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
            f"prompt_logprobs={self.prompt_logprobs}, "
            f"outputs={self.outputs}, "
            f"finished={self.finished}, "
            f"metrics={self.metrics}, "
            f"lora_request={self.lora_request}, "
            f"num_cached_tokens={self.num_cached_tokens}, "
            f"multi_modal_placeholders={self.multi_modal_placeholders})")

add

add(next_output: RequestOutput, aggregate: bool) -> None

Merge subsequent RequestOutput into this one

Source code in vllm/outputs.py
def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
    """Merge subsequent RequestOutput into this one"""

    self.finished |= next_output.finished
    self.kv_transfer_params = next_output.kv_transfer_params

    for next_completion in next_output.outputs:
        for i, completion in enumerate(self.outputs):
            if completion.index == next_completion.index:
                if aggregate:
                    # Merge outputs with same index
                    completion.text += next_completion.text
                    if not isinstance(completion.token_ids,
                                      MutableSequence):
                        completion.token_ids = list(completion.token_ids)
                    completion.token_ids.extend(next_completion.token_ids)
                    if next_completion.logprobs:
                        assert completion.logprobs is not None
                        completion.logprobs.extend(
                            next_completion.logprobs)
                    completion.cumulative_logprob = (
                        next_completion.cumulative_logprob)
                    completion.finish_reason = next_completion.finish_reason
                    completion.stop_reason = next_completion.stop_reason
                else:
                    # Replace the output with the new one
                    self.outputs[i] = next_completion
                break
        else:
            self.outputs.append(next_completion)

SamplingParams

Bases: Struct

Sampling parameters for text generation.

Overall, we follow the sampling parameters from the OpenAI text completion API (https://platform.openai.com/docs/api-reference/completions/create). In addition, we support beam search, which is not supported by OpenAI.

Source code in vllm/sampling_params.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
class SamplingParams(
        msgspec.Struct,
        omit_defaults=True,  # type: ignore[call-arg]
        # required for @cached_property.
        dict=True):  # type: ignore[call-arg]
    """Sampling parameters for text generation.

    Overall, we follow the sampling parameters from the OpenAI text completion
    API (https://platform.openai.com/docs/api-reference/completions/create).
    In addition, we support beam search, which is not supported by OpenAI.
    """

    n: int = 1
    """Number of outputs to return for the given prompt request.

    NOTE:
        `AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs
        are generated and streamed cumulatively per request. To see all `n`
        outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
        in `SamplingParams`."""
    best_of: Optional[int] = None
    """Number of output sequences that are generated from the prompt. From
    these `best_of` sequences, the top `n` sequences are returned. `best_of`
    must be greater than or equal to `n`. By default, `best_of` is set to `n`.
    Warning, this is only supported in V0."""
    _real_n: Optional[int] = None
    presence_penalty: float = 0.0
    """Penalizes new tokens based on whether they appear in the generated text
    so far. Values > 0 encourage the model to use new tokens, while values < 0
    encourage the model to repeat tokens."""
    frequency_penalty: float = 0.0
    """Penalizes new tokens based on their frequency in the generated text so
    far. Values > 0 encourage the model to use new tokens, while values < 0
    encourage the model to repeat tokens."""
    repetition_penalty: float = 1.0
    """Penalizes new tokens based on whether they appear in the prompt and the
    generated text so far. Values > 1 encourage the model to use new tokens,
    while values < 1 encourage the model to repeat tokens."""
    temperature: float = 1.0
    """Controls the randomness of the sampling. Lower values make the model
    more deterministic, while higher values make the model more random. Zero
    means greedy sampling."""
    top_p: float = 1.0
    """Controls the cumulative probability of the top tokens to consider. Must
    be in (0, 1]. Set to 1 to consider all tokens."""
    top_k: int = 0
    """Controls the number of top tokens to consider. Set to 0 (or -1) to
    consider all tokens."""
    min_p: float = 0.0
    """Represents the minimum probability for a token to be considered,
    relative to the probability of the most likely token. Must be in [0, 1].
    Set to 0 to disable this."""
    seed: Optional[int] = None
    """Random seed to use for the generation."""
    stop: Optional[Union[str, list[str]]] = None
    """String(s) that stop the generation when they are generated. The returned
    output will not contain the stop strings."""
    stop_token_ids: Optional[list[int]] = None
    """Token IDs that stop the generation when they are generated. The returned
    output will contain the stop tokens unless the stop tokens are special
    tokens."""
    ignore_eos: bool = False
    """Whether to ignore the EOS token and continue generating
    tokens after the EOS token is generated."""
    max_tokens: Optional[int] = 16
    """Maximum number of tokens to generate per output sequence."""
    min_tokens: int = 0
    """Minimum number of tokens to generate per output sequence before EOS or
    `stop_token_ids` can be generated"""
    logprobs: Optional[int] = None
    """Number of log probabilities to return per output token. When set to
    `None`, no probability is returned. If set to a non-`None` value, the
    result includes the log probabilities of the specified number of most
    likely tokens, as well as the chosen tokens. Note that the implementation
    follows the OpenAI API: The API will always return the log probability of
    the sampled token, so there may be up to `logprobs+1` elements in the
    response. When set to -1, return all `vocab_size` log probabilities."""
    prompt_logprobs: Optional[int] = None
    """Number of log probabilities to return per prompt token.
    When set to -1, return all `vocab_size` log probabilities."""
    # NOTE: This parameter is only exposed at the engine level for now.
    # It is not exposed in the OpenAI API server, as the OpenAI API does
    # not support returning only a list of token IDs.
    detokenize: bool = True
    """Whether to detokenize the output."""
    skip_special_tokens: bool = True
    """Whether to skip special tokens in the output."""
    spaces_between_special_tokens: bool = True
    """Whether to add spaces between special tokens in the output."""
    # Optional[list[LogitsProcessor]] type. We use Any here because
    # Optional[list[LogitsProcessor]] type is not supported by msgspec.
    logits_processors: Optional[Any] = None
    """Functions that modify logits based on previously generated tokens, and
    optionally prompt tokens as a first argument."""
    include_stop_str_in_output: bool = False
    """Whether to include the stop strings in output text."""
    truncate_prompt_tokens: Optional[Annotated[int,
                                               msgspec.Meta(ge=-1)]] = None
    """If set to -1, will use the truncation size supported by the model. If
    set to an integer k, will use only the last k tokens from the prompt
    (i.e., left truncation). If set to `None`, truncation is disabled."""
    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE

    # The below fields are not supposed to be used as an input.
    # They are set in post_init.
    output_text_buffer_length: int = 0
    _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)

    # Fields used to construct logits processors
    structured_outputs: Optional[StructuredOutputsParams] = None
    """Parameters for configuring structured outputs."""
    guided_decoding: Optional[GuidedDecodingParams] = None
    """Deprecated alias for structured_outputs."""
    logit_bias: Optional[dict[int, float]] = None
    """If provided, the engine will construct a logits processor that applies
    these logit biases."""
    allowed_token_ids: Optional[list[int]] = None
    """If provided, the engine will construct a logits processor which only
    retains scores for the given token ids."""
    extra_args: Optional[dict[str, Any]] = None
    """Arbitrary additional args, that can be used by custom sampling
    implementations, plugins, etc. Not used by any in-tree sampling
    implementations."""

    # Fields used for bad words
    bad_words: Optional[list[str]] = None
    """Words that are not allowed to be generated. More precisely, only the
    last token of a corresponding token sequence is not allowed when the next
    generated token can complete the sequence."""
    _bad_words_token_ids: Optional[list[list[int]]] = None

    @staticmethod
    def from_optional(
        n: Optional[int] = 1,
        best_of: Optional[int] = None,
        presence_penalty: Optional[float] = 0.0,
        frequency_penalty: Optional[float] = 0.0,
        repetition_penalty: Optional[float] = 1.0,
        temperature: Optional[float] = 1.0,
        top_p: Optional[float] = 1.0,
        top_k: int = 0,
        min_p: float = 0.0,
        seed: Optional[int] = None,
        stop: Optional[Union[str, list[str]]] = None,
        stop_token_ids: Optional[list[int]] = None,
        bad_words: Optional[list[str]] = None,
        include_stop_str_in_output: bool = False,
        ignore_eos: bool = False,
        max_tokens: Optional[int] = 16,
        min_tokens: int = 0,
        logprobs: Optional[int] = None,
        prompt_logprobs: Optional[int] = None,
        detokenize: bool = True,
        skip_special_tokens: bool = True,
        spaces_between_special_tokens: bool = True,
        logits_processors: Optional[list[LogitsProcessor]] = None,
        truncate_prompt_tokens: Optional[Annotated[int,
                                                   msgspec.Meta(
                                                       ge=-1)]] = None,
        output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
        structured_outputs: Optional[StructuredOutputsParams] = None,
        guided_decoding: Optional[GuidedDecodingParams] = None,
        logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
        allowed_token_ids: Optional[list[int]] = None,
        extra_args: Optional[dict[str, Any]] = None,
    ) -> "SamplingParams":
        if logit_bias is not None:
            # Convert token_id to integer
            # Clamp the bias between -100 and 100 per OpenAI API spec
            logit_bias = {
                int(token): min(100.0, max(-100.0, bias))
                for token, bias in logit_bias.items()
            }
        if guided_decoding is not None:
            warnings.warn(
                "guided_decoding is deprecated. This will be removed in "
                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
                "structured_outputs instead.",
                DeprecationWarning,
                stacklevel=2)
            structured_outputs = guided_decoding
            guided_decoding = None

        return SamplingParams(
            n=1 if n is None else n,
            best_of=best_of,
            presence_penalty=0.0
            if presence_penalty is None else presence_penalty,
            frequency_penalty=0.0
            if frequency_penalty is None else frequency_penalty,
            repetition_penalty=1.0
            if repetition_penalty is None else repetition_penalty,
            temperature=1.0 if temperature is None else temperature,
            top_p=1.0 if top_p is None else top_p,
            top_k=top_k,
            min_p=min_p,
            seed=seed,
            stop=stop,
            stop_token_ids=stop_token_ids,
            bad_words=bad_words,
            include_stop_str_in_output=include_stop_str_in_output,
            ignore_eos=ignore_eos,
            max_tokens=max_tokens,
            min_tokens=min_tokens,
            logprobs=logprobs,
            prompt_logprobs=prompt_logprobs,
            detokenize=detokenize,
            skip_special_tokens=skip_special_tokens,
            spaces_between_special_tokens=spaces_between_special_tokens,
            logits_processors=logits_processors,
            truncate_prompt_tokens=truncate_prompt_tokens,
            output_kind=output_kind,
            structured_outputs=structured_outputs,
            logit_bias=logit_bias,
            allowed_token_ids=allowed_token_ids,
            extra_args=extra_args,
        )

    def __post_init__(self) -> None:
        # how we deal with `best_of``:
        # if `best_of`` is not set, we default to `n`;
        # if `best_of`` is set, we set `n`` to `best_of`,
        # and set `_real_n`` to the original `n`.
        # when we return the result, we will check
        # if we need to return `n` or `_real_n` results
        if self.best_of:
            if self.best_of < self.n:
                raise ValueError(
                    f"best_of must be greater than or equal to n, "
                    f"got n={self.n} and best_of={self.best_of}.")
            if not self._real_n:
                self._real_n = self.n
                self.n = self.best_of

        if 0 < self.temperature < _MAX_TEMP:
            logger.warning(
                "temperature %s is less than %s, which may cause numerical "
                "errors nan or inf in tensors. We have maxed it out to %s.",
                self.temperature, _MAX_TEMP, _MAX_TEMP)
            self.temperature = max(self.temperature, _MAX_TEMP)

        if self.seed == -1:
            self.seed = None

        if self.stop is None:
            self.stop = []
        elif isinstance(self.stop, str):
            self.stop = [self.stop]

        if self.stop_token_ids is None:
            self.stop_token_ids = []

        if self.bad_words is None:
            self.bad_words = []

        if self.logprobs is True:
            self.logprobs = 1

        if self.prompt_logprobs is True:
            self.prompt_logprobs = 1

        # Number of characters to hold back for stop string evaluation
        # until sequence is finished.
        if self.stop and not self.include_stop_str_in_output:
            self.output_text_buffer_length = max(len(s) for s in self.stop) - 1

        self._verify_args()

        if self.temperature < _SAMPLING_EPS:
            # Zero temperature means greedy sampling.
            self.top_p = 1.0
            self.top_k = 0
            self.min_p = 0.0
            self._verify_greedy_sampling()

        # eos_token_id is added to this by the engine
        self._all_stop_token_ids.update(self.stop_token_ids)

        if self.guided_decoding is not None:
            warnings.warn(
                "guided_decoding is deprecated. This will be removed in "
                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
                "structured_outputs instead.",
                DeprecationWarning,
                stacklevel=2)
            self.structured_outputs = self.guided_decoding
            self.guided_decoding = None

    def _verify_args(self) -> None:
        if not isinstance(self.n, int):
            raise ValueError(f"n must be an int, but is of "
                             f"type {type(self.n)}")
        if self.n < 1:
            raise ValueError(f"n must be at least 1, got {self.n}.")
        if self.best_of is not None:
            if not isinstance(self.best_of, int):
                raise ValueError(
                    f"best_of must be an integer, got {type(self.best_of)}")
            if self.best_of < 1:
                raise ValueError(
                    f"best_of must be at least 1, got {self.best_of}")
            if self.best_of < self.n:
                raise ValueError(
                    f"best_of must be greater than or equal to n, "
                    f"got n={self.n} and best_of={self.best_of}.")
        if not -2.0 <= self.presence_penalty <= 2.0:
            raise ValueError("presence_penalty must be in [-2, 2], got "
                             f"{self.presence_penalty}.")
        if not -2.0 <= self.frequency_penalty <= 2.0:
            raise ValueError("frequency_penalty must be in [-2, 2], got "
                             f"{self.frequency_penalty}.")
        if self.repetition_penalty <= 0.0:
            raise ValueError(
                "repetition_penalty must be greater than zero, got "
                f"{self.repetition_penalty}.")
        if self.temperature < 0.0:
            raise ValueError(
                f"temperature must be non-negative, got {self.temperature}.")
        if not 0.0 < self.top_p <= 1.0:
            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
        # quietly accept -1 as disabled, but prefer 0
        if self.top_k < -1:
            raise ValueError(f"top_k must be 0 (disable), or at least 1, "
                             f"got {self.top_k}.")
        if not isinstance(self.top_k, int):
            raise TypeError(
                f"top_k must be an integer, got {type(self.top_k).__name__}")
        if not 0.0 <= self.min_p <= 1.0:
            raise ValueError("min_p must be in [0, 1], got "
                             f"{self.min_p}.")
        if self.max_tokens is not None and self.max_tokens < 1:
            raise ValueError(
                f"max_tokens must be at least 1, got {self.max_tokens}.")
        if self.min_tokens < 0:
            raise ValueError(f"min_tokens must be greater than or equal to 0, "
                             f"got {self.min_tokens}.")
        if self.max_tokens is not None and self.min_tokens > self.max_tokens:
            raise ValueError(
                f"min_tokens must be less than or equal to "
                f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
        if (self.logprobs is not None and self.logprobs != -1
                and self.logprobs < 0):
            raise ValueError(
                f"logprobs must be non-negative or -1, got {self.logprobs}.")
        if (self.prompt_logprobs is not None and self.prompt_logprobs != -1
                and self.prompt_logprobs < 0):
            raise ValueError(
                f"prompt_logprobs must be non-negative or -1, got "
                f"{self.prompt_logprobs}.")
        if (self.truncate_prompt_tokens is not None
                and (self.truncate_prompt_tokens == 0
                     or self.truncate_prompt_tokens < -1)):
            raise ValueError(
                f"truncate_prompt_tokens must be an integer >= 1 or -1, "
                f"got {self.truncate_prompt_tokens}")
        assert isinstance(self.stop_token_ids, list)
        if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
            raise ValueError(f"stop_token_ids must contain only integers, "
                             f"got {self.stop_token_ids}.")
        assert isinstance(self.stop, list)
        if any(not stop_str for stop_str in self.stop):
            raise ValueError("stop cannot contain an empty string.")
        if self.stop and not self.detokenize:
            raise ValueError(
                "stop strings are only supported when detokenize is True. "
                "Set detokenize=True to use stop.")
        if self.best_of != self._real_n and self.output_kind == (
                RequestOutputKind.DELTA):
            raise ValueError("best_of must equal n to use output_kind=DELTA")

    def _verify_greedy_sampling(self) -> None:
        if self.n > 1:
            raise ValueError("n must be 1 when using greedy sampling, "
                             f"got {self.n}.")

    def update_from_generation_config(
            self,
            generation_config: dict[str, Any],
            model_eos_token_id: Optional[int] = None) -> None:
        """Update if there are non-default values from generation_config"""

        if model_eos_token_id is not None:
            # Add the eos token id into the sampling_params to support
            # min_tokens processing.
            self._all_stop_token_ids.add(model_eos_token_id)

        # Update eos_token_id for generation
        if (eos_ids := generation_config.get("eos_token_id")) is not None:
            # it can be either int or list of int
            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
            if model_eos_token_id is not None:
                # We don't need to include the primary eos_token_id in
                # stop_token_ids since it's handled separately for stopping
                # purposes.
                eos_ids.discard(model_eos_token_id)
            if eos_ids:
                self._all_stop_token_ids.update(eos_ids)
                if not self.ignore_eos:
                    eos_ids.update(self.stop_token_ids)
                    self.stop_token_ids = list(eos_ids)

    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
        if not self.bad_words:
            return
        self._bad_words_token_ids = []
        for bad_word in self.bad_words:
            # To prohibit words both at the beginning
            # and in the middle of text
            # (related to add_prefix_space tokenizer parameter)
            for add_prefix_space in [False, True]:
                prefix = " " if add_prefix_space else ""
                prompt = prefix + bad_word.lstrip()
                prompt_token_ids = tokenizer.encode(text=prompt,
                                                    add_special_tokens=False)

                # If no space at the beginning
                # or if prefix space produces a new word token
                if (not add_prefix_space) or (
                        add_prefix_space and prompt_token_ids[0]
                        != self._bad_words_token_ids[-1][0]
                        and len(prompt_token_ids) == len(
                            self._bad_words_token_ids[-1])):
                    self._bad_words_token_ids.append(prompt_token_ids)

        invalid_token_ids = [
            token_id for bad_words_token_ids in self._bad_words_token_ids
            for token_id in bad_words_token_ids
            if token_id < 0 or token_id > tokenizer.max_token_id
        ]
        if len(invalid_token_ids) > 0:
            raise ValueError(
                f"The model vocabulary size is {tokenizer.max_token_id+1},"
                f" but the following tokens"
                f" were specified as bad: {invalid_token_ids}."
                f" All token id values should be integers satisfying:"
                f" 0 <= token_id <= {tokenizer.max_token_id}.")

    @cached_property
    def sampling_type(self) -> SamplingType:
        if self.temperature < _SAMPLING_EPS:
            return SamplingType.GREEDY
        if self.seed is not None:
            return SamplingType.RANDOM_SEED
        return SamplingType.RANDOM

    @property
    def all_stop_token_ids(self) -> set[int]:
        return self._all_stop_token_ids

    @property
    def bad_words_token_ids(self) -> Optional[list[list[int]]]:
        # For internal use only. Backward compatibility not guaranteed
        return self._bad_words_token_ids

    def clone(self) -> "SamplingParams":
        """Deep copy, but maybe not the LogitsProcessor objects.

        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
        data that is expensive to copy. However, if not copied, the processor
        needs to support parallel decoding for multiple sequences
        See https://github.com/vllm-project/vllm/issues/3087
        """

        logit_processor_refs = None if self.logits_processors is None else {
            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
            for lp in self.logits_processors
        }
        return copy.deepcopy(self, memo=logit_processor_refs)

    def __repr__(self) -> str:
        return (
            f"SamplingParams(n={self.n}, "
            f"presence_penalty={self.presence_penalty}, "
            f"frequency_penalty={self.frequency_penalty}, "
            f"repetition_penalty={self.repetition_penalty}, "
            f"temperature={self.temperature}, "
            f"top_p={self.top_p}, "
            f"top_k={self.top_k}, "
            f"min_p={self.min_p}, "
            f"seed={self.seed}, "
            f"stop={self.stop}, "
            f"stop_token_ids={self.stop_token_ids}, "
            f"bad_words={self.bad_words}, "
            f"include_stop_str_in_output={self.include_stop_str_in_output}, "
            f"ignore_eos={self.ignore_eos}, "
            f"max_tokens={self.max_tokens}, "
            f"min_tokens={self.min_tokens}, "
            f"logprobs={self.logprobs}, "
            f"prompt_logprobs={self.prompt_logprobs}, "
            f"skip_special_tokens={self.skip_special_tokens}, "
            "spaces_between_special_tokens="
            f"{self.spaces_between_special_tokens}, "
            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
            f"structured_outputs={self.structured_outputs}, "
            f"extra_args={self.extra_args})")

_all_stop_token_ids class-attribute instance-attribute

_all_stop_token_ids: set[int] = field(default_factory=set)

_bad_words_token_ids class-attribute instance-attribute

_bad_words_token_ids: Optional[list[list[int]]] = None

_real_n class-attribute instance-attribute

_real_n: Optional[int] = None

all_stop_token_ids property

all_stop_token_ids: set[int]

allowed_token_ids class-attribute instance-attribute

allowed_token_ids: Optional[list[int]] = None

If provided, the engine will construct a logits processor which only retains scores for the given token ids.

bad_words class-attribute instance-attribute

bad_words: Optional[list[str]] = None

Words that are not allowed to be generated. More precisely, only the last token of a corresponding token sequence is not allowed when the next generated token can complete the sequence.

bad_words_token_ids property

bad_words_token_ids: Optional[list[list[int]]]

best_of class-attribute instance-attribute

best_of: Optional[int] = None

Number of output sequences that are generated from the prompt. From these best_of sequences, the top n sequences are returned. best_of must be greater than or equal to n. By default, best_of is set to n. Warning, this is only supported in V0.

detokenize class-attribute instance-attribute

detokenize: bool = True

Whether to detokenize the output.

extra_args class-attribute instance-attribute

extra_args: Optional[dict[str, Any]] = None

Arbitrary additional args, that can be used by custom sampling implementations, plugins, etc. Not used by any in-tree sampling implementations.

frequency_penalty class-attribute instance-attribute

frequency_penalty: float = 0.0

Penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.

guided_decoding class-attribute instance-attribute

guided_decoding: Optional[GuidedDecodingParams] = None

Deprecated alias for structured_outputs.

ignore_eos class-attribute instance-attribute

ignore_eos: bool = False

Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.

include_stop_str_in_output class-attribute instance-attribute

include_stop_str_in_output: bool = False

Whether to include the stop strings in output text.

logit_bias class-attribute instance-attribute

logit_bias: Optional[dict[int, float]] = None

If provided, the engine will construct a logits processor that applies these logit biases.

logits_processors class-attribute instance-attribute

logits_processors: Optional[Any] = None

Functions that modify logits based on previously generated tokens, and optionally prompt tokens as a first argument.

logprobs class-attribute instance-attribute

logprobs: Optional[int] = None

Number of log probabilities to return per output token. When set to None, no probability is returned. If set to a non-None value, the result includes the log probabilities of the specified number of most likely tokens, as well as the chosen tokens. Note that the implementation follows the OpenAI API: The API will always return the log probability of the sampled token, so there may be up to logprobs+1 elements in the response. When set to -1, return all vocab_size log probabilities.

max_tokens class-attribute instance-attribute

max_tokens: Optional[int] = 16

Maximum number of tokens to generate per output sequence.

min_p class-attribute instance-attribute

min_p: float = 0.0

Represents the minimum probability for a token to be considered, relative to the probability of the most likely token. Must be in [0, 1]. Set to 0 to disable this.

min_tokens class-attribute instance-attribute

min_tokens: int = 0

Minimum number of tokens to generate per output sequence before EOS or stop_token_ids can be generated

n class-attribute instance-attribute

n: int = 1

Number of outputs to return for the given prompt request.

NOTE

AsyncLLM streams outputs by default. When n > 1, all n outputs are generated and streamed cumulatively per request. To see all n outputs upon completion, use output_kind=RequestOutputKind.FINAL_ONLY in SamplingParams.

output_kind class-attribute instance-attribute

output_text_buffer_length class-attribute instance-attribute

output_text_buffer_length: int = 0

presence_penalty class-attribute instance-attribute

presence_penalty: float = 0.0

Penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[int] = None

Number of log probabilities to return per prompt token. When set to -1, return all vocab_size log probabilities.

repetition_penalty class-attribute instance-attribute

repetition_penalty: float = 1.0

Penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > 1 encourage the model to use new tokens, while values < 1 encourage the model to repeat tokens.

sampling_type cached property

sampling_type: SamplingType

seed class-attribute instance-attribute

seed: Optional[int] = None

Random seed to use for the generation.

skip_special_tokens class-attribute instance-attribute

skip_special_tokens: bool = True

Whether to skip special tokens in the output.

spaces_between_special_tokens class-attribute instance-attribute

spaces_between_special_tokens: bool = True

Whether to add spaces between special tokens in the output.

stop class-attribute instance-attribute

stop: Optional[Union[str, list[str]]] = None

String(s) that stop the generation when they are generated. The returned output will not contain the stop strings.

stop_token_ids class-attribute instance-attribute

stop_token_ids: Optional[list[int]] = None

Token IDs that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens.

structured_outputs class-attribute instance-attribute

structured_outputs: Optional[StructuredOutputsParams] = None

Parameters for configuring structured outputs.

temperature class-attribute instance-attribute

temperature: float = 1.0

Controls the randomness of the sampling. Lower values make the model more deterministic, while higher values make the model more random. Zero means greedy sampling.

top_k class-attribute instance-attribute

top_k: int = 0

Controls the number of top tokens to consider. Set to 0 (or -1) to consider all tokens.

top_p class-attribute instance-attribute

top_p: float = 1.0

Controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens.

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Meta(ge=-1)]
] = None

If set to -1, will use the truncation size supported by the model. If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). If set to None, truncation is disabled.

__post_init__

__post_init__() -> None
Source code in vllm/sampling_params.py
def __post_init__(self) -> None:
    # how we deal with `best_of``:
    # if `best_of`` is not set, we default to `n`;
    # if `best_of`` is set, we set `n`` to `best_of`,
    # and set `_real_n`` to the original `n`.
    # when we return the result, we will check
    # if we need to return `n` or `_real_n` results
    if self.best_of:
        if self.best_of < self.n:
            raise ValueError(
                f"best_of must be greater than or equal to n, "
                f"got n={self.n} and best_of={self.best_of}.")
        if not self._real_n:
            self._real_n = self.n
            self.n = self.best_of

    if 0 < self.temperature < _MAX_TEMP:
        logger.warning(
            "temperature %s is less than %s, which may cause numerical "
            "errors nan or inf in tensors. We have maxed it out to %s.",
            self.temperature, _MAX_TEMP, _MAX_TEMP)
        self.temperature = max(self.temperature, _MAX_TEMP)

    if self.seed == -1:
        self.seed = None

    if self.stop is None:
        self.stop = []
    elif isinstance(self.stop, str):
        self.stop = [self.stop]

    if self.stop_token_ids is None:
        self.stop_token_ids = []

    if self.bad_words is None:
        self.bad_words = []

    if self.logprobs is True:
        self.logprobs = 1

    if self.prompt_logprobs is True:
        self.prompt_logprobs = 1

    # Number of characters to hold back for stop string evaluation
    # until sequence is finished.
    if self.stop and not self.include_stop_str_in_output:
        self.output_text_buffer_length = max(len(s) for s in self.stop) - 1

    self._verify_args()

    if self.temperature < _SAMPLING_EPS:
        # Zero temperature means greedy sampling.
        self.top_p = 1.0
        self.top_k = 0
        self.min_p = 0.0
        self._verify_greedy_sampling()

    # eos_token_id is added to this by the engine
    self._all_stop_token_ids.update(self.stop_token_ids)

    if self.guided_decoding is not None:
        warnings.warn(
            "guided_decoding is deprecated. This will be removed in "
            "v0.12.0 or v1.0.0, which ever is soonest. Please use "
            "structured_outputs instead.",
            DeprecationWarning,
            stacklevel=2)
        self.structured_outputs = self.guided_decoding
        self.guided_decoding = None

__repr__

__repr__() -> str
Source code in vllm/sampling_params.py
def __repr__(self) -> str:
    return (
        f"SamplingParams(n={self.n}, "
        f"presence_penalty={self.presence_penalty}, "
        f"frequency_penalty={self.frequency_penalty}, "
        f"repetition_penalty={self.repetition_penalty}, "
        f"temperature={self.temperature}, "
        f"top_p={self.top_p}, "
        f"top_k={self.top_k}, "
        f"min_p={self.min_p}, "
        f"seed={self.seed}, "
        f"stop={self.stop}, "
        f"stop_token_ids={self.stop_token_ids}, "
        f"bad_words={self.bad_words}, "
        f"include_stop_str_in_output={self.include_stop_str_in_output}, "
        f"ignore_eos={self.ignore_eos}, "
        f"max_tokens={self.max_tokens}, "
        f"min_tokens={self.min_tokens}, "
        f"logprobs={self.logprobs}, "
        f"prompt_logprobs={self.prompt_logprobs}, "
        f"skip_special_tokens={self.skip_special_tokens}, "
        "spaces_between_special_tokens="
        f"{self.spaces_between_special_tokens}, "
        f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
        f"structured_outputs={self.structured_outputs}, "
        f"extra_args={self.extra_args})")

_verify_args

_verify_args() -> None
Source code in vllm/sampling_params.py
def _verify_args(self) -> None:
    if not isinstance(self.n, int):
        raise ValueError(f"n must be an int, but is of "
                         f"type {type(self.n)}")
    if self.n < 1:
        raise ValueError(f"n must be at least 1, got {self.n}.")
    if self.best_of is not None:
        if not isinstance(self.best_of, int):
            raise ValueError(
                f"best_of must be an integer, got {type(self.best_of)}")
        if self.best_of < 1:
            raise ValueError(
                f"best_of must be at least 1, got {self.best_of}")
        if self.best_of < self.n:
            raise ValueError(
                f"best_of must be greater than or equal to n, "
                f"got n={self.n} and best_of={self.best_of}.")
    if not -2.0 <= self.presence_penalty <= 2.0:
        raise ValueError("presence_penalty must be in [-2, 2], got "
                         f"{self.presence_penalty}.")
    if not -2.0 <= self.frequency_penalty <= 2.0:
        raise ValueError("frequency_penalty must be in [-2, 2], got "
                         f"{self.frequency_penalty}.")
    if self.repetition_penalty <= 0.0:
        raise ValueError(
            "repetition_penalty must be greater than zero, got "
            f"{self.repetition_penalty}.")
    if self.temperature < 0.0:
        raise ValueError(
            f"temperature must be non-negative, got {self.temperature}.")
    if not 0.0 < self.top_p <= 1.0:
        raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
    # quietly accept -1 as disabled, but prefer 0
    if self.top_k < -1:
        raise ValueError(f"top_k must be 0 (disable), or at least 1, "
                         f"got {self.top_k}.")
    if not isinstance(self.top_k, int):
        raise TypeError(
            f"top_k must be an integer, got {type(self.top_k).__name__}")
    if not 0.0 <= self.min_p <= 1.0:
        raise ValueError("min_p must be in [0, 1], got "
                         f"{self.min_p}.")
    if self.max_tokens is not None and self.max_tokens < 1:
        raise ValueError(
            f"max_tokens must be at least 1, got {self.max_tokens}.")
    if self.min_tokens < 0:
        raise ValueError(f"min_tokens must be greater than or equal to 0, "
                         f"got {self.min_tokens}.")
    if self.max_tokens is not None and self.min_tokens > self.max_tokens:
        raise ValueError(
            f"min_tokens must be less than or equal to "
            f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
    if (self.logprobs is not None and self.logprobs != -1
            and self.logprobs < 0):
        raise ValueError(
            f"logprobs must be non-negative or -1, got {self.logprobs}.")
    if (self.prompt_logprobs is not None and self.prompt_logprobs != -1
            and self.prompt_logprobs < 0):
        raise ValueError(
            f"prompt_logprobs must be non-negative or -1, got "
            f"{self.prompt_logprobs}.")
    if (self.truncate_prompt_tokens is not None
            and (self.truncate_prompt_tokens == 0
                 or self.truncate_prompt_tokens < -1)):
        raise ValueError(
            f"truncate_prompt_tokens must be an integer >= 1 or -1, "
            f"got {self.truncate_prompt_tokens}")
    assert isinstance(self.stop_token_ids, list)
    if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
        raise ValueError(f"stop_token_ids must contain only integers, "
                         f"got {self.stop_token_ids}.")
    assert isinstance(self.stop, list)
    if any(not stop_str for stop_str in self.stop):
        raise ValueError("stop cannot contain an empty string.")
    if self.stop and not self.detokenize:
        raise ValueError(
            "stop strings are only supported when detokenize is True. "
            "Set detokenize=True to use stop.")
    if self.best_of != self._real_n and self.output_kind == (
            RequestOutputKind.DELTA):
        raise ValueError("best_of must equal n to use output_kind=DELTA")

_verify_greedy_sampling

_verify_greedy_sampling() -> None
Source code in vllm/sampling_params.py
def _verify_greedy_sampling(self) -> None:
    if self.n > 1:
        raise ValueError("n must be 1 when using greedy sampling, "
                         f"got {self.n}.")

clone

clone() -> SamplingParams

Deep copy, but maybe not the LogitsProcessor objects.

LogitsProcessor objects may contain an arbitrary, nontrivial amount of data that is expensive to copy. However, if not copied, the processor needs to support parallel decoding for multiple sequences See https://github.com/vllm-project/vllm/issues/3087

Source code in vllm/sampling_params.py
def clone(self) -> "SamplingParams":
    """Deep copy, but maybe not the LogitsProcessor objects.

    LogitsProcessor objects may contain an arbitrary, nontrivial amount of
    data that is expensive to copy. However, if not copied, the processor
    needs to support parallel decoding for multiple sequences
    See https://github.com/vllm-project/vllm/issues/3087
    """

    logit_processor_refs = None if self.logits_processors is None else {
        id(lp): lp.clone() if hasattr(lp, 'clone') else lp
        for lp in self.logits_processors
    }
    return copy.deepcopy(self, memo=logit_processor_refs)

from_optional staticmethod

from_optional(
    n: Optional[int] = 1,
    best_of: Optional[int] = None,
    presence_penalty: Optional[float] = 0.0,
    frequency_penalty: Optional[float] = 0.0,
    repetition_penalty: Optional[float] = 1.0,
    temperature: Optional[float] = 1.0,
    top_p: Optional[float] = 1.0,
    top_k: int = 0,
    min_p: float = 0.0,
    seed: Optional[int] = None,
    stop: Optional[Union[str, list[str]]] = None,
    stop_token_ids: Optional[list[int]] = None,
    bad_words: Optional[list[str]] = None,
    include_stop_str_in_output: bool = False,
    ignore_eos: bool = False,
    max_tokens: Optional[int] = 16,
    min_tokens: int = 0,
    logprobs: Optional[int] = None,
    prompt_logprobs: Optional[int] = None,
    detokenize: bool = True,
    skip_special_tokens: bool = True,
    spaces_between_special_tokens: bool = True,
    logits_processors: Optional[
        list[LogitsProcessor]
    ] = None,
    truncate_prompt_tokens: Optional[
        Annotated[int, Meta(ge=-1)]
    ] = None,
    output_kind: RequestOutputKind = CUMULATIVE,
    structured_outputs: Optional[
        StructuredOutputsParams
    ] = None,
    guided_decoding: Optional[GuidedDecodingParams] = None,
    logit_bias: Optional[
        Union[dict[int, float], dict[str, float]]
    ] = None,
    allowed_token_ids: Optional[list[int]] = None,
    extra_args: Optional[dict[str, Any]] = None,
) -> SamplingParams
Source code in vllm/sampling_params.py
@staticmethod
def from_optional(
    n: Optional[int] = 1,
    best_of: Optional[int] = None,
    presence_penalty: Optional[float] = 0.0,
    frequency_penalty: Optional[float] = 0.0,
    repetition_penalty: Optional[float] = 1.0,
    temperature: Optional[float] = 1.0,
    top_p: Optional[float] = 1.0,
    top_k: int = 0,
    min_p: float = 0.0,
    seed: Optional[int] = None,
    stop: Optional[Union[str, list[str]]] = None,
    stop_token_ids: Optional[list[int]] = None,
    bad_words: Optional[list[str]] = None,
    include_stop_str_in_output: bool = False,
    ignore_eos: bool = False,
    max_tokens: Optional[int] = 16,
    min_tokens: int = 0,
    logprobs: Optional[int] = None,
    prompt_logprobs: Optional[int] = None,
    detokenize: bool = True,
    skip_special_tokens: bool = True,
    spaces_between_special_tokens: bool = True,
    logits_processors: Optional[list[LogitsProcessor]] = None,
    truncate_prompt_tokens: Optional[Annotated[int,
                                               msgspec.Meta(
                                                   ge=-1)]] = None,
    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
    structured_outputs: Optional[StructuredOutputsParams] = None,
    guided_decoding: Optional[GuidedDecodingParams] = None,
    logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
    allowed_token_ids: Optional[list[int]] = None,
    extra_args: Optional[dict[str, Any]] = None,
) -> "SamplingParams":
    if logit_bias is not None:
        # Convert token_id to integer
        # Clamp the bias between -100 and 100 per OpenAI API spec
        logit_bias = {
            int(token): min(100.0, max(-100.0, bias))
            for token, bias in logit_bias.items()
        }
    if guided_decoding is not None:
        warnings.warn(
            "guided_decoding is deprecated. This will be removed in "
            "v0.12.0 or v1.0.0, which ever is soonest. Please use "
            "structured_outputs instead.",
            DeprecationWarning,
            stacklevel=2)
        structured_outputs = guided_decoding
        guided_decoding = None

    return SamplingParams(
        n=1 if n is None else n,
        best_of=best_of,
        presence_penalty=0.0
        if presence_penalty is None else presence_penalty,
        frequency_penalty=0.0
        if frequency_penalty is None else frequency_penalty,
        repetition_penalty=1.0
        if repetition_penalty is None else repetition_penalty,
        temperature=1.0 if temperature is None else temperature,
        top_p=1.0 if top_p is None else top_p,
        top_k=top_k,
        min_p=min_p,
        seed=seed,
        stop=stop,
        stop_token_ids=stop_token_ids,
        bad_words=bad_words,
        include_stop_str_in_output=include_stop_str_in_output,
        ignore_eos=ignore_eos,
        max_tokens=max_tokens,
        min_tokens=min_tokens,
        logprobs=logprobs,
        prompt_logprobs=prompt_logprobs,
        detokenize=detokenize,
        skip_special_tokens=skip_special_tokens,
        spaces_between_special_tokens=spaces_between_special_tokens,
        logits_processors=logits_processors,
        truncate_prompt_tokens=truncate_prompt_tokens,
        output_kind=output_kind,
        structured_outputs=structured_outputs,
        logit_bias=logit_bias,
        allowed_token_ids=allowed_token_ids,
        extra_args=extra_args,
    )

update_from_generation_config

update_from_generation_config(
    generation_config: dict[str, Any],
    model_eos_token_id: Optional[int] = None,
) -> None

Update if there are non-default values from generation_config

Source code in vllm/sampling_params.py
def update_from_generation_config(
        self,
        generation_config: dict[str, Any],
        model_eos_token_id: Optional[int] = None) -> None:
    """Update if there are non-default values from generation_config"""

    if model_eos_token_id is not None:
        # Add the eos token id into the sampling_params to support
        # min_tokens processing.
        self._all_stop_token_ids.add(model_eos_token_id)

    # Update eos_token_id for generation
    if (eos_ids := generation_config.get("eos_token_id")) is not None:
        # it can be either int or list of int
        eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
        if model_eos_token_id is not None:
            # We don't need to include the primary eos_token_id in
            # stop_token_ids since it's handled separately for stopping
            # purposes.
            eos_ids.discard(model_eos_token_id)
        if eos_ids:
            self._all_stop_token_ids.update(eos_ids)
            if not self.ignore_eos:
                eos_ids.update(self.stop_token_ids)
                self.stop_token_ids = list(eos_ids)

update_from_tokenizer

update_from_tokenizer(tokenizer: AnyTokenizer) -> None
Source code in vllm/sampling_params.py
def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
    if not self.bad_words:
        return
    self._bad_words_token_ids = []
    for bad_word in self.bad_words:
        # To prohibit words both at the beginning
        # and in the middle of text
        # (related to add_prefix_space tokenizer parameter)
        for add_prefix_space in [False, True]:
            prefix = " " if add_prefix_space else ""
            prompt = prefix + bad_word.lstrip()
            prompt_token_ids = tokenizer.encode(text=prompt,
                                                add_special_tokens=False)

            # If no space at the beginning
            # or if prefix space produces a new word token
            if (not add_prefix_space) or (
                    add_prefix_space and prompt_token_ids[0]
                    != self._bad_words_token_ids[-1][0]
                    and len(prompt_token_ids) == len(
                        self._bad_words_token_ids[-1])):
                self._bad_words_token_ids.append(prompt_token_ids)

    invalid_token_ids = [
        token_id for bad_words_token_ids in self._bad_words_token_ids
        for token_id in bad_words_token_ids
        if token_id < 0 or token_id > tokenizer.max_token_id
    ]
    if len(invalid_token_ids) > 0:
        raise ValueError(
            f"The model vocabulary size is {tokenizer.max_token_id+1},"
            f" but the following tokens"
            f" were specified as bad: {invalid_token_ids}."
            f" All token id values should be integers satisfying:"
            f" 0 <= token_id <= {tokenizer.max_token_id}.")

ScoringOutput dataclass

The output data of one scoring output of a request.

Parameters:

Name Type Description Default
score float

The similarity score, which is a scalar value.

required
Source code in vllm/outputs.py
@dataclass
class ScoringOutput:
    """The output data of one scoring output of a request.

    Args:
        score: The similarity score, which is a scalar value.
    """
    score: float

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        # pooling_output shape:
        #   classify task: (num_classes) num_classes == 1
        #   embed task: a scalar value
        pooled_data = pooling_output.data.squeeze()
        if pooled_data.ndim != 0:
            raise ValueError("pooled_data should be a scalar score")

        return ScoringOutput(pooled_data.item())

    def __repr__(self) -> str:
        return f"ScoringOutput(score={self.score})"

score instance-attribute

score: float

__init__

__init__(score: float) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"ScoringOutput(score={self.score})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    # pooling_output shape:
    #   classify task: (num_classes) num_classes == 1
    #   embed task: a scalar value
    pooled_data = pooling_output.data.squeeze()
    if pooled_data.ndim != 0:
        raise ValueError("pooled_data should be a scalar score")

    return ScoringOutput(pooled_data.item())

ScoringRequestOutput

Bases: PoolingRequestOutput[ScoringOutput]

Source code in vllm/outputs.py
class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):

    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return ScoringRequestOutput(
            request_id=request_output.request_id,
            outputs=ScoringOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return ScoringRequestOutput(
        request_id=request_output.request_id,
        outputs=ScoringOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        finished=request_output.finished,
    )

TextPrompt

Bases: TypedDict

Schema for a text prompt.

Source code in vllm/inputs/data.py
class TextPrompt(TypedDict):
    """Schema for a text prompt."""

    prompt: str
    """The input text to be tokenized before passing to the model."""

    multi_modal_data: NotRequired["MultiModalDataDict"]
    """
    Optional multi-modal data to pass to the model,
    if the model supports it.
    """

    mm_processor_kwargs: NotRequired[dict[str, Any]]
    """
    Optional multi-modal processor kwargs to be forwarded to the
    multimodal input mapper & processor. Note that if multiple modalities
    have registered mappers etc for the model being considered, we attempt
    to pass the mm_processor_kwargs to each of them.
    """

    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
    """
    Optional user-specified UUIDs for multimodal items, mapped by modality.
    Lists must match the number of items per modality and may contain `None`.
    For `None` entries, the hasher will compute IDs automatically; non-None
    entries override the default hashes for caching, and MUST be unique per
    multimodal item.
    """

    cache_salt: NotRequired[str]
    """
    Optional cache salt to be used for prefix caching.
    """

cache_salt instance-attribute

cache_salt: NotRequired[str]

Optional cache salt to be used for prefix caching.

mm_processor_kwargs instance-attribute

mm_processor_kwargs: NotRequired[dict[str, Any]]

Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them.

multi_modal_data instance-attribute

multi_modal_data: NotRequired[MultiModalDataDict]

Optional multi-modal data to pass to the model, if the model supports it.

multi_modal_uuids instance-attribute

multi_modal_uuids: NotRequired[MultiModalUUIDDict]

Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain None. For None entries, the hasher will compute IDs automatically; non-None entries override the default hashes for caching, and MUST be unique per multimodal item.

prompt instance-attribute

prompt: str

The input text to be tokenized before passing to the model.

TokensPrompt

Bases: TypedDict

Schema for a tokenized prompt.

Source code in vllm/inputs/data.py
class TokensPrompt(TypedDict):
    """Schema for a tokenized prompt."""

    prompt_token_ids: list[int]
    """A list of token IDs to pass to the model."""

    prompt: NotRequired[str]
    """The prompt text corresponding to the token IDs, if available."""

    token_type_ids: NotRequired[list[int]]
    """A list of token type IDs to pass to the cross encoder model."""

    multi_modal_data: NotRequired["MultiModalDataDict"]
    """
    Optional multi-modal data to pass to the model,
    if the model supports it.
    """

    mm_processor_kwargs: NotRequired[dict[str, Any]]
    """
    Optional multi-modal processor kwargs to be forwarded to the
    multimodal input mapper & processor. Note that if multiple modalities
    have registered mappers etc for the model being considered, we attempt
    to pass the mm_processor_kwargs to each of them.
    """

    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
    """
    Optional user-specified UUIDs for multimodal items, mapped by modality.
    Lists must match the number of items per modality and may contain `None`.
    For `None` entries, the hasher will compute IDs automatically; non-None
    entries override the default hashes for caching.
    """

    cache_salt: NotRequired[str]
    """
    Optional cache salt to be used for prefix caching.
    """

cache_salt instance-attribute

cache_salt: NotRequired[str]

Optional cache salt to be used for prefix caching.

mm_processor_kwargs instance-attribute

mm_processor_kwargs: NotRequired[dict[str, Any]]

Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them.

multi_modal_data instance-attribute

multi_modal_data: NotRequired[MultiModalDataDict]

Optional multi-modal data to pass to the model, if the model supports it.

multi_modal_uuids instance-attribute

multi_modal_uuids: NotRequired[MultiModalUUIDDict]

Optional user-specified UUIDs for multimodal items, mapped by modality. Lists must match the number of items per modality and may contain None. For None entries, the hasher will compute IDs automatically; non-None entries override the default hashes for caching.

prompt instance-attribute

prompt: NotRequired[str]

The prompt text corresponding to the token IDs, if available.

prompt_token_ids instance-attribute

prompt_token_ids: list[int]

A list of token IDs to pass to the model.

token_type_ids instance-attribute

token_type_ids: NotRequired[list[int]]

A list of token type IDs to pass to the cross encoder model.

__getattr__

__getattr__(name: str) -> Any
Source code in vllm/__init__.py
def __getattr__(name: str) -> typing.Any:
    from importlib import import_module

    if name in MODULE_ATTRS:
        module_name, attr_name = MODULE_ATTRS[name].split(":")
        module = import_module(module_name, __package__)
        return getattr(module, attr_name)
    else:
        raise AttributeError(
            f'module {__package__} has no attribute {name}')

bc_linter_include

bc_linter_include(obj: T) -> T
bc_linter_include(
    *, reason: str | None = ...
) -> Callable[[T], T]
bc_linter_include(
    obj: Any = None, *, reason: str | None = None
)
Usage

@bc_linter_include def public_api(...): ...

Source code in vllm/_bc_linter.py
def bc_linter_include(obj: Any = None, *, reason: str | None = None):
    """
    Usage:
        @bc_linter_include
        def public_api(...): ...
    """

    def _wrap(x: T) -> T:
        return x

    return _wrap if obj is None else obj

bc_linter_skip

bc_linter_skip(obj: T) -> T
bc_linter_skip(
    *, reason: str | None = ...
) -> Callable[[T], T]
bc_linter_skip(
    obj: Any = None, *, reason: str | None = None
)

No-op decorator to mark symbols/files for BC-linter suppression.

Usage

@bc_linter_skip def legacy_api(...): ...

Source code in vllm/_bc_linter.py
def bc_linter_skip(obj: Any = None, *, reason: str | None = None):
    """
    No-op decorator to mark symbols/files for BC-linter suppression.

    Usage:
        @bc_linter_skip
        def legacy_api(...): ...
    """

    def _wrap(x: T) -> T:
        return x

    return _wrap if obj is None else obj

initialize_ray_cluster

initialize_ray_cluster(
    parallel_config: ParallelConfig,
    ray_address: Optional[str] = None,
)

Initialize the distributed cluster with Ray.

it will connect to the Ray cluster and create a placement group for the workers, which includes the specification of the resources for each distributed worker.

Parameters:

Name Type Description Default
parallel_config ParallelConfig

The configurations for parallel execution.

required
ray_address Optional[str]

The address of the Ray cluster. If None, uses the default Ray cluster address.

None
Source code in vllm/executor/ray_utils.py
def initialize_ray_cluster(
    parallel_config: ParallelConfig,
    ray_address: Optional[str] = None,
):
    """Initialize the distributed cluster with Ray.

    it will connect to the Ray cluster and create a placement group
    for the workers, which includes the specification of the resources
    for each distributed worker.

    Args:
        parallel_config: The configurations for parallel execution.
        ray_address: The address of the Ray cluster. If None, uses
            the default Ray cluster address.
    """
    assert_ray_available()
    from vllm.platforms import current_platform

    if ray.is_initialized():
        logger.info("Ray is already initialized. Skipping Ray initialization.")
    elif current_platform.is_rocm() or current_platform.is_xpu():
        # Try to connect existing ray instance and create a new one if not found
        try:
            ray.init("auto")
        except ConnectionError:
            logger.warning(
                "No existing RAY instance detected. "
                "A new instance will be launched with current node resources.")
            ray.init(address=ray_address,
                     num_gpus=parallel_config.world_size,
                     runtime_env=parallel_config.ray_runtime_env)
    else:
        ray.init(address=ray_address,
                 runtime_env=parallel_config.ray_runtime_env)

    device_str = current_platform.ray_device_key
    if not device_str:
        raise ValueError(
            f"current platform {current_platform.device_name} does not "
            "support ray.")

    # Create or get the placement group for worker processes
    if parallel_config.placement_group:
        current_placement_group = parallel_config.placement_group
    else:
        current_placement_group = ray.util.get_current_placement_group()

    if current_placement_group:
        logger.info("Using the existing placement group")

        # We are in a placement group
        bundles = current_placement_group.bundle_specs
        # Verify that we can use the placement group.
        device_bundles = 0
        for bundle in bundles:
            bundle_devices = bundle.get(device_str, 0)
            if bundle_devices > 1:
                raise ValueError(
                    "Placement group bundle cannot have more than 1 "
                    f"{device_str}.")
            if bundle_devices:
                device_bundles += 1
        if parallel_config.world_size > device_bundles:
            raise ValueError(
                f"The number of required {device_str}s exceeds the total "
                f"number of available {device_str}s in the placement group. "
                f"Required number of devices: {parallel_config.world_size}. "
                f"Total number of devices: {device_bundles}.")
    else:
        logger.info("No current placement group found. "
                    "Creating a new placement group.")
        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
        # Log a warning message and delay resource allocation failure response.
        # Avoid immediate rejection to allow user-initiated placement group
        # created and wait cluster to be ready
        if parallel_config.world_size > num_devices_in_cluster:
            logger.warning(
                "The number of required %ss exceeds the total "
                "number of available %ss in the placement group.", device_str,
                device_str)
        # Create a new placement group
        placement_group_specs: List[Dict[str, float]] = ([{
            device_str: 1.0
        } for _ in range(parallel_config.world_size)])

        # vLLM engine is also a worker to execute model with an accelerator,
        # so it requires to have the device in a current node. Check if
        # the current node has at least one device.
        current_ip = get_ip()
        current_node_id = ray.get_runtime_context().get_node_id()
        current_node_resource = available_resources_per_node()[current_node_id]
        if current_node_resource.get(device_str, 0) < 1:
            raise ValueError(
                f"Current node has no {device_str} available. "
                f"{current_node_resource=}. vLLM engine cannot start without "
                f"{device_str}. Make sure you have at least 1 {device_str} "
                f"available in a node {current_node_id=} {current_ip=}.")
        # This way, at least bundle is required to be created in a current
        # node.
        placement_group_specs[0][f"node:{current_ip}"] = 0.001

        # By default, Ray packs resources as much as possible.
        current_placement_group = ray.util.placement_group(
            placement_group_specs, strategy="PACK")
        _wait_until_pg_ready(current_placement_group)

    assert current_placement_group is not None
    _verify_bundles(current_placement_group, parallel_config, device_str)
    # Set the placement group in the parallel config
    parallel_config.placement_group = current_placement_group