Skip to content

vllm.entrypoints.openai.protocol

AnyResponseFormat module-attribute

AudioResponseFormat module-attribute

AudioResponseFormat: TypeAlias = Literal[
    "json", "text", "srt", "verbose_json", "vtt"
]

BatchRequestInputBody module-attribute

EmbeddingRequest module-attribute

LogitsProcessors module-attribute

LogitsProcessors = list[
    Union[str, LogitsProcessorConstructor]
]

PoolingChatRequest module-attribute

PoolingChatRequest = EmbeddingChatRequest

PoolingCompletionRequest module-attribute

PoolingCompletionRequest = EmbeddingCompletionRequest

PoolingRequest module-attribute

ResponseInputOutputItem module-attribute

ResponseInputOutputItem: TypeAlias = Union[
    ResponseInputItemParam,
    ResponseReasoningItem,
    ResponseFunctionToolCall,
]

StreamingResponsesResponse module-attribute

StreamingResponsesResponse: TypeAlias = Union[
    ResponseCreatedEvent,
    ResponseInProgressEvent,
    ResponseCompletedEvent,
    ResponseOutputItemAddedEvent,
    ResponseOutputItemDoneEvent,
    ResponseContentPartAddedEvent,
    ResponseContentPartDoneEvent,
    ResponseReasoningTextDeltaEvent,
    ResponseReasoningTextDoneEvent,
    ResponseReasoningPartAddedEvent,
    ResponseReasoningPartDoneEvent,
    ResponseCodeInterpreterCallInProgressEvent,
    ResponseCodeInterpreterCallCodeDeltaEvent,
    ResponseWebSearchCallInProgressEvent,
    ResponseWebSearchCallSearchingEvent,
    ResponseWebSearchCallCompletedEvent,
    ResponseCodeInterpreterCallCodeDoneEvent,
    ResponseCodeInterpreterCallInterpretingEvent,
    ResponseCodeInterpreterCallCompletedEvent,
]

T module-attribute

T = TypeVar('T')

TokenizeRequest module-attribute

_LONG_INFO module-attribute

_LONG_INFO = iinfo(long)

logger module-attribute

logger = init_logger(__name__)

BatchRequestInput

Bases: OpenAIBaseModel

The per-line object of the batch input file.

NOTE: Currently only the /v1/chat/completions endpoint is supported.

Source code in vllm/entrypoints/openai/protocol.py
class BatchRequestInput(OpenAIBaseModel):
    """
    The per-line object of the batch input file.

    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
    """

    # A developer-provided per-request id that will be used to match outputs to
    # inputs. Must be unique for each request in a batch.
    custom_id: str

    # The HTTP method to be used for the request. Currently only POST is
    # supported.
    method: str

    # The OpenAI API relative URL to be used for the request. Currently
    # /v1/chat/completions is supported.
    url: str

    # The parameters of the request.
    body: BatchRequestInputBody

    @field_validator('body', mode='plain')
    @classmethod
    def check_type_for_url(cls, value: Any, info: ValidationInfo):
        # Use url to disambiguate models
        url: str = info.data["url"]
        if url == "/v1/chat/completions":
            return ChatCompletionRequest.model_validate(value)
        if url == "/v1/embeddings":
            return TypeAdapter(EmbeddingRequest).validate_python(value)
        if url.endswith("/score"):
            return ScoreRequest.model_validate(value)
        if url.endswith("/rerank"):
            return RerankRequest.model_validate(value)
        return TypeAdapter(BatchRequestInputBody).validate_python(value)

body instance-attribute

custom_id instance-attribute

custom_id: str

method instance-attribute

method: str

url instance-attribute

url: str

check_type_for_url classmethod

check_type_for_url(value: Any, info: ValidationInfo)
Source code in vllm/entrypoints/openai/protocol.py
@field_validator('body', mode='plain')
@classmethod
def check_type_for_url(cls, value: Any, info: ValidationInfo):
    # Use url to disambiguate models
    url: str = info.data["url"]
    if url == "/v1/chat/completions":
        return ChatCompletionRequest.model_validate(value)
    if url == "/v1/embeddings":
        return TypeAdapter(EmbeddingRequest).validate_python(value)
    if url.endswith("/score"):
        return ScoreRequest.model_validate(value)
    if url.endswith("/rerank"):
        return RerankRequest.model_validate(value)
    return TypeAdapter(BatchRequestInputBody).validate_python(value)

BatchRequestOutput

Bases: OpenAIBaseModel

The per-line object of the batch output and error files

Source code in vllm/entrypoints/openai/protocol.py
class BatchRequestOutput(OpenAIBaseModel):
    """
    The per-line object of the batch output and error files
    """

    id: str

    # A developer-provided per-request id that will be used to match outputs to
    # inputs.
    custom_id: str

    response: Optional[BatchResponseData]

    # For requests that failed with a non-HTTP error, this will contain more
    # information on the cause of the failure.
    error: Optional[Any]

custom_id instance-attribute

custom_id: str

error instance-attribute

error: Optional[Any]

id instance-attribute

id: str

response instance-attribute

BatchResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class BatchResponseData(OpenAIBaseModel):
    # HTTP status code of the response.
    status_code: int = 200

    # An unique identifier for the API request.
    request_id: str

    # The body of the response.
    body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
                         ScoreResponse, RerankResponse]] = None

body class-attribute instance-attribute

request_id instance-attribute

request_id: str

status_code class-attribute instance-attribute

status_code: int = 200

ChatCompletionLogProb

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionLogProb(OpenAIBaseModel):
    token: str
    logprob: float = -9999.0
    bytes: Optional[list[int]] = None

bytes class-attribute instance-attribute

bytes: Optional[list[int]] = None

logprob class-attribute instance-attribute

logprob: float = -9999.0

token instance-attribute

token: str

ChatCompletionLogProbs

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionLogProbs(OpenAIBaseModel):
    content: Optional[list[ChatCompletionLogProbsContent]] = None

content class-attribute instance-attribute

ChatCompletionLogProbsContent

Bases: ChatCompletionLogProb

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionLogProbsContent(ChatCompletionLogProb):
    # Workaround: redefine fields name cache so that it's not
    # shared with the super class.
    field_names: ClassVar[Optional[set[str]]] = None
    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)

field_names class-attribute

field_names: Optional[set[str]] = None

top_logprobs class-attribute instance-attribute

top_logprobs: list[ChatCompletionLogProb] = Field(
    default_factory=list
)

ChatCompletionNamedFunction

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionNamedFunction(OpenAIBaseModel):
    name: str

name instance-attribute

name: str

ChatCompletionNamedToolChoiceParam

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
    function: ChatCompletionNamedFunction
    type: Literal["function"] = "function"

function instance-attribute

type class-attribute instance-attribute

type: Literal['function'] = 'function'

ChatCompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
class ChatCompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/chat/create
    messages: list[ChatCompletionMessageParam]
    model: Optional[str] = None
    frequency_penalty: Optional[float] = 0.0
    logit_bias: Optional[dict[str, float]] = None
    logprobs: Optional[bool] = False
    top_logprobs: Optional[int] = 0
    max_tokens: Optional[int] = Field(
        default=None,
        deprecated=
        'max_tokens is deprecated in favor of the max_completion_tokens field')
    max_completion_tokens: Optional[int] = None
    n: Optional[int] = 1
    presence_penalty: Optional[float] = 0.0
    response_format: Optional[AnyResponseFormat] = None
    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    stop: Optional[Union[str, list[str]]] = []
    stream: Optional[bool] = False
    stream_options: Optional[StreamOptions] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    tools: Optional[list[ChatCompletionToolsParam]] = None
    tool_choice: Optional[Union[
        Literal["none"],
        Literal["auto"],
        Literal["required"],
        ChatCompletionNamedToolChoiceParam,
    ]] = "none"
    reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
    include_reasoning: bool = True

    # NOTE this will be ignored by vLLM -- the model determines the behavior
    parallel_tool_calls: Optional[bool] = False
    user: Optional[str] = None

    # --8<-- [start:chat-completion-sampling-params]
    best_of: Optional[int] = None
    use_beam_search: bool = False
    top_k: Optional[int] = None
    min_p: Optional[float] = None
    repetition_penalty: Optional[float] = None
    length_penalty: float = 1.0
    stop_token_ids: Optional[list[int]] = []
    include_stop_str_in_output: bool = False
    ignore_eos: bool = False
    min_tokens: int = 0
    skip_special_tokens: bool = True
    spaces_between_special_tokens: bool = True
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
    prompt_logprobs: Optional[int] = None
    allowed_token_ids: Optional[list[int]] = None
    bad_words: list[str] = Field(default_factory=list)
    # --8<-- [end:chat-completion-sampling-params]

    # --8<-- [start:chat-completion-extra-params]
    echo: bool = Field(
        default=False,
        description=(
            "If true, the new message will be prepended with the last message "
            "if they belong to the same role."),
    )
    add_generation_prompt: bool = Field(
        default=True,
        description=
        ("If true, the generation prompt will be added to the chat template. "
         "This is a parameter used by chat template in tokenizer config of the "
         "model."),
    )
    continue_final_message: bool = Field(
        default=False,
        description=
        ("If this is set, the chat will be formatted so that the final "
         "message in the chat is open-ended, without any EOS tokens. The "
         "model will continue this message rather than starting a new one. "
         "This allows you to \"prefill\" part of the model's response for it. "
         "Cannot be used at the same time as `add_generation_prompt`."),
    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."),
    )
    documents: Optional[list[dict[str, str]]] = Field(
        default=None,
        description=
        ("A list of dicts representing documents that will be accessible to "
         "the model if it is performing RAG (retrieval-augmented generation)."
         " If the template does not support RAG, this argument will have no "
         "effect. We recommend that each document should be a dict containing "
         "\"title\" and \"text\" keys."),
    )
    chat_template: Optional[str] = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."),
    )
    chat_template_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."),
    )
    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    structured_outputs: Optional[StructuredOutputsParams] = Field(
        default=None,
        description="Additional kwargs for structured outputs",
    )
    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
        default=None,
        description=(
            "`guided_json` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `json` to `structured_outputs` instead."),
    )
    guided_regex: Optional[str] = Field(
        default=None,
        description=(
            "`guided_regex` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `regex` to `structured_outputs` instead."),
    )
    guided_choice: Optional[list[str]] = Field(
        default=None,
        description=(
            "`guided_choice` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `choice` to `structured_outputs` instead."),
    )
    guided_grammar: Optional[str] = Field(
        default=None,
        description=(
            "`guided_grammar` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `grammar` to `structured_outputs` instead."),
    )
    structural_tag: Optional[str] = Field(
        default=None,
        description=(
            "`structural_tag` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `structural_tag` to `structured_outputs` instead."),
    )
    guided_decoding_backend: Optional[str] = Field(
        default=None,
        description=(
            "`guided_decoding_backend` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please remove it from your request."),
    )
    guided_whitespace_pattern: Optional[str] = Field(
        default=None,
        description=(
            "`guided_whitespace_pattern` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `whitespace_pattern` to `structured_outputs` instead."
        ),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    logits_processors: Optional[LogitsProcessors] = Field(
        default=None,
        description=(
            "A list of either qualified names of logits processors, or "
            "constructor objects, to apply when sampling. A constructor is "
            "a JSON object with a required 'qualname' field specifying the "
            "qualified name of the processor class/factory, and optional "
            "'args' and 'kwargs' fields containing positional and keyword "
            "arguments. For example: {'qualname': "
            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
            "{'param': 'value'}}."))
    return_tokens_as_token_ids: Optional[bool] = Field(
        default=None,
        description=(
            "If specified with 'logprobs', tokens are represented "
            " as strings of the form 'token_id:{token_id}' so that tokens "
            "that are not JSON-encodable can be identified."))
    return_token_ids: Optional[bool] = Field(
        default=None,
        description=(
            "If specified, the result will include token IDs alongside the "
            "generated text. In streaming mode, prompt_token_ids is included "
            "only in the first chunk, and token_ids contains the delta tokens "
            "for each chunk. This is useful for debugging or when you "
            "need to map generated text back to input tokens."))
    cache_salt: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit). Not supported by vLLM engine V0."))
    kv_transfer_params: Optional[dict[str, Any]] = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.")

    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
        default=None,
        description=("Additional request parameters with string or "
                     "numeric values, used by custom extensions."),
    )

    # --8<-- [end:chat-completion-extra-params]

    # Default sampling parameters for chat completion requests
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_beam_search_params(
            self, max_tokens: int,
            default_sampling_params: dict) -> BeamSearchParams:

        n = self.n if self.n is not None else 1
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])

        return BeamSearchParams(
            beam_width=n,
            max_tokens=max_tokens,
            ignore_eos=self.ignore_eos,
            temperature=temperature,
            length_penalty=self.length_penalty,
            include_stop_str_in_output=self.include_stop_str_in_output,
        )

    def to_sampling_params(
        self,
        max_tokens: int,
        logits_processor_pattern: Optional[str],
        default_sampling_params: dict,
    ) -> SamplingParams:

        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
            )
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

        prompt_logprobs = self.prompt_logprobs
        if prompt_logprobs is None and self.echo:
            prompt_logprobs = self.top_logprobs

        # Forward deprecated guided_* parameters to structured_outputs
        if self.structured_outputs is None:
            kwargs = dict[str, Any](
                json=self.guided_json,
                regex=self.guided_regex,
                choice=self.guided_choice,
                grammar=self.guided_grammar,
                whitespace_pattern=self.guided_whitespace_pattern,
                structural_tag=self.structural_tag,
            )
            kwargs = {k: v for k, v in kwargs.items() if v is not None}
            if len(kwargs) > 0:
                self.structured_outputs = StructuredOutputsParams(**kwargs)

        response_format = self.response_format
        json_schema_from_tool = self._get_json_schema_from_tool()
        if response_format is not None or json_schema_from_tool is not None:
            # If structured outputs wasn't already enabled,
            # we must enable it for these features to work
            if self.structured_outputs is None:
                self.structured_outputs = StructuredOutputsParams()

            # Set structured output params for response format
            if response_format is not None:
                if response_format.type == "json_object":
                    self.structured_outputs.json_object = True
                elif response_format.type == "json_schema":
                    json_schema = response_format.json_schema
                    assert json_schema is not None
                    self.structured_outputs.json = json_schema.json_schema
                elif response_format.type == "structural_tag":
                    structural_tag = response_format
                    assert structural_tag is not None and isinstance(
                        structural_tag, StructuralTagResponseFormat)
                    s_tag_obj = structural_tag.model_dump(by_alias=True)
                    self.structured_outputs.structural_tag = json.dumps(
                        s_tag_obj)

            # Set structured output params for tool calling
            if json_schema_from_tool is not None:
                self.structured_outputs.json = json_schema_from_tool

        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
        if self.kv_transfer_params:
            # Pass in kv_transfer_params via extra_args
            extra_args["kv_transfer_params"] = self.kv_transfer_params
        return SamplingParams.from_optional(
            n=self.n,
            best_of=self.best_of,
            presence_penalty=self.presence_penalty,
            frequency_penalty=self.frequency_penalty,
            repetition_penalty=repetition_penalty,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            seed=self.seed,
            stop=self.stop,
            stop_token_ids=self.stop_token_ids,
            logprobs=self.top_logprobs if self.logprobs else None,
            prompt_logprobs=prompt_logprobs,
            ignore_eos=self.ignore_eos,
            max_tokens=max_tokens,
            min_tokens=self.min_tokens,
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            logits_processors=get_logits_processors(self.logits_processors,
                                                    logits_processor_pattern),
            include_stop_str_in_output=self.include_stop_str_in_output,
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA if self.stream \
                else RequestOutputKind.FINAL_ONLY,
            structured_outputs=self.structured_outputs,
            logit_bias=self.logit_bias,
            bad_words=self.bad_words,
            allowed_token_ids=self.allowed_token_ids,
            extra_args=extra_args or None,
        )

    def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]:
        # user has chosen to not use any tool
        if self.tool_choice == "none" or self.tools is None:
            return None

        # user has chosen to use a named tool
        if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
            tool_name = self.tool_choice.function.name
            tools = {tool.function.name: tool.function for tool in self.tools}
            if tool_name not in tools:
                raise ValueError(
                    f"Tool '{tool_name}' has not been passed in `tools`.")
            tool = tools[tool_name]
            return tool.parameters

        if self.tool_choice == "required":
            # Pydantic schema generation cannot be used since the JSON schema
            # has to be constructed for a specific instantiation of a tool list
            # so that parameters of a function are correctly generated
            # based on the chosen function name
            def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
                return {
                    "properties": {
                        "name": {
                            "type": "string",
                            "enum": [tool.function.name]
                        },
                        # parameters are always generated as '{}' in the final
                        # output if they are missing from the request
                        # (i.e. are None or '{}') so the schema is
                        # updated to produce an empty object in that case
                        "parameters": tool.function.parameters
                        if tool.function.parameters else {
                            "type": "object",
                            "properties": {}
                        }
                    },
                    "required": ["name", "parameters"]
                }

            def get_tool_schema_defs(
                    tools: list[ChatCompletionToolsParam]) -> dict:
                all_defs = dict[str, dict[str, Any]]()
                for tool in tools:
                    if tool.function.parameters is None:
                        continue
                    defs = tool.function.parameters.pop("$defs", {})
                    for def_name, def_schema in defs.items():
                        if def_name in all_defs and all_defs[
                                def_name] != def_schema:
                            raise ValueError(
                                f"Tool definition '{def_name}' has "
                                "multiple schemas, which is not "
                                "supported.")
                        else:
                            all_defs[def_name] = def_schema
                return all_defs

            json_schema = {
                "type": "array",
                "minItems": 1,
                "items": {
                    "type": "object",
                    "anyOf": [get_tool_schema(tool) for tool in self.tools]
                }
            }
            json_schema_defs = get_tool_schema_defs(self.tools)
            if json_schema_defs:
                json_schema["$defs"] = json_schema_defs
            return json_schema

        return None

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        if data.get("stream_options") and not data.get("stream"):
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data

    @model_validator(mode="before")
    @classmethod
    def check_logprobs(cls, data):
        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
            if data.get("stream") and (prompt_logprobs > 0
                                       or prompt_logprobs == -1):
                raise ValueError(
                    "`prompt_logprobs` are not available when `stream=True`.")

            if prompt_logprobs < 0 and prompt_logprobs != -1:
                raise ValueError(
                    "`prompt_logprobs` must be a positive value or -1.")
            if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
                raise ValueError("`prompt_logprobs=-1` is only supported with "
                                 "vLLM engine V1.")
        if (top_logprobs := data.get("top_logprobs")) is not None:
            if top_logprobs < 0 and top_logprobs != -1:
                raise ValueError(
                    "`top_logprobs` must be a positive value or -1.")

            if (top_logprobs == -1
                    or top_logprobs > 0) and not data.get("logprobs"):
                raise ValueError(
                    "when using `top_logprobs`, `logprobs` must be set to true."
                )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_structured_outputs_count(cls, data):
        if isinstance(data, ValueError):
            raise data

        if data.get("structured_outputs", None) is None:
            return data

        structured_outputs_kwargs = data['structured_outputs']
        count = sum(
            structured_outputs_kwargs.get(k) is not None
            for k in ("json", "regex", "choice"))
        # you can only use one kind of constraints for structured outputs
        if count > 1:
            raise ValueError(
                "You can only use one kind of constraints for structured "
                "outputs ('json', 'regex' or 'choice').")
        # you can only either use structured outputs or tools, not both
        if count > 1 and data.get("tool_choice", "none") not in (
                "none",
                "auto",
                "required",
        ):
            raise ValueError(
                "You can only either use constraints for structured outputs "
                "or tools, not both.")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_tool_usage(cls, data):

        # if "tool_choice" is not specified but tools are provided,
        # default to "auto" tool_choice
        if "tool_choice" not in data and data.get("tools"):
            data["tool_choice"] = "auto"

        # if "tool_choice" is "none" -- no validation is needed for tools
        if "tool_choice" in data and data["tool_choice"] == "none":
            return data

        # if "tool_choice" is specified -- validation
        if "tool_choice" in data and data["tool_choice"] is not None:

            # ensure that if "tool choice" is specified, tools are present
            if "tools" not in data or data["tools"] is None:
                raise ValueError(
                    "When using `tool_choice`, `tools` must be set.")

            # make sure that tool choice is either a named tool
            # OR that it's set to "auto" or "required"
            if data["tool_choice"] not in [
                    "auto", "required"
            ] and not isinstance(data["tool_choice"], dict):
                raise ValueError(
                    f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
                    'Only named tools, "none", "auto" or "required" '\
                    'are supported.'
                )

            # if tool_choice is "required" but the "tools" list is empty,
            # override the data to behave like "none" to align with
            # OpenAI’s behavior.
            if data["tool_choice"] == "required" and isinstance(
                    data["tools"], list) and len(data["tools"]) == 0:
                data["tool_choice"] = "none"
                del data["tools"]
                return data

            # ensure that if "tool_choice" is specified as an object,
            # it matches a valid tool
            correct_usage_message = 'Correct usage: `{"type": "function",' \
                ' "function": {"name": "my_function"}}`'
            if isinstance(data["tool_choice"], dict):
                valid_tool = False
                function = data["tool_choice"].get("function")
                if not isinstance(function, dict):
                    raise ValueError(
                        f"Invalid value for `function`: `{function}` in "
                        f"`tool_choice`! {correct_usage_message}")
                if "name" not in function:
                    raise ValueError(f"Expected field `name` in `function` in "
                                     f"`tool_choice`! {correct_usage_message}")
                function_name = function["name"]
                if not isinstance(function_name,
                                  str) or len(function_name) == 0:
                    raise ValueError(
                        f"Invalid `name` in `function`: `{function_name}`"
                        f" in `tool_choice`! {correct_usage_message}")
                for tool in data["tools"]:
                    if tool["function"]["name"] == function_name:
                        valid_tool = True
                        break
                if not valid_tool:
                    raise ValueError(
                        "The tool specified in `tool_choice` does not match any"
                        " of the specified `tools`")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get(
                "add_generation_prompt"):
            raise ValueError("Cannot set both `continue_final_message` and "
                             "`add_generation_prompt` to True.")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_cache_salt_support(cls, data):
        if data.get("cache_salt") is not None:
            if not envs.VLLM_USE_V1:
                raise ValueError(
                    "Parameter 'cache_salt' is not supported with "
                    "this instance of vLLM, which uses engine V0.")
            if not isinstance(data["cache_salt"],
                              str) or not data["cache_salt"]:
                raise ValueError("Parameter 'cache_salt' must be a "
                                 "non-empty string if provided.")
        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {
    "repetition_penalty": 1.0,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 0,
    "min_p": 0.0,
}

add_generation_prompt class-attribute instance-attribute

add_generation_prompt: bool = Field(
    default=True,
    description="If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

allowed_token_ids class-attribute instance-attribute

allowed_token_ids: Optional[list[int]] = None

bad_words class-attribute instance-attribute

bad_words: list[str] = Field(default_factory=list)

best_of class-attribute instance-attribute

best_of: Optional[int] = None

cache_salt class-attribute instance-attribute

cache_salt: Optional[str] = Field(
    default=None,
    description="If specified, the prefix cache will be salted with the provided string to prevent an attacker to guess prompts in multi-user environments. The salt should be random, protected from access by 3rd parties, and long enough to be unpredictable (e.g., 43 characters base64-encoded, corresponding to 256 bit). Not supported by vLLM engine V0.",
)

chat_template class-attribute instance-attribute

chat_template: Optional[str] = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs class-attribute instance-attribute

chat_template_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

continue_final_message class-attribute instance-attribute

continue_final_message: bool = Field(
    default=False,
    description='If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to "prefill" part of the model\'s response for it. Cannot be used at the same time as `add_generation_prompt`.',
)

documents class-attribute instance-attribute

documents: Optional[list[dict[str, str]]] = Field(
    default=None,
    description='A list of dicts representing documents that will be accessible to the model if it is performing RAG (retrieval-augmented generation). If the template does not support RAG, this argument will have no effect. We recommend that each document should be a dict containing "title" and "text" keys.',
)

echo class-attribute instance-attribute

echo: bool = Field(
    default=False,
    description="If true, the new message will be prepended with the last message if they belong to the same role.",
)

frequency_penalty class-attribute instance-attribute

frequency_penalty: Optional[float] = 0.0

guided_choice class-attribute instance-attribute

guided_choice: Optional[list[str]] = Field(
    default=None,
    description="`guided_choice` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `choice` to `structured_outputs` instead.",
)

guided_decoding_backend class-attribute instance-attribute

guided_decoding_backend: Optional[str] = Field(
    default=None,
    description="`guided_decoding_backend` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please remove it from your request.",
)

guided_grammar class-attribute instance-attribute

guided_grammar: Optional[str] = Field(
    default=None,
    description="`guided_grammar` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `grammar` to `structured_outputs` instead.",
)

guided_json class-attribute instance-attribute

guided_json: Optional[Union[str, dict, BaseModel]] = Field(
    default=None,
    description="`guided_json` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `json` to `structured_outputs` instead.",
)

guided_regex class-attribute instance-attribute

guided_regex: Optional[str] = Field(
    default=None,
    description="`guided_regex` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `regex` to `structured_outputs` instead.",
)

guided_whitespace_pattern class-attribute instance-attribute

guided_whitespace_pattern: Optional[str] = Field(
    default=None,
    description="`guided_whitespace_pattern` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `whitespace_pattern` to `structured_outputs` instead.",
)

ignore_eos class-attribute instance-attribute

ignore_eos: bool = False

include_reasoning class-attribute instance-attribute

include_reasoning: bool = True

include_stop_str_in_output class-attribute instance-attribute

include_stop_str_in_output: bool = False

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

length_penalty class-attribute instance-attribute

length_penalty: float = 1.0

logit_bias class-attribute instance-attribute

logit_bias: Optional[dict[str, float]] = None

logits_processors class-attribute instance-attribute

logits_processors: Optional[LogitsProcessors] = Field(
    default=None,
    description="A list of either qualified names of logits processors, or constructor objects, to apply when sampling. A constructor is a JSON object with a required 'qualname' field specifying the qualified name of the processor class/factory, and optional 'args' and 'kwargs' fields containing positional and keyword arguments. For example: {'qualname': 'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': {'param': 'value'}}.",
)

logprobs class-attribute instance-attribute

logprobs: Optional[bool] = False

max_completion_tokens class-attribute instance-attribute

max_completion_tokens: Optional[int] = None

max_tokens class-attribute instance-attribute

max_tokens: Optional[int] = Field(
    default=None,
    deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
)

messages instance-attribute

min_p class-attribute instance-attribute

min_p: Optional[float] = None

min_tokens class-attribute instance-attribute

min_tokens: int = 0

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

n class-attribute instance-attribute

n: Optional[int] = 1

parallel_tool_calls class-attribute instance-attribute

parallel_tool_calls: Optional[bool] = False

presence_penalty class-attribute instance-attribute

presence_penalty: Optional[float] = 0.0

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[int] = None

reasoning_effort class-attribute instance-attribute

reasoning_effort: Optional[
    Literal["low", "medium", "high"]
] = None

repetition_penalty class-attribute instance-attribute

repetition_penalty: Optional[float] = None

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

response_format class-attribute instance-attribute

response_format: Optional[AnyResponseFormat] = None

return_token_ids class-attribute instance-attribute

return_token_ids: Optional[bool] = Field(
    default=None,
    description="If specified, the result will include token IDs alongside the generated text. In streaming mode, prompt_token_ids is included only in the first chunk, and token_ids contains the delta tokens for each chunk. This is useful for debugging or when you need to map generated text back to input tokens.",
)

return_tokens_as_token_ids class-attribute instance-attribute

return_tokens_as_token_ids: Optional[bool] = Field(
    default=None,
    description="If specified with 'logprobs', tokens are represented  as strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified.",
)

seed class-attribute instance-attribute

seed: Optional[int] = Field(None, ge=min, le=max)

skip_special_tokens class-attribute instance-attribute

skip_special_tokens: bool = True

spaces_between_special_tokens class-attribute instance-attribute

spaces_between_special_tokens: bool = True

stop class-attribute instance-attribute

stop: Optional[Union[str, list[str]]] = []

stop_token_ids class-attribute instance-attribute

stop_token_ids: Optional[list[int]] = []

stream class-attribute instance-attribute

stream: Optional[bool] = False

stream_options class-attribute instance-attribute

stream_options: Optional[StreamOptions] = None

structural_tag class-attribute instance-attribute

structural_tag: Optional[str] = Field(
    default=None,
    description="`structural_tag` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `structural_tag` to `structured_outputs` instead.",
)

structured_outputs class-attribute instance-attribute

structured_outputs: Optional[StructuredOutputsParams] = (
    Field(
        default=None,
        description="Additional kwargs for structured outputs",
    )
)

temperature class-attribute instance-attribute

temperature: Optional[float] = None

tool_choice class-attribute instance-attribute

tool_choice: Optional[
    Union[
        Literal["none"],
        Literal["auto"],
        Literal["required"],
        ChatCompletionNamedToolChoiceParam,
    ]
] = "none"

tools class-attribute instance-attribute

top_k class-attribute instance-attribute

top_k: Optional[int] = None

top_logprobs class-attribute instance-attribute

top_logprobs: Optional[int] = 0

top_p class-attribute instance-attribute

top_p: Optional[float] = None

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None
use_beam_search: bool = False

user class-attribute instance-attribute

user: Optional[str] = None

vllm_xargs class-attribute instance-attribute

vllm_xargs: Optional[dict[str, Union[str, int, float]]] = (
    Field(
        default=None,
        description="Additional request parameters with string or numeric values, used by custom extensions.",
    )
)

_get_json_schema_from_tool

_get_json_schema_from_tool() -> Optional[Union[str, dict]]
Source code in vllm/entrypoints/openai/protocol.py
def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]:
    # user has chosen to not use any tool
    if self.tool_choice == "none" or self.tools is None:
        return None

    # user has chosen to use a named tool
    if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
        tool_name = self.tool_choice.function.name
        tools = {tool.function.name: tool.function for tool in self.tools}
        if tool_name not in tools:
            raise ValueError(
                f"Tool '{tool_name}' has not been passed in `tools`.")
        tool = tools[tool_name]
        return tool.parameters

    if self.tool_choice == "required":
        # Pydantic schema generation cannot be used since the JSON schema
        # has to be constructed for a specific instantiation of a tool list
        # so that parameters of a function are correctly generated
        # based on the chosen function name
        def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
            return {
                "properties": {
                    "name": {
                        "type": "string",
                        "enum": [tool.function.name]
                    },
                    # parameters are always generated as '{}' in the final
                    # output if they are missing from the request
                    # (i.e. are None or '{}') so the schema is
                    # updated to produce an empty object in that case
                    "parameters": tool.function.parameters
                    if tool.function.parameters else {
                        "type": "object",
                        "properties": {}
                    }
                },
                "required": ["name", "parameters"]
            }

        def get_tool_schema_defs(
                tools: list[ChatCompletionToolsParam]) -> dict:
            all_defs = dict[str, dict[str, Any]]()
            for tool in tools:
                if tool.function.parameters is None:
                    continue
                defs = tool.function.parameters.pop("$defs", {})
                for def_name, def_schema in defs.items():
                    if def_name in all_defs and all_defs[
                            def_name] != def_schema:
                        raise ValueError(
                            f"Tool definition '{def_name}' has "
                            "multiple schemas, which is not "
                            "supported.")
                    else:
                        all_defs[def_name] = def_schema
            return all_defs

        json_schema = {
            "type": "array",
            "minItems": 1,
            "items": {
                "type": "object",
                "anyOf": [get_tool_schema(tool) for tool in self.tools]
            }
        }
        json_schema_defs = get_tool_schema_defs(self.tools)
        if json_schema_defs:
            json_schema["$defs"] = json_schema_defs
        return json_schema

    return None

check_cache_salt_support classmethod

check_cache_salt_support(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_cache_salt_support(cls, data):
    if data.get("cache_salt") is not None:
        if not envs.VLLM_USE_V1:
            raise ValueError(
                "Parameter 'cache_salt' is not supported with "
                "this instance of vLLM, which uses engine V0.")
        if not isinstance(data["cache_salt"],
                          str) or not data["cache_salt"]:
            raise ValueError("Parameter 'cache_salt' must be a "
                             "non-empty string if provided.")
    return data

check_generation_prompt classmethod

check_generation_prompt(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get(
            "add_generation_prompt"):
        raise ValueError("Cannot set both `continue_final_message` and "
                         "`add_generation_prompt` to True.")
    return data

check_logprobs classmethod

check_logprobs(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_logprobs(cls, data):
    if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
        if data.get("stream") and (prompt_logprobs > 0
                                   or prompt_logprobs == -1):
            raise ValueError(
                "`prompt_logprobs` are not available when `stream=True`.")

        if prompt_logprobs < 0 and prompt_logprobs != -1:
            raise ValueError(
                "`prompt_logprobs` must be a positive value or -1.")
        if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
            raise ValueError("`prompt_logprobs=-1` is only supported with "
                             "vLLM engine V1.")
    if (top_logprobs := data.get("top_logprobs")) is not None:
        if top_logprobs < 0 and top_logprobs != -1:
            raise ValueError(
                "`top_logprobs` must be a positive value or -1.")

        if (top_logprobs == -1
                or top_logprobs > 0) and not data.get("logprobs"):
            raise ValueError(
                "when using `top_logprobs`, `logprobs` must be set to true."
            )

    return data

check_structured_outputs_count classmethod

check_structured_outputs_count(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_structured_outputs_count(cls, data):
    if isinstance(data, ValueError):
        raise data

    if data.get("structured_outputs", None) is None:
        return data

    structured_outputs_kwargs = data['structured_outputs']
    count = sum(
        structured_outputs_kwargs.get(k) is not None
        for k in ("json", "regex", "choice"))
    # you can only use one kind of constraints for structured outputs
    if count > 1:
        raise ValueError(
            "You can only use one kind of constraints for structured "
            "outputs ('json', 'regex' or 'choice').")
    # you can only either use structured outputs or tools, not both
    if count > 1 and data.get("tool_choice", "none") not in (
            "none",
            "auto",
            "required",
    ):
        raise ValueError(
            "You can only either use constraints for structured outputs "
            "or tools, not both.")
    return data

check_tool_usage classmethod

check_tool_usage(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_tool_usage(cls, data):

    # if "tool_choice" is not specified but tools are provided,
    # default to "auto" tool_choice
    if "tool_choice" not in data and data.get("tools"):
        data["tool_choice"] = "auto"

    # if "tool_choice" is "none" -- no validation is needed for tools
    if "tool_choice" in data and data["tool_choice"] == "none":
        return data

    # if "tool_choice" is specified -- validation
    if "tool_choice" in data and data["tool_choice"] is not None:

        # ensure that if "tool choice" is specified, tools are present
        if "tools" not in data or data["tools"] is None:
            raise ValueError(
                "When using `tool_choice`, `tools` must be set.")

        # make sure that tool choice is either a named tool
        # OR that it's set to "auto" or "required"
        if data["tool_choice"] not in [
                "auto", "required"
        ] and not isinstance(data["tool_choice"], dict):
            raise ValueError(
                f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
                'Only named tools, "none", "auto" or "required" '\
                'are supported.'
            )

        # if tool_choice is "required" but the "tools" list is empty,
        # override the data to behave like "none" to align with
        # OpenAI’s behavior.
        if data["tool_choice"] == "required" and isinstance(
                data["tools"], list) and len(data["tools"]) == 0:
            data["tool_choice"] = "none"
            del data["tools"]
            return data

        # ensure that if "tool_choice" is specified as an object,
        # it matches a valid tool
        correct_usage_message = 'Correct usage: `{"type": "function",' \
            ' "function": {"name": "my_function"}}`'
        if isinstance(data["tool_choice"], dict):
            valid_tool = False
            function = data["tool_choice"].get("function")
            if not isinstance(function, dict):
                raise ValueError(
                    f"Invalid value for `function`: `{function}` in "
                    f"`tool_choice`! {correct_usage_message}")
            if "name" not in function:
                raise ValueError(f"Expected field `name` in `function` in "
                                 f"`tool_choice`! {correct_usage_message}")
            function_name = function["name"]
            if not isinstance(function_name,
                              str) or len(function_name) == 0:
                raise ValueError(
                    f"Invalid `name` in `function`: `{function_name}`"
                    f" in `tool_choice`! {correct_usage_message}")
            for tool in data["tools"]:
                if tool["function"]["name"] == function_name:
                    valid_tool = True
                    break
            if not valid_tool:
                raise ValueError(
                    "The tool specified in `tool_choice` does not match any"
                    " of the specified `tools`")
    return data

to_beam_search_params

to_beam_search_params(
    max_tokens: int, default_sampling_params: dict
) -> BeamSearchParams
Source code in vllm/entrypoints/openai/protocol.py
def to_beam_search_params(
        self, max_tokens: int,
        default_sampling_params: dict) -> BeamSearchParams:

    n = self.n if self.n is not None else 1
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])

    return BeamSearchParams(
        beam_width=n,
        max_tokens=max_tokens,
        ignore_eos=self.ignore_eos,
        temperature=temperature,
        length_penalty=self.length_penalty,
        include_stop_str_in_output=self.include_stop_str_in_output,
    )

to_sampling_params

to_sampling_params(
    max_tokens: int,
    logits_processor_pattern: Optional[str],
    default_sampling_params: dict,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
    self,
    max_tokens: int,
    logits_processor_pattern: Optional[str],
    default_sampling_params: dict,
) -> SamplingParams:

    # Default parameters
    if (repetition_penalty := self.repetition_penalty) is None:
        repetition_penalty = default_sampling_params.get(
            "repetition_penalty",
            self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
        )
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
    if (top_k := self.top_k) is None:
        top_k = default_sampling_params.get(
            "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
    if (min_p := self.min_p) is None:
        min_p = default_sampling_params.get(
            "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

    prompt_logprobs = self.prompt_logprobs
    if prompt_logprobs is None and self.echo:
        prompt_logprobs = self.top_logprobs

    # Forward deprecated guided_* parameters to structured_outputs
    if self.structured_outputs is None:
        kwargs = dict[str, Any](
            json=self.guided_json,
            regex=self.guided_regex,
            choice=self.guided_choice,
            grammar=self.guided_grammar,
            whitespace_pattern=self.guided_whitespace_pattern,
            structural_tag=self.structural_tag,
        )
        kwargs = {k: v for k, v in kwargs.items() if v is not None}
        if len(kwargs) > 0:
            self.structured_outputs = StructuredOutputsParams(**kwargs)

    response_format = self.response_format
    json_schema_from_tool = self._get_json_schema_from_tool()
    if response_format is not None or json_schema_from_tool is not None:
        # If structured outputs wasn't already enabled,
        # we must enable it for these features to work
        if self.structured_outputs is None:
            self.structured_outputs = StructuredOutputsParams()

        # Set structured output params for response format
        if response_format is not None:
            if response_format.type == "json_object":
                self.structured_outputs.json_object = True
            elif response_format.type == "json_schema":
                json_schema = response_format.json_schema
                assert json_schema is not None
                self.structured_outputs.json = json_schema.json_schema
            elif response_format.type == "structural_tag":
                structural_tag = response_format
                assert structural_tag is not None and isinstance(
                    structural_tag, StructuralTagResponseFormat)
                s_tag_obj = structural_tag.model_dump(by_alias=True)
                self.structured_outputs.structural_tag = json.dumps(
                    s_tag_obj)

        # Set structured output params for tool calling
        if json_schema_from_tool is not None:
            self.structured_outputs.json = json_schema_from_tool

    extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
    if self.kv_transfer_params:
        # Pass in kv_transfer_params via extra_args
        extra_args["kv_transfer_params"] = self.kv_transfer_params
    return SamplingParams.from_optional(
        n=self.n,
        best_of=self.best_of,
        presence_penalty=self.presence_penalty,
        frequency_penalty=self.frequency_penalty,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        min_p=min_p,
        seed=self.seed,
        stop=self.stop,
        stop_token_ids=self.stop_token_ids,
        logprobs=self.top_logprobs if self.logprobs else None,
        prompt_logprobs=prompt_logprobs,
        ignore_eos=self.ignore_eos,
        max_tokens=max_tokens,
        min_tokens=self.min_tokens,
        skip_special_tokens=self.skip_special_tokens,
        spaces_between_special_tokens=self.spaces_between_special_tokens,
        logits_processors=get_logits_processors(self.logits_processors,
                                                logits_processor_pattern),
        include_stop_str_in_output=self.include_stop_str_in_output,
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        output_kind=RequestOutputKind.DELTA if self.stream \
            else RequestOutputKind.FINAL_ONLY,
        structured_outputs=self.structured_outputs,
        logit_bias=self.logit_bias,
        bad_words=self.bad_words,
        allowed_token_ids=self.allowed_token_ids,
        extra_args=extra_args or None,
    )

validate_stream_options classmethod

validate_stream_options(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
    if data.get("stream_options") and not data.get("stream"):
        raise ValueError(
            "Stream options can only be defined when `stream=True`.")

    return data

ChatCompletionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
    object: Literal["chat.completion"] = "chat.completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[ChatCompletionResponseChoice]
    service_tier: Optional[Literal["auto", "default", "flex", "scale",
                                   "priority"]] = None
    system_fingerprint: Optional[str] = None
    usage: UsageInfo

    # vLLM-specific fields that are not in OpenAI spec
    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
    prompt_token_ids: Optional[list[int]] = None
    kv_transfer_params: Optional[dict[str, Any]] = Field(
        default=None, description="KVTransfer parameters.")

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"chatcmpl-{random_uuid()}"
)

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = Field(
    default=None, description="KVTransfer parameters."
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal['chat.completion'] = 'chat.completion'

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[
    list[Optional[dict[int, Logprob]]]
] = None

prompt_token_ids class-attribute instance-attribute

prompt_token_ids: Optional[list[int]] = None

service_tier class-attribute instance-attribute

service_tier: Optional[
    Literal["auto", "default", "flex", "scale", "priority"]
] = None

system_fingerprint class-attribute instance-attribute

system_fingerprint: Optional[str] = None

usage instance-attribute

usage: UsageInfo

ChatCompletionResponseChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionResponseChoice(OpenAIBaseModel):
    index: int
    message: ChatMessage
    logprobs: Optional[ChatCompletionLogProbs] = None
    # per OpenAI spec this is the default
    finish_reason: Optional[str] = "stop"
    # not part of the OpenAI spec but included in vLLM for legacy reasons
    stop_reason: Optional[Union[int, str]] = None
    # not part of the OpenAI spec but is useful for tracing the tokens
    # in agent scenarios
    token_ids: Optional[list[int]] = None

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = 'stop'

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: Optional[ChatCompletionLogProbs] = None

message instance-attribute

message: ChatMessage

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = None

token_ids class-attribute instance-attribute

token_ids: Optional[list[int]] = None

ChatCompletionResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
    index: int
    delta: DeltaMessage
    logprobs: Optional[ChatCompletionLogProbs] = None
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = None
    # not part of the OpenAI spec but for tracing the tokens
    token_ids: Optional[list[int]] = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: Optional[ChatCompletionLogProbs] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = None

token_ids class-attribute instance-attribute

token_ids: Optional[list[int]] = None

ChatCompletionStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[ChatCompletionResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(default=None)
    # not part of the OpenAI spec but for tracing the tokens
    prompt_token_ids: Optional[list[int]] = None

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"chatcmpl-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal["chat.completion.chunk"] = (
    "chat.completion.chunk"
)

prompt_token_ids class-attribute instance-attribute

prompt_token_ids: Optional[list[int]] = None

usage class-attribute instance-attribute

usage: Optional[UsageInfo] = Field(default=None)

ChatCompletionToolsParam

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatCompletionToolsParam(OpenAIBaseModel):
    type: Literal["function"] = "function"
    function: FunctionDefinition

function instance-attribute

type class-attribute instance-attribute

type: Literal['function'] = 'function'

ChatMessage

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ChatMessage(OpenAIBaseModel):
    role: str
    content: Optional[str] = None
    refusal: Optional[str] = None
    annotations: Optional[OpenAIAnnotation] = None
    audio: Optional[OpenAIChatCompletionAudio] = None
    function_call: Optional[FunctionCall] = None
    tool_calls: list[ToolCall] = Field(default_factory=list)

    # vLLM-specific fields that are not in OpenAI spec
    reasoning_content: Optional[str] = None

annotations class-attribute instance-attribute

annotations: Optional[Annotation] = None

audio class-attribute instance-attribute

audio: Optional[ChatCompletionAudio] = None

content class-attribute instance-attribute

content: Optional[str] = None

function_call class-attribute instance-attribute

function_call: Optional[FunctionCall] = None

reasoning_content class-attribute instance-attribute

reasoning_content: Optional[str] = None

refusal class-attribute instance-attribute

refusal: Optional[str] = None

role instance-attribute

role: str

tool_calls class-attribute instance-attribute

tool_calls: list[ToolCall] = Field(default_factory=list)

ClassificationData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ClassificationData(OpenAIBaseModel):
    index: int
    label: Optional[str]
    probs: list[float]
    num_classes: int

index instance-attribute

index: int

label instance-attribute

label: Optional[str]

num_classes instance-attribute

num_classes: int

probs instance-attribute

probs: list[float]

ClassificationRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ClassificationRequest(OpenAIBaseModel):
    model: Optional[str] = None
    input: Union[list[str], str]
    truncate_prompt_tokens: Optional[int] = None
    user: Optional[str] = None

    # --8<-- [start:classification-extra-params]
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )

    activation: Optional[bool] = None

    # --8<-- [end:classification-extra-params]

    def to_pooling_params(self):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            activation=self.activation)

activation class-attribute instance-attribute

activation: Optional[bool] = None

input instance-attribute

input: Union[list[str], str]

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[int] = None

user class-attribute instance-attribute

user: Optional[str] = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        activation=self.activation)

ClassificationResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ClassificationResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[ClassificationData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"classify-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

CompletionLogProbs

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionLogProbs(OpenAIBaseModel):
    text_offset: list[int] = Field(default_factory=list)
    token_logprobs: list[Optional[float]] = Field(default_factory=list)
    tokens: list[str] = Field(default_factory=list)
    top_logprobs: list[Optional[dict[str,
                                     float]]] = Field(default_factory=list)

text_offset class-attribute instance-attribute

text_offset: list[int] = Field(default_factory=list)

token_logprobs class-attribute instance-attribute

token_logprobs: list[Optional[float]] = Field(
    default_factory=list
)

tokens class-attribute instance-attribute

tokens: list[str] = Field(default_factory=list)

top_logprobs class-attribute instance-attribute

top_logprobs: list[Optional[dict[str, float]]] = Field(
    default_factory=list
)

CompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
class CompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/completions/create
    model: Optional[str] = None
    prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
    best_of: Optional[int] = None
    echo: Optional[bool] = False
    frequency_penalty: Optional[float] = 0.0
    logit_bias: Optional[dict[str, float]] = None
    logprobs: Optional[int] = None
    max_tokens: Optional[int] = 16
    n: int = 1
    presence_penalty: Optional[float] = 0.0
    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    stop: Optional[Union[str, list[str]]] = []
    stream: Optional[bool] = False
    stream_options: Optional[StreamOptions] = None
    suffix: Optional[str] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    user: Optional[str] = None

    # --8<-- [start:completion-sampling-params]
    use_beam_search: bool = False
    top_k: Optional[int] = None
    min_p: Optional[float] = None
    repetition_penalty: Optional[float] = None
    length_penalty: float = 1.0
    stop_token_ids: Optional[list[int]] = []
    include_stop_str_in_output: bool = False
    ignore_eos: bool = False
    min_tokens: int = 0
    skip_special_tokens: bool = True
    spaces_between_special_tokens: bool = True
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
    allowed_token_ids: Optional[list[int]] = None
    prompt_logprobs: Optional[int] = None
    # --8<-- [end:completion-sampling-params]

    # --8<-- [start:completion-extra-params]
    prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
    add_special_tokens: bool = Field(
        default=True,
        description=(
            "If true (the default), special tokens (e.g. BOS) will be added to "
            "the prompt."),
    )
    response_format: Optional[AnyResponseFormat] = Field(
        default=None,
        description=(
            "Similar to chat completion, this parameter specifies the format "
            "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
            ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
        ),
    )
    structured_outputs: Optional[StructuredOutputsParams] = Field(
        default=None,
        description="Additional kwargs for structured outputs",
    )
    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
        default=None,
        description=(
            "`guided_json` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `json` to `structured_outputs` instead."),
    )
    guided_regex: Optional[str] = Field(
        default=None,
        description=(
            "`guided_regex` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `regex` to `structured_outputs` instead."),
    )
    guided_choice: Optional[list[str]] = Field(
        default=None,
        description=(
            "`guided_choice` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `choice` to `structured_outputs` instead."),
    )
    guided_grammar: Optional[str] = Field(
        default=None,
        description=(
            "`guided_grammar` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `grammar` to `structured_outputs` instead."),
    )
    guided_decoding_backend: Optional[str] = Field(
        default=None,
        description=(
            "`guided_decoding_backend` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please remove it from your request."),
    )
    guided_whitespace_pattern: Optional[str] = Field(
        default=None,
        description=(
            "`guided_whitespace_pattern` is deprecated. "
            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
            "Please pass `whitespace_pattern` to `structured_outputs` instead."
        ),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    logits_processors: Optional[LogitsProcessors] = Field(
        default=None,
        description=(
            "A list of either qualified names of logits processors, or "
            "constructor objects, to apply when sampling. A constructor is "
            "a JSON object with a required 'qualname' field specifying the "
            "qualified name of the processor class/factory, and optional "
            "'args' and 'kwargs' fields containing positional and keyword "
            "arguments. For example: {'qualname': "
            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
            "{'param': 'value'}}."))

    return_tokens_as_token_ids: Optional[bool] = Field(
        default=None,
        description=(
            "If specified with 'logprobs', tokens are represented "
            " as strings of the form 'token_id:{token_id}' so that tokens "
            "that are not JSON-encodable can be identified."))
    return_token_ids: Optional[bool] = Field(
        default=None,
        description=(
            "If specified, the result will include token IDs alongside the "
            "generated text. In streaming mode, prompt_token_ids is included "
            "only in the first chunk, and token_ids contains the delta tokens "
            "for each chunk. This is useful for debugging or when you "
            "need to map generated text back to input tokens."))

    cache_salt: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit). Not supported by vLLM engine V0."))

    kv_transfer_params: Optional[dict[str, Any]] = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.")

    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
        default=None,
        description=("Additional request parameters with string or "
                     "numeric values, used by custom extensions."),
    )

    # --8<-- [end:completion-extra-params]

    # Default sampling parameters for completion requests
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_beam_search_params(
        self,
        max_tokens: int,
        default_sampling_params: Optional[dict] = None,
    ) -> BeamSearchParams:

        if default_sampling_params is None:
            default_sampling_params = {}
        n = self.n if self.n is not None else 1

        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get("temperature", 1.0)

        return BeamSearchParams(
            beam_width=n,
            max_tokens=max_tokens,
            ignore_eos=self.ignore_eos,
            temperature=temperature,
            length_penalty=self.length_penalty,
            include_stop_str_in_output=self.include_stop_str_in_output,
        )

    def to_sampling_params(
        self,
        max_tokens: int,
        logits_processor_pattern: Optional[str],
        default_sampling_params: Optional[dict] = None,
    ) -> SamplingParams:

        if default_sampling_params is None:
            default_sampling_params = {}

        # Default parameters
        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
            )
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

        prompt_logprobs = self.prompt_logprobs
        if prompt_logprobs is None and self.echo:
            prompt_logprobs = self.logprobs

        echo_without_generation = self.echo and self.max_tokens == 0

        # Forward deprecated guided_* parameters to structured_outputs
        if self.structured_outputs is None:
            kwargs = dict[str, Any](
                json=self.guided_json,
                regex=self.guided_regex,
                choice=self.guided_choice,
                grammar=self.guided_grammar,
                whitespace_pattern=self.guided_whitespace_pattern,
            )
            kwargs = {k: v for k, v in kwargs.items() if v is not None}
            if len(kwargs) > 0:
                self.structured_outputs = StructuredOutputsParams(**kwargs)

        if (self.structured_outputs is not None
                and self.response_format is not None
                and self.response_format.type == "json_object"):
            self.structured_outputs.json_object = True

        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
        if self.kv_transfer_params:
            # Pass in kv_transfer_params via extra_args
            extra_args["kv_transfer_params"] = self.kv_transfer_params
        return SamplingParams.from_optional(
            n=self.n,
            best_of=self.best_of,
            presence_penalty=self.presence_penalty,
            frequency_penalty=self.frequency_penalty,
            repetition_penalty=repetition_penalty,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            seed=self.seed,
            stop=self.stop,
            stop_token_ids=self.stop_token_ids,
            logprobs=self.logprobs,
            ignore_eos=self.ignore_eos,
            max_tokens=max_tokens if not echo_without_generation else 1,
            min_tokens=self.min_tokens,
            prompt_logprobs=prompt_logprobs,
            skip_special_tokens=self.skip_special_tokens,
            spaces_between_special_tokens=self.spaces_between_special_tokens,
            include_stop_str_in_output=self.include_stop_str_in_output,
            logits_processors=get_logits_processors(self.logits_processors,
                                                    logits_processor_pattern),
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA if self.stream \
                else RequestOutputKind.FINAL_ONLY,
            structured_outputs=self.structured_outputs,
            logit_bias=self.logit_bias,
            allowed_token_ids=self.allowed_token_ids,
            extra_args=extra_args or None,
            )

    @model_validator(mode="before")
    @classmethod
    def check_structured_outputs_count(cls, data):
        if data.get("structured_outputs", None) is None:
            return data

        structured_outputs_kwargs = data['structured_outputs']
        count = sum(
            structured_outputs_kwargs.get(k) is not None
            for k in ("json", "regex", "choice"))
        if count > 1:
            raise ValueError(
                "You can only use one kind of constraints for structured "
                "outputs ('json', 'regex' or 'choice').")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_logprobs(cls, data):
        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
            if data.get("stream") and (prompt_logprobs > 0
                                       or prompt_logprobs == -1):
                raise ValueError(
                    "`prompt_logprobs` are not available when `stream=True`.")

            if prompt_logprobs < 0 and prompt_logprobs != -1:
                raise ValueError(
                    "`prompt_logprobs` must be a positive value or -1.")
            if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
                raise ValueError("`prompt_logprobs=-1` is only supported with "
                                 "vLLM engine V1.")
        if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
            raise ValueError("`logprobs` must be a positive value.")

        return data

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        if data.get("stream_options") and not data.get("stream"):
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data

    @model_validator(mode="before")
    @classmethod
    def validate_prompt_and_prompt_embeds(cls, data):
        prompt = data.get("prompt")
        prompt_embeds = data.get("prompt_embeds")

        prompt_is_empty = (prompt is None
                           or (isinstance(prompt, str) and prompt == ""))
        embeds_is_empty = (prompt_embeds is None
                           or (isinstance(prompt_embeds, list)
                               and len(prompt_embeds) == 0))

        if prompt_is_empty and embeds_is_empty:
            raise ValueError(
                "Either prompt or prompt_embeds must be provided and non-empty."
            )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_cache_salt_support(cls, data):
        if data.get("cache_salt") is not None:
            if not envs.VLLM_USE_V1:
                raise ValueError(
                    "Parameter 'cache_salt' is not supported with "
                    "this instance of vLLM, which uses engine V0.")
            if not isinstance(data["cache_salt"],
                              str) or not data["cache_salt"]:
                raise ValueError("Parameter 'cache_salt' must be a "
                                 "non-empty string if provided.")
        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {
    "repetition_penalty": 1.0,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 0,
    "min_p": 0.0,
}

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=True,
    description="If true (the default), special tokens (e.g. BOS) will be added to the prompt.",
)

allowed_token_ids class-attribute instance-attribute

allowed_token_ids: Optional[list[int]] = None

best_of class-attribute instance-attribute

best_of: Optional[int] = None

cache_salt class-attribute instance-attribute

cache_salt: Optional[str] = Field(
    default=None,
    description="If specified, the prefix cache will be salted with the provided string to prevent an attacker to guess prompts in multi-user environments. The salt should be random, protected from access by 3rd parties, and long enough to be unpredictable (e.g., 43 characters base64-encoded, corresponding to 256 bit). Not supported by vLLM engine V0.",
)

echo class-attribute instance-attribute

echo: Optional[bool] = False

frequency_penalty class-attribute instance-attribute

frequency_penalty: Optional[float] = 0.0

guided_choice class-attribute instance-attribute

guided_choice: Optional[list[str]] = Field(
    default=None,
    description="`guided_choice` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `choice` to `structured_outputs` instead.",
)

guided_decoding_backend class-attribute instance-attribute

guided_decoding_backend: Optional[str] = Field(
    default=None,
    description="`guided_decoding_backend` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please remove it from your request.",
)

guided_grammar class-attribute instance-attribute

guided_grammar: Optional[str] = Field(
    default=None,
    description="`guided_grammar` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `grammar` to `structured_outputs` instead.",
)

guided_json class-attribute instance-attribute

guided_json: Optional[Union[str, dict, BaseModel]] = Field(
    default=None,
    description="`guided_json` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `json` to `structured_outputs` instead.",
)

guided_regex class-attribute instance-attribute

guided_regex: Optional[str] = Field(
    default=None,
    description="`guided_regex` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `regex` to `structured_outputs` instead.",
)

guided_whitespace_pattern class-attribute instance-attribute

guided_whitespace_pattern: Optional[str] = Field(
    default=None,
    description="`guided_whitespace_pattern` is deprecated. This will be removed in v0.12.0 or v1.0.0, whichever is soonest. Please pass `whitespace_pattern` to `structured_outputs` instead.",
)

ignore_eos class-attribute instance-attribute

ignore_eos: bool = False

include_stop_str_in_output class-attribute instance-attribute

include_stop_str_in_output: bool = False

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

length_penalty class-attribute instance-attribute

length_penalty: float = 1.0

logit_bias class-attribute instance-attribute

logit_bias: Optional[dict[str, float]] = None

logits_processors class-attribute instance-attribute

logits_processors: Optional[LogitsProcessors] = Field(
    default=None,
    description="A list of either qualified names of logits processors, or constructor objects, to apply when sampling. A constructor is a JSON object with a required 'qualname' field specifying the qualified name of the processor class/factory, and optional 'args' and 'kwargs' fields containing positional and keyword arguments. For example: {'qualname': 'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': {'param': 'value'}}.",
)

logprobs class-attribute instance-attribute

logprobs: Optional[int] = None

max_tokens class-attribute instance-attribute

max_tokens: Optional[int] = 16

min_p class-attribute instance-attribute

min_p: Optional[float] = None

min_tokens class-attribute instance-attribute

min_tokens: int = 0

model class-attribute instance-attribute

model: Optional[str] = None

n class-attribute instance-attribute

n: int = 1

presence_penalty class-attribute instance-attribute

presence_penalty: Optional[float] = 0.0

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

prompt class-attribute instance-attribute

prompt: Optional[
    Union[list[int], list[list[int]], str, list[str]]
] = None

prompt_embeds class-attribute instance-attribute

prompt_embeds: Optional[Union[bytes, list[bytes]]] = None

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[int] = None

repetition_penalty class-attribute instance-attribute

repetition_penalty: Optional[float] = None

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

response_format class-attribute instance-attribute

response_format: Optional[AnyResponseFormat] = Field(
    default=None,
    description="Similar to chat completion, this parameter specifies the format of output. Only {'type': 'json_object'}, {'type': 'json_schema'}, {'type': 'structural_tag'}, or {'type': 'text' } is supported.",
)

return_token_ids class-attribute instance-attribute

return_token_ids: Optional[bool] = Field(
    default=None,
    description="If specified, the result will include token IDs alongside the generated text. In streaming mode, prompt_token_ids is included only in the first chunk, and token_ids contains the delta tokens for each chunk. This is useful for debugging or when you need to map generated text back to input tokens.",
)

return_tokens_as_token_ids class-attribute instance-attribute

return_tokens_as_token_ids: Optional[bool] = Field(
    default=None,
    description="If specified with 'logprobs', tokens are represented  as strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified.",
)

seed class-attribute instance-attribute

seed: Optional[int] = Field(None, ge=min, le=max)

skip_special_tokens class-attribute instance-attribute

skip_special_tokens: bool = True

spaces_between_special_tokens class-attribute instance-attribute

spaces_between_special_tokens: bool = True

stop class-attribute instance-attribute

stop: Optional[Union[str, list[str]]] = []

stop_token_ids class-attribute instance-attribute

stop_token_ids: Optional[list[int]] = []

stream class-attribute instance-attribute

stream: Optional[bool] = False

stream_options class-attribute instance-attribute

stream_options: Optional[StreamOptions] = None

structured_outputs class-attribute instance-attribute

structured_outputs: Optional[StructuredOutputsParams] = (
    Field(
        default=None,
        description="Additional kwargs for structured outputs",
    )
)

suffix class-attribute instance-attribute

suffix: Optional[str] = None

temperature class-attribute instance-attribute

temperature: Optional[float] = None

top_k class-attribute instance-attribute

top_k: Optional[int] = None

top_p class-attribute instance-attribute

top_p: Optional[float] = None

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None
use_beam_search: bool = False

user class-attribute instance-attribute

user: Optional[str] = None

vllm_xargs class-attribute instance-attribute

vllm_xargs: Optional[dict[str, Union[str, int, float]]] = (
    Field(
        default=None,
        description="Additional request parameters with string or numeric values, used by custom extensions.",
    )
)

check_cache_salt_support classmethod

check_cache_salt_support(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_cache_salt_support(cls, data):
    if data.get("cache_salt") is not None:
        if not envs.VLLM_USE_V1:
            raise ValueError(
                "Parameter 'cache_salt' is not supported with "
                "this instance of vLLM, which uses engine V0.")
        if not isinstance(data["cache_salt"],
                          str) or not data["cache_salt"]:
            raise ValueError("Parameter 'cache_salt' must be a "
                             "non-empty string if provided.")
    return data

check_logprobs classmethod

check_logprobs(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_logprobs(cls, data):
    if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
        if data.get("stream") and (prompt_logprobs > 0
                                   or prompt_logprobs == -1):
            raise ValueError(
                "`prompt_logprobs` are not available when `stream=True`.")

        if prompt_logprobs < 0 and prompt_logprobs != -1:
            raise ValueError(
                "`prompt_logprobs` must be a positive value or -1.")
        if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
            raise ValueError("`prompt_logprobs=-1` is only supported with "
                             "vLLM engine V1.")
    if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
        raise ValueError("`logprobs` must be a positive value.")

    return data

check_structured_outputs_count classmethod

check_structured_outputs_count(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_structured_outputs_count(cls, data):
    if data.get("structured_outputs", None) is None:
        return data

    structured_outputs_kwargs = data['structured_outputs']
    count = sum(
        structured_outputs_kwargs.get(k) is not None
        for k in ("json", "regex", "choice"))
    if count > 1:
        raise ValueError(
            "You can only use one kind of constraints for structured "
            "outputs ('json', 'regex' or 'choice').")
    return data

to_beam_search_params

to_beam_search_params(
    max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> BeamSearchParams
Source code in vllm/entrypoints/openai/protocol.py
def to_beam_search_params(
    self,
    max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> BeamSearchParams:

    if default_sampling_params is None:
        default_sampling_params = {}
    n = self.n if self.n is not None else 1

    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get("temperature", 1.0)

    return BeamSearchParams(
        beam_width=n,
        max_tokens=max_tokens,
        ignore_eos=self.ignore_eos,
        temperature=temperature,
        length_penalty=self.length_penalty,
        include_stop_str_in_output=self.include_stop_str_in_output,
    )

to_sampling_params

to_sampling_params(
    max_tokens: int,
    logits_processor_pattern: Optional[str],
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
    self,
    max_tokens: int,
    logits_processor_pattern: Optional[str],
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams:

    if default_sampling_params is None:
        default_sampling_params = {}

    # Default parameters
    if (repetition_penalty := self.repetition_penalty) is None:
        repetition_penalty = default_sampling_params.get(
            "repetition_penalty",
            self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
        )
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
    if (top_k := self.top_k) is None:
        top_k = default_sampling_params.get(
            "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
    if (min_p := self.min_p) is None:
        min_p = default_sampling_params.get(
            "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

    prompt_logprobs = self.prompt_logprobs
    if prompt_logprobs is None and self.echo:
        prompt_logprobs = self.logprobs

    echo_without_generation = self.echo and self.max_tokens == 0

    # Forward deprecated guided_* parameters to structured_outputs
    if self.structured_outputs is None:
        kwargs = dict[str, Any](
            json=self.guided_json,
            regex=self.guided_regex,
            choice=self.guided_choice,
            grammar=self.guided_grammar,
            whitespace_pattern=self.guided_whitespace_pattern,
        )
        kwargs = {k: v for k, v in kwargs.items() if v is not None}
        if len(kwargs) > 0:
            self.structured_outputs = StructuredOutputsParams(**kwargs)

    if (self.structured_outputs is not None
            and self.response_format is not None
            and self.response_format.type == "json_object"):
        self.structured_outputs.json_object = True

    extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
    if self.kv_transfer_params:
        # Pass in kv_transfer_params via extra_args
        extra_args["kv_transfer_params"] = self.kv_transfer_params
    return SamplingParams.from_optional(
        n=self.n,
        best_of=self.best_of,
        presence_penalty=self.presence_penalty,
        frequency_penalty=self.frequency_penalty,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        min_p=min_p,
        seed=self.seed,
        stop=self.stop,
        stop_token_ids=self.stop_token_ids,
        logprobs=self.logprobs,
        ignore_eos=self.ignore_eos,
        max_tokens=max_tokens if not echo_without_generation else 1,
        min_tokens=self.min_tokens,
        prompt_logprobs=prompt_logprobs,
        skip_special_tokens=self.skip_special_tokens,
        spaces_between_special_tokens=self.spaces_between_special_tokens,
        include_stop_str_in_output=self.include_stop_str_in_output,
        logits_processors=get_logits_processors(self.logits_processors,
                                                logits_processor_pattern),
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        output_kind=RequestOutputKind.DELTA if self.stream \
            else RequestOutputKind.FINAL_ONLY,
        structured_outputs=self.structured_outputs,
        logit_bias=self.logit_bias,
        allowed_token_ids=self.allowed_token_ids,
        extra_args=extra_args or None,
        )

validate_prompt_and_prompt_embeds classmethod

validate_prompt_and_prompt_embeds(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_prompt_and_prompt_embeds(cls, data):
    prompt = data.get("prompt")
    prompt_embeds = data.get("prompt_embeds")

    prompt_is_empty = (prompt is None
                       or (isinstance(prompt, str) and prompt == ""))
    embeds_is_empty = (prompt_embeds is None
                       or (isinstance(prompt_embeds, list)
                           and len(prompt_embeds) == 0))

    if prompt_is_empty and embeds_is_empty:
        raise ValueError(
            "Either prompt or prompt_embeds must be provided and non-empty."
        )

    return data

validate_stream_options classmethod

validate_stream_options(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
    if data.get("stream_options") and not data.get("stream"):
        raise ValueError(
            "Stream options can only be defined when `stream=True`.")

    return data

CompletionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
    object: Literal["text_completion"] = "text_completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[CompletionResponseChoice]
    service_tier: Optional[Literal["auto", "default", "flex", "scale",
                                   "priority"]] = None
    system_fingerprint: Optional[str] = None
    usage: UsageInfo

    # vLLM-specific fields that are not in OpenAI spec
    kv_transfer_params: Optional[dict[str, Any]] = Field(
        default=None, description="KVTransfer parameters.")

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"cmpl-{random_uuid()}"
)

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: Optional[dict[str, Any]] = Field(
    default=None, description="KVTransfer parameters."
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal['text_completion'] = 'text_completion'

service_tier class-attribute instance-attribute

service_tier: Optional[
    Literal["auto", "default", "flex", "scale", "priority"]
] = None

system_fingerprint class-attribute instance-attribute

system_fingerprint: Optional[str] = None

usage instance-attribute

usage: UsageInfo

CompletionResponseChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionResponseChoice(OpenAIBaseModel):
    index: int
    text: str
    logprobs: Optional[CompletionLogProbs] = None
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = Field(
        default=None,
        description=(
            "The stop string or token id that caused the completion "
            "to stop, None if the completion finished for some other reason "
            "including encountering the EOS token"),
    )
    token_ids: Optional[list[int]] = None  # For response
    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
    prompt_token_ids: Optional[list[int]] = None  # For prompt

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: Optional[CompletionLogProbs] = None

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[
    list[Optional[dict[int, Logprob]]]
] = None

prompt_token_ids class-attribute instance-attribute

prompt_token_ids: Optional[list[int]] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = Field(
    default=None,
    description="The stop string or token id that caused the completion to stop, None if the completion finished for some other reason including encountering the EOS token",
)

text instance-attribute

text: str

token_ids class-attribute instance-attribute

token_ids: Optional[list[int]] = None

CompletionResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionResponseStreamChoice(OpenAIBaseModel):
    index: int
    text: str
    logprobs: Optional[CompletionLogProbs] = None
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = Field(
        default=None,
        description=(
            "The stop string or token id that caused the completion "
            "to stop, None if the completion finished for some other reason "
            "including encountering the EOS token"),
    )
    # not part of the OpenAI spec but for tracing the tokens
    # prompt tokens is put into choice to align with CompletionResponseChoice
    prompt_token_ids: Optional[list[int]] = None
    token_ids: Optional[list[int]] = None

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: Optional[CompletionLogProbs] = None

prompt_token_ids class-attribute instance-attribute

prompt_token_ids: Optional[list[int]] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = Field(
    default=None,
    description="The stop string or token id that caused the completion to stop, None if the completion finished for some other reason including encountering the EOS token",
)

text instance-attribute

text: str

token_ids class-attribute instance-attribute

token_ids: Optional[list[int]] = None

CompletionStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class CompletionStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
    object: str = "text_completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[CompletionResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"cmpl-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'text_completion'

usage class-attribute instance-attribute

usage: Optional[UsageInfo] = Field(default=None)

DeltaFunctionCall

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DeltaFunctionCall(BaseModel):
    name: Optional[str] = None
    arguments: Optional[str] = None

arguments class-attribute instance-attribute

arguments: Optional[str] = None

name class-attribute instance-attribute

name: Optional[str] = None

DeltaMessage

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DeltaMessage(OpenAIBaseModel):
    role: Optional[str] = None
    content: Optional[str] = None
    reasoning_content: Optional[str] = None
    tool_calls: list[DeltaToolCall] = Field(default_factory=list)

content class-attribute instance-attribute

content: Optional[str] = None

reasoning_content class-attribute instance-attribute

reasoning_content: Optional[str] = None

role class-attribute instance-attribute

role: Optional[str] = None

tool_calls class-attribute instance-attribute

tool_calls: list[DeltaToolCall] = Field(
    default_factory=list
)

DeltaToolCall

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DeltaToolCall(OpenAIBaseModel):
    id: Optional[str] = None
    type: Optional[Literal["function"]] = None
    index: int
    function: Optional[DeltaFunctionCall] = None

function class-attribute instance-attribute

function: Optional[DeltaFunctionCall] = None

id class-attribute instance-attribute

id: Optional[str] = None

index instance-attribute

index: int

type class-attribute instance-attribute

type: Optional[Literal['function']] = None

DetokenizeRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DetokenizeRequest(OpenAIBaseModel):
    model: Optional[str] = None
    tokens: list[int]

model class-attribute instance-attribute

model: Optional[str] = None

tokens instance-attribute

tokens: list[int]

DetokenizeResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class DetokenizeResponse(OpenAIBaseModel):
    prompt: str

prompt instance-attribute

prompt: str

EmbeddingChatRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class EmbeddingChatRequest(OpenAIBaseModel):
    model: Optional[str] = None
    messages: list[ChatCompletionMessageParam]

    encoding_format: Literal["float", "base64"] = "float"
    dimensions: Optional[int] = None
    user: Optional[str] = None
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

    # --8<-- [start:chat-embedding-extra-params]
    add_generation_prompt: bool = Field(
        default=False,
        description=
        ("If true, the generation prompt will be added to the chat template. "
         "This is a parameter used by chat template in tokenizer config of the "
         "model."),
    )

    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."),
    )
    chat_template: Optional[str] = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."),
    )
    chat_template_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."),
    )
    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    normalize: Optional[bool] = None
    # --8<-- [end:chat-embedding-extra-params]

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get(
                "add_generation_prompt"):
            raise ValueError("Cannot set both `continue_final_message` and "
                             "`add_generation_prompt` to True.")
        return data

    def to_pooling_params(self):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
            normalize=self.normalize)

add_generation_prompt class-attribute instance-attribute

add_generation_prompt: bool = Field(
    default=False,
    description="If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

chat_template class-attribute instance-attribute

chat_template: Optional[str] = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs class-attribute instance-attribute

chat_template_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

dimensions class-attribute instance-attribute

dimensions: Optional[int] = None

encoding_format class-attribute instance-attribute

encoding_format: Literal['float', 'base64'] = 'float'

messages instance-attribute

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

normalize class-attribute instance-attribute

normalize: Optional[bool] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None

user class-attribute instance-attribute

user: Optional[str] = None

check_generation_prompt classmethod

check_generation_prompt(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get(
            "add_generation_prompt"):
        raise ValueError("Cannot set both `continue_final_message` and "
                         "`add_generation_prompt` to True.")
    return data

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        dimensions=self.dimensions,
        normalize=self.normalize)

EmbeddingCompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class EmbeddingCompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/embeddings
    model: Optional[str] = None
    input: Union[list[int], list[list[int]], str, list[str]]
    encoding_format: Literal["float", "base64"] = "float"
    dimensions: Optional[int] = None
    user: Optional[str] = None
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

    # --8<-- [start:embedding-extra-params]
    add_special_tokens: bool = Field(
        default=True,
        description=(
            "If true (the default), special tokens (e.g. BOS) will be added to "
            "the prompt."),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    normalize: Optional[bool] = None

    # --8<-- [end:embedding-extra-params]

    def to_pooling_params(self):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
            normalize=self.normalize)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=True,
    description="If true (the default), special tokens (e.g. BOS) will be added to the prompt.",
)

dimensions class-attribute instance-attribute

dimensions: Optional[int] = None

encoding_format class-attribute instance-attribute

encoding_format: Literal['float', 'base64'] = 'float'

input instance-attribute

input: Union[list[int], list[list[int]], str, list[str]]

model class-attribute instance-attribute

model: Optional[str] = None

normalize class-attribute instance-attribute

normalize: Optional[bool] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None

user class-attribute instance-attribute

user: Optional[str] = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        dimensions=self.dimensions,
        normalize=self.normalize)

EmbeddingResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class EmbeddingResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[EmbeddingResponseData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"embd-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

EmbeddingResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class EmbeddingResponseData(OpenAIBaseModel):
    index: int
    object: str = "embedding"
    embedding: Union[list[float], str]

embedding instance-attribute

embedding: Union[list[float], str]

index instance-attribute

index: int

object class-attribute instance-attribute

object: str = 'embedding'

ErrorInfo

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ErrorInfo(OpenAIBaseModel):
    message: str
    type: str
    param: Optional[str] = None
    code: int

code instance-attribute

code: int

message instance-attribute

message: str

param class-attribute instance-attribute

param: Optional[str] = None

type instance-attribute

type: str

ErrorResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ErrorResponse(OpenAIBaseModel):
    error: ErrorInfo

error instance-attribute

error: ErrorInfo

ExtractedToolCallInformation

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ExtractedToolCallInformation(BaseModel):
    # indicate if tools were called
    tools_called: bool

    # extracted tool calls
    tool_calls: list[ToolCall]

    # content - per OpenAI spec, content AND tool calls can be returned rarely
    # But some models will do this intentionally
    content: Optional[str] = None

content class-attribute instance-attribute

content: Optional[str] = None

tool_calls instance-attribute

tool_calls: list[ToolCall]

tools_called instance-attribute

tools_called: bool

FunctionCall

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class FunctionCall(OpenAIBaseModel):
    name: str
    arguments: str

arguments instance-attribute

arguments: str

name instance-attribute

name: str

FunctionDefinition

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class FunctionDefinition(OpenAIBaseModel):
    name: str
    description: Optional[str] = None
    parameters: Optional[dict[str, Any]] = None

description class-attribute instance-attribute

description: Optional[str] = None

name instance-attribute

name: str

parameters class-attribute instance-attribute

parameters: Optional[dict[str, Any]] = None

IOProcessorRequest

Bases: OpenAIBaseModel, Generic[T]

Source code in vllm/entrypoints/openai/protocol.py
class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
    model: Optional[str] = None

    priority: int = Field(default=0)
    """
    The priority of the request (lower means earlier handling;
    default: 0). Any priority other than 0 will raise an error
    if the served model does not use priority scheduling.
    """
    data: T
    """
    When using plugins IOProcessor plugins, the actual input is processed
    by the plugin itself. Hence, we use a generic type for the request data
    """
    softmax: bool = True

    def to_pooling_params(self):
        return PoolingParams(task="encode", softmax=self.softmax)

data instance-attribute

data: T

When using plugins IOProcessor plugins, the actual input is processed by the plugin itself. Hence, we use a generic type for the request data

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(default=0)

The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.

softmax class-attribute instance-attribute

softmax: bool = True

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(task="encode", softmax=self.softmax)

IOProcessorResponse

Bases: OpenAIBaseModel, Generic[T]

Source code in vllm/entrypoints/openai/protocol.py
class IOProcessorResponse(OpenAIBaseModel, Generic[T]):

    request_id: Optional[str] = None
    """
    The request_id associated with this response
    """
    created_at: int = Field(default_factory=lambda: int(time.time()))

    data: T
    """
    When using plugins IOProcessor plugins, the actual output is generated
    by the plugin itself. Hence, we use a generic type for the response data
    """

created_at class-attribute instance-attribute

created_at: int = Field(default_factory=lambda: int(time()))

data instance-attribute

data: T

When using plugins IOProcessor plugins, the actual output is generated by the plugin itself. Hence, we use a generic type for the response data

request_id class-attribute instance-attribute

request_id: Optional[str] = None

The request_id associated with this response

InputTokensDetails

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class InputTokensDetails(OpenAIBaseModel):
    cached_tokens: int

cached_tokens instance-attribute

cached_tokens: int

JsonSchemaResponseFormat

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class JsonSchemaResponseFormat(OpenAIBaseModel):
    name: str
    description: Optional[str] = None
    # schema is the field in openai but that causes conflicts with pydantic so
    # instead use json_schema with an alias
    json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
    strict: Optional[bool] = None

description class-attribute instance-attribute

description: Optional[str] = None

json_schema class-attribute instance-attribute

json_schema: Optional[dict[str, Any]] = Field(
    default=None, alias="schema"
)

name instance-attribute

name: str

strict class-attribute instance-attribute

strict: Optional[bool] = None

LoadLoRAAdapterRequest

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class LoadLoRAAdapterRequest(BaseModel):
    lora_name: str
    lora_path: str

lora_name instance-attribute

lora_name: str

lora_path instance-attribute

lora_path: str

LogitsProcessorConstructor

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class LogitsProcessorConstructor(BaseModel):
    qualname: str
    args: Optional[list[Any]] = None
    kwargs: Optional[dict[str, Any]] = None

    model_config = ConfigDict(extra="forbid")

args class-attribute instance-attribute

args: Optional[list[Any]] = None

kwargs class-attribute instance-attribute

kwargs: Optional[dict[str, Any]] = None

model_config class-attribute instance-attribute

model_config = ConfigDict(extra='forbid')

qualname instance-attribute

qualname: str

ModelCard

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ModelCard(OpenAIBaseModel):
    id: str
    object: str = "model"
    created: int = Field(default_factory=lambda: int(time.time()))
    owned_by: str = "vllm"
    root: Optional[str] = None
    parent: Optional[str] = None
    max_model_len: Optional[int] = None
    permission: list[ModelPermission] = Field(default_factory=list)

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id instance-attribute

id: str

max_model_len class-attribute instance-attribute

max_model_len: Optional[int] = None

object class-attribute instance-attribute

object: str = 'model'

owned_by class-attribute instance-attribute

owned_by: str = 'vllm'

parent class-attribute instance-attribute

parent: Optional[str] = None

permission class-attribute instance-attribute

permission: list[ModelPermission] = Field(
    default_factory=list
)

root class-attribute instance-attribute

root: Optional[str] = None

ModelList

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ModelList(OpenAIBaseModel):
    object: str = "list"
    data: list[ModelCard] = Field(default_factory=list)

data class-attribute instance-attribute

data: list[ModelCard] = Field(default_factory=list)

object class-attribute instance-attribute

object: str = 'list'

ModelPermission

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ModelPermission(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
    object: str = "model_permission"
    created: int = Field(default_factory=lambda: int(time.time()))
    allow_create_engine: bool = False
    allow_sampling: bool = True
    allow_logprobs: bool = True
    allow_search_indices: bool = False
    allow_view: bool = True
    allow_fine_tuning: bool = False
    organization: str = "*"
    group: Optional[str] = None
    is_blocking: bool = False

allow_create_engine class-attribute instance-attribute

allow_create_engine: bool = False

allow_fine_tuning class-attribute instance-attribute

allow_fine_tuning: bool = False

allow_logprobs class-attribute instance-attribute

allow_logprobs: bool = True

allow_sampling class-attribute instance-attribute

allow_sampling: bool = True

allow_search_indices class-attribute instance-attribute

allow_search_indices: bool = False

allow_view class-attribute instance-attribute

allow_view: bool = True

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

group class-attribute instance-attribute

group: Optional[str] = None

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"modelperm-{random_uuid()}"
)

is_blocking class-attribute instance-attribute

is_blocking: bool = False

object class-attribute instance-attribute

object: str = 'model_permission'

organization class-attribute instance-attribute

organization: str = '*'

OpenAIBaseModel

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class OpenAIBaseModel(BaseModel):
    # OpenAI API does allow extra fields
    model_config = ConfigDict(extra="allow")

    # Cache class field names
    field_names: ClassVar[Optional[set[str]]] = None

    @model_validator(mode="wrap")
    @classmethod
    def __log_extra_fields__(cls, data, handler):
        result = handler(data)
        if not isinstance(data, dict):
            return result
        field_names = cls.field_names
        if field_names is None:
            # Get all class field names and their potential aliases
            field_names = set()
            for field_name, field in cls.model_fields.items():
                field_names.add(field_name)
                if alias := getattr(field, "alias", None):
                    field_names.add(alias)
            cls.field_names = field_names

        # Compare against both field names and aliases
        if any(k not in field_names for k in data):
            logger.warning(
                "The following fields were present in the request "
                "but ignored: %s",
                data.keys() - field_names,
            )
        return result

field_names class-attribute

field_names: Optional[set[str]] = None

model_config class-attribute instance-attribute

model_config = ConfigDict(extra='allow')

__log_extra_fields__ classmethod

__log_extra_fields__(data, handler)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="wrap")
@classmethod
def __log_extra_fields__(cls, data, handler):
    result = handler(data)
    if not isinstance(data, dict):
        return result
    field_names = cls.field_names
    if field_names is None:
        # Get all class field names and their potential aliases
        field_names = set()
        for field_name, field in cls.model_fields.items():
            field_names.add(field_name)
            if alias := getattr(field, "alias", None):
                field_names.add(alias)
        cls.field_names = field_names

    # Compare against both field names and aliases
    if any(k not in field_names for k in data):
        logger.warning(
            "The following fields were present in the request "
            "but ignored: %s",
            data.keys() - field_names,
        )
    return result

OutputTokensDetails

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class OutputTokensDetails(OpenAIBaseModel):
    reasoning_tokens: int = 0
    tool_output_tokens: int = 0

reasoning_tokens class-attribute instance-attribute

reasoning_tokens: int = 0

tool_output_tokens class-attribute instance-attribute

tool_output_tokens: int = 0

PoolingResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class PoolingResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[PoolingResponseData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"pool-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

PoolingResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class PoolingResponseData(OpenAIBaseModel):
    index: int
    object: str = "pooling"
    data: Union[list[list[float]], list[float], str]

data instance-attribute

index instance-attribute

index: int

object class-attribute instance-attribute

object: str = 'pooling'

PromptTokenUsageInfo

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class PromptTokenUsageInfo(OpenAIBaseModel):
    cached_tokens: Optional[int] = None

cached_tokens class-attribute instance-attribute

cached_tokens: Optional[int] = None

RequestResponseMetadata

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RequestResponseMetadata(BaseModel):
    request_id: str
    final_usage_info: Optional[UsageInfo] = None

final_usage_info class-attribute instance-attribute

final_usage_info: Optional[UsageInfo] = None

request_id instance-attribute

request_id: str

RerankDocument

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankDocument(BaseModel):
    text: Optional[str] = None
    multi_modal: Optional[ScoreContentPartParam] = None

multi_modal class-attribute instance-attribute

multi_modal: Optional[ScoreContentPartParam] = None

text class-attribute instance-attribute

text: Optional[str] = None

RerankRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankRequest(OpenAIBaseModel):
    model: Optional[str] = None
    query: Union[str, ScoreMultiModalParam]
    documents: Union[list[str], ScoreMultiModalParam]
    top_n: int = Field(default_factory=lambda: 0)
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

    # --8<-- [start:rerank-extra-params]

    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )

    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )

    activation: Optional[bool] = None

    # --8<-- [end:rerank-extra-params]

    def to_pooling_params(self):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            activation=self.activation)

activation class-attribute instance-attribute

activation: Optional[bool] = None

documents instance-attribute

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

query instance-attribute

top_n class-attribute instance-attribute

top_n: int = Field(default_factory=lambda: 0)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        activation=self.activation)

RerankResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankResponse(OpenAIBaseModel):
    id: str
    model: str
    usage: RerankUsage
    results: list[RerankResult]

id instance-attribute

id: str

model instance-attribute

model: str

results instance-attribute

results: list[RerankResult]

usage instance-attribute

usage: RerankUsage

RerankResult

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankResult(BaseModel):
    index: int
    document: RerankDocument
    relevance_score: float

document instance-attribute

document: RerankDocument

index instance-attribute

index: int

relevance_score instance-attribute

relevance_score: float

RerankUsage

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class RerankUsage(BaseModel):
    total_tokens: int

total_tokens instance-attribute

total_tokens: int

ResponseFormat

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ResponseFormat(OpenAIBaseModel):
    # type must be "json_schema", "json_object", or "text"
    type: Literal["text", "json_object", "json_schema"]
    json_schema: Optional[JsonSchemaResponseFormat] = None

json_schema class-attribute instance-attribute

json_schema: Optional[JsonSchemaResponseFormat] = None

type instance-attribute

type: Literal['text', 'json_object', 'json_schema']

ResponseReasoningPartAddedEvent

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
    content_index: int
    """The index of the content part that is done."""

    item_id: str
    """The ID of the output item that the content part was added to."""

    output_index: int
    """The index of the output item that the content part was added to."""

    part: ResponseReasoningTextContent
    """The content part that is done."""

    sequence_number: int
    """The sequence number of this event."""

    type: Literal["response.reasoning_part.added"]
    """The type of the event. Always `response.reasoning_part.added`."""

content_index instance-attribute

content_index: int

The index of the content part that is done.

item_id instance-attribute

item_id: str

The ID of the output item that the content part was added to.

output_index instance-attribute

output_index: int

The index of the output item that the content part was added to.

part instance-attribute

part: Content

The content part that is done.

sequence_number instance-attribute

sequence_number: int

The sequence number of this event.

type instance-attribute

type: Literal['response.reasoning_part.added']

The type of the event. Always response.reasoning_part.added.

ResponseReasoningPartDoneEvent

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
    content_index: int
    """The index of the content part that is done."""

    item_id: str
    """The ID of the output item that the content part was added to."""

    output_index: int
    """The index of the output item that the content part was added to."""

    part: ResponseReasoningTextContent
    """The content part that is done."""

    sequence_number: int
    """The sequence number of this event."""

    type: Literal["response.reasoning_part.done"]
    """The type of the event. Always `response.reasoning_part.done`."""

content_index instance-attribute

content_index: int

The index of the content part that is done.

item_id instance-attribute

item_id: str

The ID of the output item that the content part was added to.

output_index instance-attribute

output_index: int

The index of the output item that the content part was added to.

part instance-attribute

part: Content

The content part that is done.

sequence_number instance-attribute

sequence_number: int

The sequence number of this event.

type instance-attribute

type: Literal['response.reasoning_part.done']

The type of the event. Always response.reasoning_part.done.

ResponseUsage

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ResponseUsage(OpenAIBaseModel):
    input_tokens: int
    input_tokens_details: InputTokensDetails
    output_tokens: int
    output_tokens_details: OutputTokensDetails
    total_tokens: int

input_tokens instance-attribute

input_tokens: int

input_tokens_details instance-attribute

input_tokens_details: InputTokensDetails

output_tokens instance-attribute

output_tokens: int

output_tokens_details instance-attribute

output_tokens_details: OutputTokensDetails

total_tokens instance-attribute

total_tokens: int

ResponsesRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ResponsesRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/responses/create
    background: Optional[bool] = False
    include: Optional[list[
        Literal[
            "code_interpreter_call.outputs",
            "computer_call_output.output.image_url",
            "file_search_call.results",
            "message.input_image.image_url",
            "message.output_text.logprobs",
            "reasoning.encrypted_content",
        ],
    ]] = None
    input: Union[str, list[ResponseInputOutputItem]]
    instructions: Optional[str] = None
    max_output_tokens: Optional[int] = None
    max_tool_calls: Optional[int] = None
    metadata: Optional[Metadata] = None
    model: Optional[str] = None
    parallel_tool_calls: Optional[bool] = True
    previous_response_id: Optional[str] = None
    prompt: Optional[ResponsePrompt] = None
    reasoning: Optional[Reasoning] = None
    service_tier: Literal["auto", "default", "flex", "scale",
                          "priority"] = "auto"
    store: Optional[bool] = True
    stream: Optional[bool] = False
    temperature: Optional[float] = None
    text: Optional[ResponseTextConfig] = None
    tool_choice: ToolChoice = "auto"
    tools: list[Tool] = Field(default_factory=list)
    top_logprobs: Optional[int] = 0
    top_p: Optional[float] = None
    truncation: Optional[Literal["auto", "disabled"]] = "disabled"
    user: Optional[str] = None

    # --8<-- [start:responses-extra-params]
    request_id: str = Field(
        default_factory=lambda: f"resp_{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."),
    )
    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
    cache_salt: Optional[str] = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit). Not supported by vLLM engine V0."))

    enable_response_messages: bool = Field(
        default=False,
        description=(
            "Dictates whether or not to return messages as part of the "
            "response object. Currently only supported for non-streaming "
            "non-background and gpt-oss only. "))
    # --8<-- [end:responses-extra-params]

    _DEFAULT_SAMPLING_PARAMS = {
        "temperature": 1.0,
        "top_p": 1.0,
    }

    def to_sampling_params(
        self,
        default_max_tokens: int,
        default_sampling_params: Optional[dict] = None,
    ) -> SamplingParams:
        if self.max_output_tokens is None:
            max_tokens = default_max_tokens
        else:
            max_tokens = min(self.max_output_tokens, default_max_tokens)

        default_sampling_params = default_sampling_params or {}
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
        stop_token_ids = default_sampling_params.get("stop_token_ids")

        # Structured output
        structured_outputs = None
        if self.text is not None and self.text.format is not None:
            response_format = self.text.format
            if (response_format.type == "json_schema"
                    and response_format.schema_ is not None):
                structured_outputs = StructuredOutputsParams(
                    json=response_format.schema_)
            elif response_format.type == "json_object":
                raise NotImplementedError("json_object is not supported")

        # TODO: add more parameters
        return SamplingParams.from_optional(
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            logprobs=self.top_logprobs
            if self.is_include_output_logprobs() else None,
            stop_token_ids=stop_token_ids,
            output_kind=(RequestOutputKind.DELTA
                         if self.stream else RequestOutputKind.FINAL_ONLY),
            structured_outputs=structured_outputs,
        )

    def is_include_output_logprobs(self) -> bool:
        """Check if the request includes output logprobs."""
        if self.include is None:
            return False
        return isinstance(
            self.include,
            list) and "message.output_text.logprobs" in self.include

    @model_validator(mode="before")
    def validate_background(cls, data):
        if not data.get("background"):
            return data
        if not data.get("store", True):
            raise ValueError(
                "background can only be used when `store` is true")
        return data

    @model_validator(mode="before")
    def validate_prompt(cls, data):
        if data.get("prompt") is not None:
            raise ValueError("prompt template is not supported")
        return data

    @model_validator(mode="before")
    def check_cache_salt_support(cls, data):
        if data.get("cache_salt") is not None:
            if not envs.VLLM_USE_V1:
                raise ValueError(
                    "Parameter 'cache_salt' is not supported with "
                    "this instance of vLLM, which uses engine V0.")
            if not isinstance(data["cache_salt"],
                              str) or not data["cache_salt"]:
                raise ValueError("Parameter 'cache_salt' must be a "
                                 "non-empty string if provided.")
        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS = {
    "temperature": 1.0,
    "top_p": 1.0,
}

background class-attribute instance-attribute

background: Optional[bool] = False

cache_salt class-attribute instance-attribute

cache_salt: Optional[str] = Field(
    default=None,
    description="If specified, the prefix cache will be salted with the provided string to prevent an attacker to guess prompts in multi-user environments. The salt should be random, protected from access by 3rd parties, and long enough to be unpredictable (e.g., 43 characters base64-encoded, corresponding to 256 bit). Not supported by vLLM engine V0.",
)

enable_response_messages class-attribute instance-attribute

enable_response_messages: bool = Field(
    default=False,
    description="Dictates whether or not to return messages as part of the response object. Currently only supported for non-streaming non-background and gpt-oss only. ",
)

include class-attribute instance-attribute

include: Optional[
    list[
        Literal[
            "code_interpreter_call.outputs",
            "computer_call_output.output.image_url",
            "file_search_call.results",
            "message.input_image.image_url",
            "message.output_text.logprobs",
            "reasoning.encrypted_content",
        ],
    ]
] = None

input instance-attribute

instructions class-attribute instance-attribute

instructions: Optional[str] = None

max_output_tokens class-attribute instance-attribute

max_output_tokens: Optional[int] = None

max_tool_calls class-attribute instance-attribute

max_tool_calls: Optional[int] = None

metadata class-attribute instance-attribute

metadata: Optional[Metadata] = None

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

parallel_tool_calls class-attribute instance-attribute

parallel_tool_calls: Optional[bool] = True

previous_response_id class-attribute instance-attribute

previous_response_id: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

prompt class-attribute instance-attribute

prompt: Optional[ResponsePrompt] = None

reasoning class-attribute instance-attribute

reasoning: Optional[Reasoning] = None

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=lambda: f"resp_{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

service_tier class-attribute instance-attribute

service_tier: Literal[
    "auto", "default", "flex", "scale", "priority"
] = "auto"

store class-attribute instance-attribute

store: Optional[bool] = True

stream class-attribute instance-attribute

stream: Optional[bool] = False

temperature class-attribute instance-attribute

temperature: Optional[float] = None

text class-attribute instance-attribute

text: Optional[ResponseFormatTextConfig] = None

tool_choice class-attribute instance-attribute

tool_choice: ToolChoice = 'auto'

tools class-attribute instance-attribute

tools: list[Tool] = Field(default_factory=list)

top_logprobs class-attribute instance-attribute

top_logprobs: Optional[int] = 0

top_p class-attribute instance-attribute

top_p: Optional[float] = None

truncation class-attribute instance-attribute

truncation: Optional[Literal["auto", "disabled"]] = (
    "disabled"
)

user class-attribute instance-attribute

user: Optional[str] = None

check_cache_salt_support

check_cache_salt_support(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
def check_cache_salt_support(cls, data):
    if data.get("cache_salt") is not None:
        if not envs.VLLM_USE_V1:
            raise ValueError(
                "Parameter 'cache_salt' is not supported with "
                "this instance of vLLM, which uses engine V0.")
        if not isinstance(data["cache_salt"],
                          str) or not data["cache_salt"]:
            raise ValueError("Parameter 'cache_salt' must be a "
                             "non-empty string if provided.")
    return data

is_include_output_logprobs

is_include_output_logprobs() -> bool

Check if the request includes output logprobs.

Source code in vllm/entrypoints/openai/protocol.py
def is_include_output_logprobs(self) -> bool:
    """Check if the request includes output logprobs."""
    if self.include is None:
        return False
    return isinstance(
        self.include,
        list) and "message.output_text.logprobs" in self.include

to_sampling_params

to_sampling_params(
    default_max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
    self,
    default_max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams:
    if self.max_output_tokens is None:
        max_tokens = default_max_tokens
    else:
        max_tokens = min(self.max_output_tokens, default_max_tokens)

    default_sampling_params = default_sampling_params or {}
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
    stop_token_ids = default_sampling_params.get("stop_token_ids")

    # Structured output
    structured_outputs = None
    if self.text is not None and self.text.format is not None:
        response_format = self.text.format
        if (response_format.type == "json_schema"
                and response_format.schema_ is not None):
            structured_outputs = StructuredOutputsParams(
                json=response_format.schema_)
        elif response_format.type == "json_object":
            raise NotImplementedError("json_object is not supported")

    # TODO: add more parameters
    return SamplingParams.from_optional(
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        logprobs=self.top_logprobs
        if self.is_include_output_logprobs() else None,
        stop_token_ids=stop_token_ids,
        output_kind=(RequestOutputKind.DELTA
                     if self.stream else RequestOutputKind.FINAL_ONLY),
        structured_outputs=structured_outputs,
    )

validate_background

validate_background(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
def validate_background(cls, data):
    if not data.get("background"):
        return data
    if not data.get("store", True):
        raise ValueError(
            "background can only be used when `store` is true")
    return data

validate_prompt

validate_prompt(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
def validate_prompt(cls, data):
    if data.get("prompt") is not None:
        raise ValueError("prompt template is not supported")
    return data

ResponsesResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ResponsesResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
    created_at: int = Field(default_factory=lambda: int(time.time()))
    # error: Optional[ResponseError] = None
    incomplete_details: Optional[IncompleteDetails] = None
    instructions: Optional[str] = None
    metadata: Optional[Metadata] = None
    model: str
    object: Literal["response"] = "response"
    output: list[ResponseOutputItem]
    # These are populated when enable_response_messages is set to True
    # TODO: Currently an issue where content of harmony messages
    # is not available when these are serialized. Metadata is available
    input_messages: Optional[list[ChatCompletionMessageParam]] = None
    output_messages: Optional[list[ChatCompletionMessageParam]] = None
    parallel_tool_calls: bool
    temperature: float
    tool_choice: ToolChoice
    tools: list[Tool]
    top_p: float
    background: bool
    max_output_tokens: int
    max_tool_calls: Optional[int] = None
    previous_response_id: Optional[str] = None
    prompt: Optional[ResponsePrompt] = None
    reasoning: Optional[Reasoning] = None
    service_tier: Literal["auto", "default", "flex", "scale", "priority"]
    status: ResponseStatus
    text: Optional[ResponseTextConfig] = None
    top_logprobs: Optional[int] = None
    truncation: Literal["auto", "disabled"]
    usage: Optional[ResponseUsage] = None
    user: Optional[str] = None

    @classmethod
    def from_request(
        cls,
        request: ResponsesRequest,
        sampling_params: SamplingParams,
        model_name: str,
        created_time: int,
        output: list[ResponseOutputItem],
        status: ResponseStatus,
        usage: Optional[ResponseUsage] = None,
        input_messages: Optional[list[ChatCompletionMessageParam]] = None,
        output_messages: Optional[list[ChatCompletionMessageParam]] = None,
    ) -> "ResponsesResponse":

        incomplete_details: Optional[IncompleteDetails] = None
        if status == 'incomplete':
            incomplete_details = IncompleteDetails(reason='max_output_tokens')
        # TODO: implement the other reason for incomplete_details,
        # which is content_filter
        # incomplete_details = IncompleteDetails(reason='content_filter')
        return cls(
            id=request.request_id,
            created_at=created_time,
            incomplete_details=incomplete_details,
            instructions=request.instructions,
            metadata=request.metadata,
            model=model_name,
            output=output,
            input_messages=input_messages,
            output_messages=output_messages,
            parallel_tool_calls=request.parallel_tool_calls,
            temperature=sampling_params.temperature,
            tool_choice=request.tool_choice,
            tools=request.tools,
            top_p=sampling_params.top_p,
            background=request.background,
            max_output_tokens=sampling_params.max_tokens,
            max_tool_calls=request.max_tool_calls,
            previous_response_id=request.previous_response_id,
            prompt=request.prompt,
            reasoning=request.reasoning,
            service_tier=request.service_tier,
            status=status,
            text=request.text,
            top_logprobs=sampling_params.logprobs,
            truncation=request.truncation,
            user=request.user,
            usage=usage,
        )

background instance-attribute

background: bool

created_at class-attribute instance-attribute

created_at: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"resp_{random_uuid()}"
)

incomplete_details class-attribute instance-attribute

incomplete_details: Optional[IncompleteDetails] = None

input_messages class-attribute instance-attribute

input_messages: Optional[
    list[ChatCompletionMessageParam]
] = None

instructions class-attribute instance-attribute

instructions: Optional[str] = None

max_output_tokens instance-attribute

max_output_tokens: int

max_tool_calls class-attribute instance-attribute

max_tool_calls: Optional[int] = None

metadata class-attribute instance-attribute

metadata: Optional[Metadata] = None

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal['response'] = 'response'

output instance-attribute

output: list[ResponseOutputItem]

output_messages class-attribute instance-attribute

output_messages: Optional[
    list[ChatCompletionMessageParam]
] = None

parallel_tool_calls instance-attribute

parallel_tool_calls: bool

previous_response_id class-attribute instance-attribute

previous_response_id: Optional[str] = None

prompt class-attribute instance-attribute

prompt: Optional[ResponsePrompt] = None

reasoning class-attribute instance-attribute

reasoning: Optional[Reasoning] = None

service_tier instance-attribute

service_tier: Literal[
    "auto", "default", "flex", "scale", "priority"
]

status instance-attribute

status: ResponseStatus

temperature instance-attribute

temperature: float

text class-attribute instance-attribute

text: Optional[ResponseFormatTextConfig] = None

tool_choice instance-attribute

tool_choice: ToolChoice

tools instance-attribute

tools: list[Tool]

top_logprobs class-attribute instance-attribute

top_logprobs: Optional[int] = None

top_p instance-attribute

top_p: float

truncation instance-attribute

truncation: Literal['auto', 'disabled']

usage class-attribute instance-attribute

usage: Optional[ResponseUsage] = None

user class-attribute instance-attribute

user: Optional[str] = None

from_request classmethod

from_request(
    request: ResponsesRequest,
    sampling_params: SamplingParams,
    model_name: str,
    created_time: int,
    output: list[ResponseOutputItem],
    status: ResponseStatus,
    usage: Optional[ResponseUsage] = None,
    input_messages: Optional[
        list[ChatCompletionMessageParam]
    ] = None,
    output_messages: Optional[
        list[ChatCompletionMessageParam]
    ] = None,
) -> ResponsesResponse
Source code in vllm/entrypoints/openai/protocol.py
@classmethod
def from_request(
    cls,
    request: ResponsesRequest,
    sampling_params: SamplingParams,
    model_name: str,
    created_time: int,
    output: list[ResponseOutputItem],
    status: ResponseStatus,
    usage: Optional[ResponseUsage] = None,
    input_messages: Optional[list[ChatCompletionMessageParam]] = None,
    output_messages: Optional[list[ChatCompletionMessageParam]] = None,
) -> "ResponsesResponse":

    incomplete_details: Optional[IncompleteDetails] = None
    if status == 'incomplete':
        incomplete_details = IncompleteDetails(reason='max_output_tokens')
    # TODO: implement the other reason for incomplete_details,
    # which is content_filter
    # incomplete_details = IncompleteDetails(reason='content_filter')
    return cls(
        id=request.request_id,
        created_at=created_time,
        incomplete_details=incomplete_details,
        instructions=request.instructions,
        metadata=request.metadata,
        model=model_name,
        output=output,
        input_messages=input_messages,
        output_messages=output_messages,
        parallel_tool_calls=request.parallel_tool_calls,
        temperature=sampling_params.temperature,
        tool_choice=request.tool_choice,
        tools=request.tools,
        top_p=sampling_params.top_p,
        background=request.background,
        max_output_tokens=sampling_params.max_tokens,
        max_tool_calls=request.max_tool_calls,
        previous_response_id=request.previous_response_id,
        prompt=request.prompt,
        reasoning=request.reasoning,
        service_tier=request.service_tier,
        status=status,
        text=request.text,
        top_logprobs=sampling_params.logprobs,
        truncation=request.truncation,
        user=request.user,
        usage=usage,
    )

ScoreRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ScoreRequest(OpenAIBaseModel):
    model: Optional[str] = None
    text_1: Union[list[str], str, ScoreMultiModalParam]
    text_2: Union[list[str], str, ScoreMultiModalParam]
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

    # --8<-- [start:score-extra-params]

    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )

    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )

    activation: Optional[bool] = None

    # --8<-- [end:score-extra-params]

    def to_pooling_params(self):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            activation=self.activation)

activation class-attribute instance-attribute

activation: Optional[bool] = None

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

text_1 instance-attribute

text_2 instance-attribute

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Field(ge=-1)]
] = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/openai/protocol.py
def to_pooling_params(self):
    return PoolingParams(
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        activation=self.activation)

ScoreResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ScoreResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[ScoreResponseData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"embd-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

ScoreResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ScoreResponseData(OpenAIBaseModel):
    index: int
    object: str = "score"
    score: float

index instance-attribute

index: int

object class-attribute instance-attribute

object: str = 'score'

score instance-attribute

score: float

StreamOptions

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class StreamOptions(OpenAIBaseModel):
    include_usage: Optional[bool] = True
    continuous_usage_stats: Optional[bool] = False

continuous_usage_stats class-attribute instance-attribute

continuous_usage_stats: Optional[bool] = False

include_usage class-attribute instance-attribute

include_usage: Optional[bool] = True

StructuralTag

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class StructuralTag(OpenAIBaseModel):
    begin: str
    # schema is the field, but that causes conflicts with pydantic so
    # instead use structural_tag_schema with an alias
    structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
                                                            alias="schema")
    end: str

begin instance-attribute

begin: str

end instance-attribute

end: str

structural_tag_schema class-attribute instance-attribute

structural_tag_schema: Optional[dict[str, Any]] = Field(
    default=None, alias="schema"
)

StructuralTagResponseFormat

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class StructuralTagResponseFormat(OpenAIBaseModel):
    type: Literal["structural_tag"]
    structures: list[StructuralTag]
    triggers: list[str]

structures instance-attribute

structures: list[StructuralTag]

triggers instance-attribute

triggers: list[str]

type instance-attribute

type: Literal['structural_tag']

TokenizeChatRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TokenizeChatRequest(OpenAIBaseModel):
    model: Optional[str] = None
    messages: list[ChatCompletionMessageParam]

    add_generation_prompt: bool = Field(
        default=True,
        description=
        ("If true, the generation prompt will be added to the chat template. "
         "This is a parameter used by chat template in tokenizer config of the "
         "model."),
    )
    return_token_strs: Optional[bool] = Field(
        default=False,
        description=("If true, also return the token strings "
                     "corresponding to the token ids."),
    )
    continue_final_message: bool = Field(
        default=False,
        description=
        ("If this is set, the chat will be formatted so that the final "
         "message in the chat is open-ended, without any EOS tokens. The "
         "model will continue this message rather than starting a new one. "
         "This allows you to \"prefill\" part of the model's response for it. "
         "Cannot be used at the same time as `add_generation_prompt`."),
    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."),
    )
    chat_template: Optional[str] = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."),
    )
    chat_template_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."),
    )
    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    tools: Optional[list[ChatCompletionToolsParam]] = Field(
        default=None,
        description=("A list of tools the model may call."),
    )

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get(
                "add_generation_prompt"):
            raise ValueError("Cannot set both `continue_final_message` and "
                             "`add_generation_prompt` to True.")
        return data

add_generation_prompt class-attribute instance-attribute

add_generation_prompt: bool = Field(
    default=True,
    description="If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

chat_template class-attribute instance-attribute

chat_template: Optional[str] = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs class-attribute instance-attribute

chat_template_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

continue_final_message class-attribute instance-attribute

continue_final_message: bool = Field(
    default=False,
    description='If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to "prefill" part of the model\'s response for it. Cannot be used at the same time as `add_generation_prompt`.',
)

messages instance-attribute

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[dict[str, Any]] = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: Optional[str] = None

return_token_strs class-attribute instance-attribute

return_token_strs: Optional[bool] = Field(
    default=False,
    description="If true, also return the token strings corresponding to the token ids.",
)

tools class-attribute instance-attribute

tools: Optional[list[ChatCompletionToolsParam]] = Field(
    default=None,
    description="A list of tools the model may call.",
)

check_generation_prompt classmethod

check_generation_prompt(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get(
            "add_generation_prompt"):
        raise ValueError("Cannot set both `continue_final_message` and "
                         "`add_generation_prompt` to True.")
    return data

TokenizeCompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TokenizeCompletionRequest(OpenAIBaseModel):
    model: Optional[str] = None
    prompt: str

    add_special_tokens: bool = Field(
        default=True,
        description=(
            "If true (the default), special tokens (e.g. BOS) will be added to "
            "the prompt."),
    )
    return_token_strs: Optional[bool] = Field(
        default=False,
        description=("If true, also return the token strings "
                     "corresponding to the token ids."),
    )

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=True,
    description="If true (the default), special tokens (e.g. BOS) will be added to the prompt.",
)

model class-attribute instance-attribute

model: Optional[str] = None

prompt instance-attribute

prompt: str

return_token_strs class-attribute instance-attribute

return_token_strs: Optional[bool] = Field(
    default=False,
    description="If true, also return the token strings corresponding to the token ids.",
)

TokenizeResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TokenizeResponse(OpenAIBaseModel):
    count: int
    max_model_len: int
    tokens: list[int]
    token_strs: Optional[list[str]] = None

count instance-attribute

count: int

max_model_len instance-attribute

max_model_len: int

token_strs class-attribute instance-attribute

token_strs: Optional[list[str]] = None

tokens instance-attribute

tokens: list[int]

TokenizerInfoResponse

Bases: OpenAIBaseModel

Response containing tokenizer configuration equivalent to tokenizer_config.json

Source code in vllm/entrypoints/openai/protocol.py
class TokenizerInfoResponse(OpenAIBaseModel):
    """
    Response containing tokenizer configuration
    equivalent to tokenizer_config.json
    """

    model_config = ConfigDict(extra="allow")
    tokenizer_class: str

model_config class-attribute instance-attribute

model_config = ConfigDict(extra='allow')

tokenizer_class instance-attribute

tokenizer_class: str

ToolCall

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class ToolCall(OpenAIBaseModel):
    id: str = Field(default_factory=make_tool_call_id)
    type: Literal["function"] = "function"
    function: FunctionCall

function instance-attribute

function: FunctionCall

id class-attribute instance-attribute

id: str = Field(default_factory=make_tool_call_id)

type class-attribute instance-attribute

type: Literal['function'] = 'function'

TranscriptionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranscription

    file: UploadFile
    """
    The audio file object (not file name) to transcribe, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: Optional[str] = None
    """ID of the model to use.
    """

    language: Optional[str] = None
    """The language of the input audio.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy and latency.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    ## TODO (varun) : Support if set to 0, certain thresholds are met !!

    timestamp_granularities: list[Literal["word", "segment"]] = Field(
        alias="timestamp_granularities[]", default=[])
    """The timestamp granularities to populate for this transcription.

    `response_format` must be set `verbose_json` to use timestamp granularities.
    Either or both of these options are supported: `word`, or `segment`. Note:
    There is no additional latency for segment timestamps, but generating word
    timestamps incurs additional latency.
    """

    stream: Optional[bool] = False
    """When set, it will enable output to be streamed in a similar fashion
    as the Chat Completion endpoint.
    """
    # --8<-- [start:transcription-extra-params]
    # Flattened stream option to simplify form data.
    stream_include_usage: Optional[bool] = False
    stream_continuous_usage_stats: Optional[bool] = False

    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
        default=None,
        description=("Additional request parameters with string or "
                     "numeric values, used by custom extensions."),
    )
    # --8<-- [end:transcription-extra-params]

    to_language: Optional[str] = None
    """The language of the output audio we transcribe to.

    Please note that this is not currently used by supported models at this
    time, but it is a placeholder for future use, matching translation api.
    """

    # --8<-- [start:transcription-sampling-params]
    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """

    top_p: Optional[float] = None
    """Enables nucleus (top-p) sampling, where tokens are selected from the
    smallest possible set whose cumulative probability exceeds `p`.
    """

    top_k: Optional[int] = None
    """Limits sampling to the `k` most probable tokens at each step."""

    min_p: Optional[float] = None
    """Filters out tokens with a probability lower than `min_p`, ensuring a
    minimum likelihood threshold during sampling.
    """

    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    """The seed to use for sampling."""

    frequency_penalty: Optional[float] = 0.0
    """The frequency penalty to use for sampling."""

    repetition_penalty: Optional[float] = None
    """The repetition penalty to use for sampling."""

    presence_penalty: Optional[float] = 0.0
    """The presence penalty to use for sampling."""
    # --8<-- [end:transcription-sampling-params]

    # Default sampling parameters for transcription requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "repetition_penalty": 1.0,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
        "min_p": 0.0,
    }

    def to_sampling_params(
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:

        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}

        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
        if (min_p := self.min_p) is None:
            min_p = default_sampling_params.get(
                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get(
                "repetition_penalty",
                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])

        return SamplingParams.from_optional(temperature=temperature,
                                            max_tokens=max_tokens,
                                            seed=self.seed,
                                            top_p=top_p,
                                            top_k=top_k,
                                            min_p=min_p,
                                            frequency_penalty=self.frequency_penalty,
                                            repetition_penalty=repetition_penalty,
                                            presence_penalty=self.presence_penalty,
                                            output_kind=RequestOutputKind.DELTA
                                            if self.stream \
                                            else RequestOutputKind.FINAL_ONLY,
                                            extra_args=self.vllm_xargs)

    @model_validator(mode="before")
    @classmethod
    def validate_transcription_request(cls, data):
        if isinstance(data.get("file"), str):
            raise HTTPException(
                status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
                detail="Expected 'file' to be a file-like object, not 'str'.",
            )

        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {
    "repetition_penalty": 1.0,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 0,
    "min_p": 0.0,
}

file instance-attribute

file: UploadFile

The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

frequency_penalty class-attribute instance-attribute

frequency_penalty: Optional[float] = 0.0

The frequency penalty to use for sampling.

language class-attribute instance-attribute

language: Optional[str] = None

The language of the input audio.

Supplying the input language in ISO-639-1 format will improve accuracy and latency.

min_p class-attribute instance-attribute

min_p: Optional[float] = None

Filters out tokens with a probability lower than min_p, ensuring a minimum likelihood threshold during sampling.

model class-attribute instance-attribute

model: Optional[str] = None

ID of the model to use.

presence_penalty class-attribute instance-attribute

presence_penalty: Optional[float] = 0.0

The presence penalty to use for sampling.

prompt class-attribute instance-attribute

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

repetition_penalty class-attribute instance-attribute

repetition_penalty: Optional[float] = None

The repetition penalty to use for sampling.

response_format class-attribute instance-attribute

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

seed class-attribute instance-attribute

seed: Optional[int] = Field(None, ge=min, le=max)

The seed to use for sampling.

stream class-attribute instance-attribute

stream: Optional[bool] = False

When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

stream_continuous_usage_stats class-attribute instance-attribute

stream_continuous_usage_stats: Optional[bool] = False

stream_include_usage class-attribute instance-attribute

stream_include_usage: Optional[bool] = False

temperature class-attribute instance-attribute

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

timestamp_granularities class-attribute instance-attribute

timestamp_granularities: list[
    Literal["word", "segment"]
] = Field(alias="timestamp_granularities[]", default=[])

The timestamp granularities to populate for this transcription.

response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.

to_language class-attribute instance-attribute

to_language: Optional[str] = None

The language of the output audio we transcribe to.

Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api.

top_k class-attribute instance-attribute

top_k: Optional[int] = None

Limits sampling to the k most probable tokens at each step.

top_p class-attribute instance-attribute

top_p: Optional[float] = None

Enables nucleus (top-p) sampling, where tokens are selected from the smallest possible set whose cumulative probability exceeds p.

vllm_xargs class-attribute instance-attribute

vllm_xargs: Optional[dict[str, Union[str, int, float]]] = (
    Field(
        default=None,
        description="Additional request parameters with string or numeric values, used by custom extensions.",
    )
)

to_sampling_params

to_sampling_params(
    default_max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
        self,
        default_max_tokens: int,
        default_sampling_params: Optional[dict] = None) -> SamplingParams:

    max_tokens = default_max_tokens

    if default_sampling_params is None:
        default_sampling_params = {}

    # Default parameters
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
    if (top_p := self.top_p) is None:
        top_p = default_sampling_params.get(
            "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
    if (top_k := self.top_k) is None:
        top_k = default_sampling_params.get(
            "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
    if (min_p := self.min_p) is None:
        min_p = default_sampling_params.get(
            "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])

    if (repetition_penalty := self.repetition_penalty) is None:
        repetition_penalty = default_sampling_params.get(
            "repetition_penalty",
            self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])

    return SamplingParams.from_optional(temperature=temperature,
                                        max_tokens=max_tokens,
                                        seed=self.seed,
                                        top_p=top_p,
                                        top_k=top_k,
                                        min_p=min_p,
                                        frequency_penalty=self.frequency_penalty,
                                        repetition_penalty=repetition_penalty,
                                        presence_penalty=self.presence_penalty,
                                        output_kind=RequestOutputKind.DELTA
                                        if self.stream \
                                        else RequestOutputKind.FINAL_ONLY,
                                        extra_args=self.vllm_xargs)

validate_transcription_request classmethod

validate_transcription_request(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_transcription_request(cls, data):
    if isinstance(data.get("file"), str):
        raise HTTPException(
            status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
            detail="Expected 'file' to be a file-like object, not 'str'.",
        )

    stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
    stream = data.get("stream", False)
    if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
        raise ValueError(
            "Stream options can only be defined when `stream=True`.")

    return data

TranscriptionResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionResponse(OpenAIBaseModel):
    text: str
    """The transcribed text."""
    usage: TranscriptionUsageAudio

text instance-attribute

text: str

The transcribed text.

usage instance-attribute

TranscriptionResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionResponseStreamChoice(OpenAIBaseModel):
    delta: DeltaMessage
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = None

TranscriptionResponseVerbose

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The transcribed text."""

    segments: Optional[list[TranscriptionSegment]] = None
    """Segments of the transcribed text and their corresponding details."""

    words: Optional[list[TranscriptionWord]] = None
    """Extracted words and their corresponding timestamps."""

duration instance-attribute

duration: str

The duration of the input audio.

language instance-attribute

language: str

The language of the input audio.

segments class-attribute instance-attribute

segments: Optional[list[TranscriptionSegment]] = None

Segments of the transcribed text and their corresponding details.

text instance-attribute

text: str

The transcribed text.

words class-attribute instance-attribute

Extracted words and their corresponding timestamps.

TranscriptionSegment

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionSegment(OpenAIBaseModel):
    id: int
    """Unique identifier of the segment."""

    avg_logprob: float
    """Average logprob of the segment.

    If the value is lower than -1, consider the logprobs failed.
    """

    compression_ratio: float
    """Compression ratio of the segment.

    If the value is greater than 2.4, consider the compression failed.
    """

    end: float
    """End time of the segment in seconds."""

    no_speech_prob: float
    """Probability of no speech in the segment.

    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
    this segment silent.
    """

    seek: int
    """Seek offset of the segment."""

    start: float
    """Start time of the segment in seconds."""

    temperature: float
    """Temperature parameter used for generating the segment."""

    text: str
    """Text content of the segment."""

    tokens: list[int]
    """Array of token IDs for the text content."""

avg_logprob instance-attribute

avg_logprob: float

Average logprob of the segment.

If the value is lower than -1, consider the logprobs failed.

compression_ratio instance-attribute

compression_ratio: float

Compression ratio of the segment.

If the value is greater than 2.4, consider the compression failed.

end instance-attribute

end: float

End time of the segment in seconds.

id instance-attribute

id: int

Unique identifier of the segment.

no_speech_prob instance-attribute

no_speech_prob: float

Probability of no speech in the segment.

If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.

seek instance-attribute

seek: int

Seek offset of the segment.

start instance-attribute

start: float

Start time of the segment in seconds.

temperature instance-attribute

temperature: float

Temperature parameter used for generating the segment.

text instance-attribute

text: str

Text content of the segment.

tokens instance-attribute

tokens: list[int]

Array of token IDs for the text content.

TranscriptionStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
    object: Literal["transcription.chunk"] = "transcription.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[TranscriptionResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"trsc-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal["transcription.chunk"] = (
    "transcription.chunk"
)

usage class-attribute instance-attribute

usage: Optional[UsageInfo] = Field(default=None)

TranscriptionUsageAudio

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionUsageAudio(OpenAIBaseModel):
    type: Literal["duration"] = "duration"
    seconds: int

seconds instance-attribute

seconds: int

type class-attribute instance-attribute

type: Literal['duration'] = 'duration'

TranscriptionWord

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranscriptionWord(OpenAIBaseModel):
    end: float
    """End time of the word in seconds."""

    start: float
    """Start time of the word in seconds."""

    word: str
    """The text content of the word."""

end instance-attribute

end: float

End time of the word in seconds.

start instance-attribute

start: float

Start time of the word in seconds.

word instance-attribute

word: str

The text content of the word.

TranslationRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/audio/createTranslation

    file: UploadFile
    """
    The audio file object (not file name) to translate, in one of these
    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    """

    model: Optional[str] = None
    """ID of the model to use.
    """

    prompt: str = Field(default="")
    """An optional text to guide the model's style or continue a previous audio
    segment.

    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
    should match the audio language.
    """

    response_format: AudioResponseFormat = Field(default="json")
    """
    The format of the output, in one of these options: `json`, `text`, `srt`,
    `verbose_json`, or `vtt`.
    """

    # TODO support additional sampling parameters
    # --8<-- [start:translation-sampling-params]
    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
    """The seed to use for sampling."""

    temperature: float = Field(default=0.0)
    """The sampling temperature, between 0 and 1.

    Higher values like 0.8 will make the output more random, while lower values
    like 0.2 will make it more focused / deterministic. If set to 0, the model
    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
    to automatically increase the temperature until certain thresholds are hit.
    """
    # --8<-- [end:translation-sampling-params]

    # --8<-- [start:translation-extra-params]
    language: Optional[str] = None
    """The language of the input audio we translate from.

    Supplying the input language in
    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
    will improve accuracy.
    """

    to_language: Optional[str] = None
    """The language of the input audio we translate to.

    Please note that this is not supported by all models, refer to the specific
    model documentation for more details.
    For instance, Whisper only supports `to_language=en`.
    """

    stream: Optional[bool] = False
    """Custom field not present in the original OpenAI definition. When set,
    it will enable output to be streamed in a similar fashion as the Chat
    Completion endpoint.
    """
    # Flattened stream option to simplify form data.
    stream_include_usage: Optional[bool] = False
    stream_continuous_usage_stats: Optional[bool] = False
    # --8<-- [end:translation-extra-params]

    # Default sampling parameters for translation requests.
    _DEFAULT_SAMPLING_PARAMS: dict = {
        "temperature": 0,
    }

    def to_sampling_params(
            self,
            default_max_tokens: int,
            default_sampling_params: Optional[dict] = None) -> SamplingParams:

        max_tokens = default_max_tokens

        if default_sampling_params is None:
            default_sampling_params = {}
        # Default parameters
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])

        return SamplingParams.from_optional(temperature=temperature,
                                            max_tokens=max_tokens,
                                            seed=self.seed,
                                            output_kind=RequestOutputKind.DELTA
                                            if self.stream \
                                            else RequestOutputKind.FINAL_ONLY)

    @model_validator(mode="before")
    @classmethod
    def validate_stream_options(cls, data):
        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
        stream = data.get("stream", False)
        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
            raise ValueError(
                "Stream options can only be defined when `stream=True`.")

        return data

_DEFAULT_SAMPLING_PARAMS class-attribute instance-attribute

_DEFAULT_SAMPLING_PARAMS: dict = {'temperature': 0}

file instance-attribute

file: UploadFile

The audio file object (not file name) to translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

language class-attribute instance-attribute

language: Optional[str] = None

The language of the input audio we translate from.

Supplying the input language in ISO-639-1 format will improve accuracy.

model class-attribute instance-attribute

model: Optional[str] = None

ID of the model to use.

prompt class-attribute instance-attribute

prompt: str = Field(default='')

An optional text to guide the model's style or continue a previous audio segment.

The prompt should match the audio language.

response_format class-attribute instance-attribute

response_format: AudioResponseFormat = Field(default="json")

The format of the output, in one of these options: json, text, srt, verbose_json, or vtt.

seed class-attribute instance-attribute

seed: Optional[int] = Field(None, ge=min, le=max)

The seed to use for sampling.

stream class-attribute instance-attribute

stream: Optional[bool] = False

Custom field not present in the original OpenAI definition. When set, it will enable output to be streamed in a similar fashion as the Chat Completion endpoint.

stream_continuous_usage_stats class-attribute instance-attribute

stream_continuous_usage_stats: Optional[bool] = False

stream_include_usage class-attribute instance-attribute

stream_include_usage: Optional[bool] = False

temperature class-attribute instance-attribute

temperature: float = Field(default=0.0)

The sampling temperature, between 0 and 1.

Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused / deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.

to_language class-attribute instance-attribute

to_language: Optional[str] = None

The language of the input audio we translate to.

Please note that this is not supported by all models, refer to the specific model documentation for more details. For instance, Whisper only supports to_language=en.

to_sampling_params

to_sampling_params(
    default_max_tokens: int,
    default_sampling_params: Optional[dict] = None,
) -> SamplingParams
Source code in vllm/entrypoints/openai/protocol.py
def to_sampling_params(
        self,
        default_max_tokens: int,
        default_sampling_params: Optional[dict] = None) -> SamplingParams:

    max_tokens = default_max_tokens

    if default_sampling_params is None:
        default_sampling_params = {}
    # Default parameters
    if (temperature := self.temperature) is None:
        temperature = default_sampling_params.get(
            "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])

    return SamplingParams.from_optional(temperature=temperature,
                                        max_tokens=max_tokens,
                                        seed=self.seed,
                                        output_kind=RequestOutputKind.DELTA
                                        if self.stream \
                                        else RequestOutputKind.FINAL_ONLY)

validate_stream_options classmethod

validate_stream_options(data)
Source code in vllm/entrypoints/openai/protocol.py
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
    stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
    stream = data.get("stream", False)
    if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
        raise ValueError(
            "Stream options can only be defined when `stream=True`.")

    return data

TranslationResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationResponse(OpenAIBaseModel):
    text: str
    """The translated text."""

text instance-attribute

text: str

The translated text.

TranslationResponseStreamChoice

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationResponseStreamChoice(OpenAIBaseModel):
    delta: DeltaMessage
    finish_reason: Optional[str] = None
    stop_reason: Optional[Union[int, str]] = None

delta instance-attribute

delta: DeltaMessage

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

stop_reason class-attribute instance-attribute

stop_reason: Optional[Union[int, str]] = None

TranslationResponseVerbose

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationResponseVerbose(OpenAIBaseModel):
    duration: str
    """The duration of the input audio."""

    language: str
    """The language of the input audio."""

    text: str
    """The translated text."""

    segments: Optional[list[TranslationSegment]] = None
    """Segments of the translated text and their corresponding details."""

    words: Optional[list[TranslationWord]] = None
    """Extracted words and their corresponding timestamps."""

duration instance-attribute

duration: str

The duration of the input audio.

language instance-attribute

language: str

The language of the input audio.

segments class-attribute instance-attribute

segments: Optional[list[TranslationSegment]] = None

Segments of the translated text and their corresponding details.

text instance-attribute

text: str

The translated text.

words class-attribute instance-attribute

words: Optional[list[TranslationWord]] = None

Extracted words and their corresponding timestamps.

TranslationSegment

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationSegment(OpenAIBaseModel):
    id: int
    """Unique identifier of the segment."""

    avg_logprob: float
    """Average logprob of the segment.

    If the value is lower than -1, consider the logprobs failed.
    """

    compression_ratio: float
    """Compression ratio of the segment.

    If the value is greater than 2.4, consider the compression failed.
    """

    end: float
    """End time of the segment in seconds."""

    no_speech_prob: float
    """Probability of no speech in the segment.

    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
    this segment silent.
    """

    seek: int
    """Seek offset of the segment."""

    start: float
    """Start time of the segment in seconds."""

    temperature: float
    """Temperature parameter used for generating the segment."""

    text: str
    """Text content of the segment."""

    tokens: list[int]
    """Array of token IDs for the text content."""

avg_logprob instance-attribute

avg_logprob: float

Average logprob of the segment.

If the value is lower than -1, consider the logprobs failed.

compression_ratio instance-attribute

compression_ratio: float

Compression ratio of the segment.

If the value is greater than 2.4, consider the compression failed.

end instance-attribute

end: float

End time of the segment in seconds.

id instance-attribute

id: int

Unique identifier of the segment.

no_speech_prob instance-attribute

no_speech_prob: float

Probability of no speech in the segment.

If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.

seek instance-attribute

seek: int

Seek offset of the segment.

start instance-attribute

start: float

Start time of the segment in seconds.

temperature instance-attribute

temperature: float

Temperature parameter used for generating the segment.

text instance-attribute

text: str

Text content of the segment.

tokens instance-attribute

tokens: list[int]

Array of token IDs for the text content.

TranslationStreamResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationStreamResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
    object: Literal["translation.chunk"] = "translation.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[TranslationResponseStreamChoice]
    usage: Optional[UsageInfo] = Field(default=None)

choices instance-attribute

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"trsl-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: Literal['translation.chunk'] = 'translation.chunk'

usage class-attribute instance-attribute

usage: Optional[UsageInfo] = Field(default=None)

TranslationWord

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class TranslationWord(OpenAIBaseModel):
    end: float
    """End time of the word in seconds."""

    start: float
    """Start time of the word in seconds."""

    word: str
    """The text content of the word."""

end instance-attribute

end: float

End time of the word in seconds.

start instance-attribute

start: float

Start time of the word in seconds.

word instance-attribute

word: str

The text content of the word.

UnloadLoRAAdapterRequest

Bases: BaseModel

Source code in vllm/entrypoints/openai/protocol.py
class UnloadLoRAAdapterRequest(BaseModel):
    lora_name: str
    lora_int_id: Optional[int] = Field(default=None)

lora_int_id class-attribute instance-attribute

lora_int_id: Optional[int] = Field(default=None)

lora_name instance-attribute

lora_name: str

UsageInfo

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/protocol.py
class UsageInfo(OpenAIBaseModel):
    prompt_tokens: int = 0
    total_tokens: int = 0
    completion_tokens: Optional[int] = 0
    prompt_tokens_details: Optional[PromptTokenUsageInfo] = None

completion_tokens class-attribute instance-attribute

completion_tokens: Optional[int] = 0

prompt_tokens class-attribute instance-attribute

prompt_tokens: int = 0

prompt_tokens_details class-attribute instance-attribute

prompt_tokens_details: Optional[PromptTokenUsageInfo] = None

total_tokens class-attribute instance-attribute

total_tokens: int = 0

get_logits_processors

get_logits_processors(
    processors: Optional[LogitsProcessors],
    pattern: Optional[str],
) -> Optional[list[Any]]
Source code in vllm/entrypoints/openai/protocol.py
def get_logits_processors(processors: Optional[LogitsProcessors],
                          pattern: Optional[str]) -> Optional[list[Any]]:
    if processors and pattern:
        logits_processors = []
        for processor in processors:
            qualname = processor if isinstance(processor,
                                               str) else processor.qualname
            if not re.match(pattern, qualname):
                raise ValueError(
                    f"Logits processor '{qualname}' is not allowed by this "
                    "server. See --logits-processor-pattern engine argument "
                    "for more information.")
            try:
                logits_processor = resolve_obj_by_qualname(qualname)
            except Exception as e:
                raise ValueError(
                    f"Logits processor '{qualname}' could not be resolved: {e}"
                ) from e
            if isinstance(processor, LogitsProcessorConstructor):
                logits_processor = logits_processor(*processor.args or [],
                                                    **processor.kwargs or {})
            logits_processors.append(logits_processor)
        return logits_processors
    elif processors:
        raise ValueError(
            "The `logits_processors` argument is not supported by this "
            "server. See --logits-processor-pattern engine argument "
            "for more information.")
    return None