vllm.entrypoints.openai.tool_parsers ¶

Modules:

Name	Description
`abstract_tool_parser`
`deepseekv31_tool_parser`
`deepseekv3_tool_parser`
`glm4_moe_tool_parser`
`granite_20b_fc_tool_parser`
`granite_tool_parser`
`hermes_tool_parser`
`hunyuan_a13b_tool_parser`
`internlm2_tool_parser`
`jamba_tool_parser`
`kimi_k2_tool_parser`
`llama4_pythonic_tool_parser`
`llama_tool_parser`
`longcat_tool_parser`
`minimax_tool_parser`
`mistral_tool_parser`
`openai_tool_parser`
`phi4mini_tool_parser`
`pythonic_tool_parser`
`qwen3coder_tool_parser`
`qwen3xml_tool_parser`
`seed_oss_tool_parser`
`step3_tool_parser`
`utils`
`xlam_tool_parser`

all `module-attribute` ¶

__all__ = [
    "ToolParser",
    "ToolParserManager",
    "Granite20bFCToolParser",
    "GraniteToolParser",
    "Hermes2ProToolParser",
    "MistralToolParser",
    "Internlm2ToolParser",
    "Llama3JsonToolParser",
    "JambaToolParser",
    "Llama4PythonicToolParser",
    "LongcatFlashToolParser",
    "PythonicToolParser",
    "Phi4MiniJsonToolParser",
    "DeepSeekV3ToolParser",
    "DeepSeekV31ToolParser",
    "xLAMToolParser",
    "MinimaxToolParser",
    "KimiK2ToolParser",
    "HunyuanA13BToolParser",
    "Glm4MoeModelToolParser",
    "Qwen3CoderToolParser",
    "Qwen3XMLToolParser",
    "SeedOssToolParser",
    "Step3ToolParser",
    "OpenAIToolParser",
]

DeepSeekV31ToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py

@ToolParserManager.register_module("deepseek_v31")
class DeepSeekV31ToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id: int = -1
        self.streamed_args_for_tool: list[str] = (
            [])  # map what has been streamed for each tool so far to a list

        self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
        self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"

        self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
        self.tool_call_end_token: str = "<｜tool▁call▁end｜>"

        self.tool_call_regex = re.compile(
            r"<｜tool▁call▁begin｜>(?P<function_name>.*?)<｜tool▁sep｜>(?P<function_arguments>.*?)<｜tool▁call▁end｜>"
        )

        self.stream_tool_call_portion_regex = re.compile(
            r"(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)")

        self.stream_tool_call_name_regex = re.compile(
            r"(?P<function_name>.*)<｜tool▁sep｜>")

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction.")
        self.tool_calls_start_token_id = self.vocab.get(
            self.tool_calls_start_token)
        self.tool_calls_end_token_id = self.vocab.get(
            self.tool_calls_end_token)

        self.tool_call_start_token_id = self.vocab.get(
            self.tool_call_start_token)
        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

        if (self.tool_calls_start_token_id is None
                or self.tool_calls_end_token_id is None):
            raise RuntimeError(
                "DeepSeek-V3.1 Tool parser could not locate tool call "
                "start/end tokens in the tokenizer!")

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:

        # sanity check; avoid unnecessary processing
        if self.tool_calls_start_token not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        else:
            try:
                # there are two possible captures - between tags, or between a
                # tag and end-of-string so the result of
                # findall is an array of tuples where one is a function call and
                # the other is None
                function_call_tuples = self.tool_call_regex.findall(
                    model_output)

                tool_calls = []
                for match in function_call_tuples:
                    function_name, function_args = match
                    tool_calls.append(
                        ToolCall(
                            type="function",
                            function=FunctionCall(name=function_name,
                                                  arguments=function_args),
                        ))

                content = model_output[:model_output.
                                       find(self.tool_calls_start_token)]
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=tool_calls,
                    content=content if content else None,
                )

            except Exception:
                logger.exception(
                    "Error in extracting tool call from response.")
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        logger.debug("delta_text: %s", delta_text)
        logger.debug("delta_token_ids: %s", delta_token_ids)
        # check to see if we should be streaming a tool call - is there a
        if self.tool_calls_start_token_id not in current_token_ids:
            logger.debug("No tool call tokens found!")
            return DeltaMessage(content=delta_text)
        delta_text = delta_text.replace(self.tool_calls_start_token,
                                        "").replace(self.tool_calls_end_token,
                                                    "")
        try:

            # figure out where we are in the parsing by counting tool call
            # start & end tags
            prev_tool_start_count = previous_token_ids.count(
                self.tool_call_start_token_id)
            prev_tool_end_count = previous_token_ids.count(
                self.tool_call_end_token_id)
            cur_tool_start_count = current_token_ids.count(
                self.tool_call_start_token_id)
            cur_tool_end_count = current_token_ids.count(
                self.tool_call_end_token_id)
            tool_call_portion = None
            text_portion = None

            # case: if we're generating text, OR rounding out a tool call
            if (cur_tool_start_count == cur_tool_end_count
                    and prev_tool_end_count == cur_tool_end_count
                    and self.tool_call_end_token not in delta_text):
                logger.debug("Generating text content! skipping tool parsing.")
                return DeltaMessage(content=delta_text)

            if self.tool_call_end_token in delta_text:
                logger.debug("tool_call_end_token in delta_text")
                full_text = current_text + delta_text
                tool_call_portion = full_text.split(
                    self.tool_call_start_token)[-1].split(
                        self.tool_call_end_token)[0].rstrip()
                delta_text = delta_text.split(
                    self.tool_call_end_token)[0].rstrip()
                text_portion = delta_text.split(
                    self.tool_call_end_token)[-1].lstrip()

            # case -- we're starting a new tool call
            if (cur_tool_start_count > cur_tool_end_count
                    and cur_tool_start_count > prev_tool_start_count):
                if len(delta_token_ids) > 1:
                    tool_call_portion = current_text.split(
                        self.tool_call_start_token)[-1]
                else:
                    tool_call_portion = None
                    delta = None

                text_portion = None

                # set cursors and state appropriately
                self.current_tool_id += 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("Starting on a new tool %s", self.current_tool_id)

            # case -- we're updating an existing tool call
            elif (cur_tool_start_count > cur_tool_end_count
                  and cur_tool_start_count == prev_tool_start_count):

                # get the portion of the text that's the tool call
                tool_call_portion = current_text.split(
                    self.tool_call_start_token)[-1]
                text_portion = None

            # case -- the current tool call is being closed.
            elif (cur_tool_start_count == cur_tool_end_count
                  and cur_tool_end_count >= prev_tool_end_count):
                if self.prev_tool_call_arr is None or len(
                        self.prev_tool_call_arr) == 0:
                    logger.debug(
                        "attempting to close tool call, but no tool call")
                    return None
                diff = self.prev_tool_call_arr[self.current_tool_id].get(
                    "arguments")
                if diff:
                    diff = (diff.encode("utf-8").decode("unicode_escape")
                            if diff is str else diff)
                    if '"}' not in delta_text:
                        return None
                    end_loc = delta_text.rindex('"}')
                    diff = delta_text[:end_loc] + '"}'
                    logger.debug(
                        "Finishing tool and found diff that had not "
                        "been streamed yet: %s",
                        diff,
                    )
                    self.streamed_args_for_tool[self.current_tool_id] += diff
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            function=DeltaFunctionCall(
                                arguments=diff).model_dump(exclude_none=True),
                        )
                    ])

            # case -- otherwise we're just generating text
            else:
                text = delta_text.replace(self.tool_call_start_token, "")
                text = text.replace(self.tool_call_end_token, "")
                delta = DeltaMessage(tool_calls=[], content=text)
                return delta

            current_tool_call = dict()
            if tool_call_portion:
                current_tool_call_matches = (
                    self.stream_tool_call_portion_regex.match(
                        tool_call_portion))
                if current_tool_call_matches:
                    tool_name, tool_args = current_tool_call_matches.groups()
                    current_tool_call["name"] = tool_name
                    current_tool_call["arguments"] = tool_args
                else:
                    current_tool_call_name_matches = (
                        self.stream_tool_call_name_regex.match(
                            tool_call_portion))
                    if current_tool_call_name_matches:
                        tool_name = current_tool_call_name_matches.groups()
                        current_tool_call["name"] = tool_name
                        current_tool_call["arguments"] = ""
                    else:
                        logger.debug("Not enough token")
                        return None

            # case - we haven't sent the tool name yet. If it's available, send
            #   it. otherwise, wait until it's available.
            if not self.current_tool_name_sent:
                if current_tool_call is None:
                    return None
                function_name: Union[str, None] = current_tool_call.get("name")
                if function_name:
                    self.current_tool_name_sent = True
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            type="function",
                            id=make_tool_call_id(),
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),
                        )
                    ])
                else:
                    return None

            # case -- otherwise, send the tool call delta

            # if the tool call portion is None, send the delta as text
            if tool_call_portion is None:
                # if there's text but not tool calls, send that -
                # otherwise None to skip chunk
                delta = (DeltaMessage(
                    content=delta_text) if text_portion is not None else None)
                return delta

            # now, the nitty-gritty of tool calls
            # now we have the portion to parse as tool call.

            logger.debug("Trying to parse current tool call with ID %s",
                         self.current_tool_id)

            # if we're starting a new tool call, push an empty object in as
            #   a placeholder for the arguments
            if len(self.prev_tool_call_arr) <= self.current_tool_id:
                self.prev_tool_call_arr.append({})

            # main logic for tool parsing here - compare prev. partially-parsed
            #   JSON to the current partially-parsed JSON
            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
                "arguments")
            cur_arguments = current_tool_call.get("arguments")

            logger.debug("diffing old arguments: %s", prev_arguments)
            logger.debug("against new ones: %s", cur_arguments)

            # case -- no arguments have been created yet. skip sending a delta.
            if not cur_arguments and not prev_arguments:
                logger.debug("Skipping text %s - no arguments", delta_text)
                delta = None

            # case -- prev arguments are defined, but non are now.
            #   probably impossible, but not a fatal error - just keep going
            elif not cur_arguments and prev_arguments:
                logger.error("should be impossible to have arguments reset "
                             "mid-call. skipping streaming anything.")
                delta = None

            # case -- we now have the first info about arguments available from
            #   autocompleting the JSON
            elif cur_arguments and not prev_arguments:

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=cur_arguments).model_dump(
                                exclude_none=True),
                    )
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] = cur_arguments

            # last case -- we have an update to existing arguments.
            elif cur_arguments and prev_arguments:
                if (isinstance(delta_text, str)
                        and cur_arguments != prev_arguments
                        and len(cur_arguments) > len(prev_arguments)
                        and cur_arguments.startswith(prev_arguments)):
                    delta_arguments = cur_arguments[len(prev_arguments):]
                    logger.debug("got diff %s", delta_text)

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            function=DeltaFunctionCall(
                                arguments=delta_arguments).model_dump(
                                    exclude_none=True),
                        )
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] = cur_arguments
                else:
                    delta = None

            # handle saving the state for the current tool into
            # the "prev" list for use in diffing for the next iteration
            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
                self.prev_tool_call_arr[
                    self.current_tool_id] = current_tool_call
            else:
                self.prev_tool_call_arr.append(current_tool_call)

            return delta

        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            return None  # do not stream a delta. skip this token ID.

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

stream_tool_call_name_regex `instance-attribute` ¶

stream_tool_call_name_regex = compile(
    "(?P<function_name>.*)<｜tool▁sep｜>"
)

stream_tool_call_portion_regex `instance-attribute` ¶

stream_tool_call_portion_regex = compile(
    "(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)"
)

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_call_end_token `instance-attribute` ¶

tool_call_end_token: str = '<｜tool▁call▁end｜>'

tool_call_end_token_id `instance-attribute` ¶

tool_call_end_token_id = get(tool_call_end_token)

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    "<｜tool▁call▁begin｜>(?P<function_name>.*?)<｜tool▁sep｜>(?P<function_arguments>.*?)<｜tool▁call▁end｜>"
)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token: str = '<｜tool▁call▁begin｜>'

tool_call_start_token_id `instance-attribute` ¶

tool_call_start_token_id = get(tool_call_start_token)

tool_calls_end_token `instance-attribute` ¶

tool_calls_end_token: str = '<｜tool▁calls▁end｜>'

tool_calls_end_token_id `instance-attribute` ¶

tool_calls_end_token_id = get(tool_calls_end_token)

tool_calls_start_token `instance-attribute` ¶

tool_calls_start_token: str = '<｜tool▁calls▁begin｜>'

tool_calls_start_token_id `instance-attribute` ¶

tool_calls_start_token_id = get(tool_calls_start_token)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    self.current_tool_name_sent: bool = False
    self.prev_tool_call_arr: list[dict] = []
    self.current_tool_id: int = -1
    self.streamed_args_for_tool: list[str] = (
        [])  # map what has been streamed for each tool so far to a list

    self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
    self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"

    self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
    self.tool_call_end_token: str = "<｜tool▁call▁end｜>"

    self.tool_call_regex = re.compile(
        r"<｜tool▁call▁begin｜>(?P<function_name>.*?)<｜tool▁sep｜>(?P<function_arguments>.*?)<｜tool▁call▁end｜>"
    )

    self.stream_tool_call_portion_regex = re.compile(
        r"(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)")

    self.stream_tool_call_name_regex = re.compile(
        r"(?P<function_name>.*)<｜tool▁sep｜>")

    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction.")
    self.tool_calls_start_token_id = self.vocab.get(
        self.tool_calls_start_token)
    self.tool_calls_end_token_id = self.vocab.get(
        self.tool_calls_end_token)

    self.tool_call_start_token_id = self.vocab.get(
        self.tool_call_start_token)
    self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

    if (self.tool_calls_start_token_id is None
            or self.tool_calls_end_token_id is None):
        raise RuntimeError(
            "DeepSeek-V3.1 Tool parser could not locate tool call "
            "start/end tokens in the tokenizer!")

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:

    # sanity check; avoid unnecessary processing
    if self.tool_calls_start_token not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    else:
        try:
            # there are two possible captures - between tags, or between a
            # tag and end-of-string so the result of
            # findall is an array of tuples where one is a function call and
            # the other is None
            function_call_tuples = self.tool_call_regex.findall(
                model_output)

            tool_calls = []
            for match in function_call_tuples:
                function_name, function_args = match
                tool_calls.append(
                    ToolCall(
                        type="function",
                        function=FunctionCall(name=function_name,
                                              arguments=function_args),
                    ))

            content = model_output[:model_output.
                                   find(self.tool_calls_start_token)]
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content if content else None,
            )

        except Exception:
            logger.exception(
                "Error in extracting tool call from response.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    logger.debug("delta_text: %s", delta_text)
    logger.debug("delta_token_ids: %s", delta_token_ids)
    # check to see if we should be streaming a tool call - is there a
    if self.tool_calls_start_token_id not in current_token_ids:
        logger.debug("No tool call tokens found!")
        return DeltaMessage(content=delta_text)
    delta_text = delta_text.replace(self.tool_calls_start_token,
                                    "").replace(self.tool_calls_end_token,
                                                "")
    try:

        # figure out where we are in the parsing by counting tool call
        # start & end tags
        prev_tool_start_count = previous_token_ids.count(
            self.tool_call_start_token_id)
        prev_tool_end_count = previous_token_ids.count(
            self.tool_call_end_token_id)
        cur_tool_start_count = current_token_ids.count(
            self.tool_call_start_token_id)
        cur_tool_end_count = current_token_ids.count(
            self.tool_call_end_token_id)
        tool_call_portion = None
        text_portion = None

        # case: if we're generating text, OR rounding out a tool call
        if (cur_tool_start_count == cur_tool_end_count
                and prev_tool_end_count == cur_tool_end_count
                and self.tool_call_end_token not in delta_text):
            logger.debug("Generating text content! skipping tool parsing.")
            return DeltaMessage(content=delta_text)

        if self.tool_call_end_token in delta_text:
            logger.debug("tool_call_end_token in delta_text")
            full_text = current_text + delta_text
            tool_call_portion = full_text.split(
                self.tool_call_start_token)[-1].split(
                    self.tool_call_end_token)[0].rstrip()
            delta_text = delta_text.split(
                self.tool_call_end_token)[0].rstrip()
            text_portion = delta_text.split(
                self.tool_call_end_token)[-1].lstrip()

        # case -- we're starting a new tool call
        if (cur_tool_start_count > cur_tool_end_count
                and cur_tool_start_count > prev_tool_start_count):
            if len(delta_token_ids) > 1:
                tool_call_portion = current_text.split(
                    self.tool_call_start_token)[-1]
            else:
                tool_call_portion = None
                delta = None

            text_portion = None

            # set cursors and state appropriately
            self.current_tool_id += 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("Starting on a new tool %s", self.current_tool_id)

        # case -- we're updating an existing tool call
        elif (cur_tool_start_count > cur_tool_end_count
              and cur_tool_start_count == prev_tool_start_count):

            # get the portion of the text that's the tool call
            tool_call_portion = current_text.split(
                self.tool_call_start_token)[-1]
            text_portion = None

        # case -- the current tool call is being closed.
        elif (cur_tool_start_count == cur_tool_end_count
              and cur_tool_end_count >= prev_tool_end_count):
            if self.prev_tool_call_arr is None or len(
                    self.prev_tool_call_arr) == 0:
                logger.debug(
                    "attempting to close tool call, but no tool call")
                return None
            diff = self.prev_tool_call_arr[self.current_tool_id].get(
                "arguments")
            if diff:
                diff = (diff.encode("utf-8").decode("unicode_escape")
                        if diff is str else diff)
                if '"}' not in delta_text:
                    return None
                end_loc = delta_text.rindex('"}')
                diff = delta_text[:end_loc] + '"}'
                logger.debug(
                    "Finishing tool and found diff that had not "
                    "been streamed yet: %s",
                    diff,
                )
                self.streamed_args_for_tool[self.current_tool_id] += diff
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=diff).model_dump(exclude_none=True),
                    )
                ])

        # case -- otherwise we're just generating text
        else:
            text = delta_text.replace(self.tool_call_start_token, "")
            text = text.replace(self.tool_call_end_token, "")
            delta = DeltaMessage(tool_calls=[], content=text)
            return delta

        current_tool_call = dict()
        if tool_call_portion:
            current_tool_call_matches = (
                self.stream_tool_call_portion_regex.match(
                    tool_call_portion))
            if current_tool_call_matches:
                tool_name, tool_args = current_tool_call_matches.groups()
                current_tool_call["name"] = tool_name
                current_tool_call["arguments"] = tool_args
            else:
                current_tool_call_name_matches = (
                    self.stream_tool_call_name_regex.match(
                        tool_call_portion))
                if current_tool_call_name_matches:
                    tool_name = current_tool_call_name_matches.groups()
                    current_tool_call["name"] = tool_name
                    current_tool_call["arguments"] = ""
                else:
                    logger.debug("Not enough token")
                    return None

        # case - we haven't sent the tool name yet. If it's available, send
        #   it. otherwise, wait until it's available.
        if not self.current_tool_name_sent:
            if current_tool_call is None:
                return None
            function_name: Union[str, None] = current_tool_call.get("name")
            if function_name:
                self.current_tool_name_sent = True
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        type="function",
                        id=make_tool_call_id(),
                        function=DeltaFunctionCall(
                            name=function_name).model_dump(
                                exclude_none=True),
                    )
                ])
            else:
                return None

        # case -- otherwise, send the tool call delta

        # if the tool call portion is None, send the delta as text
        if tool_call_portion is None:
            # if there's text but not tool calls, send that -
            # otherwise None to skip chunk
            delta = (DeltaMessage(
                content=delta_text) if text_portion is not None else None)
            return delta

        # now, the nitty-gritty of tool calls
        # now we have the portion to parse as tool call.

        logger.debug("Trying to parse current tool call with ID %s",
                     self.current_tool_id)

        # if we're starting a new tool call, push an empty object in as
        #   a placeholder for the arguments
        if len(self.prev_tool_call_arr) <= self.current_tool_id:
            self.prev_tool_call_arr.append({})

        # main logic for tool parsing here - compare prev. partially-parsed
        #   JSON to the current partially-parsed JSON
        prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
            "arguments")
        cur_arguments = current_tool_call.get("arguments")

        logger.debug("diffing old arguments: %s", prev_arguments)
        logger.debug("against new ones: %s", cur_arguments)

        # case -- no arguments have been created yet. skip sending a delta.
        if not cur_arguments and not prev_arguments:
            logger.debug("Skipping text %s - no arguments", delta_text)
            delta = None

        # case -- prev arguments are defined, but non are now.
        #   probably impossible, but not a fatal error - just keep going
        elif not cur_arguments and prev_arguments:
            logger.error("should be impossible to have arguments reset "
                         "mid-call. skipping streaming anything.")
            delta = None

        # case -- we now have the first info about arguments available from
        #   autocompleting the JSON
        elif cur_arguments and not prev_arguments:

            delta = DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=self.current_tool_id,
                    function=DeltaFunctionCall(
                        arguments=cur_arguments).model_dump(
                            exclude_none=True),
                )
            ])
            self.streamed_args_for_tool[
                self.current_tool_id] = cur_arguments

        # last case -- we have an update to existing arguments.
        elif cur_arguments and prev_arguments:
            if (isinstance(delta_text, str)
                    and cur_arguments != prev_arguments
                    and len(cur_arguments) > len(prev_arguments)
                    and cur_arguments.startswith(prev_arguments)):
                delta_arguments = cur_arguments[len(prev_arguments):]
                logger.debug("got diff %s", delta_text)

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=delta_arguments).model_dump(
                                exclude_none=True),
                    )
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] = cur_arguments
            else:
                delta = None

        # handle saving the state for the current tool into
        # the "prev" list for use in diffing for the next iteration
        if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
            self.prev_tool_call_arr[
                self.current_tool_id] = current_tool_call
        else:
            self.prev_tool_call_arr.append(current_tool_call)

        return delta

    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        return None  # do not stream a delta. skip this token ID.

DeepSeekV3ToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py

@ToolParserManager.register_module("deepseek_v3")
class DeepSeekV3ToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id: int = -1
        self.streamed_args_for_tool: list[str] = (
            [])  # map what has been streamed for each tool so far to a list

        self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
        self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"

        self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
        self.tool_call_end_token: str = "<｜tool▁call▁end｜>"

        self.tool_call_regex = re.compile(
            r"<｜tool▁call▁begin｜>(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*)\n```<｜tool▁call▁end｜>"
        )

        self.stream_tool_call_portion_regex = re.compile(
            r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*[^\n`])"
        )

        self.stream_tool_call_name_regex = re.compile(
            r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n")

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction.")
        self.tool_calls_start_token_id = self.vocab.get(
            self.tool_calls_start_token)
        self.tool_calls_end_token_id = self.vocab.get(
            self.tool_calls_end_token)

        self.tool_call_start_token_id = self.vocab.get(
            self.tool_call_start_token)
        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

        if (self.tool_calls_start_token_id is None
                or self.tool_calls_end_token_id is None):
            raise RuntimeError(
                "DeepSeek-V3 Tool parser could not locate tool call start/end "
                "tokens in the tokenizer!")

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:

        # sanity check; avoid unnecessary processing
        if self.tool_calls_start_token not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        else:
            try:
                # there are two possible captures - between tags, or between a
                # tag and end-of-string so the result of
                # findall is an array of tuples where one is a function call and
                # the other is None
                function_call_tuples = self.tool_call_regex.findall(
                    model_output)

                tool_calls = []
                for match in function_call_tuples:
                    tool_type, function_name, function_args = match
                    tool_calls.append(
                        ToolCall(
                            type=tool_type,
                            function=FunctionCall(name=function_name,
                                                  arguments=function_args),
                        ))

                content = model_output[:model_output.
                                       find(self.tool_calls_start_token)]
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=tool_calls,
                    content=content if content else None,
                )

            except Exception:
                logger.exception(
                    "Error in extracting tool call from response.")
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        logger.debug("delta_text: %s", delta_text)
        logger.debug("delta_token_ids: %s", delta_token_ids)
        # check to see if we should be streaming a tool call - is there a
        if self.tool_calls_start_token_id not in current_token_ids:
            logger.debug("No tool call tokens found!")
            return DeltaMessage(content=delta_text)
        delta_text = delta_text.replace(self.tool_calls_start_token,
                                        "").replace(self.tool_calls_end_token,
                                                    "")
        try:

            # figure out where we are in the parsing by counting tool call
            # start & end tags
            prev_tool_start_count = previous_token_ids.count(
                self.tool_call_start_token_id)
            prev_tool_end_count = previous_token_ids.count(
                self.tool_call_end_token_id)
            cur_tool_start_count = current_token_ids.count(
                self.tool_call_start_token_id)
            cur_tool_end_count = current_token_ids.count(
                self.tool_call_end_token_id)
            tool_call_portion = None
            text_portion = None

            # case: if we're generating text, OR rounding out a tool call
            if (cur_tool_start_count == cur_tool_end_count
                    and prev_tool_end_count == cur_tool_end_count
                    and self.tool_call_end_token not in delta_text):
                logger.debug("Generating text content! skipping tool parsing.")
                return DeltaMessage(content=delta_text)

            if self.tool_call_end_token in delta_text:
                logger.debug("tool_call_end_token in delta_text")
                full_text = current_text + delta_text
                tool_call_portion = full_text.split(
                    self.tool_call_start_token)[-1].split(
                        self.tool_call_end_token)[0].rstrip()
                delta_text = delta_text.split(
                    self.tool_call_end_token)[0].rstrip()
                text_portion = delta_text.split(
                    self.tool_call_end_token)[-1].lstrip()

            # case -- we're starting a new tool call
            if (cur_tool_start_count > cur_tool_end_count
                    and cur_tool_start_count > prev_tool_start_count):
                if len(delta_token_ids) > 1:
                    tool_call_portion = current_text.split(
                        self.tool_call_start_token)[-1]
                else:
                    tool_call_portion = None
                    delta = None

                text_portion = None

                # set cursors and state appropriately
                self.current_tool_id += 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("Starting on a new tool %s", self.current_tool_id)

            # case -- we're updating an existing tool call
            elif (cur_tool_start_count > cur_tool_end_count
                  and cur_tool_start_count == prev_tool_start_count):

                # get the portion of the text that's the tool call
                tool_call_portion = current_text.split(
                    self.tool_call_start_token)[-1]
                text_portion = None

            # case -- the current tool call is being closed.
            elif (cur_tool_start_count == cur_tool_end_count
                  and cur_tool_end_count >= prev_tool_end_count):
                if self.prev_tool_call_arr is None or len(
                        self.prev_tool_call_arr) == 0:
                    logger.debug(
                        "attempting to close tool call, but no tool call")
                    return None
                diff = self.prev_tool_call_arr[self.current_tool_id].get(
                    "arguments")
                if diff:
                    diff = (diff.encode("utf-8").decode("unicode_escape")
                            if diff is str else diff)
                    if '"}' not in delta_text:
                        return None
                    end_loc = delta_text.rindex('"}')
                    diff = delta_text[:end_loc] + '"}'
                    logger.debug(
                        "Finishing tool and found diff that had not "
                        "been streamed yet: %s",
                        diff,
                    )
                    self.streamed_args_for_tool[self.current_tool_id] += diff
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            function=DeltaFunctionCall(
                                arguments=diff).model_dump(exclude_none=True),
                        )
                    ])

            # case -- otherwise we're just generating text
            else:
                text = delta_text.replace(self.tool_call_start_token, "")
                text = text.replace(self.tool_call_end_token, "")
                delta = DeltaMessage(tool_calls=[], content=text)
                return delta

            current_tool_call = dict()
            if tool_call_portion:
                current_tool_call_matches = (
                    self.stream_tool_call_portion_regex.match(
                        tool_call_portion))
                if current_tool_call_matches:
                    tool_type, tool_name, tool_args = (
                        current_tool_call_matches.groups())
                    current_tool_call["name"] = tool_name
                    current_tool_call["arguments"] = tool_args
                else:
                    current_tool_call_name_matches = (
                        self.stream_tool_call_name_regex.match(
                            tool_call_portion))
                    if current_tool_call_name_matches:
                        tool_type, tool_name = (
                            current_tool_call_name_matches.groups())
                        current_tool_call["name"] = tool_name
                        current_tool_call["arguments"] = ""
                    else:
                        logger.debug("Not enough token")
                        return None

            # case - we haven't sent the tool name yet. If it's available, send
            #   it. otherwise, wait until it's available.
            if not self.current_tool_name_sent:
                if current_tool_call is None:
                    return None
                function_name: Union[str, None] = current_tool_call.get("name")
                if function_name:
                    self.current_tool_name_sent = True
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            type="function",
                            id=make_tool_call_id(),
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),
                        )
                    ])
                else:
                    return None

            # case -- otherwise, send the tool call delta

            # if the tool call portion is None, send the delta as text
            if tool_call_portion is None:
                # if there's text but not tool calls, send that -
                # otherwise None to skip chunk
                delta = (DeltaMessage(
                    content=delta_text) if text_portion is not None else None)
                return delta

            # now, the nitty-gritty of tool calls
            # now we have the portion to parse as tool call.

            logger.debug("Trying to parse current tool call with ID %s",
                         self.current_tool_id)

            # if we're starting a new tool call, push an empty object in as
            #   a placeholder for the arguments
            if len(self.prev_tool_call_arr) <= self.current_tool_id:
                self.prev_tool_call_arr.append({})

            # main logic for tool parsing here - compare prev. partially-parsed
            #   JSON to the current partially-parsed JSON
            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
                "arguments")
            cur_arguments = current_tool_call.get("arguments")

            logger.debug("diffing old arguments: %s", prev_arguments)
            logger.debug("against new ones: %s", cur_arguments)

            # case -- no arguments have been created yet. skip sending a delta.
            if not cur_arguments and not prev_arguments:
                logger.debug("Skipping text %s - no arguments", delta_text)
                delta = None

            # case -- prev arguments are defined, but non are now.
            #   probably impossible, but not a fatal error - just keep going
            elif not cur_arguments and prev_arguments:
                logger.error("should be impossible to have arguments reset "
                             "mid-call. skipping streaming anything.")
                delta = None

            # case -- we now have the first info about arguments available from
            #   autocompleting the JSON
            elif cur_arguments and not prev_arguments:

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=cur_arguments).model_dump(
                                exclude_none=True),
                    )
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] = cur_arguments

            # last case -- we have an update to existing arguments.
            elif cur_arguments and prev_arguments:
                if (isinstance(delta_text, str)
                        and cur_arguments != prev_arguments
                        and len(cur_arguments) > len(prev_arguments)
                        and cur_arguments.startswith(prev_arguments)):
                    delta_arguments = cur_arguments[len(prev_arguments):]
                    logger.debug("got diff %s", delta_text)

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            function=DeltaFunctionCall(
                                arguments=delta_arguments).model_dump(
                                    exclude_none=True),
                        )
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] = cur_arguments
                else:
                    delta = None

            # handle saving the state for the current tool into
            # the "prev" list for use in diffing for the next iteration
            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
                self.prev_tool_call_arr[
                    self.current_tool_id] = current_tool_call
            else:
                self.prev_tool_call_arr.append(current_tool_call)

            return delta

        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            return None  # do not stream a delta. skip this token ID.

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

stream_tool_call_name_regex `instance-attribute` ¶

stream_tool_call_name_regex = compile(
    "(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\\n"
)

stream_tool_call_portion_regex `instance-attribute` ¶

stream_tool_call_portion_regex = compile(
    "(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\\n```json\\n(?P<function_arguments>.*[^\\n`])"
)

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_call_end_token `instance-attribute` ¶

tool_call_end_token: str = '<｜tool▁call▁end｜>'

tool_call_end_token_id `instance-attribute` ¶

tool_call_end_token_id = get(tool_call_end_token)

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    "<｜tool▁call▁begin｜>(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\\n```json\\n(?P<function_arguments>.*)\\n```<｜tool▁call▁end｜>"
)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token: str = '<｜tool▁call▁begin｜>'

tool_call_start_token_id `instance-attribute` ¶

tool_call_start_token_id = get(tool_call_start_token)

tool_calls_end_token `instance-attribute` ¶

tool_calls_end_token: str = '<｜tool▁calls▁end｜>'

tool_calls_end_token_id `instance-attribute` ¶

tool_calls_end_token_id = get(tool_calls_end_token)

tool_calls_start_token `instance-attribute` ¶

tool_calls_start_token: str = '<｜tool▁calls▁begin｜>'

tool_calls_start_token_id `instance-attribute` ¶

tool_calls_start_token_id = get(tool_calls_start_token)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    self.current_tool_name_sent: bool = False
    self.prev_tool_call_arr: list[dict] = []
    self.current_tool_id: int = -1
    self.streamed_args_for_tool: list[str] = (
        [])  # map what has been streamed for each tool so far to a list

    self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
    self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"

    self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
    self.tool_call_end_token: str = "<｜tool▁call▁end｜>"

    self.tool_call_regex = re.compile(
        r"<｜tool▁call▁begin｜>(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*)\n```<｜tool▁call▁end｜>"
    )

    self.stream_tool_call_portion_regex = re.compile(
        r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*[^\n`])"
    )

    self.stream_tool_call_name_regex = re.compile(
        r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n")

    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction.")
    self.tool_calls_start_token_id = self.vocab.get(
        self.tool_calls_start_token)
    self.tool_calls_end_token_id = self.vocab.get(
        self.tool_calls_end_token)

    self.tool_call_start_token_id = self.vocab.get(
        self.tool_call_start_token)
    self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

    if (self.tool_calls_start_token_id is None
            or self.tool_calls_end_token_id is None):
        raise RuntimeError(
            "DeepSeek-V3 Tool parser could not locate tool call start/end "
            "tokens in the tokenizer!")

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:

    # sanity check; avoid unnecessary processing
    if self.tool_calls_start_token not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    else:
        try:
            # there are two possible captures - between tags, or between a
            # tag and end-of-string so the result of
            # findall is an array of tuples where one is a function call and
            # the other is None
            function_call_tuples = self.tool_call_regex.findall(
                model_output)

            tool_calls = []
            for match in function_call_tuples:
                tool_type, function_name, function_args = match
                tool_calls.append(
                    ToolCall(
                        type=tool_type,
                        function=FunctionCall(name=function_name,
                                              arguments=function_args),
                    ))

            content = model_output[:model_output.
                                   find(self.tool_calls_start_token)]
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content if content else None,
            )

        except Exception:
            logger.exception(
                "Error in extracting tool call from response.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    logger.debug("delta_text: %s", delta_text)
    logger.debug("delta_token_ids: %s", delta_token_ids)
    # check to see if we should be streaming a tool call - is there a
    if self.tool_calls_start_token_id not in current_token_ids:
        logger.debug("No tool call tokens found!")
        return DeltaMessage(content=delta_text)
    delta_text = delta_text.replace(self.tool_calls_start_token,
                                    "").replace(self.tool_calls_end_token,
                                                "")
    try:

        # figure out where we are in the parsing by counting tool call
        # start & end tags
        prev_tool_start_count = previous_token_ids.count(
            self.tool_call_start_token_id)
        prev_tool_end_count = previous_token_ids.count(
            self.tool_call_end_token_id)
        cur_tool_start_count = current_token_ids.count(
            self.tool_call_start_token_id)
        cur_tool_end_count = current_token_ids.count(
            self.tool_call_end_token_id)
        tool_call_portion = None
        text_portion = None

        # case: if we're generating text, OR rounding out a tool call
        if (cur_tool_start_count == cur_tool_end_count
                and prev_tool_end_count == cur_tool_end_count
                and self.tool_call_end_token not in delta_text):
            logger.debug("Generating text content! skipping tool parsing.")
            return DeltaMessage(content=delta_text)

        if self.tool_call_end_token in delta_text:
            logger.debug("tool_call_end_token in delta_text")
            full_text = current_text + delta_text
            tool_call_portion = full_text.split(
                self.tool_call_start_token)[-1].split(
                    self.tool_call_end_token)[0].rstrip()
            delta_text = delta_text.split(
                self.tool_call_end_token)[0].rstrip()
            text_portion = delta_text.split(
                self.tool_call_end_token)[-1].lstrip()

        # case -- we're starting a new tool call
        if (cur_tool_start_count > cur_tool_end_count
                and cur_tool_start_count > prev_tool_start_count):
            if len(delta_token_ids) > 1:
                tool_call_portion = current_text.split(
                    self.tool_call_start_token)[-1]
            else:
                tool_call_portion = None
                delta = None

            text_portion = None

            # set cursors and state appropriately
            self.current_tool_id += 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("Starting on a new tool %s", self.current_tool_id)

        # case -- we're updating an existing tool call
        elif (cur_tool_start_count > cur_tool_end_count
              and cur_tool_start_count == prev_tool_start_count):

            # get the portion of the text that's the tool call
            tool_call_portion = current_text.split(
                self.tool_call_start_token)[-1]
            text_portion = None

        # case -- the current tool call is being closed.
        elif (cur_tool_start_count == cur_tool_end_count
              and cur_tool_end_count >= prev_tool_end_count):
            if self.prev_tool_call_arr is None or len(
                    self.prev_tool_call_arr) == 0:
                logger.debug(
                    "attempting to close tool call, but no tool call")
                return None
            diff = self.prev_tool_call_arr[self.current_tool_id].get(
                "arguments")
            if diff:
                diff = (diff.encode("utf-8").decode("unicode_escape")
                        if diff is str else diff)
                if '"}' not in delta_text:
                    return None
                end_loc = delta_text.rindex('"}')
                diff = delta_text[:end_loc] + '"}'
                logger.debug(
                    "Finishing tool and found diff that had not "
                    "been streamed yet: %s",
                    diff,
                )
                self.streamed_args_for_tool[self.current_tool_id] += diff
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=diff).model_dump(exclude_none=True),
                    )
                ])

        # case -- otherwise we're just generating text
        else:
            text = delta_text.replace(self.tool_call_start_token, "")
            text = text.replace(self.tool_call_end_token, "")
            delta = DeltaMessage(tool_calls=[], content=text)
            return delta

        current_tool_call = dict()
        if tool_call_portion:
            current_tool_call_matches = (
                self.stream_tool_call_portion_regex.match(
                    tool_call_portion))
            if current_tool_call_matches:
                tool_type, tool_name, tool_args = (
                    current_tool_call_matches.groups())
                current_tool_call["name"] = tool_name
                current_tool_call["arguments"] = tool_args
            else:
                current_tool_call_name_matches = (
                    self.stream_tool_call_name_regex.match(
                        tool_call_portion))
                if current_tool_call_name_matches:
                    tool_type, tool_name = (
                        current_tool_call_name_matches.groups())
                    current_tool_call["name"] = tool_name
                    current_tool_call["arguments"] = ""
                else:
                    logger.debug("Not enough token")
                    return None

        # case - we haven't sent the tool name yet. If it's available, send
        #   it. otherwise, wait until it's available.
        if not self.current_tool_name_sent:
            if current_tool_call is None:
                return None
            function_name: Union[str, None] = current_tool_call.get("name")
            if function_name:
                self.current_tool_name_sent = True
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        type="function",
                        id=make_tool_call_id(),
                        function=DeltaFunctionCall(
                            name=function_name).model_dump(
                                exclude_none=True),
                    )
                ])
            else:
                return None

        # case -- otherwise, send the tool call delta

        # if the tool call portion is None, send the delta as text
        if tool_call_portion is None:
            # if there's text but not tool calls, send that -
            # otherwise None to skip chunk
            delta = (DeltaMessage(
                content=delta_text) if text_portion is not None else None)
            return delta

        # now, the nitty-gritty of tool calls
        # now we have the portion to parse as tool call.

        logger.debug("Trying to parse current tool call with ID %s",
                     self.current_tool_id)

        # if we're starting a new tool call, push an empty object in as
        #   a placeholder for the arguments
        if len(self.prev_tool_call_arr) <= self.current_tool_id:
            self.prev_tool_call_arr.append({})

        # main logic for tool parsing here - compare prev. partially-parsed
        #   JSON to the current partially-parsed JSON
        prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
            "arguments")
        cur_arguments = current_tool_call.get("arguments")

        logger.debug("diffing old arguments: %s", prev_arguments)
        logger.debug("against new ones: %s", cur_arguments)

        # case -- no arguments have been created yet. skip sending a delta.
        if not cur_arguments and not prev_arguments:
            logger.debug("Skipping text %s - no arguments", delta_text)
            delta = None

        # case -- prev arguments are defined, but non are now.
        #   probably impossible, but not a fatal error - just keep going
        elif not cur_arguments and prev_arguments:
            logger.error("should be impossible to have arguments reset "
                         "mid-call. skipping streaming anything.")
            delta = None

        # case -- we now have the first info about arguments available from
        #   autocompleting the JSON
        elif cur_arguments and not prev_arguments:

            delta = DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=self.current_tool_id,
                    function=DeltaFunctionCall(
                        arguments=cur_arguments).model_dump(
                            exclude_none=True),
                )
            ])
            self.streamed_args_for_tool[
                self.current_tool_id] = cur_arguments

        # last case -- we have an update to existing arguments.
        elif cur_arguments and prev_arguments:
            if (isinstance(delta_text, str)
                    and cur_arguments != prev_arguments
                    and len(cur_arguments) > len(prev_arguments)
                    and cur_arguments.startswith(prev_arguments)):
                delta_arguments = cur_arguments[len(prev_arguments):]
                logger.debug("got diff %s", delta_text)

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=delta_arguments).model_dump(
                                exclude_none=True),
                    )
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] = cur_arguments
            else:
                delta = None

        # handle saving the state for the current tool into
        # the "prev" list for use in diffing for the next iteration
        if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
            self.prev_tool_call_arr[
                self.current_tool_id] = current_tool_call
        else:
            self.prev_tool_call_arr.append(current_tool_call)

        return delta

    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        return None  # do not stream a delta. skip this token ID.

Glm4MoeModelToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py

@ToolParserManager.register_module("glm45")
class Glm4MoeModelToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
        self.current_tool_name_sent = False
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id = -1
        self.streamed_args_for_tool: list[str] = []
        self.tool_call_start_token = "<tool_call>"
        self.tool_call_end_token = "</tool_call>"

        self.tool_calls_start_token = self.tool_call_start_token

        self.func_call_regex = re.compile(r"<tool_call>.*?</tool_call>",
                                          re.DOTALL)
        self.func_detail_regex = re.compile(
            r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
        self.func_arg_regex = re.compile(
            r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>",
            re.DOTALL)
        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction.")

        self.tool_call_start_token_id = self.vocab.get(
            self.tool_call_start_token)
        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
        self._buffer = ""

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:

        def _is_string_type(
                tool_name: str, arg_name: str,
                tools: Optional[list[ChatCompletionToolsParam]]) -> bool:
            if tools is None:
                return False
            for tool in tools:
                if tool.function.name == tool_name:
                    if tool.function.parameters is None:
                        return False
                    arg_type = tool.function.parameters.get(
                        "properties", {}).get(arg_name, {}).get("type", None)
                    return arg_type == "string"
            logger.warning("No tool named '%s'.", tool_name)
            return False

        def _deserialize(value: str) -> Any:
            try:
                return json.loads(value)
            except Exception:
                pass

            try:
                return ast.literal_eval(value)
            except Exception:
                pass
            return value

        matched_tool_calls = self.func_call_regex.findall(model_output)
        logger.debug("model_output: %s", model_output)
        try:
            tool_calls = []
            for match in matched_tool_calls:
                tc_detail = self.func_detail_regex.search(match)
                tc_name = tc_detail.group(1)
                tc_args = tc_detail.group(2)
                pairs = self.func_arg_regex.findall(tc_args)
                arg_dct = {}
                for key, value in pairs:
                    arg_key = key.strip()
                    arg_val = value.strip()
                    if not _is_string_type(tc_name, arg_key, request.tools):
                        arg_val = _deserialize(arg_val)
                    logger.debug("arg_key = %s, arg_val = %s", arg_key,
                                 arg_val)
                    arg_dct[arg_key] = arg_val
                tool_calls.append(
                    ToolCall(type="function",
                             function=FunctionCall(
                                 name=tc_name, arguments=json.dumps(arg_dct))))
        except Exception:
            logger.exception("Failed to extract tool call spec")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)
        else:
            if len(tool_calls) > 0:
                content = model_output[:model_output.
                                       find(self.tool_calls_start_token)]
                return ExtractedToolCallInformation(tools_called=True,
                                                    tool_calls=tool_calls,
                                                    content=content)
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        self._buffer += delta_text
        cur_text = self._buffer
        start_idx = cur_text.find(self.tool_call_start_token)
        if start_idx == -1:
            self._buffer = ""
            if self.current_tool_id > 0:
                cur_text = ""
            return DeltaMessage(content=cur_text)
        logger.debug("cur_text = %s", cur_text)
        end_idx = cur_text.find(self.tool_call_end_token)
        if end_idx != -1:
            if self.current_tool_id == -1:
                self.current_tool_id = 0
                self.prev_tool_call_arr = []
                self.streamed_args_for_tool = []
            while len(self.prev_tool_call_arr) <= self.current_tool_id:
                self.prev_tool_call_arr.append({})
            while len(self.streamed_args_for_tool) <= self.current_tool_id:
                self.streamed_args_for_tool.append("")

            extracted_tool_calls = self.extract_tool_calls(
                cur_text[:end_idx + len(self.tool_call_end_token)], request)

            if len(extracted_tool_calls.tool_calls) == 0:
                logger.warning("Failed to extract any tool calls.")
                return None
            tool_call = extracted_tool_calls.tool_calls[0]
            self.prev_tool_call_arr[self.current_tool_id] = {
                "name": tool_call.function.name,
                "arguments": json.loads(tool_call.function.arguments)
            }
            self.streamed_args_for_tool[
                self.current_tool_id] = tool_call.function.arguments
            delta = DeltaMessage(
                content=extracted_tool_calls.content,
                tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  id=tool_call.id,
                                  type=tool_call.type,
                                  function=DeltaFunctionCall(
                                      name=tool_call.function.name,
                                      arguments=tool_call.function.arguments))
                ])
            self.current_tool_id += 1
            self._buffer = cur_text[end_idx + len(self.tool_call_end_token):]
            return delta

        self._buffer = cur_text[start_idx:]
        return DeltaMessage(content=cur_text[:start_idx])

_buffer `instance-attribute` ¶

_buffer = ''

current_tool_id `instance-attribute` ¶

current_tool_id = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent = False

func_arg_regex `instance-attribute` ¶

func_arg_regex = compile(
    "<arg_key>(.*?)</arg_key>\\s*<arg_value>(.*?)</arg_value>",
    DOTALL,
)

func_call_regex `instance-attribute` ¶

func_call_regex = compile(
    "<tool_call>.*?</tool_call>", DOTALL
)

func_detail_regex `instance-attribute` ¶

func_detail_regex = compile(
    "<tool_call>([^\\n]*)\\n(.*)</tool_call>", DOTALL
)

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_call_end_token `instance-attribute` ¶

tool_call_end_token = '</tool_call>'

tool_call_end_token_id `instance-attribute` ¶

tool_call_end_token_id = get(tool_call_end_token)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token = '<tool_call>'

tool_call_start_token_id `instance-attribute` ¶

tool_call_start_token_id = get(tool_call_start_token)

tool_calls_start_token `instance-attribute` ¶

tool_calls_start_token = tool_call_start_token

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)
    self.current_tool_name_sent = False
    self.prev_tool_call_arr: list[dict] = []
    self.current_tool_id = -1
    self.streamed_args_for_tool: list[str] = []
    self.tool_call_start_token = "<tool_call>"
    self.tool_call_end_token = "</tool_call>"

    self.tool_calls_start_token = self.tool_call_start_token

    self.func_call_regex = re.compile(r"<tool_call>.*?</tool_call>",
                                      re.DOTALL)
    self.func_detail_regex = re.compile(
        r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
    self.func_arg_regex = re.compile(
        r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>",
        re.DOTALL)
    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction.")

    self.tool_call_start_token_id = self.vocab.get(
        self.tool_call_start_token)
    self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
    self._buffer = ""

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:

    def _is_string_type(
            tool_name: str, arg_name: str,
            tools: Optional[list[ChatCompletionToolsParam]]) -> bool:
        if tools is None:
            return False
        for tool in tools:
            if tool.function.name == tool_name:
                if tool.function.parameters is None:
                    return False
                arg_type = tool.function.parameters.get(
                    "properties", {}).get(arg_name, {}).get("type", None)
                return arg_type == "string"
        logger.warning("No tool named '%s'.", tool_name)
        return False

    def _deserialize(value: str) -> Any:
        try:
            return json.loads(value)
        except Exception:
            pass

        try:
            return ast.literal_eval(value)
        except Exception:
            pass
        return value

    matched_tool_calls = self.func_call_regex.findall(model_output)
    logger.debug("model_output: %s", model_output)
    try:
        tool_calls = []
        for match in matched_tool_calls:
            tc_detail = self.func_detail_regex.search(match)
            tc_name = tc_detail.group(1)
            tc_args = tc_detail.group(2)
            pairs = self.func_arg_regex.findall(tc_args)
            arg_dct = {}
            for key, value in pairs:
                arg_key = key.strip()
                arg_val = value.strip()
                if not _is_string_type(tc_name, arg_key, request.tools):
                    arg_val = _deserialize(arg_val)
                logger.debug("arg_key = %s, arg_val = %s", arg_key,
                             arg_val)
                arg_dct[arg_key] = arg_val
            tool_calls.append(
                ToolCall(type="function",
                         function=FunctionCall(
                             name=tc_name, arguments=json.dumps(arg_dct))))
    except Exception:
        logger.exception("Failed to extract tool call spec")
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)
    else:
        if len(tool_calls) > 0:
            content = model_output[:model_output.
                                   find(self.tool_calls_start_token)]
            return ExtractedToolCallInformation(tools_called=True,
                                                tool_calls=tool_calls,
                                                content=content)
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    self._buffer += delta_text
    cur_text = self._buffer
    start_idx = cur_text.find(self.tool_call_start_token)
    if start_idx == -1:
        self._buffer = ""
        if self.current_tool_id > 0:
            cur_text = ""
        return DeltaMessage(content=cur_text)
    logger.debug("cur_text = %s", cur_text)
    end_idx = cur_text.find(self.tool_call_end_token)
    if end_idx != -1:
        if self.current_tool_id == -1:
            self.current_tool_id = 0
            self.prev_tool_call_arr = []
            self.streamed_args_for_tool = []
        while len(self.prev_tool_call_arr) <= self.current_tool_id:
            self.prev_tool_call_arr.append({})
        while len(self.streamed_args_for_tool) <= self.current_tool_id:
            self.streamed_args_for_tool.append("")

        extracted_tool_calls = self.extract_tool_calls(
            cur_text[:end_idx + len(self.tool_call_end_token)], request)

        if len(extracted_tool_calls.tool_calls) == 0:
            logger.warning("Failed to extract any tool calls.")
            return None
        tool_call = extracted_tool_calls.tool_calls[0]
        self.prev_tool_call_arr[self.current_tool_id] = {
            "name": tool_call.function.name,
            "arguments": json.loads(tool_call.function.arguments)
        }
        self.streamed_args_for_tool[
            self.current_tool_id] = tool_call.function.arguments
        delta = DeltaMessage(
            content=extracted_tool_calls.content,
            tool_calls=[
                DeltaToolCall(index=self.current_tool_id,
                              id=tool_call.id,
                              type=tool_call.type,
                              function=DeltaFunctionCall(
                                  name=tool_call.function.name,
                                  arguments=tool_call.function.arguments))
            ])
        self.current_tool_id += 1
        self._buffer = cur_text[end_idx + len(self.tool_call_end_token):]
        return delta

    self._buffer = cur_text[start_idx:]
    return DeltaMessage(content=cur_text[:start_idx])

Granite20bFCToolParser ¶

Bases: ToolParser

Tool call parser for the granite-20b-functioncalling model intended for use with the examples/tool_chat_template_granite20b_fc.jinja template.

Used when --enable-auto-tool-choice --tool-call-parser granite-20-fc are all set

Source code in vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py

@ToolParserManager.register_module("granite-20b-fc")
class Granite20bFCToolParser(ToolParser):
    """
    Tool call parser for the granite-20b-functioncalling model intended
    for use with the examples/tool_chat_template_granite20b_fc.jinja
    template.

    Used when --enable-auto-tool-choice --tool-call-parser granite-20-fc
    are all set
    """

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        self.bot_token = "<function_call>"
        self.tool_start_token = self.bot_token
        self.tool_call_regex = re.compile(r"<function_call>\s*")

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        if self.tool_start_token not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        dec = JSONDecoder()
        try:
            matches = list(self.tool_call_regex.finditer(model_output))
            logger.debug("Found %d tool call matches", len(matches))

            raw_function_calls = []

            for i, match in enumerate(matches):
                # position after the <function_call> tag
                start_of_json = match.end()
                # end_index == the start of the next function call
                # (if exists)
                next_function_call_start = (matches[i + 1].start() if i +
                                            1 < len(matches) else None)

                raw_function_calls.append(
                    dec.raw_decode(
                        model_output[start_of_json:next_function_call_start])
                    [0])

            logger.debug("Extracted %d tool calls", len(raw_function_calls))
            tool_calls = [
                ToolCall(
                    type="function",
                    function=FunctionCall(
                        name=function_call["name"],
                        # function call args are JSON but as a string
                        arguments=json.dumps(function_call["arguments"],
                                             ensure_ascii=False),
                    ),
                ) for function_call in raw_function_calls
            ]

            content = model_output[:model_output.find(self.bot_token)]
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content if content else None,
            )

        except Exception as e:
            logger.error("Error in extracting tool call from response %s", e)
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        if len(current_text) < len(
                self.bot_token) and self.bot_token.startswith(current_text):
            return None

        if not current_text.startswith(self.bot_token):
            return DeltaMessage(content=delta_text)

        # bit mask flags for partial JSON parsing. If the name hasn't been
        # sent yet, don't allow sending
        # an incomplete string since OpenAI only ever (as far as I have
        # seen) allows sending the entire tool/ function name at once.
        flags = Allow.ALL if self.current_tool_name_sent \
            else Allow.ALL & ~Allow.STR
        try:
            tool_call_arr = []
            is_complete = []
            try:
                start_idx = len(self.bot_token)
                start_idx = consume_space(start_idx, current_text)

                while start_idx < len(current_text):
                    (obj,
                     end_idx) = partial_json_loads(current_text[start_idx:],
                                                   flags)
                    is_complete.append(
                        is_complete_json(current_text[start_idx:start_idx +
                                                      end_idx]))
                    start_idx += end_idx
                    start_idx = consume_space(start_idx, current_text)
                    start_idx += len(self.bot_token)
                    start_idx = consume_space(start_idx, current_text)
                    tool_call_arr.append(obj)
            except partial_json_parser.core.exceptions.MalformedJSON:
                logger.debug('not enough tokens to parse into JSON yet')
                return None

            # select as the current tool call the one we're on the state at
            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                if len(tool_call_arr) > 0 else {}

            # case -- if no tokens have been streamed for the tool, e.g.
            #   only the array brackets, stream nothing
            if len(tool_call_arr) == 0:
                return None

            # case: we are starting a new tool in the array
            #   -> array has > 0 length AND length has moved past cursor
            elif (len(tool_call_arr) > 0
                  and len(tool_call_arr) > self.current_tool_id + 1):

                # if we're moving on to a new call, first make sure we
                # haven't missed anything in the previous one that was
                # auto-generated due to JSON completions, but wasn't
                # streamed to the client yet.
                if self.current_tool_id >= 0:
                    cur_arguments = current_tool_call.get("arguments")
                    if cur_arguments:
                        cur_args_json = json.dumps(cur_arguments,
                                                   ensure_ascii=False)
                        sent = len(
                            self.streamed_args_for_tool[self.current_tool_id])
                        argument_diff = cur_args_json[sent:]

                        logger.debug("got arguments diff: %s", argument_diff)
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=argument_diff).
                                          model_dump(exclude_none=True))
                        ])
                        self.streamed_args_for_tool[
                            self.current_tool_id] += argument_diff
                    else:
                        delta = None
                else:
                    delta = None
                # re-set stuff pertaining to progress in the current tool
                self.current_tool_id = len(tool_call_arr) - 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("starting on new tool %d", self.current_tool_id)
                return delta

            # if the current tool name hasn't been sent, send if available
            # - otherwise send nothing
            elif not self.current_tool_name_sent:
                function_name = current_tool_call.get("name")
                if function_name:

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))
                    ])
                    self.current_tool_name_sent = True
                else:
                    delta = None

            # now we know we're on the same tool call and we're streaming
            # arguments
            else:
                cur_arguments = current_tool_call.get("arguments")
                delta = None

                if cur_arguments:
                    sent = len(
                        self.streamed_args_for_tool[self.current_tool_id])
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    prev_arguments = self.prev_tool_call_arr[
                        self.current_tool_id].get("arguments")

                    argument_diff = None
                    if is_complete[self.current_tool_id]:
                        argument_diff = cur_args_json[sent:]
                    elif prev_arguments:
                        prev_args_json = json.dumps(prev_arguments,
                                                    ensure_ascii=False)
                        if cur_args_json != prev_args_json:

                            prefix = find_common_prefix(
                                prev_args_json, cur_args_json)
                            argument_diff = prefix[sent:]

                    if argument_diff is not None:
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=argument_diff).
                                          model_dump(exclude_none=True))
                        ])
                        self.streamed_args_for_tool[
                            self.current_tool_id] += argument_diff

            self.prev_tool_call_arr = tool_call_arr
            return delta

        except Exception as e:
            logger.error("Error trying to handle streaming tool call: %s", e)
            logger.debug(
                "Skipping chunk as a result of tool streaming extraction "
                "error")
            return None

bot_token `instance-attribute` ¶

bot_token = '<function_call>'

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile('<function_call>\\s*')

tool_start_token `instance-attribute` ¶

tool_start_token = bot_token

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    self.bot_token = "<function_call>"
    self.tool_start_token = self.bot_token
    self.tool_call_regex = re.compile(r"<function_call>\s*")

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    if self.tool_start_token not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    dec = JSONDecoder()
    try:
        matches = list(self.tool_call_regex.finditer(model_output))
        logger.debug("Found %d tool call matches", len(matches))

        raw_function_calls = []

        for i, match in enumerate(matches):
            # position after the <function_call> tag
            start_of_json = match.end()
            # end_index == the start of the next function call
            # (if exists)
            next_function_call_start = (matches[i + 1].start() if i +
                                        1 < len(matches) else None)

            raw_function_calls.append(
                dec.raw_decode(
                    model_output[start_of_json:next_function_call_start])
                [0])

        logger.debug("Extracted %d tool calls", len(raw_function_calls))
        tool_calls = [
            ToolCall(
                type="function",
                function=FunctionCall(
                    name=function_call["name"],
                    # function call args are JSON but as a string
                    arguments=json.dumps(function_call["arguments"],
                                         ensure_ascii=False),
                ),
            ) for function_call in raw_function_calls
        ]

        content = model_output[:model_output.find(self.bot_token)]
        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=tool_calls,
            content=content if content else None,
        )

    except Exception as e:
        logger.error("Error in extracting tool call from response %s", e)
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    if len(current_text) < len(
            self.bot_token) and self.bot_token.startswith(current_text):
        return None

    if not current_text.startswith(self.bot_token):
        return DeltaMessage(content=delta_text)

    # bit mask flags for partial JSON parsing. If the name hasn't been
    # sent yet, don't allow sending
    # an incomplete string since OpenAI only ever (as far as I have
    # seen) allows sending the entire tool/ function name at once.
    flags = Allow.ALL if self.current_tool_name_sent \
        else Allow.ALL & ~Allow.STR
    try:
        tool_call_arr = []
        is_complete = []
        try:
            start_idx = len(self.bot_token)
            start_idx = consume_space(start_idx, current_text)

            while start_idx < len(current_text):
                (obj,
                 end_idx) = partial_json_loads(current_text[start_idx:],
                                               flags)
                is_complete.append(
                    is_complete_json(current_text[start_idx:start_idx +
                                                  end_idx]))
                start_idx += end_idx
                start_idx = consume_space(start_idx, current_text)
                start_idx += len(self.bot_token)
                start_idx = consume_space(start_idx, current_text)
                tool_call_arr.append(obj)
        except partial_json_parser.core.exceptions.MalformedJSON:
            logger.debug('not enough tokens to parse into JSON yet')
            return None

        # select as the current tool call the one we're on the state at
        current_tool_call: dict = tool_call_arr[self.current_tool_id] \
            if len(tool_call_arr) > 0 else {}

        # case -- if no tokens have been streamed for the tool, e.g.
        #   only the array brackets, stream nothing
        if len(tool_call_arr) == 0:
            return None

        # case: we are starting a new tool in the array
        #   -> array has > 0 length AND length has moved past cursor
        elif (len(tool_call_arr) > 0
              and len(tool_call_arr) > self.current_tool_id + 1):

            # if we're moving on to a new call, first make sure we
            # haven't missed anything in the previous one that was
            # auto-generated due to JSON completions, but wasn't
            # streamed to the client yet.
            if self.current_tool_id >= 0:
                cur_arguments = current_tool_call.get("arguments")
                if cur_arguments:
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    sent = len(
                        self.streamed_args_for_tool[self.current_tool_id])
                    argument_diff = cur_args_json[sent:]

                    logger.debug("got arguments diff: %s", argument_diff)
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff
                else:
                    delta = None
            else:
                delta = None
            # re-set stuff pertaining to progress in the current tool
            self.current_tool_id = len(tool_call_arr) - 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("starting on new tool %d", self.current_tool_id)
            return delta

        # if the current tool name hasn't been sent, send if available
        # - otherwise send nothing
        elif not self.current_tool_name_sent:
            function_name = current_tool_call.get("name")
            if function_name:

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  type="function",
                                  id=make_tool_call_id(),
                                  function=DeltaFunctionCall(
                                      name=function_name).model_dump(
                                          exclude_none=True))
                ])
                self.current_tool_name_sent = True
            else:
                delta = None

        # now we know we're on the same tool call and we're streaming
        # arguments
        else:
            cur_arguments = current_tool_call.get("arguments")
            delta = None

            if cur_arguments:
                sent = len(
                    self.streamed_args_for_tool[self.current_tool_id])
                cur_args_json = json.dumps(cur_arguments,
                                           ensure_ascii=False)
                prev_arguments = self.prev_tool_call_arr[
                    self.current_tool_id].get("arguments")

                argument_diff = None
                if is_complete[self.current_tool_id]:
                    argument_diff = cur_args_json[sent:]
                elif prev_arguments:
                    prev_args_json = json.dumps(prev_arguments,
                                                ensure_ascii=False)
                    if cur_args_json != prev_args_json:

                        prefix = find_common_prefix(
                            prev_args_json, cur_args_json)
                        argument_diff = prefix[sent:]

                if argument_diff is not None:
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff

        self.prev_tool_call_arr = tool_call_arr
        return delta

    except Exception as e:
        logger.error("Error trying to handle streaming tool call: %s", e)
        logger.debug(
            "Skipping chunk as a result of tool streaming extraction "
            "error")
        return None

GraniteToolParser ¶

Bases: ToolParser

Tool call parser for the granite 3.0 models. Intended for use with the examples/tool_chat_template_granite.jinja template.

Used when --enable-auto-tool-choice --tool-call-parser granite are all set

Source code in vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py

@ToolParserManager.register_module("granite")
class GraniteToolParser(ToolParser):
    """
    Tool call parser for the granite 3.0 models. Intended
    for use with the examples/tool_chat_template_granite.jinja
    template.

    Used when --enable-auto-tool-choice --tool-call-parser granite
    are all set
    """

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
        # for granite 3.0, the token `<|tool_call|>`
        self.bot_token = "<|tool_call|>"
        # for granite 3.1, the string `<tool_call>`
        self.bot_string = "<tool_call>"

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        stripped = model_output.strip()\
                    .removeprefix(self.bot_token)\
                    .removeprefix(self.bot_string)\
                    .lstrip()
        if not stripped or stripped[0] != '[':
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)
        try:
            raw_function_calls = json.loads(stripped)
            if not isinstance(raw_function_calls, list):
                raise Exception(
                    f"Expected dict or list, got {type(raw_function_calls)}")

            logger.debug("Extracted %d tool calls", len(raw_function_calls))
            tool_calls = [
                ToolCall(
                    type="function",
                    function=FunctionCall(
                        name=function_call["name"],
                        # function call args are JSON but as a string
                        arguments=json.dumps(function_call["arguments"],
                                             ensure_ascii=False),
                    ),
                ) for function_call in raw_function_calls
            ]

            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=None,
            )

        except Exception as e:
            logger.error("Error in extracting tool call from response %s", e)
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        start_idx = consume_space(0, current_text)
        if current_text[start_idx:].startswith(self.bot_token):
            start_idx = consume_space(start_idx + len(self.bot_token),
                                      current_text)
        if current_text[start_idx:].startswith(self.bot_string):
            start_idx = consume_space(start_idx + len(self.bot_string),
                                      current_text)
        if not current_text or start_idx >= len(current_text)\
            or current_text[start_idx] != '[':
            return DeltaMessage(content=delta_text)

        # bit mask flags for partial JSON parsing. If the name hasn't been
        # sent yet, don't allow sending
        # an incomplete string since OpenAI only ever (as far as I have
        # seen) allows sending the entire tool/ function name at once.
        flags = Allow.ALL if self.current_tool_name_sent \
            else Allow.ALL & ~Allow.STR
        try:
            tool_call_arr = None
            is_complete = None
            try:
                tool_calls, end_idx = partial_json_loads(
                    current_text[start_idx:], flags)
                if type(tool_calls) is list:
                    tool_call_arr = tool_calls
                else:
                    return DeltaMessage(content=delta_text)

                is_complete = [True] * len(tool_calls)
                if not is_complete_json(
                        current_text[start_idx:start_idx + end_idx]):
                    is_complete[-1] = False
            except partial_json_parser.core.exceptions.MalformedJSON:
                logger.debug('not enough tokens to parse into JSON yet')
                return None

            # case -- if no tokens have been streamed for the tool, e.g.
            #   only the array brackets, stream nothing
            if not tool_call_arr:
                return None

            # select as the current tool call the one we're on the state at
            current_tool_call: dict = tool_call_arr[self.current_tool_id]

            delta = None
            # case: we are starting a new tool in the array
            #   -> array has > 0 length AND length has moved past cursor
            if len(tool_call_arr) > self.current_tool_id + 1:

                # if we're moving on to a new call, first make sure we
                # haven't missed anything in the previous one that was
                # auto-generated due to JSON completions, but wasn't
                # streamed to the client yet.
                if self.current_tool_id >= 0:
                    cur_arguments = current_tool_call.get("arguments")
                    if cur_arguments:
                        cur_args_json = json.dumps(cur_arguments,
                                                   ensure_ascii=False)
                        sent = len(
                            self.streamed_args_for_tool[self.current_tool_id])
                        argument_diff = cur_args_json[sent:]

                        logger.debug("got arguments diff: %s", argument_diff)
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=argument_diff).
                                          model_dump(exclude_none=True))
                        ])
                        self.streamed_args_for_tool[
                            self.current_tool_id] += argument_diff

                # re-set stuff pertaining to progress in the current tool
                self.current_tool_id = len(tool_call_arr) - 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("starting on new tool %d", self.current_tool_id)
                return delta

            # if the current tool name hasn't been sent, send if available
            # - otherwise send nothing
            elif not self.current_tool_name_sent:
                function_name = current_tool_call.get("name")
                if function_name:

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))
                    ])
                    self.current_tool_name_sent = True

            # now we know we're on the same tool call and we're streaming
            # arguments
            else:
                cur_arguments = current_tool_call.get("arguments")

                if cur_arguments:
                    sent = len(
                        self.streamed_args_for_tool[self.current_tool_id])
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    prev_arguments = self.prev_tool_call_arr[
                        self.current_tool_id].get("arguments")

                    argument_diff = None
                    if is_complete[self.current_tool_id]:
                        argument_diff = cur_args_json[sent:]
                    elif prev_arguments:
                        prev_args_json = json.dumps(prev_arguments,
                                                    ensure_ascii=False)
                        if cur_args_json != prev_args_json:
                            prefix = find_common_prefix(
                                prev_args_json, cur_args_json)
                            argument_diff = prefix[sent:]

                    if argument_diff is not None:
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=argument_diff).
                                          model_dump(exclude_none=True))
                        ])
                        self.streamed_args_for_tool[
                            self.current_tool_id] += argument_diff

            self.prev_tool_call_arr = tool_call_arr
            return delta

        except Exception as e:
            logger.error("Error trying to handle streaming tool call: %s", e)
            logger.debug(
                "Skipping chunk as a result of tool streaming extraction "
                "error")
            return None

bot_string `instance-attribute` ¶

bot_string = '<tool_call>'

bot_token `instance-attribute` ¶

bot_token = '<|tool_call|>'

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)
    # for granite 3.0, the token `<|tool_call|>`
    self.bot_token = "<|tool_call|>"
    # for granite 3.1, the string `<tool_call>`
    self.bot_string = "<tool_call>"

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    stripped = model_output.strip()\
                .removeprefix(self.bot_token)\
                .removeprefix(self.bot_string)\
                .lstrip()
    if not stripped or stripped[0] != '[':
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)
    try:
        raw_function_calls = json.loads(stripped)
        if not isinstance(raw_function_calls, list):
            raise Exception(
                f"Expected dict or list, got {type(raw_function_calls)}")

        logger.debug("Extracted %d tool calls", len(raw_function_calls))
        tool_calls = [
            ToolCall(
                type="function",
                function=FunctionCall(
                    name=function_call["name"],
                    # function call args are JSON but as a string
                    arguments=json.dumps(function_call["arguments"],
                                         ensure_ascii=False),
                ),
            ) for function_call in raw_function_calls
        ]

        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=tool_calls,
            content=None,
        )

    except Exception as e:
        logger.error("Error in extracting tool call from response %s", e)
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    start_idx = consume_space(0, current_text)
    if current_text[start_idx:].startswith(self.bot_token):
        start_idx = consume_space(start_idx + len(self.bot_token),
                                  current_text)
    if current_text[start_idx:].startswith(self.bot_string):
        start_idx = consume_space(start_idx + len(self.bot_string),
                                  current_text)
    if not current_text or start_idx >= len(current_text)\
        or current_text[start_idx] != '[':
        return DeltaMessage(content=delta_text)

    # bit mask flags for partial JSON parsing. If the name hasn't been
    # sent yet, don't allow sending
    # an incomplete string since OpenAI only ever (as far as I have
    # seen) allows sending the entire tool/ function name at once.
    flags = Allow.ALL if self.current_tool_name_sent \
        else Allow.ALL & ~Allow.STR
    try:
        tool_call_arr = None
        is_complete = None
        try:
            tool_calls, end_idx = partial_json_loads(
                current_text[start_idx:], flags)
            if type(tool_calls) is list:
                tool_call_arr = tool_calls
            else:
                return DeltaMessage(content=delta_text)

            is_complete = [True] * len(tool_calls)
            if not is_complete_json(
                    current_text[start_idx:start_idx + end_idx]):
                is_complete[-1] = False
        except partial_json_parser.core.exceptions.MalformedJSON:
            logger.debug('not enough tokens to parse into JSON yet')
            return None

        # case -- if no tokens have been streamed for the tool, e.g.
        #   only the array brackets, stream nothing
        if not tool_call_arr:
            return None

        # select as the current tool call the one we're on the state at
        current_tool_call: dict = tool_call_arr[self.current_tool_id]

        delta = None
        # case: we are starting a new tool in the array
        #   -> array has > 0 length AND length has moved past cursor
        if len(tool_call_arr) > self.current_tool_id + 1:

            # if we're moving on to a new call, first make sure we
            # haven't missed anything in the previous one that was
            # auto-generated due to JSON completions, but wasn't
            # streamed to the client yet.
            if self.current_tool_id >= 0:
                cur_arguments = current_tool_call.get("arguments")
                if cur_arguments:
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    sent = len(
                        self.streamed_args_for_tool[self.current_tool_id])
                    argument_diff = cur_args_json[sent:]

                    logger.debug("got arguments diff: %s", argument_diff)
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff

            # re-set stuff pertaining to progress in the current tool
            self.current_tool_id = len(tool_call_arr) - 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("starting on new tool %d", self.current_tool_id)
            return delta

        # if the current tool name hasn't been sent, send if available
        # - otherwise send nothing
        elif not self.current_tool_name_sent:
            function_name = current_tool_call.get("name")
            if function_name:

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  type="function",
                                  id=make_tool_call_id(),
                                  function=DeltaFunctionCall(
                                      name=function_name).model_dump(
                                          exclude_none=True))
                ])
                self.current_tool_name_sent = True

        # now we know we're on the same tool call and we're streaming
        # arguments
        else:
            cur_arguments = current_tool_call.get("arguments")

            if cur_arguments:
                sent = len(
                    self.streamed_args_for_tool[self.current_tool_id])
                cur_args_json = json.dumps(cur_arguments,
                                           ensure_ascii=False)
                prev_arguments = self.prev_tool_call_arr[
                    self.current_tool_id].get("arguments")

                argument_diff = None
                if is_complete[self.current_tool_id]:
                    argument_diff = cur_args_json[sent:]
                elif prev_arguments:
                    prev_args_json = json.dumps(prev_arguments,
                                                ensure_ascii=False)
                    if cur_args_json != prev_args_json:
                        prefix = find_common_prefix(
                            prev_args_json, cur_args_json)
                        argument_diff = prefix[sent:]

                if argument_diff is not None:
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff

        self.prev_tool_call_arr = tool_call_arr
        return delta

    except Exception as e:
        logger.error("Error trying to handle streaming tool call: %s", e)
        logger.debug(
            "Skipping chunk as a result of tool streaming extraction "
            "error")
        return None

Hermes2ProToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py

@ToolParserManager.register_module("hermes")
class Hermes2ProToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        if isinstance(self.model_tokenizer, MistralTokenizer):
            logger.error(
                "Detected Mistral tokenizer when using a Hermes model")
            self.model_tokenizer = self.model_tokenizer.tokenizer

        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id: int = -1
        self.streamed_args_for_tool: list[str] = [
        ]  # map what has been streamed for each tool so far to a list

        self.tool_call_start_token: str = "<tool_call>"
        self.tool_call_end_token: str = "</tool_call>"

        self.tool_call_regex = re.compile(
            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL)
        self.scratch_pad_regex = re.compile(
            r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL)

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction.")
        self.tool_call_start_token_ids = self.model_tokenizer.encode(
            self.tool_call_start_token, add_special_tokens=False)
        self.tool_call_end_token_ids = self.model_tokenizer.encode(
            self.tool_call_end_token, add_special_tokens=False)

        self.tool_call_start_token_array = [
            self.model_tokenizer.decode([token_id])
            for token_id in self.tool_call_start_token_ids
        ]

        self.tool_call_end_token_array = [
            self.model_tokenizer.decode([token_id])
            for token_id in self.tool_call_end_token_ids
        ]

        self.buffered_delta_text = ""

    # Very simple idea: when encountering tokens like <, tool, _call, >,
    # <, /, tool, _call, >, store them in a buffer.
    # When the last token is encountered, empty the buffer and return it.
    # If a token appears in an incorrect sequence while storing in the buffer,
    # return the preceding buffer along with the token.
    def tool_call_delta_buffer(self, delta_text: str):
        # If the sequence of tool_call_start or tool_call_end tokens is not yet
        # complete, fill the buffer with the token and return "".
        if (delta_text in self.tool_call_start_token_array
                or delta_text in self.tool_call_end_token_array):
            # If delta_text is the last token of tool_call_start_token or
            # tool_call_end_token, empty the buffer and return
            # the buffered text + delta_text.
            if (delta_text == self.tool_call_start_token_array[-1]
                    or delta_text == self.tool_call_end_token_array[-1]):
                buffered_text = self.buffered_delta_text
                self.buffered_delta_text = ""
                return buffered_text + delta_text
            else:
                self.buffered_delta_text = self.buffered_delta_text + delta_text
                return ""
        else:
            if self.buffered_delta_text:
                buffered_text = self.buffered_delta_text
                self.buffered_delta_text = ""
                return buffered_text + delta_text
            else:
                return delta_text

    def adjust_request(
            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        if request.tools and request.tool_choice != 'none':
            # do not skip special tokens because the tool_call tokens are
            # marked "special" in some models. Since they are skipped
            # prior to the call to the tool parser, it breaks tool calling.
            request.skip_special_tokens = False
        return request

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:

        # sanity check; avoid unnecessary processing
        if self.tool_call_start_token not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        else:

            try:
                # there are two possible captures - between tags, or between a
                # tag and end-of-string so the result of
                # findall is an array of tuples where one is a function call and
                # the other is None
                function_call_tuples = (
                    self.tool_call_regex.findall(model_output))

                # load the JSON, and then use it to build the Function and
                # Tool Call
                raw_function_calls = [
                    json.loads(match[0] if match[0] else match[1])
                    for match in function_call_tuples
                ]
                tool_calls = [
                    ToolCall(
                        type="function",
                        function=FunctionCall(
                            name=function_call["name"],
                            # function call args are JSON but as a string
                            arguments=json.dumps(function_call["arguments"],
                                                 ensure_ascii=False)))
                    for function_call in raw_function_calls
                ]

                content = model_output[:model_output.
                                       find(self.tool_call_start_token)]
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=tool_calls,
                    content=content if content else None)

            except Exception:
                logger.exception(
                    "Error in extracting tool call from response.")
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        # 1. All tokens are parsed based on _text, not token_ids.
        # 2. All incoming text data is processed by the tool_call_delta_buffer
        #    function for buffering before being used for parsing.

        delta_text = self.tool_call_delta_buffer(delta_text)
        # If the last characters of previous_text
        # match self.buffered_delta_text, remove only the matching part.
        if (len(previous_text) >= len(self.buffered_delta_text)
                and previous_text[-len(self.buffered_delta_text):]
                == self.buffered_delta_text):
            previous_text = previous_text[:-len(self.buffered_delta_text)]
            current_text = previous_text + delta_text

        logger.debug("delta_text: %s", delta_text)
        logger.debug("delta_token_ids: %s", delta_token_ids)
        # check to see if we should be streaming a tool call - is there a
        if self.tool_call_start_token not in current_text:
            logger.debug("No tool call tokens found!")
            return DeltaMessage(content=delta_text)

        try:

            # figure out where we are in the parsing by counting tool call
            # start & end tags
            prev_tool_start_count = previous_text.count(
                self.tool_call_start_token)
            prev_tool_end_count = previous_text.count(self.tool_call_end_token)
            cur_tool_start_count = current_text.count(
                self.tool_call_start_token)
            cur_tool_end_count = current_text.count(self.tool_call_end_token)
            tool_call_portion = None
            text_portion = None

            # case: if we're generating text, OR rounding out a tool call
            if (cur_tool_start_count == cur_tool_end_count
                    and prev_tool_end_count == cur_tool_end_count
                    and self.tool_call_end_token not in delta_text):
                logger.debug("Generating text content! skipping tool parsing.")
                return DeltaMessage(content=delta_text)

            if self.tool_call_end_token in delta_text:
                logger.debug("tool_call_end_token in delta_text")
                full_text = current_text + delta_text
                tool_call_portion = full_text.split(
                    self.tool_call_start_token)[-1].split(
                        self.tool_call_end_token)[0].rstrip()
                delta_text = delta_text.split(
                    self.tool_call_end_token)[0].rstrip()
                text_portion = delta_text.split(
                    self.tool_call_end_token)[-1].lstrip()

            # case: if tool open & close tag counts don't match, we're doing
            # imaginary "else" block here
            # something with tools with this diff.
            # flags for partial JSON parting. exported constants from
            # "Allow" are handled via BIT MASK
            flags = Allow.ALL if self.current_tool_name_sent \
                else Allow.ALL & ~Allow.STR

            # case -- we're starting a new tool call
            if (cur_tool_start_count > cur_tool_end_count
                    and cur_tool_start_count > prev_tool_start_count):
                if len(delta_token_ids) > 1:
                    tool_call_portion = current_text.split(
                        self.tool_call_start_token)[-1]
                else:
                    tool_call_portion = None
                    delta = None

                text_portion = None

                # set cursors and state appropriately
                self.current_tool_id += 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("Starting on a new tool %s", self.current_tool_id)

            # case -- we're updating an existing tool call
            elif (cur_tool_start_count > cur_tool_end_count
                  and cur_tool_start_count == prev_tool_start_count):

                # get the portion of the text that's the tool call
                tool_call_portion = current_text.split(
                    self.tool_call_start_token)[-1]
                text_portion = None

            # case -- the current tool call is being closed.
            elif (cur_tool_start_count == cur_tool_end_count
                  and cur_tool_end_count >= prev_tool_end_count):
                if (self.prev_tool_call_arr is None
                        or len(self.prev_tool_call_arr) == 0):
                    logger.debug(
                        "attempting to close tool call, but no tool call")
                    return None
                diff = self.prev_tool_call_arr[self.current_tool_id].get(
                    "arguments")
                if diff:
                    diff = diff.encode('utf-8').decode(
                        'unicode_escape') if diff is str else diff
                    if ('"}' not in delta_text):
                        return None
                    end_loc = delta_text.rindex('"}')
                    diff = delta_text[:end_loc] + '"}'
                    logger.debug(
                        "Finishing tool and found diff that had not "
                        "been streamed yet: %s", diff)
                    self.streamed_args_for_tool[self.current_tool_id] \
                        += diff
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=diff).model_dump(
                                              exclude_none=True))
                    ])

            # case -- otherwise we're just generating text
            else:
                text = delta_text.replace(self.tool_call_start_token, "")
                text = text.replace(self.tool_call_end_token, "")
                delta = DeltaMessage(tool_calls=[], content=text)
                return delta

            try:

                current_tool_call = partial_json_parser.loads(
                    tool_call_portion or "{}",
                    flags) if tool_call_portion else None
                logger.debug("Parsed tool call %s", current_tool_call)
            except partial_json_parser.core.exceptions.MalformedJSON:
                logger.debug('not enough tokens to parse into JSON yet')
                return None
            except json.decoder.JSONDecodeError:
                logger.debug("unable to parse JSON")
                return None

            # case - we haven't sent the tool name yet. If it's available, send
            #   it. otherwise, wait until it's available.
            if not self.current_tool_name_sent:
                if (current_tool_call is None):
                    return None
                function_name: Union[str, None] = current_tool_call.get("name")
                if function_name:
                    self.current_tool_name_sent = True
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))
                    ])
                else:
                    return None
            # case -- otherwise, send the tool call delta

            # if the tool call portion is None, send the delta as text
            if tool_call_portion is None:
                # if there's text but not tool calls, send that -
                # otherwise None to skip chunk
                delta = DeltaMessage(content=delta_text) \
                    if text_portion is not None else None
                return delta

            # now, the nitty-gritty of tool calls
            # now we have the portion to parse as tool call.

            logger.debug("Trying to parse current tool call with ID %s",
                         self.current_tool_id)

            # if we're starting a new tool call, push an empty object in as
            #   a placeholder for the arguments
            if len(self.prev_tool_call_arr) <= self.current_tool_id:
                self.prev_tool_call_arr.append({})

            # main logic for tool parsing here - compare prev. partially-parsed
            #   JSON to the current partially-parsed JSON
            prev_arguments = (
                self.prev_tool_call_arr[self.current_tool_id].get("arguments"))
            cur_arguments = current_tool_call.get("arguments")

            logger.debug("diffing old arguments: %s", prev_arguments)
            logger.debug("against new ones: %s", cur_arguments)

            # case -- no arguments have been created yet. skip sending a delta.
            if not cur_arguments and not prev_arguments:
                logger.debug("Skipping text %s - no arguments", delta_text)
                delta = None

            # case -- prev arguments are defined, but non are now.
            #   probably impossible, but not a fatal error - just keep going
            elif not cur_arguments and prev_arguments:
                logger.error("should be impossible to have arguments reset "
                             "mid-call. skipping streaming anything.")
                delta = None

            # case -- we now have the first info about arguments available from
            #   autocompleting the JSON
            elif cur_arguments and not prev_arguments:
                # extract the content after {"name": ..., "arguments":
                #   directly from tool_call_portion as cur_arguments_json,
                #   since cur_arguments may differ from the original text
                #   due to partial JSON parsing
                #   for example, tool_call_portion =
                #     {"name": "search", "arguments": {"search_request": {"
                #   but cur_arguments =
                #     {"search_request": {}}
                function_name = current_tool_call.get("name")
                match = re.search(
                    r'\{"name":\s*"' +
                    re.escape(function_name) + r'"\s*,\s*"arguments":\s*(.*)',
                    tool_call_portion.strip(), re.DOTALL)
                if match:
                    cur_arguments_json = match.group(1)
                else:
                    cur_arguments_json = json.dumps(cur_arguments,
                                                    ensure_ascii=False)

                logger.debug("finding %s in %s", delta_text,
                             cur_arguments_json)

                # get the location where previous args differ from current.
                if (delta_text not in cur_arguments_json):
                    return None
                args_delta_start_loc = cur_arguments_json. \
                                           rindex(delta_text) + \
                                           len(delta_text)

                # use that to find the actual delta
                arguments_delta = cur_arguments_json[:args_delta_start_loc]
                logger.debug("First tokens in arguments received: %s",
                             arguments_delta)

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=arguments_delta).model_dump(
                                          exclude_none=True))
                ])
                self.streamed_args_for_tool[self.current_tool_id] \
                    += arguments_delta

            # last case -- we have an update to existing arguments.
            elif cur_arguments and prev_arguments:
                # judge whether the tool_call_portion is a complete JSON
                try:
                    json.loads(tool_call_portion)
                    is_complete_json = True
                except Exception:
                    is_complete_json = False

                # if the delta_text ends with a '}' and tool_call_portion is a
                #   complete JSON, then the last '}' does not belong to the
                #   arguments, so we should trim it off
                if isinstance(delta_text, str) \
                    and len(delta_text.rstrip()) >= 1 \
                    and delta_text.rstrip()[-1] == '}' \
                    and is_complete_json:
                    delta_text = delta_text.rstrip()[:-1]

                logger.debug("got diff %s", delta_text)

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=delta_text).model_dump(
                                          exclude_none=True))
                ])
                self.streamed_args_for_tool[self.current_tool_id] \
                    += delta_text

            # handle saving the state for the current tool into
            # the "prev" list for use in diffing for the next iteration
            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
                self.prev_tool_call_arr[self.current_tool_id] = \
                    current_tool_call
            else:
                self.prev_tool_call_arr.append(current_tool_call)

            return delta

        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            return None  # do not stream a delta. skip this token ID.

buffered_delta_text `instance-attribute` ¶

buffered_delta_text = ''

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

model_tokenizer `instance-attribute` ¶

model_tokenizer = tokenizer

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

scratch_pad_regex `instance-attribute` ¶

scratch_pad_regex = compile(
    "<scratch_pad>(.*?)</scratch_pad>", DOTALL
)

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_call_end_token `instance-attribute` ¶

tool_call_end_token: str = '</tool_call>'

tool_call_end_token_array `instance-attribute` ¶

tool_call_end_token_array = [
    (decode([token_id]))
    for token_id in (tool_call_end_token_ids)
]

tool_call_end_token_ids `instance-attribute` ¶

tool_call_end_token_ids = encode(
    tool_call_end_token, add_special_tokens=False
)

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    "<tool_call>(.*?)</tool_call>|<tool_call>(.*)", DOTALL
)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token: str = '<tool_call>'

tool_call_start_token_array `instance-attribute` ¶

tool_call_start_token_array = [
    (decode([token_id]))
    for token_id in (tool_call_start_token_ids)
]

tool_call_start_token_ids `instance-attribute` ¶

tool_call_start_token_ids = encode(
    tool_call_start_token, add_special_tokens=False
)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    if isinstance(self.model_tokenizer, MistralTokenizer):
        logger.error(
            "Detected Mistral tokenizer when using a Hermes model")
        self.model_tokenizer = self.model_tokenizer.tokenizer

    self.current_tool_name_sent: bool = False
    self.prev_tool_call_arr: list[dict] = []
    self.current_tool_id: int = -1
    self.streamed_args_for_tool: list[str] = [
    ]  # map what has been streamed for each tool so far to a list

    self.tool_call_start_token: str = "<tool_call>"
    self.tool_call_end_token: str = "</tool_call>"

    self.tool_call_regex = re.compile(
        r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL)
    self.scratch_pad_regex = re.compile(
        r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL)

    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction.")
    self.tool_call_start_token_ids = self.model_tokenizer.encode(
        self.tool_call_start_token, add_special_tokens=False)
    self.tool_call_end_token_ids = self.model_tokenizer.encode(
        self.tool_call_end_token, add_special_tokens=False)

    self.tool_call_start_token_array = [
        self.model_tokenizer.decode([token_id])
        for token_id in self.tool_call_start_token_ids
    ]

    self.tool_call_end_token_array = [
        self.model_tokenizer.decode([token_id])
        for token_id in self.tool_call_end_token_ids
    ]

    self.buffered_delta_text = ""

adjust_request ¶

adjust_request(
    request: ChatCompletionRequest,
) -> ChatCompletionRequest

Source code in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py

def adjust_request(
        self, request: ChatCompletionRequest) -> ChatCompletionRequest:
    if request.tools and request.tool_choice != 'none':
        # do not skip special tokens because the tool_call tokens are
        # marked "special" in some models. Since they are skipped
        # prior to the call to the tool parser, it breaks tool calling.
        request.skip_special_tokens = False
    return request

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:

    # sanity check; avoid unnecessary processing
    if self.tool_call_start_token not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    else:

        try:
            # there are two possible captures - between tags, or between a
            # tag and end-of-string so the result of
            # findall is an array of tuples where one is a function call and
            # the other is None
            function_call_tuples = (
                self.tool_call_regex.findall(model_output))

            # load the JSON, and then use it to build the Function and
            # Tool Call
            raw_function_calls = [
                json.loads(match[0] if match[0] else match[1])
                for match in function_call_tuples
            ]
            tool_calls = [
                ToolCall(
                    type="function",
                    function=FunctionCall(
                        name=function_call["name"],
                        # function call args are JSON but as a string
                        arguments=json.dumps(function_call["arguments"],
                                             ensure_ascii=False)))
                for function_call in raw_function_calls
            ]

            content = model_output[:model_output.
                                   find(self.tool_call_start_token)]
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content if content else None)

        except Exception:
            logger.exception(
                "Error in extracting tool call from response.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    # 1. All tokens are parsed based on _text, not token_ids.
    # 2. All incoming text data is processed by the tool_call_delta_buffer
    #    function for buffering before being used for parsing.

    delta_text = self.tool_call_delta_buffer(delta_text)
    # If the last characters of previous_text
    # match self.buffered_delta_text, remove only the matching part.
    if (len(previous_text) >= len(self.buffered_delta_text)
            and previous_text[-len(self.buffered_delta_text):]
            == self.buffered_delta_text):
        previous_text = previous_text[:-len(self.buffered_delta_text)]
        current_text = previous_text + delta_text

    logger.debug("delta_text: %s", delta_text)
    logger.debug("delta_token_ids: %s", delta_token_ids)
    # check to see if we should be streaming a tool call - is there a
    if self.tool_call_start_token not in current_text:
        logger.debug("No tool call tokens found!")
        return DeltaMessage(content=delta_text)

    try:

        # figure out where we are in the parsing by counting tool call
        # start & end tags
        prev_tool_start_count = previous_text.count(
            self.tool_call_start_token)
        prev_tool_end_count = previous_text.count(self.tool_call_end_token)
        cur_tool_start_count = current_text.count(
            self.tool_call_start_token)
        cur_tool_end_count = current_text.count(self.tool_call_end_token)
        tool_call_portion = None
        text_portion = None

        # case: if we're generating text, OR rounding out a tool call
        if (cur_tool_start_count == cur_tool_end_count
                and prev_tool_end_count == cur_tool_end_count
                and self.tool_call_end_token not in delta_text):
            logger.debug("Generating text content! skipping tool parsing.")
            return DeltaMessage(content=delta_text)

        if self.tool_call_end_token in delta_text:
            logger.debug("tool_call_end_token in delta_text")
            full_text = current_text + delta_text
            tool_call_portion = full_text.split(
                self.tool_call_start_token)[-1].split(
                    self.tool_call_end_token)[0].rstrip()
            delta_text = delta_text.split(
                self.tool_call_end_token)[0].rstrip()
            text_portion = delta_text.split(
                self.tool_call_end_token)[-1].lstrip()

        # case: if tool open & close tag counts don't match, we're doing
        # imaginary "else" block here
        # something with tools with this diff.
        # flags for partial JSON parting. exported constants from
        # "Allow" are handled via BIT MASK
        flags = Allow.ALL if self.current_tool_name_sent \
            else Allow.ALL & ~Allow.STR

        # case -- we're starting a new tool call
        if (cur_tool_start_count > cur_tool_end_count
                and cur_tool_start_count > prev_tool_start_count):
            if len(delta_token_ids) > 1:
                tool_call_portion = current_text.split(
                    self.tool_call_start_token)[-1]
            else:
                tool_call_portion = None
                delta = None

            text_portion = None

            # set cursors and state appropriately
            self.current_tool_id += 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("Starting on a new tool %s", self.current_tool_id)

        # case -- we're updating an existing tool call
        elif (cur_tool_start_count > cur_tool_end_count
              and cur_tool_start_count == prev_tool_start_count):

            # get the portion of the text that's the tool call
            tool_call_portion = current_text.split(
                self.tool_call_start_token)[-1]
            text_portion = None

        # case -- the current tool call is being closed.
        elif (cur_tool_start_count == cur_tool_end_count
              and cur_tool_end_count >= prev_tool_end_count):
            if (self.prev_tool_call_arr is None
                    or len(self.prev_tool_call_arr) == 0):
                logger.debug(
                    "attempting to close tool call, but no tool call")
                return None
            diff = self.prev_tool_call_arr[self.current_tool_id].get(
                "arguments")
            if diff:
                diff = diff.encode('utf-8').decode(
                    'unicode_escape') if diff is str else diff
                if ('"}' not in delta_text):
                    return None
                end_loc = delta_text.rindex('"}')
                diff = delta_text[:end_loc] + '"}'
                logger.debug(
                    "Finishing tool and found diff that had not "
                    "been streamed yet: %s", diff)
                self.streamed_args_for_tool[self.current_tool_id] \
                    += diff
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=diff).model_dump(
                                          exclude_none=True))
                ])

        # case -- otherwise we're just generating text
        else:
            text = delta_text.replace(self.tool_call_start_token, "")
            text = text.replace(self.tool_call_end_token, "")
            delta = DeltaMessage(tool_calls=[], content=text)
            return delta

        try:

            current_tool_call = partial_json_parser.loads(
                tool_call_portion or "{}",
                flags) if tool_call_portion else None
            logger.debug("Parsed tool call %s", current_tool_call)
        except partial_json_parser.core.exceptions.MalformedJSON:
            logger.debug('not enough tokens to parse into JSON yet')
            return None
        except json.decoder.JSONDecodeError:
            logger.debug("unable to parse JSON")
            return None

        # case - we haven't sent the tool name yet. If it's available, send
        #   it. otherwise, wait until it's available.
        if not self.current_tool_name_sent:
            if (current_tool_call is None):
                return None
            function_name: Union[str, None] = current_tool_call.get("name")
            if function_name:
                self.current_tool_name_sent = True
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  type="function",
                                  id=make_tool_call_id(),
                                  function=DeltaFunctionCall(
                                      name=function_name).model_dump(
                                          exclude_none=True))
                ])
            else:
                return None
        # case -- otherwise, send the tool call delta

        # if the tool call portion is None, send the delta as text
        if tool_call_portion is None:
            # if there's text but not tool calls, send that -
            # otherwise None to skip chunk
            delta = DeltaMessage(content=delta_text) \
                if text_portion is not None else None
            return delta

        # now, the nitty-gritty of tool calls
        # now we have the portion to parse as tool call.

        logger.debug("Trying to parse current tool call with ID %s",
                     self.current_tool_id)

        # if we're starting a new tool call, push an empty object in as
        #   a placeholder for the arguments
        if len(self.prev_tool_call_arr) <= self.current_tool_id:
            self.prev_tool_call_arr.append({})

        # main logic for tool parsing here - compare prev. partially-parsed
        #   JSON to the current partially-parsed JSON
        prev_arguments = (
            self.prev_tool_call_arr[self.current_tool_id].get("arguments"))
        cur_arguments = current_tool_call.get("arguments")

        logger.debug("diffing old arguments: %s", prev_arguments)
        logger.debug("against new ones: %s", cur_arguments)

        # case -- no arguments have been created yet. skip sending a delta.
        if not cur_arguments and not prev_arguments:
            logger.debug("Skipping text %s - no arguments", delta_text)
            delta = None

        # case -- prev arguments are defined, but non are now.
        #   probably impossible, but not a fatal error - just keep going
        elif not cur_arguments and prev_arguments:
            logger.error("should be impossible to have arguments reset "
                         "mid-call. skipping streaming anything.")
            delta = None

        # case -- we now have the first info about arguments available from
        #   autocompleting the JSON
        elif cur_arguments and not prev_arguments:
            # extract the content after {"name": ..., "arguments":
            #   directly from tool_call_portion as cur_arguments_json,
            #   since cur_arguments may differ from the original text
            #   due to partial JSON parsing
            #   for example, tool_call_portion =
            #     {"name": "search", "arguments": {"search_request": {"
            #   but cur_arguments =
            #     {"search_request": {}}
            function_name = current_tool_call.get("name")
            match = re.search(
                r'\{"name":\s*"' +
                re.escape(function_name) + r'"\s*,\s*"arguments":\s*(.*)',
                tool_call_portion.strip(), re.DOTALL)
            if match:
                cur_arguments_json = match.group(1)
            else:
                cur_arguments_json = json.dumps(cur_arguments,
                                                ensure_ascii=False)

            logger.debug("finding %s in %s", delta_text,
                         cur_arguments_json)

            # get the location where previous args differ from current.
            if (delta_text not in cur_arguments_json):
                return None
            args_delta_start_loc = cur_arguments_json. \
                                       rindex(delta_text) + \
                                       len(delta_text)

            # use that to find the actual delta
            arguments_delta = cur_arguments_json[:args_delta_start_loc]
            logger.debug("First tokens in arguments received: %s",
                         arguments_delta)

            delta = DeltaMessage(tool_calls=[
                DeltaToolCall(index=self.current_tool_id,
                              function=DeltaFunctionCall(
                                  arguments=arguments_delta).model_dump(
                                      exclude_none=True))
            ])
            self.streamed_args_for_tool[self.current_tool_id] \
                += arguments_delta

        # last case -- we have an update to existing arguments.
        elif cur_arguments and prev_arguments:
            # judge whether the tool_call_portion is a complete JSON
            try:
                json.loads(tool_call_portion)
                is_complete_json = True
            except Exception:
                is_complete_json = False

            # if the delta_text ends with a '}' and tool_call_portion is a
            #   complete JSON, then the last '}' does not belong to the
            #   arguments, so we should trim it off
            if isinstance(delta_text, str) \
                and len(delta_text.rstrip()) >= 1 \
                and delta_text.rstrip()[-1] == '}' \
                and is_complete_json:
                delta_text = delta_text.rstrip()[:-1]

            logger.debug("got diff %s", delta_text)

            delta = DeltaMessage(tool_calls=[
                DeltaToolCall(index=self.current_tool_id,
                              function=DeltaFunctionCall(
                                  arguments=delta_text).model_dump(
                                      exclude_none=True))
            ])
            self.streamed_args_for_tool[self.current_tool_id] \
                += delta_text

        # handle saving the state for the current tool into
        # the "prev" list for use in diffing for the next iteration
        if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
            self.prev_tool_call_arr[self.current_tool_id] = \
                current_tool_call
        else:
            self.prev_tool_call_arr.append(current_tool_call)

        return delta

    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        return None  # do not stream a delta. skip this token ID.

tool_call_delta_buffer ¶

tool_call_delta_buffer(delta_text: str)

Source code in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py

def tool_call_delta_buffer(self, delta_text: str):
    # If the sequence of tool_call_start or tool_call_end tokens is not yet
    # complete, fill the buffer with the token and return "".
    if (delta_text in self.tool_call_start_token_array
            or delta_text in self.tool_call_end_token_array):
        # If delta_text is the last token of tool_call_start_token or
        # tool_call_end_token, empty the buffer and return
        # the buffered text + delta_text.
        if (delta_text == self.tool_call_start_token_array[-1]
                or delta_text == self.tool_call_end_token_array[-1]):
            buffered_text = self.buffered_delta_text
            self.buffered_delta_text = ""
            return buffered_text + delta_text
        else:
            self.buffered_delta_text = self.buffered_delta_text + delta_text
            return ""
    else:
        if self.buffered_delta_text:
            buffered_text = self.buffered_delta_text
            self.buffered_delta_text = ""
            return buffered_text + delta_text
        else:
            return delta_text

HunyuanA13BToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

@ToolParserManager.register_module("hunyuan_a13b")
class HunyuanA13BToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        # Initialize state for streaming mode
        self.prev_tool_calls: list[dict] = []
        self.current_tool_id = -1
        self.current_tool_name_sent = False
        self.streamed_args: list[str] = [
        ]  # Track arguments sent for each tool

        # For backward compatibility with tests
        self.current_tools_sent: list[bool] = []

        # For backward compatibility with serving code
        self.prev_tool_call_arr = []

        # Regex patterns for preprocessing
        self.answer_tool_calls_pattern = re.compile(
            r"<tool_calls>([\s\S]*?)</tool_calls>", re.DOTALL)

        self.tool_name_reg = re.compile(r'"name"\s*:\s*"([^"]+)"')

        self.tool_empty_arg_reg = re.compile(
            r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')

        # TODO: not support nested json object in fc arguments.
        self.tool_non_empty_arg_reg = re.compile(
            r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
        )

        self.bot_string = "<tool_calls>"

        # Define streaming state type to be initialized later
        self.streaming_state: dict[str, Any] = {
            "current_tool_index": -1,
            "tool_ids": [],
            "sent_tools": [],
        }

    def preprocess_model_output(
            self, model_output: str) -> tuple[Optional[str], Optional[str]]:
        # find the location tool call
        for match in self.answer_tool_calls_pattern.finditer(model_output):
            start, end = match.span()
            # check tool_calls whether in side of <think>
            think_regions = [(m.start(), m.end()) for m in re.finditer(
                r"<think>(.*?)</think>", model_output, flags=re.DOTALL)]
            in_think = any(start > t_start and end < t_end
                           for t_start, t_end in think_regions)
            if not in_think:
                content = model_output[:start]
                tool_calls_content = match.group(1).strip()
                try:
                    json.loads(tool_calls_content)
                    return content, tool_calls_content
                except Exception:
                    continue
        return model_output, None

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        """
        Extract tool calls from a complete model output.
        """
        try:
            # Preprocess the model output
            content, potential_tool_calls = self.preprocess_model_output(
                model_output)

            if not potential_tool_calls:
                # some text should be filtered out for no function call
                # this text is in a13b's chat template.
                if content:
                    content = content.replace("助手：", "", 1)
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=content)

            # Parse the potential tool calls as JSON
            tool_calls_data = json.loads(potential_tool_calls)

            # Ensure it's an array
            if not isinstance(tool_calls_data, list):
                logger.debug("Tool calls data is not an array")
                return ExtractedToolCallInformation(
                    tools_called=False,
                    tool_calls=[],
                    content=content or model_output,
                )

            tool_calls: list[ToolCall] = []

            for idx, call in enumerate(tool_calls_data):
                if (not isinstance(call, dict) or "name" not in call
                        or "arguments" not in call):
                    continue

                tool_call = ToolCall(
                    id=f"call_{random_uuid()}",
                    type="function",
                    function=FunctionCall(
                        name=call["name"],
                        arguments=(json.dumps(call["arguments"]) if isinstance(
                            call["arguments"], dict) else call["arguments"]),
                    ),
                )
                tool_calls.append(tool_call)

            if not content or len(content.strip()) == 0:
                # clear the whitespace content.
                content = None

            return ExtractedToolCallInformation(
                tools_called=len(tool_calls) > 0,
                tool_calls=tool_calls,
                content=content,
            )

        except Exception:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        """
        Extract tool calls for streaming mode.
        """

        start_idx = consume_space(0, current_text)
        if current_text[start_idx:].startswith(self.bot_string):
            start_idx = consume_space(start_idx + len(self.bot_string),
                                      current_text)
        if not current_text or start_idx >= len(
                current_text) or current_text[start_idx] != '[':
            return DeltaMessage(content=delta_text)

        self._try_parse_json_tools(current_text[start_idx:])

        test_delta = self._handle_test_compatibility(current_text)
        if test_delta:
            return test_delta

        name_matches = list(self.tool_name_reg.finditer(current_text))
        tool_count = len(name_matches)
        if tool_count == 0:
            return None
        self._ensure_state_arrays(tool_count)
        current_idx = self.streaming_state["current_tool_index"]

        name_delta = self._handle_tool_name_streaming(current_idx, tool_count,
                                                      name_matches)
        if name_delta:
            return name_delta

        args_delta = self._handle_tool_args_streaming(current_text,
                                                      current_idx, tool_count)
        if args_delta:
            return args_delta

        return None

    def _try_parse_json_tools(self, current_text: str):
        try:
            parsed_tools = json.loads(current_text)
            if isinstance(parsed_tools, list):
                self.prev_tool_call_arr = parsed_tools
        except json.JSONDecodeError:
            pass

    def _handle_test_compatibility(self, current_text: str):
        if len(self.current_tools_sent) > 0:
            if (len(self.current_tools_sent) == 1
                    and self.current_tools_sent[0] is False):
                name_match = self.tool_name_reg.search(current_text)
                if name_match:
                    function_name = name_match.group(1)
                    tool_id = f"chatcmpl-tool-{random_uuid()}"
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=0,
                            type="function",
                            id=tool_id,
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),
                        )
                    ])
                    self.current_tools_sent = [True]
                    self.current_tool_id = 0
                    self.streaming_state["current_tool_index"] = 0
                    if len(self.streaming_state["sent_tools"]) == 0:
                        self.streaming_state["sent_tools"].append({
                            "sent_name":
                            True,
                            "sent_arguments_prefix":
                            False,
                            "sent_arguments":
                            "",
                        })
                    else:
                        self.streaming_state["sent_tools"][0][
                            "sent_name"] = True
                    self.current_tool_name_sent = True
                    return delta
        return None

    def _ensure_state_arrays(self, tool_count: int):
        while len(self.streaming_state["sent_tools"]) < tool_count:
            self.streaming_state["sent_tools"].append({
                "sent_name": False,
                "sent_arguments_prefix": False,
                "sent_arguments": "",
            })
        while len(self.streaming_state["tool_ids"]) < tool_count:
            self.streaming_state["tool_ids"].append(None)

    def _handle_tool_name_streaming(self, current_idx: int, tool_count: int,
                                    name_matches):
        if current_idx == -1 or current_idx < tool_count - 1:
            next_idx = current_idx + 1
            if (next_idx < tool_count
                    and not self.streaming_state["sent_tools"][next_idx]
                ["sent_name"]):
                self.streaming_state["current_tool_index"] = next_idx
                self.current_tool_id = next_idx
                current_idx = next_idx
                tool_name = name_matches[current_idx].group(1)
                tool_id = f"call_{current_idx}_{random_uuid()}"
                self.streaming_state["tool_ids"][current_idx] = tool_id
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=current_idx,
                        type="function",
                        id=tool_id,
                        function=DeltaFunctionCall(name=tool_name).model_dump(
                            exclude_none=True),
                    )
                ])
                self.streaming_state["sent_tools"][current_idx][
                    "sent_name"] = True
                self.current_tool_name_sent = True
                while len(self.streamed_args) <= current_idx:
                    self.streamed_args.append("")
                return delta
        return None

    def _handle_tool_args_streaming(self, current_text: str, current_idx: int,
                                    tool_count: int):

        if current_idx >= 0 and current_idx < tool_count:
            empty_args_match = self.tool_empty_arg_reg.search(current_text)
            if empty_args_match and empty_args_match.start() > 0:
                for i in range(tool_count):
                    if i == current_idx:
                        if not self.streaming_state["sent_tools"][current_idx][
                                "sent_arguments_prefix"]:
                            self.streaming_state["sent_tools"][current_idx][
                                "sent_arguments_prefix"] = True
                            self.streaming_state["sent_tools"][current_idx][
                                "sent_arguments"] = "{}"
                            while len(self.streamed_args) <= current_idx:
                                self.streamed_args.append("")
                            self.streamed_args[current_idx] += "{}"
                            delta = DeltaMessage(tool_calls=[
                                DeltaToolCall(
                                    index=current_idx,
                                    function=DeltaFunctionCall(
                                        arguments="{}").model_dump(
                                            exclude_none=True),
                                )
                            ])
                            if current_idx < tool_count - 1:
                                self.streaming_state["current_tool_index"] += 1
                                self.current_tool_id = self.streaming_state[
                                    "current_tool_index"]
                            return delta

            args_matches = list(
                self.tool_non_empty_arg_reg.finditer(current_text))
            if current_idx < len(args_matches):
                args_text = args_matches[current_idx].group(1)
                is_last_tool = current_idx == tool_count - 1
                if not is_last_tool:
                    next_tool_pos = current_text.find(
                        "},{", args_matches[current_idx].start())
                    if next_tool_pos != -1:
                        args_end_pos = (next_tool_pos + 1)
                        args_text = (
                            current_text[args_matches[current_idx].start(
                            ):args_end_pos].split('"arguments":')[1].strip())
                sent_args = self.streaming_state["sent_tools"][current_idx][
                    "sent_arguments"]
                if not self.streaming_state["sent_tools"][current_idx][
                        "sent_arguments_prefix"] and args_text.startswith("{"):
                    self.streaming_state["sent_tools"][current_idx][
                        "sent_arguments_prefix"] = True
                    self.streaming_state["sent_tools"][current_idx][
                        "sent_arguments"] = "{"
                    while len(self.streamed_args) <= current_idx:
                        self.streamed_args.append("")
                    self.streamed_args[current_idx] += "{"
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=current_idx,
                            function=DeltaFunctionCall(
                                arguments="{").model_dump(exclude_none=True),
                        )
                    ])
                    return delta

                if args_text.startswith(sent_args):
                    args_diff = args_text[len(sent_args):]
                    if args_diff:
                        self.streaming_state["sent_tools"][current_idx][
                            "sent_arguments"] = args_text
                        while len(self.streamed_args) <= current_idx:
                            self.streamed_args.append("")
                        self.streamed_args[current_idx] += args_diff
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=current_idx,
                                function=DeltaFunctionCall(
                                    arguments=args_diff).model_dump(
                                        exclude_none=True),
                            )
                        ])
                        return delta

                if args_text.endswith("}") and args_text == sent_args:
                    if current_idx < tool_count - 1:
                        self.streaming_state["current_tool_index"] += 1
                        self.current_tool_id = self.streaming_state[
                            "current_tool_index"]
        return None

answer_tool_calls_pattern `instance-attribute` ¶

answer_tool_calls_pattern = compile(
    "<tool_calls>([\\s\\S]*?)</tool_calls>", DOTALL
)

bot_string `instance-attribute` ¶

bot_string = '<tool_calls>'

current_tool_id `instance-attribute` ¶

current_tool_id = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent = False

current_tools_sent `instance-attribute` ¶

current_tools_sent: list[bool] = []

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr = []

prev_tool_calls `instance-attribute` ¶

prev_tool_calls: list[dict] = []

streamed_args `instance-attribute` ¶

streamed_args: list[str] = []

streaming_state `instance-attribute` ¶

streaming_state: dict[str, Any] = {
    "current_tool_index": -1,
    "tool_ids": [],
    "sent_tools": [],
}

tool_empty_arg_reg `instance-attribute` ¶

tool_empty_arg_reg = compile(
    '"name"\\s*:\\s*"[^"]+"\\s*,\\s*"arguments"\\s*:\\s*\\{\\s*\\}'
)

tool_name_reg `instance-attribute` ¶

tool_name_reg = compile('"name"\\s*:\\s*"([^"]+)"')

tool_non_empty_arg_reg `instance-attribute` ¶

tool_non_empty_arg_reg = compile(
    '"name"\\s*:\\s*"[^"]+"\\s*,\\s*"arguments"\\s*:\\s*(\\{(?:[^{}]|(?:\\{[^{}]*\\}))*\\})'
)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    # Initialize state for streaming mode
    self.prev_tool_calls: list[dict] = []
    self.current_tool_id = -1
    self.current_tool_name_sent = False
    self.streamed_args: list[str] = [
    ]  # Track arguments sent for each tool

    # For backward compatibility with tests
    self.current_tools_sent: list[bool] = []

    # For backward compatibility with serving code
    self.prev_tool_call_arr = []

    # Regex patterns for preprocessing
    self.answer_tool_calls_pattern = re.compile(
        r"<tool_calls>([\s\S]*?)</tool_calls>", re.DOTALL)

    self.tool_name_reg = re.compile(r'"name"\s*:\s*"([^"]+)"')

    self.tool_empty_arg_reg = re.compile(
        r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')

    # TODO: not support nested json object in fc arguments.
    self.tool_non_empty_arg_reg = re.compile(
        r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
    )

    self.bot_string = "<tool_calls>"

    # Define streaming state type to be initialized later
    self.streaming_state: dict[str, Any] = {
        "current_tool_index": -1,
        "tool_ids": [],
        "sent_tools": [],
    }

_ensure_state_arrays ¶

_ensure_state_arrays(tool_count: int)

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def _ensure_state_arrays(self, tool_count: int):
    while len(self.streaming_state["sent_tools"]) < tool_count:
        self.streaming_state["sent_tools"].append({
            "sent_name": False,
            "sent_arguments_prefix": False,
            "sent_arguments": "",
        })
    while len(self.streaming_state["tool_ids"]) < tool_count:
        self.streaming_state["tool_ids"].append(None)

_handle_test_compatibility ¶

_handle_test_compatibility(current_text: str)

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def _handle_test_compatibility(self, current_text: str):
    if len(self.current_tools_sent) > 0:
        if (len(self.current_tools_sent) == 1
                and self.current_tools_sent[0] is False):
            name_match = self.tool_name_reg.search(current_text)
            if name_match:
                function_name = name_match.group(1)
                tool_id = f"chatcmpl-tool-{random_uuid()}"
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=0,
                        type="function",
                        id=tool_id,
                        function=DeltaFunctionCall(
                            name=function_name).model_dump(
                                exclude_none=True),
                    )
                ])
                self.current_tools_sent = [True]
                self.current_tool_id = 0
                self.streaming_state["current_tool_index"] = 0
                if len(self.streaming_state["sent_tools"]) == 0:
                    self.streaming_state["sent_tools"].append({
                        "sent_name":
                        True,
                        "sent_arguments_prefix":
                        False,
                        "sent_arguments":
                        "",
                    })
                else:
                    self.streaming_state["sent_tools"][0][
                        "sent_name"] = True
                self.current_tool_name_sent = True
                return delta
    return None

_handle_tool_args_streaming ¶

_handle_tool_args_streaming(
    current_text: str, current_idx: int, tool_count: int
)

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def _handle_tool_args_streaming(self, current_text: str, current_idx: int,
                                tool_count: int):

    if current_idx >= 0 and current_idx < tool_count:
        empty_args_match = self.tool_empty_arg_reg.search(current_text)
        if empty_args_match and empty_args_match.start() > 0:
            for i in range(tool_count):
                if i == current_idx:
                    if not self.streaming_state["sent_tools"][current_idx][
                            "sent_arguments_prefix"]:
                        self.streaming_state["sent_tools"][current_idx][
                            "sent_arguments_prefix"] = True
                        self.streaming_state["sent_tools"][current_idx][
                            "sent_arguments"] = "{}"
                        while len(self.streamed_args) <= current_idx:
                            self.streamed_args.append("")
                        self.streamed_args[current_idx] += "{}"
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=current_idx,
                                function=DeltaFunctionCall(
                                    arguments="{}").model_dump(
                                        exclude_none=True),
                            )
                        ])
                        if current_idx < tool_count - 1:
                            self.streaming_state["current_tool_index"] += 1
                            self.current_tool_id = self.streaming_state[
                                "current_tool_index"]
                        return delta

        args_matches = list(
            self.tool_non_empty_arg_reg.finditer(current_text))
        if current_idx < len(args_matches):
            args_text = args_matches[current_idx].group(1)
            is_last_tool = current_idx == tool_count - 1
            if not is_last_tool:
                next_tool_pos = current_text.find(
                    "},{", args_matches[current_idx].start())
                if next_tool_pos != -1:
                    args_end_pos = (next_tool_pos + 1)
                    args_text = (
                        current_text[args_matches[current_idx].start(
                        ):args_end_pos].split('"arguments":')[1].strip())
            sent_args = self.streaming_state["sent_tools"][current_idx][
                "sent_arguments"]
            if not self.streaming_state["sent_tools"][current_idx][
                    "sent_arguments_prefix"] and args_text.startswith("{"):
                self.streaming_state["sent_tools"][current_idx][
                    "sent_arguments_prefix"] = True
                self.streaming_state["sent_tools"][current_idx][
                    "sent_arguments"] = "{"
                while len(self.streamed_args) <= current_idx:
                    self.streamed_args.append("")
                self.streamed_args[current_idx] += "{"
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=current_idx,
                        function=DeltaFunctionCall(
                            arguments="{").model_dump(exclude_none=True),
                    )
                ])
                return delta

            if args_text.startswith(sent_args):
                args_diff = args_text[len(sent_args):]
                if args_diff:
                    self.streaming_state["sent_tools"][current_idx][
                        "sent_arguments"] = args_text
                    while len(self.streamed_args) <= current_idx:
                        self.streamed_args.append("")
                    self.streamed_args[current_idx] += args_diff
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=current_idx,
                            function=DeltaFunctionCall(
                                arguments=args_diff).model_dump(
                                    exclude_none=True),
                        )
                    ])
                    return delta

            if args_text.endswith("}") and args_text == sent_args:
                if current_idx < tool_count - 1:
                    self.streaming_state["current_tool_index"] += 1
                    self.current_tool_id = self.streaming_state[
                        "current_tool_index"]
    return None

_handle_tool_name_streaming ¶

_handle_tool_name_streaming(
    current_idx: int, tool_count: int, name_matches
)

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def _handle_tool_name_streaming(self, current_idx: int, tool_count: int,
                                name_matches):
    if current_idx == -1 or current_idx < tool_count - 1:
        next_idx = current_idx + 1
        if (next_idx < tool_count
                and not self.streaming_state["sent_tools"][next_idx]
            ["sent_name"]):
            self.streaming_state["current_tool_index"] = next_idx
            self.current_tool_id = next_idx
            current_idx = next_idx
            tool_name = name_matches[current_idx].group(1)
            tool_id = f"call_{current_idx}_{random_uuid()}"
            self.streaming_state["tool_ids"][current_idx] = tool_id
            delta = DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=current_idx,
                    type="function",
                    id=tool_id,
                    function=DeltaFunctionCall(name=tool_name).model_dump(
                        exclude_none=True),
                )
            ])
            self.streaming_state["sent_tools"][current_idx][
                "sent_name"] = True
            self.current_tool_name_sent = True
            while len(self.streamed_args) <= current_idx:
                self.streamed_args.append("")
            return delta
    return None

_try_parse_json_tools ¶

_try_parse_json_tools(current_text: str)

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def _try_parse_json_tools(self, current_text: str):
    try:
        parsed_tools = json.loads(current_text)
        if isinstance(parsed_tools, list):
            self.prev_tool_call_arr = parsed_tools
    except json.JSONDecodeError:
        pass

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract tool calls from a complete model output.

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    """
    Extract tool calls from a complete model output.
    """
    try:
        # Preprocess the model output
        content, potential_tool_calls = self.preprocess_model_output(
            model_output)

        if not potential_tool_calls:
            # some text should be filtered out for no function call
            # this text is in a13b's chat template.
            if content:
                content = content.replace("助手：", "", 1)
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=content)

        # Parse the potential tool calls as JSON
        tool_calls_data = json.loads(potential_tool_calls)

        # Ensure it's an array
        if not isinstance(tool_calls_data, list):
            logger.debug("Tool calls data is not an array")
            return ExtractedToolCallInformation(
                tools_called=False,
                tool_calls=[],
                content=content or model_output,
            )

        tool_calls: list[ToolCall] = []

        for idx, call in enumerate(tool_calls_data):
            if (not isinstance(call, dict) or "name" not in call
                    or "arguments" not in call):
                continue

            tool_call = ToolCall(
                id=f"call_{random_uuid()}",
                type="function",
                function=FunctionCall(
                    name=call["name"],
                    arguments=(json.dumps(call["arguments"]) if isinstance(
                        call["arguments"], dict) else call["arguments"]),
                ),
            )
            tool_calls.append(tool_call)

        if not content or len(content.strip()) == 0:
            # clear the whitespace content.
            content = None

        return ExtractedToolCallInformation(
            tools_called=len(tool_calls) > 0,
            tool_calls=tool_calls,
            content=content,
        )

    except Exception:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Extract tool calls for streaming mode.

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    """
    Extract tool calls for streaming mode.
    """

    start_idx = consume_space(0, current_text)
    if current_text[start_idx:].startswith(self.bot_string):
        start_idx = consume_space(start_idx + len(self.bot_string),
                                  current_text)
    if not current_text or start_idx >= len(
            current_text) or current_text[start_idx] != '[':
        return DeltaMessage(content=delta_text)

    self._try_parse_json_tools(current_text[start_idx:])

    test_delta = self._handle_test_compatibility(current_text)
    if test_delta:
        return test_delta

    name_matches = list(self.tool_name_reg.finditer(current_text))
    tool_count = len(name_matches)
    if tool_count == 0:
        return None
    self._ensure_state_arrays(tool_count)
    current_idx = self.streaming_state["current_tool_index"]

    name_delta = self._handle_tool_name_streaming(current_idx, tool_count,
                                                  name_matches)
    if name_delta:
        return name_delta

    args_delta = self._handle_tool_args_streaming(current_text,
                                                  current_idx, tool_count)
    if args_delta:
        return args_delta

    return None

preprocess_model_output ¶

preprocess_model_output(
    model_output: str,
) -> tuple[Optional[str], Optional[str]]

Source code in vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py

def preprocess_model_output(
        self, model_output: str) -> tuple[Optional[str], Optional[str]]:
    # find the location tool call
    for match in self.answer_tool_calls_pattern.finditer(model_output):
        start, end = match.span()
        # check tool_calls whether in side of <think>
        think_regions = [(m.start(), m.end()) for m in re.finditer(
            r"<think>(.*?)</think>", model_output, flags=re.DOTALL)]
        in_think = any(start > t_start and end < t_end
                       for t_start, t_end in think_regions)
        if not in_think:
            content = model_output[:start]
            tool_calls_content = match.group(1).strip()
            try:
                json.loads(tool_calls_content)
                return content, tool_calls_content
            except Exception:
                continue
    return model_output, None

Internlm2ToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py

@ToolParserManager.register_module(["internlm"])
class Internlm2ToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
        self.position = 0

    def adjust_request(
            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        if request.tools and request.tool_choice != 'none':
            # do not skip special tokens because internlm use the special
            # tokens to indicate the start and end of the tool calls
            # information.
            request.skip_special_tokens = False
        return request

    def get_arguments(self, obj):
        if "parameters" in obj:
            return obj.get("parameters")
        elif "arguments" in obj:
            return obj.get("arguments")
        return None

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        if '<|action_start|>' not in current_text:
            self.position = len(current_text)
            return DeltaMessage(content=delta_text)
        # if the tool call is sent, return an empty delta message
        # to make sure the finish_reason will be sent correctly.
        if self.current_tool_id > 0:
            return DeltaMessage(content='')

        last_pos = self.position
        if '<|action_start|><|plugin|>' not in current_text[last_pos:]:
            return None

        new_delta = current_text[last_pos:]
        text, action = new_delta.split('<|action_start|><|plugin|>')

        if len(text) > 0:
            self.position = self.position + len(text)
            return DeltaMessage(content=text)

        action = action.strip()
        action = action.split('<|action_end|>'.strip())[0]

        # bit mask flags for partial JSON parsing. If the name hasn't been
        # sent yet, don't allow sending
        # an incomplete string since OpenAI only ever (as far as I have
        # seen) allows sending the entire tool/ function name at once.
        flags = Allow.ALL if self.current_tool_name_sent \
            else Allow.ALL & ~Allow.STR

        try:
            parsable_arr = action

            # tool calls are generated in an object in internlm2
            # it's not support parallel tool calls
            try:
                tool_call_arr: dict = partial_json_parser.loads(
                    parsable_arr, flags)
            except partial_json_parser.core.exceptions.MalformedJSON:
                logger.debug('not enough tokens to parse into JSON yet')
                return None

            # if the current tool name hasn't been sent, send if available
            # - otherwise send nothing
            if not self.current_tool_name_sent:
                function_name = tool_call_arr.get("name")
                if function_name:
                    self.current_tool_id = self.current_tool_id + 1
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))
                    ])
                    self.current_tool_name_sent = True
                    self.streamed_args_for_tool.append("")
                else:
                    delta = None
            # now we know we're on the same tool call and we're streaming
            # arguments
            else:
                prev_arguments = self.get_arguments(
                    self.prev_tool_call_arr[self.current_tool_id])
                cur_arguments = self.get_arguments(tool_call_arr)

                # not arguments generated
                if not cur_arguments and not prev_arguments:
                    delta = None
                # will never happen
                elif not cur_arguments and prev_arguments:
                    logger.error(
                        "INVARIANT - impossible to have arguments reset "
                        "mid-arguments")
                    delta = None
                # first time to get parameters
                elif cur_arguments and not prev_arguments:
                    cur_arguments_json = json.dumps(cur_arguments,
                                                    ensure_ascii=False)

                    arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                         index(delta_text) +
                                                         len(delta_text)]
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=arguments_delta).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += arguments_delta
                # both prev and cur parameters, send the increase parameters
                elif cur_arguments and prev_arguments:
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    prev_args_json = json.dumps(prev_arguments,
                                                ensure_ascii=False)

                    argument_diff = extract_intermediate_diff(
                        cur_args_json, prev_args_json)

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).model_dump(
                                              exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff

            # check to see if the name is defined and has been sent. if so,
            # stream the name - otherwise keep waiting
            # finish by setting old and returning None as base case
            tool_call_arr["arguments"] = self.get_arguments(tool_call_arr)
            self.prev_tool_call_arr = [tool_call_arr]
            return delta
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            logger.debug(
                "Skipping chunk as a result of tool streaming extraction "
                "error")
            return None

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        text = model_output
        tools = request.tools
        if '<|action_start|><|plugin|>' in text:
            text, action = text.split('<|action_start|><|plugin|>')
            action = action.split('<|action_end|>'.strip())[0]
            action = action[action.find('{'):]
            action_dict = json.loads(action)
            name, parameters = action_dict['name'], json.dumps(
                action_dict.get('parameters', action_dict.get('arguments',
                                                              {})),
                ensure_ascii=False)

            if not tools or name not in [t.function.name for t in tools]:
                ExtractedToolCallInformation(tools_called=False,
                                             tool_calls=[],
                                             content=text)

            tool_calls = [
                ToolCall(
                    function=FunctionCall(name=name, arguments=parameters))
            ]
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=text if len(text) > 0 else None)

        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=text)

position `instance-attribute` ¶

position = 0

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)
    self.position = 0

adjust_request ¶

adjust_request(
    request: ChatCompletionRequest,
) -> ChatCompletionRequest

Source code in vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py

def adjust_request(
        self, request: ChatCompletionRequest) -> ChatCompletionRequest:
    if request.tools and request.tool_choice != 'none':
        # do not skip special tokens because internlm use the special
        # tokens to indicate the start and end of the tool calls
        # information.
        request.skip_special_tokens = False
    return request

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    text = model_output
    tools = request.tools
    if '<|action_start|><|plugin|>' in text:
        text, action = text.split('<|action_start|><|plugin|>')
        action = action.split('<|action_end|>'.strip())[0]
        action = action[action.find('{'):]
        action_dict = json.loads(action)
        name, parameters = action_dict['name'], json.dumps(
            action_dict.get('parameters', action_dict.get('arguments',
                                                          {})),
            ensure_ascii=False)

        if not tools or name not in [t.function.name for t in tools]:
            ExtractedToolCallInformation(tools_called=False,
                                         tool_calls=[],
                                         content=text)

        tool_calls = [
            ToolCall(
                function=FunctionCall(name=name, arguments=parameters))
        ]
        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=tool_calls,
            content=text if len(text) > 0 else None)

    return ExtractedToolCallInformation(tools_called=False,
                                        tool_calls=[],
                                        content=text)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    if '<|action_start|>' not in current_text:
        self.position = len(current_text)
        return DeltaMessage(content=delta_text)
    # if the tool call is sent, return an empty delta message
    # to make sure the finish_reason will be sent correctly.
    if self.current_tool_id > 0:
        return DeltaMessage(content='')

    last_pos = self.position
    if '<|action_start|><|plugin|>' not in current_text[last_pos:]:
        return None

    new_delta = current_text[last_pos:]
    text, action = new_delta.split('<|action_start|><|plugin|>')

    if len(text) > 0:
        self.position = self.position + len(text)
        return DeltaMessage(content=text)

    action = action.strip()
    action = action.split('<|action_end|>'.strip())[0]

    # bit mask flags for partial JSON parsing. If the name hasn't been
    # sent yet, don't allow sending
    # an incomplete string since OpenAI only ever (as far as I have
    # seen) allows sending the entire tool/ function name at once.
    flags = Allow.ALL if self.current_tool_name_sent \
        else Allow.ALL & ~Allow.STR

    try:
        parsable_arr = action

        # tool calls are generated in an object in internlm2
        # it's not support parallel tool calls
        try:
            tool_call_arr: dict = partial_json_parser.loads(
                parsable_arr, flags)
        except partial_json_parser.core.exceptions.MalformedJSON:
            logger.debug('not enough tokens to parse into JSON yet')
            return None

        # if the current tool name hasn't been sent, send if available
        # - otherwise send nothing
        if not self.current_tool_name_sent:
            function_name = tool_call_arr.get("name")
            if function_name:
                self.current_tool_id = self.current_tool_id + 1
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  type="function",
                                  id=make_tool_call_id(),
                                  function=DeltaFunctionCall(
                                      name=function_name).model_dump(
                                          exclude_none=True))
                ])
                self.current_tool_name_sent = True
                self.streamed_args_for_tool.append("")
            else:
                delta = None
        # now we know we're on the same tool call and we're streaming
        # arguments
        else:
            prev_arguments = self.get_arguments(
                self.prev_tool_call_arr[self.current_tool_id])
            cur_arguments = self.get_arguments(tool_call_arr)

            # not arguments generated
            if not cur_arguments and not prev_arguments:
                delta = None
            # will never happen
            elif not cur_arguments and prev_arguments:
                logger.error(
                    "INVARIANT - impossible to have arguments reset "
                    "mid-arguments")
                delta = None
            # first time to get parameters
            elif cur_arguments and not prev_arguments:
                cur_arguments_json = json.dumps(cur_arguments,
                                                ensure_ascii=False)

                arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                     index(delta_text) +
                                                     len(delta_text)]
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=arguments_delta).
                                  model_dump(exclude_none=True))
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] += arguments_delta
            # both prev and cur parameters, send the increase parameters
            elif cur_arguments and prev_arguments:
                cur_args_json = json.dumps(cur_arguments,
                                           ensure_ascii=False)
                prev_args_json = json.dumps(prev_arguments,
                                            ensure_ascii=False)

                argument_diff = extract_intermediate_diff(
                    cur_args_json, prev_args_json)

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=argument_diff).model_dump(
                                          exclude_none=True))
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] += argument_diff

        # check to see if the name is defined and has been sent. if so,
        # stream the name - otherwise keep waiting
        # finish by setting old and returning None as base case
        tool_call_arr["arguments"] = self.get_arguments(tool_call_arr)
        self.prev_tool_call_arr = [tool_call_arr]
        return delta
    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        logger.debug(
            "Skipping chunk as a result of tool streaming extraction "
            "error")
        return None

get_arguments ¶

get_arguments(obj)

Source code in vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py

def get_arguments(self, obj):
    if "parameters" in obj:
        return obj.get("parameters")
    elif "arguments" in obj:
        return obj.get("arguments")
    return None

JambaToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py

@ToolParserManager.register_module("jamba")
class JambaToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        if isinstance(self.model_tokenizer, MistralTokenizer):
            raise ValueError(
                "Detected a MistralTokenizer tokenizer when using a Jamba model"
            )

        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id: int = -1
        self.streamed_args_for_tool: list[str] = [
        ]  # map what has been streamed for each tool so far to a list

        self.tool_calls_start_token: str = "<tool_calls>"
        self.tool_calls_end_token: str = "</tool_calls>"

        self.tool_calls_regex = re.compile(
            rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}",
            re.DOTALL)

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction.")
        self.tool_calls_start_token_id = self.vocab.get(
            self.tool_calls_start_token)
        self.tool_calls_end_token_id = self.vocab.get(
            self.tool_calls_end_token)
        if (self.tool_calls_start_token_id is None
                or self.tool_calls_end_token_id is None):
            raise RuntimeError(
                "Jamba Tool parser could not locate tool calls start/end "
                "tokens in the tokenizer!")

    def adjust_request(
            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        if request.tools and request.tool_choice != 'none':
            # do not skip special tokens because jamba use the special
            # tokens to indicate the start and end of the tool calls
            # information.
            request.skip_special_tokens = False
        return request

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:

        # sanity check; avoid unnecessary processing
        if self.tool_calls_start_token not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        else:

            try:
                # use a regex to find the tool call between the tags
                function_calls = self.tool_calls_regex.findall(model_output)[0]

                # load the JSON, and then use it to build the Function and
                # Tool Call
                raw_function_calls = json.loads(function_calls)
                tool_calls = [
                    ToolCall(
                        type="function",
                        function=FunctionCall(
                            name=function_call["name"],
                            # function call args are JSON but as a string
                            arguments=json.dumps(function_call["arguments"],
                                                 ensure_ascii=False),
                        )) for function_call in raw_function_calls
                ]

                content = model_output[:model_output.
                                       find(self.tool_calls_start_token)]
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=tool_calls,
                    content=content if
                    (len(content) > 0 and content != " ") else None)

            except Exception:
                logger.exception(
                    "Error in extracting tool call from response.")
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        # if the tool call token is not in the tokens generated so far, append
        # output to contents since it's not a tool
        if self.tool_calls_start_token not in current_text:
            return DeltaMessage(content=delta_text)

        # if the tool call token ID IS in the tokens generated so far, that
        # means we're parsing as tool calls now

        # handle if we detected the start of tool calls token which means
        # the start of tool calling
        if (self.tool_calls_start_token_id in delta_token_ids
                and len(delta_token_ids) == 1):
            # if it's the only token, return None, so we don't send a chat
            # completion and don't send a control token
            return None

        # bit mask flags for partial JSON parsing. If the name hasn't been
        # sent yet, don't allow sending
        # an incomplete string since OpenAI only ever (as far as I have
        # seen) allows sending the entire tool/ function name at once.
        flags = Allow.ALL if self.current_tool_name_sent \
            else Allow.ALL & ~Allow.STR
        try:

            # Extract the tool calls between the special tool call tokens
            parsable_arr = current_text.split(
                self.tool_calls_start_token)[-1].split(
                    self.tool_calls_end_token)[0]

            # tool calls are generated in an array, so do partial JSON
            # parsing on the entire array
            try:
                tool_call_arr: list[dict] = partial_json_parser.loads(
                    parsable_arr, flags)
            except partial_json_parser.core.exceptions.MalformedJSON:
                logger.debug('not enough tokens to parse into JSON yet')
                return None

            # select as the current tool call the one we're on the state at

            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                if len(tool_call_arr) > 0 else {}

            # case -- if no tokens have been streamed for the tool, e.g.
            #   only the array brackets, stream nothing
            if len(tool_call_arr) == 0:
                return None

            # case: we are starting a new tool in the array
            #   -> array has > 0 length AND length has moved past cursor
            elif (len(tool_call_arr) > 0
                  and len(tool_call_arr) > self.current_tool_id + 1):

                # if we're moving on to a new call, first make sure we
                # haven't missed anything in the previous one that was
                # auto-generated due to JSON completions, but wasn't
                # streamed to the client yet.
                if self.current_tool_id >= 0:
                    diff: Union[str, None] = current_tool_call.get("arguments")

                    if diff:
                        diff = json.dumps(diff, ensure_ascii=False).replace(
                            self.streamed_args_for_tool[self.current_tool_id],
                            "")
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=diff).model_dump(
                                                  exclude_none=True))
                        ])
                        self.streamed_args_for_tool[
                            self.current_tool_id] += diff
                    else:
                        delta = None
                else:
                    delta = None
                # re-set stuff pertaining to progress in the current tool
                self.current_tool_id = len(tool_call_arr) - 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("starting on new tool %d", self.current_tool_id)
                return delta

            # case: update an existing tool - this is handled below

            # if the current tool name hasn't been sent, send if available
            # - otherwise send nothing
            if not self.current_tool_name_sent:
                function_name = current_tool_call.get("name")
                if function_name:

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))
                    ])
                    self.current_tool_name_sent = True
                else:
                    delta = None

            # now we know we're on the same tool call and we're streaming
            # arguments
            else:

                prev_arguments = self.prev_tool_call_arr[
                    self.current_tool_id].get("arguments")
                cur_arguments = current_tool_call.get("arguments")

                new_text = delta_text.replace("\'", "\"")

                if not cur_arguments and not prev_arguments:

                    delta = None
                elif not cur_arguments and prev_arguments:
                    logger.error(
                        "INVARIANT - impossible to have arguments reset "
                        "mid-arguments")
                    delta = None
                elif cur_arguments and not prev_arguments:
                    cur_arguments_json = json.dumps(cur_arguments,
                                                    ensure_ascii=False)
                    logger.debug("finding %s in %s", new_text,
                                 cur_arguments_json)

                    arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                         index(new_text) +
                                                         len(new_text)]
                    logger.debug("First tokens in arguments received: %s",
                                 arguments_delta)
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=arguments_delta).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += arguments_delta

                elif cur_arguments and prev_arguments:
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    prev_args_json = json.dumps(prev_arguments,
                                                ensure_ascii=False)
                    logger.debug("Searching for diff between \n%s\n%s",
                                 cur_args_json, prev_args_json)

                    argument_diff = extract_intermediate_diff(
                        cur_args_json, prev_args_json)
                    logger.debug("got arguments diff: %s", argument_diff)
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).model_dump(
                                              exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff
                else:
                    # try parsing it with regular JSON - if it works we're
                    # at the end, and we need to send the difference between
                    # tokens streamed so far and the valid JSON
                    delta = None

            # check to see if the name is defined and has been sent. if so,
            # stream the name - otherwise keep waiting
            # finish by setting old and returning None as base case
            self.prev_tool_call_arr = tool_call_arr
            return delta

        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            logger.debug(
                "Skipping chunk as a result of tool streaming extraction "
                "error")
            return None

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_calls_end_token `instance-attribute` ¶

tool_calls_end_token: str = '</tool_calls>'

tool_calls_end_token_id `instance-attribute` ¶

tool_calls_end_token_id = get(tool_calls_end_token)

tool_calls_regex `instance-attribute` ¶

tool_calls_regex = compile(
    f"{tool_calls_start_token}(.*?){tool_calls_end_token}",
    DOTALL,
)

tool_calls_start_token `instance-attribute` ¶

tool_calls_start_token: str = '<tool_calls>'

tool_calls_start_token_id `instance-attribute` ¶

tool_calls_start_token_id = get(tool_calls_start_token)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    if isinstance(self.model_tokenizer, MistralTokenizer):
        raise ValueError(
            "Detected a MistralTokenizer tokenizer when using a Jamba model"
        )

    self.current_tool_name_sent: bool = False
    self.prev_tool_call_arr: list[dict] = []
    self.current_tool_id: int = -1
    self.streamed_args_for_tool: list[str] = [
    ]  # map what has been streamed for each tool so far to a list

    self.tool_calls_start_token: str = "<tool_calls>"
    self.tool_calls_end_token: str = "</tool_calls>"

    self.tool_calls_regex = re.compile(
        rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}",
        re.DOTALL)

    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction.")
    self.tool_calls_start_token_id = self.vocab.get(
        self.tool_calls_start_token)
    self.tool_calls_end_token_id = self.vocab.get(
        self.tool_calls_end_token)
    if (self.tool_calls_start_token_id is None
            or self.tool_calls_end_token_id is None):
        raise RuntimeError(
            "Jamba Tool parser could not locate tool calls start/end "
            "tokens in the tokenizer!")

adjust_request ¶

adjust_request(
    request: ChatCompletionRequest,
) -> ChatCompletionRequest

Source code in vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py

def adjust_request(
        self, request: ChatCompletionRequest) -> ChatCompletionRequest:
    if request.tools and request.tool_choice != 'none':
        # do not skip special tokens because jamba use the special
        # tokens to indicate the start and end of the tool calls
        # information.
        request.skip_special_tokens = False
    return request

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:

    # sanity check; avoid unnecessary processing
    if self.tool_calls_start_token not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    else:

        try:
            # use a regex to find the tool call between the tags
            function_calls = self.tool_calls_regex.findall(model_output)[0]

            # load the JSON, and then use it to build the Function and
            # Tool Call
            raw_function_calls = json.loads(function_calls)
            tool_calls = [
                ToolCall(
                    type="function",
                    function=FunctionCall(
                        name=function_call["name"],
                        # function call args are JSON but as a string
                        arguments=json.dumps(function_call["arguments"],
                                             ensure_ascii=False),
                    )) for function_call in raw_function_calls
            ]

            content = model_output[:model_output.
                                   find(self.tool_calls_start_token)]
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content if
                (len(content) > 0 and content != " ") else None)

        except Exception:
            logger.exception(
                "Error in extracting tool call from response.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    # if the tool call token is not in the tokens generated so far, append
    # output to contents since it's not a tool
    if self.tool_calls_start_token not in current_text:
        return DeltaMessage(content=delta_text)

    # if the tool call token ID IS in the tokens generated so far, that
    # means we're parsing as tool calls now

    # handle if we detected the start of tool calls token which means
    # the start of tool calling
    if (self.tool_calls_start_token_id in delta_token_ids
            and len(delta_token_ids) == 1):
        # if it's the only token, return None, so we don't send a chat
        # completion and don't send a control token
        return None

    # bit mask flags for partial JSON parsing. If the name hasn't been
    # sent yet, don't allow sending
    # an incomplete string since OpenAI only ever (as far as I have
    # seen) allows sending the entire tool/ function name at once.
    flags = Allow.ALL if self.current_tool_name_sent \
        else Allow.ALL & ~Allow.STR
    try:

        # Extract the tool calls between the special tool call tokens
        parsable_arr = current_text.split(
            self.tool_calls_start_token)[-1].split(
                self.tool_calls_end_token)[0]

        # tool calls are generated in an array, so do partial JSON
        # parsing on the entire array
        try:
            tool_call_arr: list[dict] = partial_json_parser.loads(
                parsable_arr, flags)
        except partial_json_parser.core.exceptions.MalformedJSON:
            logger.debug('not enough tokens to parse into JSON yet')
            return None

        # select as the current tool call the one we're on the state at

        current_tool_call: dict = tool_call_arr[self.current_tool_id] \
            if len(tool_call_arr) > 0 else {}

        # case -- if no tokens have been streamed for the tool, e.g.
        #   only the array brackets, stream nothing
        if len(tool_call_arr) == 0:
            return None

        # case: we are starting a new tool in the array
        #   -> array has > 0 length AND length has moved past cursor
        elif (len(tool_call_arr) > 0
              and len(tool_call_arr) > self.current_tool_id + 1):

            # if we're moving on to a new call, first make sure we
            # haven't missed anything in the previous one that was
            # auto-generated due to JSON completions, but wasn't
            # streamed to the client yet.
            if self.current_tool_id >= 0:
                diff: Union[str, None] = current_tool_call.get("arguments")

                if diff:
                    diff = json.dumps(diff, ensure_ascii=False).replace(
                        self.streamed_args_for_tool[self.current_tool_id],
                        "")
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=diff).model_dump(
                                              exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += diff
                else:
                    delta = None
            else:
                delta = None
            # re-set stuff pertaining to progress in the current tool
            self.current_tool_id = len(tool_call_arr) - 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("starting on new tool %d", self.current_tool_id)
            return delta

        # case: update an existing tool - this is handled below

        # if the current tool name hasn't been sent, send if available
        # - otherwise send nothing
        if not self.current_tool_name_sent:
            function_name = current_tool_call.get("name")
            if function_name:

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  type="function",
                                  id=make_tool_call_id(),
                                  function=DeltaFunctionCall(
                                      name=function_name).model_dump(
                                          exclude_none=True))
                ])
                self.current_tool_name_sent = True
            else:
                delta = None

        # now we know we're on the same tool call and we're streaming
        # arguments
        else:

            prev_arguments = self.prev_tool_call_arr[
                self.current_tool_id].get("arguments")
            cur_arguments = current_tool_call.get("arguments")

            new_text = delta_text.replace("\'", "\"")

            if not cur_arguments and not prev_arguments:

                delta = None
            elif not cur_arguments and prev_arguments:
                logger.error(
                    "INVARIANT - impossible to have arguments reset "
                    "mid-arguments")
                delta = None
            elif cur_arguments and not prev_arguments:
                cur_arguments_json = json.dumps(cur_arguments,
                                                ensure_ascii=False)
                logger.debug("finding %s in %s", new_text,
                             cur_arguments_json)

                arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                     index(new_text) +
                                                     len(new_text)]
                logger.debug("First tokens in arguments received: %s",
                             arguments_delta)
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=arguments_delta).
                                  model_dump(exclude_none=True))
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] += arguments_delta

            elif cur_arguments and prev_arguments:
                cur_args_json = json.dumps(cur_arguments,
                                           ensure_ascii=False)
                prev_args_json = json.dumps(prev_arguments,
                                            ensure_ascii=False)
                logger.debug("Searching for diff between \n%s\n%s",
                             cur_args_json, prev_args_json)

                argument_diff = extract_intermediate_diff(
                    cur_args_json, prev_args_json)
                logger.debug("got arguments diff: %s", argument_diff)
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=argument_diff).model_dump(
                                          exclude_none=True))
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] += argument_diff
            else:
                # try parsing it with regular JSON - if it works we're
                # at the end, and we need to send the difference between
                # tokens streamed so far and the valid JSON
                delta = None

        # check to see if the name is defined and has been sent. if so,
        # stream the name - otherwise keep waiting
        # finish by setting old and returning None as base case
        self.prev_tool_call_arr = tool_call_arr
        return delta

    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        logger.debug(
            "Skipping chunk as a result of tool streaming extraction "
            "error")
        return None

KimiK2ToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py

@ToolParserManager.register_module(["kimi_k2"])
class KimiK2ToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id: int = -1
        self.streamed_args_for_tool: list[str] = (
            [])  # map what has been streamed for each tool so far to a list

        self.tool_calls_start_token: str = "<|tool_calls_section_begin|>"
        self.tool_calls_end_token: str = "<|tool_calls_section_end|>"

        self.tool_call_start_token: str = "<|tool_call_begin|>"
        self.tool_call_end_token: str = "<|tool_call_end|>"

        self.tool_call_regex = re.compile(
            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
        )

        self.stream_tool_call_portion_regex = re.compile(
            r"(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*)"
        )

        self.stream_tool_call_name_regex = re.compile(
            r"(?P<tool_call_id>.+:\d+)\s*")

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction.")
        self.tool_calls_start_token_id = self.vocab.get(
            self.tool_calls_start_token)
        self.tool_calls_end_token_id = self.vocab.get(
            self.tool_calls_end_token)

        self.tool_call_start_token_id = self.vocab.get(
            self.tool_call_start_token)
        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

        if (self.tool_calls_start_token_id is None
                or self.tool_calls_end_token_id is None):
            raise RuntimeError(
                "Kimi-K2 Tool parser could not locate tool call start/end "
                "tokens in the tokenizer!")

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:

        # sanity check; avoid unnecessary processing
        if self.tool_calls_start_token not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        else:
            try:
                # there are two possible captures - between tags, or between a
                # tag and end-of-string so the result of
                # findall is an array of tuples where one is a function call and
                # the other is None
                function_call_tuples = self.tool_call_regex.findall(
                    model_output)

                logger.debug("function_call_tuples: %s", function_call_tuples)

                tool_calls = []
                for match in function_call_tuples:
                    function_id, function_args = match
                    # function_id: functions.get_weather:0
                    function_name = function_id.split('.')[1].split(':')[0]
                    tool_calls.append(
                        ToolCall(
                            id=function_id,
                            type='function',
                            function=FunctionCall(name=function_name,
                                                  arguments=function_args),
                        ))

                content = model_output[:model_output.
                                       find(self.tool_calls_start_token)]
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=tool_calls,
                    content=content if content else None,
                )

            except Exception:
                logger.exception(
                    "Error in extracting tool call from response.")
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        logger.debug("delta_text: %s", delta_text)
        logger.debug("delta_token_ids: %s", delta_token_ids)
        # check to see if we should be streaming a tool call - is there a
        if self.tool_calls_start_token_id not in current_token_ids:
            logger.debug("No tool call tokens found!")
            return DeltaMessage(content=delta_text)
        delta_text = delta_text.replace(self.tool_calls_start_token,
                                        "").replace(self.tool_calls_end_token,
                                                    "")
        try:

            # figure out where we are in the parsing by counting tool call
            # start & end tags
            prev_tool_start_count = previous_token_ids.count(
                self.tool_call_start_token_id)
            prev_tool_end_count = previous_token_ids.count(
                self.tool_call_end_token_id)
            cur_tool_start_count = current_token_ids.count(
                self.tool_call_start_token_id)
            cur_tool_end_count = current_token_ids.count(
                self.tool_call_end_token_id)
            tool_call_portion = None
            text_portion = None

            # case: if we're generating text, OR rounding out a tool call
            if (cur_tool_start_count == cur_tool_end_count
                    and prev_tool_end_count == cur_tool_end_count
                    and self.tool_call_end_token not in delta_text):
                logger.debug("Generating text content! skipping tool parsing.")
                return DeltaMessage(content=delta_text)

            if self.tool_call_end_token in delta_text:
                logger.debug("tool_call_end_token in delta_text")
                full_text = current_text + delta_text
                tool_call_portion = full_text.split(
                    self.tool_call_start_token)[-1].split(
                        self.tool_call_end_token)[0].rstrip()
                delta_text = delta_text.split(
                    self.tool_call_end_token)[0].rstrip()
                text_portion = delta_text.split(
                    self.tool_call_end_token)[-1].lstrip()

            # case -- we're starting a new tool call
            if (cur_tool_start_count > cur_tool_end_count
                    and cur_tool_start_count > prev_tool_start_count):
                if len(delta_token_ids) > 1:
                    tool_call_portion = current_text.split(
                        self.tool_call_start_token)[-1]
                else:
                    tool_call_portion = None
                    delta = None

                text_portion = None

                # set cursors and state appropriately
                self.current_tool_id += 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("Starting on a new tool %s", self.current_tool_id)

            # case -- we're updating an existing tool call
            elif (cur_tool_start_count > cur_tool_end_count
                  and cur_tool_start_count == prev_tool_start_count):

                # get the portion of the text that's the tool call
                tool_call_portion = current_text.split(
                    self.tool_call_start_token)[-1]
                text_portion = None

            # case -- the current tool call is being closed.
            elif (cur_tool_start_count == cur_tool_end_count
                  and cur_tool_end_count >= prev_tool_end_count):
                if self.prev_tool_call_arr is None or len(
                        self.prev_tool_call_arr) == 0:
                    logger.debug(
                        "attempting to close tool call, but no tool call")
                    return None
                diff = self.prev_tool_call_arr[self.current_tool_id].get(
                    "arguments")
                if diff:
                    diff = (diff.encode("utf-8").decode("unicode_escape")
                            if diff is str else diff)
                    if '"}' not in delta_text:
                        return None
                    end_loc = delta_text.rindex('"}')
                    diff = delta_text[:end_loc] + '"}'
                    logger.debug(
                        "Finishing tool and found diff that had not "
                        "been streamed yet: %s",
                        diff,
                    )
                    self.streamed_args_for_tool[self.current_tool_id] += diff
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            function=DeltaFunctionCall(
                                arguments=diff).model_dump(exclude_none=True),
                        )
                    ])

            # case -- otherwise we're just generating text
            else:
                text = delta_text.replace(self.tool_call_start_token, "")
                text = text.replace(self.tool_call_end_token, "")
                delta = DeltaMessage(tool_calls=[], content=text)
                return delta

            current_tool_call = dict()
            if tool_call_portion:
                current_tool_call_matches = (
                    self.stream_tool_call_portion_regex.match(
                        tool_call_portion))
                if current_tool_call_matches:
                    tool_id, tool_args = (current_tool_call_matches.groups())
                    tool_name = tool_id.split('.')[1].split(':')[0]
                    current_tool_call['id'] = tool_id
                    current_tool_call["name"] = tool_name
                    current_tool_call["arguments"] = tool_args
                else:
                    current_tool_call_name_matches = (
                        self.stream_tool_call_name_regex.match(
                            tool_call_portion))
                    if current_tool_call_name_matches:
                        tool_id_str, = current_tool_call_name_matches.groups()
                        tool_name = tool_id_str.split('.')[1].split(':')[0]
                        current_tool_call['id'] = tool_id_str
                        current_tool_call["name"] = tool_name
                        current_tool_call["arguments"] = ""
                    else:
                        logger.debug("Not enough token")
                        return None

            # case - we haven't sent the tool name yet. If it's available, send
            #   it. otherwise, wait until it's available.
            if not self.current_tool_name_sent:
                if current_tool_call is None:
                    return None
                function_name: Union[str, None] = current_tool_call.get("name")
                tool_id = current_tool_call.get("id")
                if function_name:
                    self.current_tool_name_sent = True
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            type="function",
                            id=tool_id,
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),
                        )
                    ])
                else:
                    return None

            # case -- otherwise, send the tool call delta

            # if the tool call portion is None, send the delta as text
            if tool_call_portion is None:
                # if there's text but not tool calls, send that -
                # otherwise None to skip chunk
                delta = (DeltaMessage(
                    content=delta_text) if text_portion is not None else None)
                return delta

            # now, the nitty-gritty of tool calls
            # now we have the portion to parse as tool call.

            logger.debug("Trying to parse current tool call with ID %s",
                         self.current_tool_id)

            # if we're starting a new tool call, push an empty object in as
            #   a placeholder for the arguments
            if len(self.prev_tool_call_arr) <= self.current_tool_id:
                self.prev_tool_call_arr.append({})

            # main logic for tool parsing here - compare prev. partially-parsed
            #   JSON to the current partially-parsed JSON
            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
                "arguments")
            cur_arguments = current_tool_call.get("arguments")

            logger.debug("diffing old arguments: %s", prev_arguments)
            logger.debug("against new ones: %s", cur_arguments)

            # case -- no arguments have been created yet. skip sending a delta.
            if not cur_arguments and not prev_arguments:
                logger.debug("Skipping text %s - no arguments", delta_text)
                delta = None

            # case -- prev arguments are defined, but non are now.
            #   probably impossible, but not a fatal error - just keep going
            elif not cur_arguments and prev_arguments:
                logger.error("should be impossible to have arguments reset "
                             "mid-call. skipping streaming anything.")
                delta = None

            # case -- we now have the first info about arguments available from
            #   autocompleting the JSON
            elif cur_arguments and not prev_arguments:

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=cur_arguments).model_dump(
                                exclude_none=True),
                    )
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] = cur_arguments

            # last case -- we have an update to existing arguments.
            elif cur_arguments and prev_arguments:
                if (isinstance(delta_text, str)
                        and cur_arguments != prev_arguments
                        and len(cur_arguments) > len(prev_arguments)
                        and cur_arguments.startswith(prev_arguments)):
                    delta_arguments = cur_arguments[len(prev_arguments):]
                    logger.debug("got diff %s", delta_text)

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_id,
                            function=DeltaFunctionCall(
                                arguments=delta_arguments).model_dump(
                                    exclude_none=True),
                        )
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] = cur_arguments
                else:
                    delta = None

            # handle saving the state for the current tool into
            # the "prev" list for use in diffing for the next iteration
            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
                self.prev_tool_call_arr[
                    self.current_tool_id] = current_tool_call
            else:
                self.prev_tool_call_arr.append(current_tool_call)

            return delta

        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            return None  # do not stream a delta. skip this token ID.

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

stream_tool_call_name_regex `instance-attribute` ¶

stream_tool_call_name_regex = compile(
    "(?P<tool_call_id>.+:\\d+)\\s*"
)

stream_tool_call_portion_regex `instance-attribute` ¶

stream_tool_call_portion_regex = compile(
    "(?P<tool_call_id>.+:\\d+)\\s*<\\|tool_call_argument_begin\\|>\\s*(?P<function_arguments>.*)"
)

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_call_end_token `instance-attribute` ¶

tool_call_end_token: str = '<|tool_call_end|>'

tool_call_end_token_id `instance-attribute` ¶

tool_call_end_token_id = get(tool_call_end_token)

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    "<\\|tool_call_begin\\|>\\s*(?P<tool_call_id>.+:\\d+)\\s*<\\|tool_call_argument_begin\\|>\\s*(?P<function_arguments>.*?)\\s*<\\|tool_call_end\\|>"
)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token: str = '<|tool_call_begin|>'

tool_call_start_token_id `instance-attribute` ¶

tool_call_start_token_id = get(tool_call_start_token)

tool_calls_end_token `instance-attribute` ¶

tool_calls_end_token: str = '<|tool_calls_section_end|>'

tool_calls_end_token_id `instance-attribute` ¶

tool_calls_end_token_id = get(tool_calls_end_token)

tool_calls_start_token `instance-attribute` ¶

tool_calls_start_token: str = "<|tool_calls_section_begin|>"

tool_calls_start_token_id `instance-attribute` ¶

tool_calls_start_token_id = get(tool_calls_start_token)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)
    self.current_tool_name_sent: bool = False
    self.prev_tool_call_arr: list[dict] = []
    self.current_tool_id: int = -1
    self.streamed_args_for_tool: list[str] = (
        [])  # map what has been streamed for each tool so far to a list

    self.tool_calls_start_token: str = "<|tool_calls_section_begin|>"
    self.tool_calls_end_token: str = "<|tool_calls_section_end|>"

    self.tool_call_start_token: str = "<|tool_call_begin|>"
    self.tool_call_end_token: str = "<|tool_call_end|>"

    self.tool_call_regex = re.compile(
        r"<\|tool_call_begin\|>\s*(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
    )

    self.stream_tool_call_portion_regex = re.compile(
        r"(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*)"
    )

    self.stream_tool_call_name_regex = re.compile(
        r"(?P<tool_call_id>.+:\d+)\s*")

    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction.")
    self.tool_calls_start_token_id = self.vocab.get(
        self.tool_calls_start_token)
    self.tool_calls_end_token_id = self.vocab.get(
        self.tool_calls_end_token)

    self.tool_call_start_token_id = self.vocab.get(
        self.tool_call_start_token)
    self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

    if (self.tool_calls_start_token_id is None
            or self.tool_calls_end_token_id is None):
        raise RuntimeError(
            "Kimi-K2 Tool parser could not locate tool call start/end "
            "tokens in the tokenizer!")

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:

    # sanity check; avoid unnecessary processing
    if self.tool_calls_start_token not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    else:
        try:
            # there are two possible captures - between tags, or between a
            # tag and end-of-string so the result of
            # findall is an array of tuples where one is a function call and
            # the other is None
            function_call_tuples = self.tool_call_regex.findall(
                model_output)

            logger.debug("function_call_tuples: %s", function_call_tuples)

            tool_calls = []
            for match in function_call_tuples:
                function_id, function_args = match
                # function_id: functions.get_weather:0
                function_name = function_id.split('.')[1].split(':')[0]
                tool_calls.append(
                    ToolCall(
                        id=function_id,
                        type='function',
                        function=FunctionCall(name=function_name,
                                              arguments=function_args),
                    ))

            content = model_output[:model_output.
                                   find(self.tool_calls_start_token)]
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content if content else None,
            )

        except Exception:
            logger.exception(
                "Error in extracting tool call from response.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    logger.debug("delta_text: %s", delta_text)
    logger.debug("delta_token_ids: %s", delta_token_ids)
    # check to see if we should be streaming a tool call - is there a
    if self.tool_calls_start_token_id not in current_token_ids:
        logger.debug("No tool call tokens found!")
        return DeltaMessage(content=delta_text)
    delta_text = delta_text.replace(self.tool_calls_start_token,
                                    "").replace(self.tool_calls_end_token,
                                                "")
    try:

        # figure out where we are in the parsing by counting tool call
        # start & end tags
        prev_tool_start_count = previous_token_ids.count(
            self.tool_call_start_token_id)
        prev_tool_end_count = previous_token_ids.count(
            self.tool_call_end_token_id)
        cur_tool_start_count = current_token_ids.count(
            self.tool_call_start_token_id)
        cur_tool_end_count = current_token_ids.count(
            self.tool_call_end_token_id)
        tool_call_portion = None
        text_portion = None

        # case: if we're generating text, OR rounding out a tool call
        if (cur_tool_start_count == cur_tool_end_count
                and prev_tool_end_count == cur_tool_end_count
                and self.tool_call_end_token not in delta_text):
            logger.debug("Generating text content! skipping tool parsing.")
            return DeltaMessage(content=delta_text)

        if self.tool_call_end_token in delta_text:
            logger.debug("tool_call_end_token in delta_text")
            full_text = current_text + delta_text
            tool_call_portion = full_text.split(
                self.tool_call_start_token)[-1].split(
                    self.tool_call_end_token)[0].rstrip()
            delta_text = delta_text.split(
                self.tool_call_end_token)[0].rstrip()
            text_portion = delta_text.split(
                self.tool_call_end_token)[-1].lstrip()

        # case -- we're starting a new tool call
        if (cur_tool_start_count > cur_tool_end_count
                and cur_tool_start_count > prev_tool_start_count):
            if len(delta_token_ids) > 1:
                tool_call_portion = current_text.split(
                    self.tool_call_start_token)[-1]
            else:
                tool_call_portion = None
                delta = None

            text_portion = None

            # set cursors and state appropriately
            self.current_tool_id += 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("Starting on a new tool %s", self.current_tool_id)

        # case -- we're updating an existing tool call
        elif (cur_tool_start_count > cur_tool_end_count
              and cur_tool_start_count == prev_tool_start_count):

            # get the portion of the text that's the tool call
            tool_call_portion = current_text.split(
                self.tool_call_start_token)[-1]
            text_portion = None

        # case -- the current tool call is being closed.
        elif (cur_tool_start_count == cur_tool_end_count
              and cur_tool_end_count >= prev_tool_end_count):
            if self.prev_tool_call_arr is None or len(
                    self.prev_tool_call_arr) == 0:
                logger.debug(
                    "attempting to close tool call, but no tool call")
                return None
            diff = self.prev_tool_call_arr[self.current_tool_id].get(
                "arguments")
            if diff:
                diff = (diff.encode("utf-8").decode("unicode_escape")
                        if diff is str else diff)
                if '"}' not in delta_text:
                    return None
                end_loc = delta_text.rindex('"}')
                diff = delta_text[:end_loc] + '"}'
                logger.debug(
                    "Finishing tool and found diff that had not "
                    "been streamed yet: %s",
                    diff,
                )
                self.streamed_args_for_tool[self.current_tool_id] += diff
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=diff).model_dump(exclude_none=True),
                    )
                ])

        # case -- otherwise we're just generating text
        else:
            text = delta_text.replace(self.tool_call_start_token, "")
            text = text.replace(self.tool_call_end_token, "")
            delta = DeltaMessage(tool_calls=[], content=text)
            return delta

        current_tool_call = dict()
        if tool_call_portion:
            current_tool_call_matches = (
                self.stream_tool_call_portion_regex.match(
                    tool_call_portion))
            if current_tool_call_matches:
                tool_id, tool_args = (current_tool_call_matches.groups())
                tool_name = tool_id.split('.')[1].split(':')[0]
                current_tool_call['id'] = tool_id
                current_tool_call["name"] = tool_name
                current_tool_call["arguments"] = tool_args
            else:
                current_tool_call_name_matches = (
                    self.stream_tool_call_name_regex.match(
                        tool_call_portion))
                if current_tool_call_name_matches:
                    tool_id_str, = current_tool_call_name_matches.groups()
                    tool_name = tool_id_str.split('.')[1].split(':')[0]
                    current_tool_call['id'] = tool_id_str
                    current_tool_call["name"] = tool_name
                    current_tool_call["arguments"] = ""
                else:
                    logger.debug("Not enough token")
                    return None

        # case - we haven't sent the tool name yet. If it's available, send
        #   it. otherwise, wait until it's available.
        if not self.current_tool_name_sent:
            if current_tool_call is None:
                return None
            function_name: Union[str, None] = current_tool_call.get("name")
            tool_id = current_tool_call.get("id")
            if function_name:
                self.current_tool_name_sent = True
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        type="function",
                        id=tool_id,
                        function=DeltaFunctionCall(
                            name=function_name).model_dump(
                                exclude_none=True),
                    )
                ])
            else:
                return None

        # case -- otherwise, send the tool call delta

        # if the tool call portion is None, send the delta as text
        if tool_call_portion is None:
            # if there's text but not tool calls, send that -
            # otherwise None to skip chunk
            delta = (DeltaMessage(
                content=delta_text) if text_portion is not None else None)
            return delta

        # now, the nitty-gritty of tool calls
        # now we have the portion to parse as tool call.

        logger.debug("Trying to parse current tool call with ID %s",
                     self.current_tool_id)

        # if we're starting a new tool call, push an empty object in as
        #   a placeholder for the arguments
        if len(self.prev_tool_call_arr) <= self.current_tool_id:
            self.prev_tool_call_arr.append({})

        # main logic for tool parsing here - compare prev. partially-parsed
        #   JSON to the current partially-parsed JSON
        prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
            "arguments")
        cur_arguments = current_tool_call.get("arguments")

        logger.debug("diffing old arguments: %s", prev_arguments)
        logger.debug("against new ones: %s", cur_arguments)

        # case -- no arguments have been created yet. skip sending a delta.
        if not cur_arguments and not prev_arguments:
            logger.debug("Skipping text %s - no arguments", delta_text)
            delta = None

        # case -- prev arguments are defined, but non are now.
        #   probably impossible, but not a fatal error - just keep going
        elif not cur_arguments and prev_arguments:
            logger.error("should be impossible to have arguments reset "
                         "mid-call. skipping streaming anything.")
            delta = None

        # case -- we now have the first info about arguments available from
        #   autocompleting the JSON
        elif cur_arguments and not prev_arguments:

            delta = DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=self.current_tool_id,
                    function=DeltaFunctionCall(
                        arguments=cur_arguments).model_dump(
                            exclude_none=True),
                )
            ])
            self.streamed_args_for_tool[
                self.current_tool_id] = cur_arguments

        # last case -- we have an update to existing arguments.
        elif cur_arguments and prev_arguments:
            if (isinstance(delta_text, str)
                    and cur_arguments != prev_arguments
                    and len(cur_arguments) > len(prev_arguments)
                    and cur_arguments.startswith(prev_arguments)):
                delta_arguments = cur_arguments[len(prev_arguments):]
                logger.debug("got diff %s", delta_text)

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_id,
                        function=DeltaFunctionCall(
                            arguments=delta_arguments).model_dump(
                                exclude_none=True),
                    )
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] = cur_arguments
            else:
                delta = None

        # handle saving the state for the current tool into
        # the "prev" list for use in diffing for the next iteration
        if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
            self.prev_tool_call_arr[
                self.current_tool_id] = current_tool_call
        else:
            self.prev_tool_call_arr.append(current_tool_call)

        return delta

    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        return None  # do not stream a delta. skip this token ID.

Llama3JsonToolParser ¶

Bases: ToolParser

Tool call parser for Llama 3.x and 4 models intended for use with the examples/tool_chat_template_llama.jinja template.

Used when --enable-auto-tool-choice --tool-call-parser llama3_json or llama4_json are set.

Source code in vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py

@ToolParserManager.register_module("llama3_json")
@ToolParserManager.register_module("llama4_json")
class Llama3JsonToolParser(ToolParser):
    """
    Tool call parser for Llama 3.x and 4 models intended for use with the
    examples/tool_chat_template_llama.jinja template.

    Used when --enable-auto-tool-choice --tool-call-parser llama3_json or 
    llama4_json are set.
    """

    def __init__(self, tokenizer: PreTrainedTokenizerBase):
        super().__init__(tokenizer)

        # initialize properties used for state when parsing tool calls in
        # streaming mode
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id: int = -1
        self.current_tool_name_sent: bool = False
        self.streamed_args_for_tool: list[str] = [
        ]  # map what has been streamed for each tool so far to a list
        self.bot_token = "<|python_tag|>"
        self.bot_token_id = tokenizer.encode(self.bot_token,
                                             add_special_tokens=False)[0]
        # Updated regex to match multiple JSONs separated by semicolons
        # This pattern is more robust and can handle nested JSON objects
        self.tool_call_regex = re.compile(
            r'{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*',
            re.DOTALL)

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        """
        Extract the tool calls from a complete model response.
        Only extracts JSON content and ignores any surrounding plain text.
        Supports both single JSON and multiple JSONs separated by semicolons.
        """
        # Quick check before running regex
        if not (self.bot_token in model_output or '{' in model_output):
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        # Find JSON object(s) in the text using regex
        match = self.tool_call_regex.search(model_output)
        if not match:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        try:
            json_str = match.group(0)
            # Split by semicolon and strip whitespace
            json_objects = [obj.strip() for obj in json_str.split(';')]

            tool_calls: list[ToolCall] = []
            for json_obj in json_objects:
                if not json_obj:  # Skip empty strings
                    continue
                obj = json.loads(json_obj)
                tool_calls.append(
                    ToolCall(
                        type="function",
                        function=FunctionCall(
                            name=obj["name"],
                            # function call args are JSON but as a string
                            arguments=json.dumps(
                                obj["arguments"]
                                if "arguments" in obj else obj["parameters"],
                                ensure_ascii=False))))

            return ExtractedToolCallInformation(tools_called=True,
                                                tool_calls=tool_calls,
                                                content=None)

        except Exception:
            logger.exception("Error in extracting tool call from response.")
            # return information to just treat the tool call as regular JSON
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        if not (current_text.startswith(self.bot_token)
                or current_text.startswith('{')):
            return DeltaMessage(content=delta_text)

        # bit mask flags for partial JSON parsing. If the name hasn't been
        # sent yet, don't allow sending
        # an incomplete string since OpenAI only ever (as far as I have
        # seen) allows sending the entire tool/ function name at once.
        flags = Allow.ALL if self.current_tool_name_sent \
            else Allow.ALL & ~Allow.STR
        try:
            tool_call_arr = []
            is_complete = []
            try:
                # depending on the prompt format the Llama model may or may not
                # prefix the output with the <|python_tag|> token
                start_idx = len(self.bot_token) if current_text.startswith(
                    self.bot_token) else 0
                while start_idx < len(current_text):
                    (obj,
                     end_idx) = partial_json_loads(current_text[start_idx:],
                                                   flags)
                    is_complete.append(
                        is_complete_json(current_text[start_idx:start_idx +
                                                      end_idx]))
                    start_idx += end_idx + len('; ')
                    # depending on the prompt Llama can use
                    # either arguments or parameters
                    if "parameters" in obj:
                        assert "arguments" not in obj, \
                            "model generated both parameters and arguments"
                        obj["arguments"] = obj["parameters"]
                    tool_call_arr.append(obj)
            except partial_json_parser.core.exceptions.MalformedJSON:
                logger.debug('not enough tokens to parse into JSON yet')
                return None

            # select as the current tool call the one we're on the state at
            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                if len(tool_call_arr) > 0 else {}

            # case -- if no tokens have been streamed for the tool, e.g.
            #   only the array brackets, stream nothing
            if len(tool_call_arr) == 0:
                return None

            # case: we are starting a new tool in the array
            #   -> array has > 0 length AND length has moved past cursor
            elif (len(tool_call_arr) > 0
                  and len(tool_call_arr) > self.current_tool_id + 1):

                # if we're moving on to a new call, first make sure we
                # haven't missed anything in the previous one that was
                # auto-generated due to JSON completions, but wasn't
                # streamed to the client yet.
                if self.current_tool_id >= 0:
                    cur_arguments = current_tool_call.get("arguments")
                    if cur_arguments:
                        cur_args_json = json.dumps(cur_arguments,
                                                   ensure_ascii=False)
                        sent = len(
                            self.streamed_args_for_tool[self.current_tool_id])
                        argument_diff = cur_args_json[sent:]

                        logger.debug("got arguments diff: %s", argument_diff)
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=argument_diff).
                                          model_dump(exclude_none=True))
                        ])
                        self.streamed_args_for_tool[
                            self.current_tool_id] += argument_diff
                    else:
                        delta = None
                else:
                    delta = None
                # re-set stuff pertaining to progress in the current tool
                self.current_tool_id = len(tool_call_arr) - 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("starting on new tool %d", self.current_tool_id)
                return delta

            # if the current tool name hasn't been sent, send if available
            # - otherwise send nothing
            elif not self.current_tool_name_sent:
                function_name = current_tool_call.get("name")
                if function_name:

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
                                      id=make_tool_call_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))
                    ])
                    self.current_tool_name_sent = True
                else:
                    delta = None

            # now we know we're on the same tool call and we're streaming
            # arguments
            else:
                cur_arguments = current_tool_call.get("arguments")
                delta = None

                if cur_arguments:
                    sent = len(
                        self.streamed_args_for_tool[self.current_tool_id])
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    prev_arguments = self.prev_tool_call_arr[
                        self.current_tool_id].get("arguments")

                    argument_diff = None
                    if is_complete[self.current_tool_id]:
                        argument_diff = cur_args_json[sent:]
                    elif prev_arguments:
                        prev_args_json = json.dumps(prev_arguments,
                                                    ensure_ascii=False)
                        if cur_args_json != prev_args_json:

                            prefix = find_common_prefix(
                                prev_args_json, cur_args_json)
                            argument_diff = prefix[sent:]

                    if argument_diff is not None:
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=argument_diff).
                                          model_dump(exclude_none=True))
                        ])
                        self.streamed_args_for_tool[
                            self.current_tool_id] += argument_diff

            self.prev_tool_call_arr = tool_call_arr
            return delta

        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            logger.debug(
                "Skipping chunk as a result of tool streaming extraction "
                "error")
            return None

bot_token `instance-attribute` ¶

bot_token = '<|python_tag|>'

bot_token_id `instance-attribute` ¶

bot_token_id = encode(bot_token, add_special_tokens=False)[
    0
]

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    "{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\\s*;\\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*",
    DOTALL,
)

init ¶

__init__(tokenizer: PreTrainedTokenizerBase)

Source code in vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py

def __init__(self, tokenizer: PreTrainedTokenizerBase):
    super().__init__(tokenizer)

    # initialize properties used for state when parsing tool calls in
    # streaming mode
    self.prev_tool_call_arr: list[dict] = []
    self.current_tool_id: int = -1
    self.current_tool_name_sent: bool = False
    self.streamed_args_for_tool: list[str] = [
    ]  # map what has been streamed for each tool so far to a list
    self.bot_token = "<|python_tag|>"
    self.bot_token_id = tokenizer.encode(self.bot_token,
                                         add_special_tokens=False)[0]
    # Updated regex to match multiple JSONs separated by semicolons
    # This pattern is more robust and can handle nested JSON objects
    self.tool_call_regex = re.compile(
        r'{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*',
        re.DOTALL)

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract the tool calls from a complete model response. Only extracts JSON content and ignores any surrounding plain text. Supports both single JSON and multiple JSONs separated by semicolons.

Source code in vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    """
    Extract the tool calls from a complete model response.
    Only extracts JSON content and ignores any surrounding plain text.
    Supports both single JSON and multiple JSONs separated by semicolons.
    """
    # Quick check before running regex
    if not (self.bot_token in model_output or '{' in model_output):
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    # Find JSON object(s) in the text using regex
    match = self.tool_call_regex.search(model_output)
    if not match:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    try:
        json_str = match.group(0)
        # Split by semicolon and strip whitespace
        json_objects = [obj.strip() for obj in json_str.split(';')]

        tool_calls: list[ToolCall] = []
        for json_obj in json_objects:
            if not json_obj:  # Skip empty strings
                continue
            obj = json.loads(json_obj)
            tool_calls.append(
                ToolCall(
                    type="function",
                    function=FunctionCall(
                        name=obj["name"],
                        # function call args are JSON but as a string
                        arguments=json.dumps(
                            obj["arguments"]
                            if "arguments" in obj else obj["parameters"],
                            ensure_ascii=False))))

        return ExtractedToolCallInformation(tools_called=True,
                                            tool_calls=tool_calls,
                                            content=None)

    except Exception:
        logger.exception("Error in extracting tool call from response.")
        # return information to just treat the tool call as regular JSON
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    if not (current_text.startswith(self.bot_token)
            or current_text.startswith('{')):
        return DeltaMessage(content=delta_text)

    # bit mask flags for partial JSON parsing. If the name hasn't been
    # sent yet, don't allow sending
    # an incomplete string since OpenAI only ever (as far as I have
    # seen) allows sending the entire tool/ function name at once.
    flags = Allow.ALL if self.current_tool_name_sent \
        else Allow.ALL & ~Allow.STR
    try:
        tool_call_arr = []
        is_complete = []
        try:
            # depending on the prompt format the Llama model may or may not
            # prefix the output with the <|python_tag|> token
            start_idx = len(self.bot_token) if current_text.startswith(
                self.bot_token) else 0
            while start_idx < len(current_text):
                (obj,
                 end_idx) = partial_json_loads(current_text[start_idx:],
                                               flags)
                is_complete.append(
                    is_complete_json(current_text[start_idx:start_idx +
                                                  end_idx]))
                start_idx += end_idx + len('; ')
                # depending on the prompt Llama can use
                # either arguments or parameters
                if "parameters" in obj:
                    assert "arguments" not in obj, \
                        "model generated both parameters and arguments"
                    obj["arguments"] = obj["parameters"]
                tool_call_arr.append(obj)
        except partial_json_parser.core.exceptions.MalformedJSON:
            logger.debug('not enough tokens to parse into JSON yet')
            return None

        # select as the current tool call the one we're on the state at
        current_tool_call: dict = tool_call_arr[self.current_tool_id] \
            if len(tool_call_arr) > 0 else {}

        # case -- if no tokens have been streamed for the tool, e.g.
        #   only the array brackets, stream nothing
        if len(tool_call_arr) == 0:
            return None

        # case: we are starting a new tool in the array
        #   -> array has > 0 length AND length has moved past cursor
        elif (len(tool_call_arr) > 0
              and len(tool_call_arr) > self.current_tool_id + 1):

            # if we're moving on to a new call, first make sure we
            # haven't missed anything in the previous one that was
            # auto-generated due to JSON completions, but wasn't
            # streamed to the client yet.
            if self.current_tool_id >= 0:
                cur_arguments = current_tool_call.get("arguments")
                if cur_arguments:
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    sent = len(
                        self.streamed_args_for_tool[self.current_tool_id])
                    argument_diff = cur_args_json[sent:]

                    logger.debug("got arguments diff: %s", argument_diff)
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff
                else:
                    delta = None
            else:
                delta = None
            # re-set stuff pertaining to progress in the current tool
            self.current_tool_id = len(tool_call_arr) - 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("starting on new tool %d", self.current_tool_id)
            return delta

        # if the current tool name hasn't been sent, send if available
        # - otherwise send nothing
        elif not self.current_tool_name_sent:
            function_name = current_tool_call.get("name")
            if function_name:

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  type="function",
                                  id=make_tool_call_id(),
                                  function=DeltaFunctionCall(
                                      name=function_name).model_dump(
                                          exclude_none=True))
                ])
                self.current_tool_name_sent = True
            else:
                delta = None

        # now we know we're on the same tool call and we're streaming
        # arguments
        else:
            cur_arguments = current_tool_call.get("arguments")
            delta = None

            if cur_arguments:
                sent = len(
                    self.streamed_args_for_tool[self.current_tool_id])
                cur_args_json = json.dumps(cur_arguments,
                                           ensure_ascii=False)
                prev_arguments = self.prev_tool_call_arr[
                    self.current_tool_id].get("arguments")

                argument_diff = None
                if is_complete[self.current_tool_id]:
                    argument_diff = cur_args_json[sent:]
                elif prev_arguments:
                    prev_args_json = json.dumps(prev_arguments,
                                                ensure_ascii=False)
                    if cur_args_json != prev_args_json:

                        prefix = find_common_prefix(
                            prev_args_json, cur_args_json)
                        argument_diff = prefix[sent:]

                if argument_diff is not None:
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff

        self.prev_tool_call_arr = tool_call_arr
        return delta

    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        logger.debug(
            "Skipping chunk as a result of tool streaming extraction "
            "error")
        return None

Llama4PythonicToolParser ¶

Bases: ToolParser

Toolcall parser for Llama4 that produce tool calls in a pythonic style Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic

Source code in vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py

@ToolParserManager.register_module("llama4_pythonic")
class Llama4PythonicToolParser(ToolParser):
    """
    Toolcall parser for Llama4 that produce tool calls in a pythonic style
    Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic
    """
    # TODO(mdepinet): Possible future improvements:
    #   1. Support text + tools separated by either <|python_tag|> or \n\n
    #   2. Support tools outside of a list (or separated by a semicolon).
    #      This depends on item 1 for consistent streaming.
    # Neither of these are necessary for e.g. ToolACE, but both would help make
    # Llama3.2 models more reliable.

    TOOL_CALL_REGEX = re.compile(
        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
        re.DOTALL)

    def __init__(self, tokenizer: PreTrainedTokenizerBase):
        super().__init__(tokenizer)

    # Rename for readability. This is NOT a tool id.
    @property
    def current_tool_index(self) -> int:
        return self.current_tool_id

    @current_tool_index.setter
    def current_tool_index(self, value: int) -> None:
        self.current_tool_id = value

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        """
        Extract the tool calls from a complete model response.
        """

        # remove <|python_start|> and <|python_end|>
        # as Llama 4 model sometime will output those tokens
        if model_output.startswith("<|python_start|>"):
            model_output = model_output[len("<|python_start|>"):]
            model_output = model_output.replace("<|python_end|>", "")

        is_tool_call_pattern = False
        try:
            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
                model_output,
                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
        except TimeoutError:
            logger.warning(
                "Regex timeout occurred when matching tool call pattern.")
            logger.debug("Regex timeout occurred when matching user input: %s",
                         model_output)

        if not is_tool_call_pattern:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        try:
            module = ast.parse(model_output)
            parsed = getattr(module.body[0], "value", None)
            if isinstance(parsed, ast.List) and all(
                    isinstance(e, ast.Call) for e in parsed.elts):
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=[
                        _handle_single_tool(e)  # type: ignore
                        for e in parsed.elts
                    ],
                    content=None)
            else:
                raise _UnexpectedAstError(
                    "Tool output must be a list of function calls")
        except Exception:
            logger.exception("Error in extracting tool call from response.")
            # Treat as regular text
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        if not current_text.startswith("[") and not current_text.startswith(
                "<|python_start|>"):
            return DeltaMessage(content=delta_text)

        try:
            # remove <|python_start|> and <|python_end|>
            if current_text.startswith("<|python_start|>"):
                current_text = current_text[len("<|python_start|>"):]
            if current_text.endswith("<|python_end|>"):
                current_text = current_text[:current_text.
                                            rfind("<|python_end|>")]
            valid_and_added_text = _make_valid_python(current_text)
            if valid_and_added_text is None:
                return None
            valid_text, added_text = valid_and_added_text

            module = ast.parse(valid_text)
            parsed = getattr(module.body[0], "value", None)
            if not isinstance(parsed, ast.List) or not all(
                    isinstance(e, ast.Call) for e in parsed.elts):
                raise _UnexpectedAstError(
                    "Tool output must be a list of function calls")
            tool_calls = [
                _handle_single_tool(e)  # type: ignore
                for e in parsed.elts
            ]

            tool_deltas = []
            for index, new_call in enumerate(tool_calls):
                if index < self.current_tool_index:
                    continue

                self.current_tool_index = index
                if len(self.streamed_args_for_tool) == index:
                    self.streamed_args_for_tool.append("")

                new_call_complete = index < len(
                    tool_calls) - 1 or ")]" not in added_text
                if new_call_complete:
                    self.current_tool_index += 1

                withheld_suffix = (added_text[:-2]
                                   if not new_call_complete else "")
                if not new_call_complete and added_text[-2] == ")":
                    # Function call is incomplete. Withhold the closing bracket.
                    withheld_suffix = withheld_suffix + "}"
                # Strings get single quotes in the model-produced string.
                # JSON requires double quotes.
                withheld_suffix = withheld_suffix.replace("'", '"')
                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
                                            new_call, index, withheld_suffix)

                if delta is not None:
                    tool_deltas.append(delta)
                    if (delta.function is not None
                            and delta.function.arguments is not None):
                        self.streamed_args_for_tool[
                            index] += delta.function.arguments

        # HACK: serving_chat.py inspects the internal state of tool parsers
        # when determining its final streaming delta, automatically
        # adding autocompleted JSON.
        # These two lines avoid that nonsense while ensuring finish_reason
        # is set to tool_calls when at least one tool is called.
            if tool_deltas and not self.prev_tool_call_arr:
                self.prev_tool_call_arr = [{"arguments": {}}]

            if tool_deltas:
                return DeltaMessage(tool_calls=tool_deltas)
            elif not added_text and self.current_tool_id > 0:
                # Return an empty DeltaMessage once the tool calls are all done
                # so that finish_reason gets set.
                return DeltaMessage(content='')
            else:
                return None
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            logger.debug(
                "Skipping chunk as a result of tool streaming extraction "
                "error")
            return None

TOOL_CALL_REGEX `class-attribute` `instance-attribute` ¶

TOOL_CALL_REGEX = compile(
    "\\[([a-zA-Z]+\\w*\\(([a-zA-Z]+\\w*=.*,\\s*)*([a-zA-Z]+\\w*=.*\\s)?\\),\\s*)*([a-zA-Z]+\\w*\\(([a-zA-Z]+\\w*=.*,\\s*)*([a-zA-Z]+\\w*=.*\\s*)?\\)\\s*)+\\]",
    DOTALL,
)

current_tool_index `property` `writable` ¶

current_tool_index: int

init ¶

__init__(tokenizer: PreTrainedTokenizerBase)

Source code in vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py

def __init__(self, tokenizer: PreTrainedTokenizerBase):
    super().__init__(tokenizer)

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract the tool calls from a complete model response.

Source code in vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    """
    Extract the tool calls from a complete model response.
    """

    # remove <|python_start|> and <|python_end|>
    # as Llama 4 model sometime will output those tokens
    if model_output.startswith("<|python_start|>"):
        model_output = model_output[len("<|python_start|>"):]
        model_output = model_output.replace("<|python_end|>", "")

    is_tool_call_pattern = False
    try:
        is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
            model_output,
            timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
    except TimeoutError:
        logger.warning(
            "Regex timeout occurred when matching tool call pattern.")
        logger.debug("Regex timeout occurred when matching user input: %s",
                     model_output)

    if not is_tool_call_pattern:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    try:
        module = ast.parse(model_output)
        parsed = getattr(module.body[0], "value", None)
        if isinstance(parsed, ast.List) and all(
                isinstance(e, ast.Call) for e in parsed.elts):
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=[
                    _handle_single_tool(e)  # type: ignore
                    for e in parsed.elts
                ],
                content=None)
        else:
            raise _UnexpectedAstError(
                "Tool output must be a list of function calls")
    except Exception:
        logger.exception("Error in extracting tool call from response.")
        # Treat as regular text
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    if not current_text.startswith("[") and not current_text.startswith(
            "<|python_start|>"):
        return DeltaMessage(content=delta_text)

    try:
        # remove <|python_start|> and <|python_end|>
        if current_text.startswith("<|python_start|>"):
            current_text = current_text[len("<|python_start|>"):]
        if current_text.endswith("<|python_end|>"):
            current_text = current_text[:current_text.
                                        rfind("<|python_end|>")]
        valid_and_added_text = _make_valid_python(current_text)
        if valid_and_added_text is None:
            return None
        valid_text, added_text = valid_and_added_text

        module = ast.parse(valid_text)
        parsed = getattr(module.body[0], "value", None)
        if not isinstance(parsed, ast.List) or not all(
                isinstance(e, ast.Call) for e in parsed.elts):
            raise _UnexpectedAstError(
                "Tool output must be a list of function calls")
        tool_calls = [
            _handle_single_tool(e)  # type: ignore
            for e in parsed.elts
        ]

        tool_deltas = []
        for index, new_call in enumerate(tool_calls):
            if index < self.current_tool_index:
                continue

            self.current_tool_index = index
            if len(self.streamed_args_for_tool) == index:
                self.streamed_args_for_tool.append("")

            new_call_complete = index < len(
                tool_calls) - 1 or ")]" not in added_text
            if new_call_complete:
                self.current_tool_index += 1

            withheld_suffix = (added_text[:-2]
                               if not new_call_complete else "")
            if not new_call_complete and added_text[-2] == ")":
                # Function call is incomplete. Withhold the closing bracket.
                withheld_suffix = withheld_suffix + "}"
            # Strings get single quotes in the model-produced string.
            # JSON requires double quotes.
            withheld_suffix = withheld_suffix.replace("'", '"')
            delta = _compute_tool_delta(self.streamed_args_for_tool[index],
                                        new_call, index, withheld_suffix)

            if delta is not None:
                tool_deltas.append(delta)
                if (delta.function is not None
                        and delta.function.arguments is not None):
                    self.streamed_args_for_tool[
                        index] += delta.function.arguments

    # HACK: serving_chat.py inspects the internal state of tool parsers
    # when determining its final streaming delta, automatically
    # adding autocompleted JSON.
    # These two lines avoid that nonsense while ensuring finish_reason
    # is set to tool_calls when at least one tool is called.
        if tool_deltas and not self.prev_tool_call_arr:
            self.prev_tool_call_arr = [{"arguments": {}}]

        if tool_deltas:
            return DeltaMessage(tool_calls=tool_deltas)
        elif not added_text and self.current_tool_id > 0:
            # Return an empty DeltaMessage once the tool calls are all done
            # so that finish_reason gets set.
            return DeltaMessage(content='')
        else:
            return None
    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        logger.debug(
            "Skipping chunk as a result of tool streaming extraction "
            "error")
        return None

LongcatFlashToolParser ¶

Bases: Hermes2ProToolParser

Source code in vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py

@ToolParserManager.register_module("longcat")
class LongcatFlashToolParser(Hermes2ProToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        self.tool_call_start_token: str = "<longcat_tool_call>"
        self.tool_call_end_token: str = "</longcat_tool_call>"

        self.tool_call_regex = re.compile(
            r"<longcat_tool_call>(.*?)</longcat_tool_call>|<longcat_tool_call>(.*)",
            re.DOTALL)

        self.tool_call_start_token_ids = self.model_tokenizer.encode(
            self.tool_call_start_token, add_special_tokens=False)
        self.tool_call_end_token_ids = self.model_tokenizer.encode(
            self.tool_call_end_token, add_special_tokens=False)

        self.tool_call_start_token_array = [
            self.model_tokenizer.decode([token_id])
            for token_id in self.tool_call_start_token_ids
        ]

        self.tool_call_end_token_array = [
            self.model_tokenizer.decode([token_id])
            for token_id in self.tool_call_end_token_ids
        ]

tool_call_end_token `instance-attribute` ¶

tool_call_end_token: str = '</longcat_tool_call>'

tool_call_end_token_array `instance-attribute` ¶

tool_call_end_token_array = [
    (decode([token_id]))
    for token_id in (tool_call_end_token_ids)
]

tool_call_end_token_ids `instance-attribute` ¶

tool_call_end_token_ids = encode(
    tool_call_end_token, add_special_tokens=False
)

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    "<longcat_tool_call>(.*?)</longcat_tool_call>|<longcat_tool_call>(.*)",
    DOTALL,
)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token: str = '<longcat_tool_call>'

tool_call_start_token_array `instance-attribute` ¶

tool_call_start_token_array = [
    (decode([token_id]))
    for token_id in (tool_call_start_token_ids)
]

tool_call_start_token_ids `instance-attribute` ¶

tool_call_start_token_ids = encode(
    tool_call_start_token, add_special_tokens=False
)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    self.tool_call_start_token: str = "<longcat_tool_call>"
    self.tool_call_end_token: str = "</longcat_tool_call>"

    self.tool_call_regex = re.compile(
        r"<longcat_tool_call>(.*?)</longcat_tool_call>|<longcat_tool_call>(.*)",
        re.DOTALL)

    self.tool_call_start_token_ids = self.model_tokenizer.encode(
        self.tool_call_start_token, add_special_tokens=False)
    self.tool_call_end_token_ids = self.model_tokenizer.encode(
        self.tool_call_end_token, add_special_tokens=False)

    self.tool_call_start_token_array = [
        self.model_tokenizer.decode([token_id])
        for token_id in self.tool_call_start_token_ids
    ]

    self.tool_call_end_token_array = [
        self.model_tokenizer.decode([token_id])
        for token_id in self.tool_call_end_token_ids
    ]

MinimaxToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

@ToolParserManager.register_module("minimax")
class MinimaxToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        # Initialize streaming state for tracking tool call progress
        self.streaming_state: dict[str, Any] = {
            "current_tool_index": -1,  # Index of current tool being processed
            "tool_ids": [],  # List of tool call IDs
            "sent_tools": [],  # List of tools that have been sent
        }

        # Define tool call tokens and patterns
        self.tool_call_start_token = "<tool_calls>"
        self.tool_call_end_token = "</tool_calls>"
        self.tool_call_regex = re.compile(
            r"<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)", re.DOTALL)
        self.thinking_tag_pattern = r"<think>(.*?)</think>"
        self.tool_name_pattern = re.compile(r'"name":\s*"([^"]+)"')
        self.tool_args_pattern = re.compile(r'"arguments":\s*')

        # Buffer for handling partial tool calls during streaming
        self.pending_buffer = ""
        self.in_thinking_tag = False

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction.")

        # Get token IDs for tool call start/end tokens
        self.tool_call_start_token_id = self.vocab.get(
            self.tool_call_start_token)
        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

        if (self.tool_call_start_token_id is None
                or self.tool_call_end_token_id is None):
            logger.warning(
                "Minimax Tool parser could not locate tool call start/end "
                "tokens in the tokenizer. Falling back to string matching.")

    def preprocess_model_output(self, model_output: str) -> str:
        """
        Preprocess model output by removing tool calls from thinking tags.

        Args:
            model_output: Raw model output string

        Returns:
            Preprocessed model output with tool calls removed from thinking tags
        """

        def remove_tool_calls_from_think(match):
            think_content = match.group(1)
            cleaned_content = re.sub(r"<tool_calls>.*?</tool_calls>",
                                     "",
                                     think_content,
                                     flags=re.DOTALL)
            return f"<think>{cleaned_content}</think>"

        return re.sub(self.thinking_tag_pattern,
                      remove_tool_calls_from_think,
                      model_output,
                      flags=re.DOTALL)

    def _clean_duplicate_braces(self, args_text: str) -> str:
        """
        Clean duplicate closing braces from arguments text.

        Args:
            args_text: Raw arguments text

        Returns:
            Cleaned arguments text with proper JSON formatting
        """
        args_text = args_text.strip()
        if not args_text:
            return args_text

        try:
            json.loads(args_text)
            return args_text
        except json.JSONDecodeError:
            pass

        while args_text.endswith('}}'):
            candidate = args_text[:-1]
            try:
                json.loads(candidate)
                return candidate
            except json.JSONDecodeError:
                args_text = candidate

        return args_text

    def _clean_delta_braces(self, delta_text: str) -> str:
        """
        Clean delta text by removing excessive closing braces.

        Args:
            delta_text: Delta text to clean

        Returns:
            Cleaned delta text
        """
        if not delta_text:
            return delta_text

        delta_stripped = delta_text.strip()

        if delta_stripped and all(c in '}\n\r\t ' for c in delta_stripped):
            brace_count = delta_stripped.count('}')
            if brace_count > 1:
                return '}\n' if delta_text.endswith('\n') else '}'

        return delta_text

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        """
        Extract tool calls from model output for non-streaming mode.

        Args:
            model_output: Complete model output
            request: Chat completion request

        Returns:
            ExtractedToolCallInformation containing tool calls and content
        """
        processed_output = self.preprocess_model_output(model_output)

        if self.tool_call_start_token not in processed_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        try:
            function_call_tuples = self.tool_call_regex.findall(
                processed_output)

            raw_function_calls = []
            for match in function_call_tuples:
                tool_call_content = match[0] if match[0] else match[1]
                if tool_call_content.strip():
                    lines = tool_call_content.strip().split('\n')
                    for line in lines:
                        line = line.strip()
                        if line and line.startswith('{') and line.endswith(
                                '}'):
                            try:
                                parsed_call = json.loads(line)
                                raw_function_calls.append(parsed_call)
                            except json.JSONDecodeError:
                                continue

            tool_calls = []
            for function_call in raw_function_calls:
                if "name" in function_call and "arguments" in function_call:
                    tool_calls.append(
                        ToolCall(type="function",
                                 function=FunctionCall(
                                     name=function_call["name"],
                                     arguments=json.dumps(
                                         function_call["arguments"],
                                         ensure_ascii=False))))

            processed_pos = processed_output.find(self.tool_call_start_token)
            if processed_pos != -1:
                processed_content = processed_output[:processed_pos].strip()

                if processed_content:
                    lines = processed_content.split('\n')
                    for line in reversed(lines):
                        line = line.strip()
                        if line:
                            pos = model_output.find(line)
                            if pos != -1:
                                content = model_output[:pos + len(line)]
                                break
                    else:
                        content = ""
                else:
                    content = ""
            else:
                content = model_output

            return ExtractedToolCallInformation(
                tools_called=len(tool_calls) > 0,
                tool_calls=tool_calls,
                content=content.strip() if content.strip() else None)

        except Exception:
            logger.exception(
                "An unexpected error occurred during tool call extraction.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def _update_thinking_state(self, text: str) -> None:
        """
        Update the thinking tag state based on text content.

        Args:
            text: Text to analyze for thinking tags
        """
        open_count = text.count("<think>")
        close_count = text.count("</think>")
        self.in_thinking_tag = open_count > close_count or (
            open_count == close_count and text.endswith("</think>"))

    def _is_potential_tag_start(self, text: str) -> bool:
        """
        Check if text might be the start of a tool call tag.

        Args:
            text: Text to check

        Returns:
            True if text could be the start of a tool call tag
        """
        for tag in [self.tool_call_start_token, self.tool_call_end_token]:
            if any(
                    tag.startswith(text[-i:])
                    for i in range(1, min(len(text) + 1, len(tag)))):
                return True
        return False

    def _should_buffer_content(self, delta_text: str) -> bool:
        """
        Determine if content should be buffered for later processing.

        Args:
            delta_text: Delta text to check

        Returns:
            True if content should be buffered
        """
        if self.in_thinking_tag:
            return False
        return bool(self.pending_buffer
                    or self.tool_call_start_token in delta_text
                    or self.tool_call_end_token in delta_text
                    or delta_text.startswith('<'))

    def _split_content_for_buffering(self, delta_text: str) -> tuple[str, str]:
        """
        Split delta text into safe content and potential tag content.

        Args:
            delta_text: Delta text to split

        Returns:
            Tuple of (safe_content, potential_tag_content)
        """
        if self.in_thinking_tag:
            return delta_text, ""

        for tag in [self.tool_call_start_token, self.tool_call_end_token]:
            for i in range(1, len(tag)):
                tag_prefix = tag[:i]
                pos = delta_text.rfind(tag_prefix)
                if pos != -1 and tag.startswith(delta_text[pos:]):
                    return delta_text[:pos], delta_text[pos:]
        return delta_text, ""

    def _process_buffer(self, new_content: str) -> str:
        """
        Process buffered content and return output content.

        Args:
            new_content: New content to add to buffer

        Returns:
            Processed output content
        """
        self.pending_buffer += new_content
        output_content = ""

        if self.in_thinking_tag:
            output_content = self.pending_buffer
            self.pending_buffer = ""
            return output_content

        while self.pending_buffer:
            start_pos = self.pending_buffer.find(self.tool_call_start_token)
            end_pos = self.pending_buffer.find(self.tool_call_end_token)

            if start_pos != -1 and (end_pos == -1 or start_pos < end_pos):
                tag_pos, tag_len = start_pos, len(self.tool_call_start_token)
            elif end_pos != -1:
                tag_pos, tag_len = end_pos, len(self.tool_call_end_token)
            else:
                if self._is_potential_tag_start(self.pending_buffer):
                    break
                output_content += self.pending_buffer
                self.pending_buffer = ""
                break

            output_content += self.pending_buffer[:tag_pos]
            self.pending_buffer = self.pending_buffer[tag_pos + tag_len:]

        return output_content

    def _reset_streaming_state(self) -> None:
        """Reset the streaming state to initial values."""
        self.streaming_state = {
            "current_tool_index": -1,
            "tool_ids": [],
            "sent_tools": [],
        }

    def _advance_to_next_tool(self) -> None:
        """Advance to the next tool in the streaming sequence."""
        self.streaming_state["current_tool_index"] = int(
            self.streaming_state["current_tool_index"]) + 1

    def _set_current_tool_index(self, index: int) -> None:
        """
        Set the current tool index.

        Args:
            index: Tool index to set
        """
        self.streaming_state["current_tool_index"] = index

    def _get_current_tool_index(self) -> int:
        """
        Get the current tool index.

        Returns:
            Current tool index
        """
        return int(self.streaming_state["current_tool_index"])

    def _get_next_unsent_tool_index(self, tool_count: int) -> int:
        """
        Get the index of the next unsent tool.

        Args:
            tool_count: Total number of tools

        Returns:
            Index of next unsent tool, or -1 if all tools sent
        """
        sent_tools = list(self.streaming_state["sent_tools"])
        for i in range(tool_count):
            if i < len(sent_tools):
                if not sent_tools[i]["sent_name"]:
                    return i
            else:
                return i
        return -1

    def _ensure_state_arrays(self, tool_count: int) -> None:
        """
        Ensure state arrays have sufficient capacity for tool_count tools.

        Args:
            tool_count: Number of tools to prepare for
        """
        sent_tools = list(self.streaming_state["sent_tools"])
        tool_ids = list(self.streaming_state["tool_ids"])

        while len(sent_tools) < tool_count:
            sent_tools.append({
                "sent_name": False,
                "sent_arguments": "",
                "id": make_tool_call_id(),
            })

        while len(tool_ids) < tool_count:
            tool_ids.append(None)

        self.streaming_state["sent_tools"] = sent_tools
        self.streaming_state["tool_ids"] = tool_ids

    def _detect_tools_in_text(self, text: str) -> int:
        """
        Detect the number of tools in text by counting name patterns.

        Args:
            text: Text to analyze

        Returns:
            Number of tools detected
        """
        matches = self.tool_name_pattern.findall(text)
        return len(matches)

    def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]:
        """
        Find the boundaries of tool calls in text.

        Args:
            text: Text to analyze

        Returns:
            List of (start, end) positions for tool calls
        """
        boundaries = []
        i = 0
        while i < len(text):
            if text[i] == '{':
                start = i
                depth = 0
                has_name = False
                has_arguments = False

                while i < len(text):
                    if text[i] == '{':
                        depth += 1
                    elif text[i] == '}':
                        depth -= 1
                        if depth == 0:
                            end = i + 1
                            segment = text[start:end]
                            if '"name"' in segment and '"arguments"' in segment:
                                boundaries.append((start, end))
                            break

                    if not has_name and '"name"' in text[start:i + 1]:
                        has_name = True
                    if not has_arguments and '"arguments"' in text[start:i +
                                                                   1]:
                        has_arguments = True

                    i += 1

                if depth > 0 and has_name:
                    boundaries.append((start, i))
            else:
                i += 1
        return boundaries

    def _extract_tool_args(self, tool_content: str,
                           args_match: re.Match[str]) -> str:
        """
        Extract tool arguments from tool content.

        Args:
            tool_content: Tool call content
            args_match: Regex match for arguments pattern

        Returns:
            Extracted arguments as string
        """
        args_start_pos = args_match.end()
        remaining_content = tool_content[args_start_pos:]

        if remaining_content.strip().startswith('{'):
            depth = 0
            for i, char in enumerate(remaining_content):
                if char == '{':
                    depth += 1
                elif char == '}':
                    depth -= 1
                    if depth == 0:
                        return remaining_content[:i + 1]
        else:
            args_end = remaining_content.find('}')
            if args_end > 0:
                return remaining_content[:args_end].strip()

        return remaining_content.rstrip('}').strip()

    def _get_current_tool_content(
            self, text: str,
            tool_index: int) -> tuple[Optional[str], Optional[str]]:
        """
        Get the content of a specific tool by index.

        Args:
            text: Text containing tool calls
            tool_index: Index of tool to extract

        Returns:
            Tuple of (tool_name, tool_arguments) or (None, None) if not found
        """
        boundaries = self._find_tool_boundaries(text)

        if tool_index >= len(boundaries):
            return None, None

        start, end = boundaries[tool_index]
        tool_content = text[start:end]

        name_match = self.tool_name_pattern.search(tool_content)
        name = name_match.group(1) if name_match else None

        args_match = self.tool_args_pattern.search(tool_content)
        if args_match:
            try:
                args_text = self._extract_tool_args(tool_content, args_match)
                return name, args_text
            except Exception:
                remaining_content = tool_content[args_match.end():]
                args_text = remaining_content.rstrip('}').strip()
                return name, args_text

        return name, None

    def _handle_tool_name_streaming(
            self, tool_content: str,
            tool_count: int) -> Union[DeltaMessage, None]:
        """
        Handle streaming of tool names.

        Args:
            tool_content: Content containing tool calls
            tool_count: Total number of tools

        Returns:
            DeltaMessage with tool name or None if no tool to stream
        """
        next_idx = self._get_next_unsent_tool_index(tool_count)

        if next_idx == -1:
            return None

        boundaries = self._find_tool_boundaries(tool_content)
        if next_idx >= len(boundaries):
            return None

        tool_name, _ = self._get_current_tool_content(tool_content, next_idx)
        if not tool_name:
            return None

        self._set_current_tool_index(next_idx)
        sent_tools = list(self.streaming_state["sent_tools"])
        tool_ids = list(self.streaming_state["tool_ids"])

        tool_id = sent_tools[next_idx]["id"]
        tool_ids[next_idx] = tool_id
        sent_tools[next_idx]["sent_name"] = True

        self.streaming_state["sent_tools"] = sent_tools
        self.streaming_state["tool_ids"] = tool_ids

        return DeltaMessage(tool_calls=[
            DeltaToolCall(index=next_idx,
                          type="function",
                          id=tool_id,
                          function=DeltaFunctionCall(
                              name=tool_name).model_dump(exclude_none=True))
        ])

    def _handle_tool_args_streaming(
            self, tool_content: str,
            tool_count: int) -> Union[DeltaMessage, None]:
        """
        Handle streaming of tool arguments.

        Args:
            tool_content: Content containing tool calls
            tool_count: Total number of tools

        Returns:
            DeltaMessage with tool arguments or None if no arguments to stream
        """
        current_idx = self._get_current_tool_index()

        if current_idx < 0 or current_idx >= tool_count:
            return None

        tool_name, tool_args = self._get_current_tool_content(
            tool_content, current_idx)
        if not tool_name or tool_args is None:
            return None

        sent_tools = list(self.streaming_state["sent_tools"])

        if not sent_tools[current_idx]["sent_name"]:
            return None

        clean_args = self._clean_duplicate_braces(tool_args)
        sent_args = sent_tools[current_idx]["sent_arguments"]

        if clean_args != sent_args:
            if sent_args and clean_args.startswith(sent_args):
                args_delta = extract_intermediate_diff(clean_args, sent_args)
                if args_delta:
                    args_delta = self._clean_delta_braces(args_delta)
                    sent_tools[current_idx]["sent_arguments"] = clean_args
                    self.streaming_state["sent_tools"] = sent_tools

                    if clean_args.endswith('}'):
                        self._advance_to_next_tool()

                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(index=current_idx,
                                      function=DeltaFunctionCall(
                                          arguments=args_delta).model_dump(
                                              exclude_none=True))
                    ])
            elif not sent_args and clean_args:
                clean_args_delta = self._clean_delta_braces(clean_args)
                sent_tools[current_idx]["sent_arguments"] = clean_args
                self.streaming_state["sent_tools"] = sent_tools

                if clean_args.endswith('}'):
                    self._advance_to_next_tool()

                return DeltaMessage(tool_calls=[
                    DeltaToolCall(index=current_idx,
                                  function=DeltaFunctionCall(
                                      arguments=clean_args_delta).model_dump(
                                          exclude_none=True))
                ])

        return None

    def _is_end_tool_calls(self, current_text: str) -> bool:
        if self.tool_call_end_token not in current_text:
            return False

        end_token_positions = []
        search_start = 0
        while True:
            pos = current_text.find(self.tool_call_end_token, search_start)
            if pos == -1:
                break
            end_token_positions.append(pos)
            search_start = pos + 1

        think_regions = []
        for match in re.finditer(self.thinking_tag_pattern,
                                 current_text,
                                 flags=re.DOTALL):
            think_regions.append((match.start(), match.end()))

        for pos in end_token_positions:
            in_think = any(pos >= t_start and pos < t_end
                           for t_start, t_end in think_regions)
            if not in_think:
                return True

        return False

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        self._update_thinking_state(current_text)

        if self.in_thinking_tag:
            return DeltaMessage(content=delta_text)

        if self._should_buffer_content(delta_text):
            buffered_output = self._process_buffer(delta_text)
            return DeltaMessage(
                content=buffered_output) if buffered_output else None

        if self._is_end_tool_calls(current_text):
            return DeltaMessage(content=delta_text)

        safe_content, potential_tag = self._split_content_for_buffering(
            delta_text)
        if potential_tag:
            self.pending_buffer += potential_tag
            return DeltaMessage(content=safe_content) if safe_content else None

        processed_current_text = self.preprocess_model_output(current_text)

        if self.tool_call_start_token not in processed_current_text:
            if (self.tool_call_end_token in delta_text
                    and self.tool_call_start_token in current_text):
                return None
            if delta_text.strip(
            ) == '' and self.tool_call_start_token in current_text:
                return None
            if (self._get_current_tool_index() != -1
                    and self.tool_call_end_token in current_text):
                self._reset_streaming_state()
            return DeltaMessage(content=delta_text)

        if (self.tool_call_start_token_id is not None
                and self.tool_call_start_token_id in delta_token_ids
                and len(delta_token_ids) == 1):
            return None

        original_tool_start = self._find_tool_start_outside_thinking(
            current_text)
        if original_tool_start is None:
            return None

        content_before_tools = self._extract_content_before_tools(
            current_text, delta_text, original_tool_start)
        if content_before_tools:
            return DeltaMessage(content=content_before_tools)

        try:
            tool_content = self._extract_tool_content(current_text,
                                                      original_tool_start)
            current_tools_count = self._detect_tools_in_text(tool_content)

            if current_tools_count == 0:
                return None

            if self._get_current_tool_index() == -1:
                self._reset_streaming_state()

            self._ensure_state_arrays(current_tools_count)

            return (self._handle_tool_name_streaming(tool_content,
                                                     current_tools_count)
                    or self._handle_tool_args_streaming(
                        tool_content, current_tools_count))

        except Exception:
            logger.exception("An unexpected error occurred ",
                             "during streaming tool call handling.")
            return None

    def _find_tool_start_outside_thinking(self,
                                          current_text: str) -> Optional[int]:
        """
        Find the start position of tool calls outside of thinking tags.

        Args:
            current_text: Current text to search

        Returns:
            Position of tool call start or None if not found
        """
        search_start = 0
        while True:
            pos = current_text.find(self.tool_call_start_token, search_start)
            if pos == -1:
                return None

            think_regions = [(m.start(), m.end()) for m in re.finditer(
                r"<think>(.*?)</think>", current_text, flags=re.DOTALL)]
            in_think = any(pos >= t_start and pos < t_end
                           for t_start, t_end in think_regions)

            if not in_think:
                return pos

            search_start = pos + 1

    def _extract_content_before_tools(self, current_text: str, delta_text: str,
                                      tool_start: int) -> Optional[str]:
        """
        Extract content that appears before tool calls.

        Args:
            current_text: Current text
            delta_text: Delta text
            tool_start: Start position of tools

        Returns:
            Content before tools or None
        """
        if tool_start > 0:
            delta_start_pos = len(current_text) - len(delta_text)
            if delta_start_pos < tool_start:
                content_part = delta_text
                if delta_start_pos + len(delta_text) > tool_start:
                    content_part = delta_text[:tool_start - delta_start_pos]
                return content_part if content_part else None
        return None

    def _extract_tool_content(self, current_text: str, tool_start: int) -> str:
        """
        Extract tool content from current text starting at tool_start.

        Args:
            current_text: Current text
            tool_start: Start position of tool calls

        Returns:
            Extracted tool content
        """
        tool_content_start = tool_start + len(self.tool_call_start_token)
        tool_content = current_text[tool_content_start:]

        end_pos = tool_content.find(self.tool_call_end_token)
        if end_pos != -1:
            tool_content = tool_content[:end_pos]

        return tool_content

in_thinking_tag `instance-attribute` ¶

in_thinking_tag = False

pending_buffer `instance-attribute` ¶

pending_buffer = ''

streaming_state `instance-attribute` ¶

streaming_state: dict[str, Any] = {
    "current_tool_index": -1,
    "tool_ids": [],
    "sent_tools": [],
}

thinking_tag_pattern `instance-attribute` ¶

thinking_tag_pattern = '<think>(.*?)</think>'

tool_args_pattern `instance-attribute` ¶

tool_args_pattern = compile('"arguments":\\s*')

tool_call_end_token `instance-attribute` ¶

tool_call_end_token = '</tool_calls>'

tool_call_end_token_id `instance-attribute` ¶

tool_call_end_token_id = get(tool_call_end_token)

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    "<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)",
    DOTALL,
)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token = '<tool_calls>'

tool_call_start_token_id `instance-attribute` ¶

tool_call_start_token_id = get(tool_call_start_token)

tool_name_pattern `instance-attribute` ¶

tool_name_pattern = compile('"name":\\s*"([^"]+)"')

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    # Initialize streaming state for tracking tool call progress
    self.streaming_state: dict[str, Any] = {
        "current_tool_index": -1,  # Index of current tool being processed
        "tool_ids": [],  # List of tool call IDs
        "sent_tools": [],  # List of tools that have been sent
    }

    # Define tool call tokens and patterns
    self.tool_call_start_token = "<tool_calls>"
    self.tool_call_end_token = "</tool_calls>"
    self.tool_call_regex = re.compile(
        r"<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)", re.DOTALL)
    self.thinking_tag_pattern = r"<think>(.*?)</think>"
    self.tool_name_pattern = re.compile(r'"name":\s*"([^"]+)"')
    self.tool_args_pattern = re.compile(r'"arguments":\s*')

    # Buffer for handling partial tool calls during streaming
    self.pending_buffer = ""
    self.in_thinking_tag = False

    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction.")

    # Get token IDs for tool call start/end tokens
    self.tool_call_start_token_id = self.vocab.get(
        self.tool_call_start_token)
    self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

    if (self.tool_call_start_token_id is None
            or self.tool_call_end_token_id is None):
        logger.warning(
            "Minimax Tool parser could not locate tool call start/end "
            "tokens in the tokenizer. Falling back to string matching.")

_advance_to_next_tool ¶

_advance_to_next_tool() -> None

Advance to the next tool in the streaming sequence.

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _advance_to_next_tool(self) -> None:
    """Advance to the next tool in the streaming sequence."""
    self.streaming_state["current_tool_index"] = int(
        self.streaming_state["current_tool_index"]) + 1

_clean_delta_braces ¶

_clean_delta_braces(delta_text: str) -> str

Clean delta text by removing excessive closing braces.

Parameters:

Name	Type	Description	Default
`delta_text`	`str`	Delta text to clean	required

Returns:

Type	Description
`str`	Cleaned delta text

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _clean_delta_braces(self, delta_text: str) -> str:
    """
    Clean delta text by removing excessive closing braces.

    Args:
        delta_text: Delta text to clean

    Returns:
        Cleaned delta text
    """
    if not delta_text:
        return delta_text

    delta_stripped = delta_text.strip()

    if delta_stripped and all(c in '}\n\r\t ' for c in delta_stripped):
        brace_count = delta_stripped.count('}')
        if brace_count > 1:
            return '}\n' if delta_text.endswith('\n') else '}'

    return delta_text

_clean_duplicate_braces ¶

_clean_duplicate_braces(args_text: str) -> str

Clean duplicate closing braces from arguments text.

Parameters:

Name	Type	Description	Default
`args_text`	`str`	Raw arguments text	required

Returns:

Type	Description
`str`	Cleaned arguments text with proper JSON formatting

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _clean_duplicate_braces(self, args_text: str) -> str:
    """
    Clean duplicate closing braces from arguments text.

    Args:
        args_text: Raw arguments text

    Returns:
        Cleaned arguments text with proper JSON formatting
    """
    args_text = args_text.strip()
    if not args_text:
        return args_text

    try:
        json.loads(args_text)
        return args_text
    except json.JSONDecodeError:
        pass

    while args_text.endswith('}}'):
        candidate = args_text[:-1]
        try:
            json.loads(candidate)
            return candidate
        except json.JSONDecodeError:
            args_text = candidate

    return args_text

_detect_tools_in_text ¶

_detect_tools_in_text(text: str) -> int

Detect the number of tools in text by counting name patterns.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to analyze	required

Returns:

Type	Description
`int`	Number of tools detected

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _detect_tools_in_text(self, text: str) -> int:
    """
    Detect the number of tools in text by counting name patterns.

    Args:
        text: Text to analyze

    Returns:
        Number of tools detected
    """
    matches = self.tool_name_pattern.findall(text)
    return len(matches)

_ensure_state_arrays ¶

_ensure_state_arrays(tool_count: int) -> None

Ensure state arrays have sufficient capacity for tool_count tools.

Parameters:

Name	Type	Description	Default
`tool_count`	`int`	Number of tools to prepare for	required

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _ensure_state_arrays(self, tool_count: int) -> None:
    """
    Ensure state arrays have sufficient capacity for tool_count tools.

    Args:
        tool_count: Number of tools to prepare for
    """
    sent_tools = list(self.streaming_state["sent_tools"])
    tool_ids = list(self.streaming_state["tool_ids"])

    while len(sent_tools) < tool_count:
        sent_tools.append({
            "sent_name": False,
            "sent_arguments": "",
            "id": make_tool_call_id(),
        })

    while len(tool_ids) < tool_count:
        tool_ids.append(None)

    self.streaming_state["sent_tools"] = sent_tools
    self.streaming_state["tool_ids"] = tool_ids

_extract_content_before_tools ¶

_extract_content_before_tools(
    current_text: str, delta_text: str, tool_start: int
) -> Optional[str]

Extract content that appears before tool calls.

Parameters:

Name	Type	Description	Default
`current_text`	`str`	Current text	required
`delta_text`	`str`	Delta text	required
`tool_start`	`int`	Start position of tools	required

Returns:

Type	Description
`Optional[str]`	Content before tools or None

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _extract_content_before_tools(self, current_text: str, delta_text: str,
                                  tool_start: int) -> Optional[str]:
    """
    Extract content that appears before tool calls.

    Args:
        current_text: Current text
        delta_text: Delta text
        tool_start: Start position of tools

    Returns:
        Content before tools or None
    """
    if tool_start > 0:
        delta_start_pos = len(current_text) - len(delta_text)
        if delta_start_pos < tool_start:
            content_part = delta_text
            if delta_start_pos + len(delta_text) > tool_start:
                content_part = delta_text[:tool_start - delta_start_pos]
            return content_part if content_part else None
    return None

_extract_tool_args ¶

_extract_tool_args(
    tool_content: str, args_match: Match[str]
) -> str

Extract tool arguments from tool content.

Parameters:

Name	Type	Description	Default
`tool_content`	`str`	Tool call content	required
`args_match`	`Match[str]`	Regex match for arguments pattern	required

Returns:

Type	Description
`str`	Extracted arguments as string

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _extract_tool_args(self, tool_content: str,
                       args_match: re.Match[str]) -> str:
    """
    Extract tool arguments from tool content.

    Args:
        tool_content: Tool call content
        args_match: Regex match for arguments pattern

    Returns:
        Extracted arguments as string
    """
    args_start_pos = args_match.end()
    remaining_content = tool_content[args_start_pos:]

    if remaining_content.strip().startswith('{'):
        depth = 0
        for i, char in enumerate(remaining_content):
            if char == '{':
                depth += 1
            elif char == '}':
                depth -= 1
                if depth == 0:
                    return remaining_content[:i + 1]
    else:
        args_end = remaining_content.find('}')
        if args_end > 0:
            return remaining_content[:args_end].strip()

    return remaining_content.rstrip('}').strip()

_extract_tool_content ¶

_extract_tool_content(
    current_text: str, tool_start: int
) -> str

Extract tool content from current text starting at tool_start.

Parameters:

Name	Type	Description	Default
`current_text`	`str`	Current text	required
`tool_start`	`int`	Start position of tool calls	required

Returns:

Type	Description
`str`	Extracted tool content

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _extract_tool_content(self, current_text: str, tool_start: int) -> str:
    """
    Extract tool content from current text starting at tool_start.

    Args:
        current_text: Current text
        tool_start: Start position of tool calls

    Returns:
        Extracted tool content
    """
    tool_content_start = tool_start + len(self.tool_call_start_token)
    tool_content = current_text[tool_content_start:]

    end_pos = tool_content.find(self.tool_call_end_token)
    if end_pos != -1:
        tool_content = tool_content[:end_pos]

    return tool_content

_find_tool_boundaries ¶

_find_tool_boundaries(text: str) -> list[tuple[int, int]]

Find the boundaries of tool calls in text.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to analyze	required

Returns:

Type	Description
`list[tuple[int, int]]`	List of (start, end) positions for tool calls

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]:
    """
    Find the boundaries of tool calls in text.

    Args:
        text: Text to analyze

    Returns:
        List of (start, end) positions for tool calls
    """
    boundaries = []
    i = 0
    while i < len(text):
        if text[i] == '{':
            start = i
            depth = 0
            has_name = False
            has_arguments = False

            while i < len(text):
                if text[i] == '{':
                    depth += 1
                elif text[i] == '}':
                    depth -= 1
                    if depth == 0:
                        end = i + 1
                        segment = text[start:end]
                        if '"name"' in segment and '"arguments"' in segment:
                            boundaries.append((start, end))
                        break

                if not has_name and '"name"' in text[start:i + 1]:
                    has_name = True
                if not has_arguments and '"arguments"' in text[start:i +
                                                               1]:
                    has_arguments = True

                i += 1

            if depth > 0 and has_name:
                boundaries.append((start, i))
        else:
            i += 1
    return boundaries

_find_tool_start_outside_thinking ¶

_find_tool_start_outside_thinking(
    current_text: str,
) -> Optional[int]

Find the start position of tool calls outside of thinking tags.

Parameters:

Name	Type	Description	Default
`current_text`	`str`	Current text to search	required

Returns:

Type	Description
`Optional[int]`	Position of tool call start or None if not found

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _find_tool_start_outside_thinking(self,
                                      current_text: str) -> Optional[int]:
    """
    Find the start position of tool calls outside of thinking tags.

    Args:
        current_text: Current text to search

    Returns:
        Position of tool call start or None if not found
    """
    search_start = 0
    while True:
        pos = current_text.find(self.tool_call_start_token, search_start)
        if pos == -1:
            return None

        think_regions = [(m.start(), m.end()) for m in re.finditer(
            r"<think>(.*?)</think>", current_text, flags=re.DOTALL)]
        in_think = any(pos >= t_start and pos < t_end
                       for t_start, t_end in think_regions)

        if not in_think:
            return pos

        search_start = pos + 1

_get_current_tool_content ¶

_get_current_tool_content(
    text: str, tool_index: int
) -> tuple[Optional[str], Optional[str]]

Get the content of a specific tool by index.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text containing tool calls	required
`tool_index`	`int`	Index of tool to extract	required

Returns:

Type	Description
`tuple[Optional[str], Optional[str]]`	Tuple of (tool_name, tool_arguments) or (None, None) if not found

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _get_current_tool_content(
        self, text: str,
        tool_index: int) -> tuple[Optional[str], Optional[str]]:
    """
    Get the content of a specific tool by index.

    Args:
        text: Text containing tool calls
        tool_index: Index of tool to extract

    Returns:
        Tuple of (tool_name, tool_arguments) or (None, None) if not found
    """
    boundaries = self._find_tool_boundaries(text)

    if tool_index >= len(boundaries):
        return None, None

    start, end = boundaries[tool_index]
    tool_content = text[start:end]

    name_match = self.tool_name_pattern.search(tool_content)
    name = name_match.group(1) if name_match else None

    args_match = self.tool_args_pattern.search(tool_content)
    if args_match:
        try:
            args_text = self._extract_tool_args(tool_content, args_match)
            return name, args_text
        except Exception:
            remaining_content = tool_content[args_match.end():]
            args_text = remaining_content.rstrip('}').strip()
            return name, args_text

    return name, None

_get_current_tool_index ¶

_get_current_tool_index() -> int

Get the current tool index.

Returns:

Type	Description
`int`	Current tool index

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _get_current_tool_index(self) -> int:
    """
    Get the current tool index.

    Returns:
        Current tool index
    """
    return int(self.streaming_state["current_tool_index"])

_get_next_unsent_tool_index ¶

_get_next_unsent_tool_index(tool_count: int) -> int

Get the index of the next unsent tool.

Parameters:

Name	Type	Description	Default
`tool_count`	`int`	Total number of tools	required

Returns:

Type	Description
`int`	Index of next unsent tool, or -1 if all tools sent

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _get_next_unsent_tool_index(self, tool_count: int) -> int:
    """
    Get the index of the next unsent tool.

    Args:
        tool_count: Total number of tools

    Returns:
        Index of next unsent tool, or -1 if all tools sent
    """
    sent_tools = list(self.streaming_state["sent_tools"])
    for i in range(tool_count):
        if i < len(sent_tools):
            if not sent_tools[i]["sent_name"]:
                return i
        else:
            return i
    return -1

_handle_tool_args_streaming ¶

_handle_tool_args_streaming(
    tool_content: str, tool_count: int
) -> Union[DeltaMessage, None]

Handle streaming of tool arguments.

Parameters:

Name	Type	Description	Default
`tool_content`	`str`	Content containing tool calls	required
`tool_count`	`int`	Total number of tools	required

Returns:

Type	Description
`Union[DeltaMessage, None]`	DeltaMessage with tool arguments or None if no arguments to stream

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _handle_tool_args_streaming(
        self, tool_content: str,
        tool_count: int) -> Union[DeltaMessage, None]:
    """
    Handle streaming of tool arguments.

    Args:
        tool_content: Content containing tool calls
        tool_count: Total number of tools

    Returns:
        DeltaMessage with tool arguments or None if no arguments to stream
    """
    current_idx = self._get_current_tool_index()

    if current_idx < 0 or current_idx >= tool_count:
        return None

    tool_name, tool_args = self._get_current_tool_content(
        tool_content, current_idx)
    if not tool_name or tool_args is None:
        return None

    sent_tools = list(self.streaming_state["sent_tools"])

    if not sent_tools[current_idx]["sent_name"]:
        return None

    clean_args = self._clean_duplicate_braces(tool_args)
    sent_args = sent_tools[current_idx]["sent_arguments"]

    if clean_args != sent_args:
        if sent_args and clean_args.startswith(sent_args):
            args_delta = extract_intermediate_diff(clean_args, sent_args)
            if args_delta:
                args_delta = self._clean_delta_braces(args_delta)
                sent_tools[current_idx]["sent_arguments"] = clean_args
                self.streaming_state["sent_tools"] = sent_tools

                if clean_args.endswith('}'):
                    self._advance_to_next_tool()

                return DeltaMessage(tool_calls=[
                    DeltaToolCall(index=current_idx,
                                  function=DeltaFunctionCall(
                                      arguments=args_delta).model_dump(
                                          exclude_none=True))
                ])
        elif not sent_args and clean_args:
            clean_args_delta = self._clean_delta_braces(clean_args)
            sent_tools[current_idx]["sent_arguments"] = clean_args
            self.streaming_state["sent_tools"] = sent_tools

            if clean_args.endswith('}'):
                self._advance_to_next_tool()

            return DeltaMessage(tool_calls=[
                DeltaToolCall(index=current_idx,
                              function=DeltaFunctionCall(
                                  arguments=clean_args_delta).model_dump(
                                      exclude_none=True))
            ])

    return None

_handle_tool_name_streaming ¶

_handle_tool_name_streaming(
    tool_content: str, tool_count: int
) -> Union[DeltaMessage, None]

Handle streaming of tool names.

Parameters:

Name	Type	Description	Default
`tool_content`	`str`	Content containing tool calls	required
`tool_count`	`int`	Total number of tools	required

Returns:

Type	Description
`Union[DeltaMessage, None]`	DeltaMessage with tool name or None if no tool to stream

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _handle_tool_name_streaming(
        self, tool_content: str,
        tool_count: int) -> Union[DeltaMessage, None]:
    """
    Handle streaming of tool names.

    Args:
        tool_content: Content containing tool calls
        tool_count: Total number of tools

    Returns:
        DeltaMessage with tool name or None if no tool to stream
    """
    next_idx = self._get_next_unsent_tool_index(tool_count)

    if next_idx == -1:
        return None

    boundaries = self._find_tool_boundaries(tool_content)
    if next_idx >= len(boundaries):
        return None

    tool_name, _ = self._get_current_tool_content(tool_content, next_idx)
    if not tool_name:
        return None

    self._set_current_tool_index(next_idx)
    sent_tools = list(self.streaming_state["sent_tools"])
    tool_ids = list(self.streaming_state["tool_ids"])

    tool_id = sent_tools[next_idx]["id"]
    tool_ids[next_idx] = tool_id
    sent_tools[next_idx]["sent_name"] = True

    self.streaming_state["sent_tools"] = sent_tools
    self.streaming_state["tool_ids"] = tool_ids

    return DeltaMessage(tool_calls=[
        DeltaToolCall(index=next_idx,
                      type="function",
                      id=tool_id,
                      function=DeltaFunctionCall(
                          name=tool_name).model_dump(exclude_none=True))
    ])

_is_end_tool_calls ¶

_is_end_tool_calls(current_text: str) -> bool

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _is_end_tool_calls(self, current_text: str) -> bool:
    if self.tool_call_end_token not in current_text:
        return False

    end_token_positions = []
    search_start = 0
    while True:
        pos = current_text.find(self.tool_call_end_token, search_start)
        if pos == -1:
            break
        end_token_positions.append(pos)
        search_start = pos + 1

    think_regions = []
    for match in re.finditer(self.thinking_tag_pattern,
                             current_text,
                             flags=re.DOTALL):
        think_regions.append((match.start(), match.end()))

    for pos in end_token_positions:
        in_think = any(pos >= t_start and pos < t_end
                       for t_start, t_end in think_regions)
        if not in_think:
            return True

    return False

_is_potential_tag_start ¶

_is_potential_tag_start(text: str) -> bool

Check if text might be the start of a tool call tag.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to check	required

Returns:

Type	Description
`bool`	True if text could be the start of a tool call tag

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _is_potential_tag_start(self, text: str) -> bool:
    """
    Check if text might be the start of a tool call tag.

    Args:
        text: Text to check

    Returns:
        True if text could be the start of a tool call tag
    """
    for tag in [self.tool_call_start_token, self.tool_call_end_token]:
        if any(
                tag.startswith(text[-i:])
                for i in range(1, min(len(text) + 1, len(tag)))):
            return True
    return False

_process_buffer ¶

_process_buffer(new_content: str) -> str

Process buffered content and return output content.

Parameters:

Name	Type	Description	Default
`new_content`	`str`	New content to add to buffer	required

Returns:

Type	Description
`str`	Processed output content

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _process_buffer(self, new_content: str) -> str:
    """
    Process buffered content and return output content.

    Args:
        new_content: New content to add to buffer

    Returns:
        Processed output content
    """
    self.pending_buffer += new_content
    output_content = ""

    if self.in_thinking_tag:
        output_content = self.pending_buffer
        self.pending_buffer = ""
        return output_content

    while self.pending_buffer:
        start_pos = self.pending_buffer.find(self.tool_call_start_token)
        end_pos = self.pending_buffer.find(self.tool_call_end_token)

        if start_pos != -1 and (end_pos == -1 or start_pos < end_pos):
            tag_pos, tag_len = start_pos, len(self.tool_call_start_token)
        elif end_pos != -1:
            tag_pos, tag_len = end_pos, len(self.tool_call_end_token)
        else:
            if self._is_potential_tag_start(self.pending_buffer):
                break
            output_content += self.pending_buffer
            self.pending_buffer = ""
            break

        output_content += self.pending_buffer[:tag_pos]
        self.pending_buffer = self.pending_buffer[tag_pos + tag_len:]

    return output_content

_reset_streaming_state ¶

_reset_streaming_state() -> None

Reset the streaming state to initial values.

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _reset_streaming_state(self) -> None:
    """Reset the streaming state to initial values."""
    self.streaming_state = {
        "current_tool_index": -1,
        "tool_ids": [],
        "sent_tools": [],
    }

_set_current_tool_index ¶

_set_current_tool_index(index: int) -> None

Set the current tool index.

Parameters:

Name	Type	Description	Default
`index`	`int`	Tool index to set	required

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _set_current_tool_index(self, index: int) -> None:
    """
    Set the current tool index.

    Args:
        index: Tool index to set
    """
    self.streaming_state["current_tool_index"] = index

_should_buffer_content ¶

_should_buffer_content(delta_text: str) -> bool

Determine if content should be buffered for later processing.

Parameters:

Name	Type	Description	Default
`delta_text`	`str`	Delta text to check	required

Returns:

Type	Description
`bool`	True if content should be buffered

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _should_buffer_content(self, delta_text: str) -> bool:
    """
    Determine if content should be buffered for later processing.

    Args:
        delta_text: Delta text to check

    Returns:
        True if content should be buffered
    """
    if self.in_thinking_tag:
        return False
    return bool(self.pending_buffer
                or self.tool_call_start_token in delta_text
                or self.tool_call_end_token in delta_text
                or delta_text.startswith('<'))

_split_content_for_buffering ¶

_split_content_for_buffering(
    delta_text: str,
) -> tuple[str, str]

Split delta text into safe content and potential tag content.

Parameters:

Name	Type	Description	Default
`delta_text`	`str`	Delta text to split	required

Returns:

Type	Description
`tuple[str, str]`	Tuple of (safe_content, potential_tag_content)

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _split_content_for_buffering(self, delta_text: str) -> tuple[str, str]:
    """
    Split delta text into safe content and potential tag content.

    Args:
        delta_text: Delta text to split

    Returns:
        Tuple of (safe_content, potential_tag_content)
    """
    if self.in_thinking_tag:
        return delta_text, ""

    for tag in [self.tool_call_start_token, self.tool_call_end_token]:
        for i in range(1, len(tag)):
            tag_prefix = tag[:i]
            pos = delta_text.rfind(tag_prefix)
            if pos != -1 and tag.startswith(delta_text[pos:]):
                return delta_text[:pos], delta_text[pos:]
    return delta_text, ""

_update_thinking_state ¶

_update_thinking_state(text: str) -> None

Update the thinking tag state based on text content.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to analyze for thinking tags	required

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def _update_thinking_state(self, text: str) -> None:
    """
    Update the thinking tag state based on text content.

    Args:
        text: Text to analyze for thinking tags
    """
    open_count = text.count("<think>")
    close_count = text.count("</think>")
    self.in_thinking_tag = open_count > close_count or (
        open_count == close_count and text.endswith("</think>"))

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract tool calls from model output for non-streaming mode.

Parameters:

Name	Type	Description	Default
`model_output`	`str`	Complete model output	required
`request`	`ChatCompletionRequest`	Chat completion request	required

Returns:

Type	Description
`ExtractedToolCallInformation`	ExtractedToolCallInformation containing tool calls and content

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    """
    Extract tool calls from model output for non-streaming mode.

    Args:
        model_output: Complete model output
        request: Chat completion request

    Returns:
        ExtractedToolCallInformation containing tool calls and content
    """
    processed_output = self.preprocess_model_output(model_output)

    if self.tool_call_start_token not in processed_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    try:
        function_call_tuples = self.tool_call_regex.findall(
            processed_output)

        raw_function_calls = []
        for match in function_call_tuples:
            tool_call_content = match[0] if match[0] else match[1]
            if tool_call_content.strip():
                lines = tool_call_content.strip().split('\n')
                for line in lines:
                    line = line.strip()
                    if line and line.startswith('{') and line.endswith(
                            '}'):
                        try:
                            parsed_call = json.loads(line)
                            raw_function_calls.append(parsed_call)
                        except json.JSONDecodeError:
                            continue

        tool_calls = []
        for function_call in raw_function_calls:
            if "name" in function_call and "arguments" in function_call:
                tool_calls.append(
                    ToolCall(type="function",
                             function=FunctionCall(
                                 name=function_call["name"],
                                 arguments=json.dumps(
                                     function_call["arguments"],
                                     ensure_ascii=False))))

        processed_pos = processed_output.find(self.tool_call_start_token)
        if processed_pos != -1:
            processed_content = processed_output[:processed_pos].strip()

            if processed_content:
                lines = processed_content.split('\n')
                for line in reversed(lines):
                    line = line.strip()
                    if line:
                        pos = model_output.find(line)
                        if pos != -1:
                            content = model_output[:pos + len(line)]
                            break
                else:
                    content = ""
            else:
                content = ""
        else:
            content = model_output

        return ExtractedToolCallInformation(
            tools_called=len(tool_calls) > 0,
            tool_calls=tool_calls,
            content=content.strip() if content.strip() else None)

    except Exception:
        logger.exception(
            "An unexpected error occurred during tool call extraction.")
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    self._update_thinking_state(current_text)

    if self.in_thinking_tag:
        return DeltaMessage(content=delta_text)

    if self._should_buffer_content(delta_text):
        buffered_output = self._process_buffer(delta_text)
        return DeltaMessage(
            content=buffered_output) if buffered_output else None

    if self._is_end_tool_calls(current_text):
        return DeltaMessage(content=delta_text)

    safe_content, potential_tag = self._split_content_for_buffering(
        delta_text)
    if potential_tag:
        self.pending_buffer += potential_tag
        return DeltaMessage(content=safe_content) if safe_content else None

    processed_current_text = self.preprocess_model_output(current_text)

    if self.tool_call_start_token not in processed_current_text:
        if (self.tool_call_end_token in delta_text
                and self.tool_call_start_token in current_text):
            return None
        if delta_text.strip(
        ) == '' and self.tool_call_start_token in current_text:
            return None
        if (self._get_current_tool_index() != -1
                and self.tool_call_end_token in current_text):
            self._reset_streaming_state()
        return DeltaMessage(content=delta_text)

    if (self.tool_call_start_token_id is not None
            and self.tool_call_start_token_id in delta_token_ids
            and len(delta_token_ids) == 1):
        return None

    original_tool_start = self._find_tool_start_outside_thinking(
        current_text)
    if original_tool_start is None:
        return None

    content_before_tools = self._extract_content_before_tools(
        current_text, delta_text, original_tool_start)
    if content_before_tools:
        return DeltaMessage(content=content_before_tools)

    try:
        tool_content = self._extract_tool_content(current_text,
                                                  original_tool_start)
        current_tools_count = self._detect_tools_in_text(tool_content)

        if current_tools_count == 0:
            return None

        if self._get_current_tool_index() == -1:
            self._reset_streaming_state()

        self._ensure_state_arrays(current_tools_count)

        return (self._handle_tool_name_streaming(tool_content,
                                                 current_tools_count)
                or self._handle_tool_args_streaming(
                    tool_content, current_tools_count))

    except Exception:
        logger.exception("An unexpected error occurred ",
                         "during streaming tool call handling.")
        return None

preprocess_model_output ¶

preprocess_model_output(model_output: str) -> str

Preprocess model output by removing tool calls from thinking tags.

Parameters:

Name	Type	Description	Default
`model_output`	`str`	Raw model output string	required

Returns:

Type	Description
`str`	Preprocessed model output with tool calls removed from thinking tags

Source code in vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

def preprocess_model_output(self, model_output: str) -> str:
    """
    Preprocess model output by removing tool calls from thinking tags.

    Args:
        model_output: Raw model output string

    Returns:
        Preprocessed model output with tool calls removed from thinking tags
    """

    def remove_tool_calls_from_think(match):
        think_content = match.group(1)
        cleaned_content = re.sub(r"<tool_calls>.*?</tool_calls>",
                                 "",
                                 think_content,
                                 flags=re.DOTALL)
        return f"<think>{cleaned_content}</think>"

    return re.sub(self.thinking_tag_pattern,
                  remove_tool_calls_from_think,
                  model_output,
                  flags=re.DOTALL)

MistralToolParser ¶

Bases: ToolParser

Tool call parser for Mistral 7B Instruct v0.3, intended for use with - mistral_common - the examples/tool_chat_template_mistral.jinja template.

Used when --enable-auto-tool-choice --tool-call-parser mistral are all set

Source code in vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py

@ToolParserManager.register_module("mistral")
class MistralToolParser(ToolParser):
    """
    Tool call parser for Mistral 7B Instruct v0.3, intended for use with
    - [`mistral_common`](https://github.com/mistralai/mistral-common/)
    - the examples/tool_chat_template_mistral.jinja template.

    Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
    """

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        if not isinstance(self.model_tokenizer, MistralTokenizer):
            logger.info("Non-Mistral tokenizer detected when using a Mistral "
                        "model...")

        # initialize properties used for state when parsing tool calls in
        # streaming mode
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id: int = -1
        self.current_tool_name_sent: bool = False
        self.streamed_args_for_tool: list[str] = [
        ]  # map what has been streamed for each tool so far to a list
        self.bot_token = "[TOOL_CALLS]"
        self.bot_token_id = self.vocab.get(self.bot_token)
        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
        if _is_fn_name_regex_support(self.model_tokenizer):
            self.fn_name_regex = re.compile(
                r'([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)', re.DOTALL)
        else:
            self.fn_name_regex = None

        if self.bot_token_id is None:
            raise RuntimeError(
                "Mistral Tool Parser could not locate the tool call token in "
                "the tokenizer!")

    def adjust_request(
            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        if not isinstance(
                self.model_tokenizer, MistralTokenizer
        ) and request.tools and request.tool_choice != 'none':
            # Do not skip special tokens when using chat template
            # with Mistral parser as TOOL_CALL token is needed
            # for tool detection.
            # Note: we don't want skip_special_tokens=False
            # with MistralTokenizer as it is incompatible
            request.skip_special_tokens = False
        return request

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        """
        Extract the tool calls from a complete model response. Requires
        find-and-replacing single quotes with double quotes for JSON parsing,
        make sure your tool call arguments don't ever include quotes!
        """

        # case -- if a tool call token is not present, return a text response
        if self.bot_token not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        # first remove the BOT token
        tool_content = model_output.replace(self.bot_token, "").strip()

        try:
            # we first try to directly load the json as parsing very nested
            # jsons is difficult
            try:
                if self.fn_name_regex:
                    matches = self.fn_name_regex.findall(tool_content)

                    function_call_arr = []
                    for match in matches:
                        fn_name = match[0]
                        args = match[1]

                        # fn_name is encoded outside serialized json dump
                        # only arguments are serialized
                        function_call_arr.append({
                            "name": fn_name,
                            "arguments": json.loads(args)
                        })
                else:
                    function_call_arr = json.loads(tool_content)
            except json.JSONDecodeError:
                # use a regex to find the part corresponding to the tool call.
                # NOTE: This use case should not happen if the model is trained
                # correctly. It's an easy possible fix so it's included, but
                # can be brittle for very complex / highly nested tool calls
                raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
                function_call_arr = json.loads(raw_tool_call)

            # Tool Call
            tool_calls: list[MistralToolCall] = [
                MistralToolCall(
                    type="function",
                    function=FunctionCall(
                        name=raw_function_call["name"],
                        # function call args are JSON but as a string
                        arguments=json.dumps(raw_function_call["arguments"],
                                             ensure_ascii=False)))
                for raw_function_call in function_call_arr
            ]

            # get any content before  the tool call
            content = model_output.split(self.bot_token)[0]
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content if len(content) > 0 else None)

        except Exception:
            logger.exception("Error in extracting tool call from response.")
            # return information to just treat the tool call as regular JSON
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=tool_content)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        # if the tool call token is not in the tokens generated so far, append
        # output to contents since it's not a tool
        if self.bot_token not in current_text:
            return DeltaMessage(content=delta_text)

        # if the tool call token ID IS in the tokens generated so far, that
        # means we're parsing as tool calls now

        # handle if we detected the BOT token which means the start of tool
        # calling
        if (self.bot_token_id in delta_token_ids
                and len(delta_token_ids) == 1):
            # if it's the only token, return None, so we don't send a chat
            # completion any don't send a control token
            return None

        # bit mask flags for partial JSON parsing. If the name hasn't been
        # sent yet, don't allow sending
        # an incomplete string since OpenAI only ever (as far as I have
        # seen) allows sending the entire tool/ function name at once.
        flags = Allow.ALL if self.current_tool_name_sent \
            else Allow.ALL & ~Allow.STR
        try:

            # replace BOT token with empty string, and convert single quotes
            # to double to allow parsing as JSON since mistral uses single
            # quotes instead of double for tool calls
            parsable_arr = current_text.split(self.bot_token)[-1]

            # tool calls are generated in an array, so do partial JSON
            # parsing on the entire array
            try:
                tool_call_arr: list[dict] = partial_json_parser.loads(
                    parsable_arr, flags)
            except partial_json_parser.core.exceptions.MalformedJSON:
                logger.debug('not enough tokens to parse into JSON yet')
                return None

            # select as the current tool call the one we're on the state at

            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                if len(tool_call_arr) > 0 else {}

            # case -- if no tokens have been streamed for the tool, e.g.
            #   only the array brackets, stream nothing
            if len(tool_call_arr) == 0:
                return None

            # case: we are starting a new tool in the array
            #   -> array has > 0 length AND length has moved past cursor
            elif (len(tool_call_arr) > 0
                  and len(tool_call_arr) > self.current_tool_id + 1):

                # if we're moving on to a new call, first make sure we
                # haven't missed anything in the previous one that was
                # auto-generated due to JSON completions, but wasn't
                # streamed to the client yet.
                if self.current_tool_id >= 0:
                    diff: Union[str, None] = current_tool_call.get("arguments")

                    if diff:
                        diff = json.dumps(diff, ensure_ascii=False).replace(
                            self.streamed_args_for_tool[self.current_tool_id],
                            "")
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=diff).model_dump(
                                                  exclude_none=True))
                        ])
                        self.streamed_args_for_tool[
                            self.current_tool_id] += diff
                    else:
                        delta = None
                else:
                    delta = None
                # re-set stuff pertaining to progress in the current tool
                self.current_tool_id = len(tool_call_arr) - 1
                self.current_tool_name_sent = False
                self.streamed_args_for_tool.append("")
                logger.debug("starting on new tool %d", self.current_tool_id)
                return delta

            # case: update an existing tool - this is handled below

            # if the current tool name hasn't been sent, send if available
            # - otherwise send nothing
            if not self.current_tool_name_sent:
                function_name = current_tool_call.get("name")
                if function_name:

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
                                      id=MistralToolCall.generate_random_id(),
                                      function=DeltaFunctionCall(
                                          name=function_name).model_dump(
                                              exclude_none=True))
                    ])
                    self.current_tool_name_sent = True
                else:
                    delta = None

            # now we know we're on the same tool call and we're streaming
            # arguments
            else:

                prev_arguments = self.prev_tool_call_arr[
                    self.current_tool_id].get("arguments")
                cur_arguments = current_tool_call.get("arguments")

                new_text = delta_text.replace("\'", "\"")
                if ('"}' in new_text):
                    new_text = new_text[:new_text.rindex('"}')]

                if not cur_arguments and not prev_arguments:

                    delta = None
                elif not cur_arguments and prev_arguments:
                    logger.error(
                        "INVARIANT - impossible to have arguments reset "
                        "mid-arguments")
                    delta = None
                elif cur_arguments and not prev_arguments:
                    cur_arguments_json = json.dumps(cur_arguments,
                                                    ensure_ascii=False)[:-2]
                    logger.debug("finding %s in %s", new_text,
                                 cur_arguments_json)

                    if (new_text not in cur_arguments_json):
                        return None
                    arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                         rindex(new_text) +
                                                         len(new_text)]
                    logger.debug("First tokens in arguments received: %s",
                                 arguments_delta)
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=arguments_delta).
                                      model_dump(exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += arguments_delta

                elif cur_arguments and prev_arguments:
                    cur_args_json = json.dumps(cur_arguments,
                                               ensure_ascii=False)
                    prev_args_json = json.dumps(prev_arguments,
                                                ensure_ascii=False)
                    logger.debug("Searching for diff between \n%s\n%s",
                                 cur_args_json, prev_args_json)

                    argument_diff = extract_intermediate_diff(
                        cur_args_json, prev_args_json)
                    logger.debug("got arguments diff: %s", argument_diff)
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=argument_diff).model_dump(
                                              exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += argument_diff
                else:
                    # try parsing it with regular JSON - if it works we're
                    # at the end, and we need to send the difference between
                    # tokens streamed so far and the valid JSON
                    delta = None

            # check to see if the name is defined and has been sent. if so,
            # stream the name - otherwise keep waiting
            # finish by setting old and returning None as base case
            self.prev_tool_call_arr = tool_call_arr
            return delta

        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            logger.debug(
                "Skipping chunk as a result of tool streaming extraction "
                "error")
            return None

bot_token `instance-attribute` ¶

bot_token = '[TOOL_CALLS]'

bot_token_id `instance-attribute` ¶

bot_token_id = get(bot_token)

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

fn_name_regex `instance-attribute` ¶

fn_name_regex = compile(
    "([a-zA-Z0-9_-]+)(\\{[\\s\\S]*?\\})(?=\\s*$|,|\\s)",
    DOTALL,
)

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile('\\[{.*}\\]', DOTALL)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    if not isinstance(self.model_tokenizer, MistralTokenizer):
        logger.info("Non-Mistral tokenizer detected when using a Mistral "
                    "model...")

    # initialize properties used for state when parsing tool calls in
    # streaming mode
    self.prev_tool_call_arr: list[dict] = []
    self.current_tool_id: int = -1
    self.current_tool_name_sent: bool = False
    self.streamed_args_for_tool: list[str] = [
    ]  # map what has been streamed for each tool so far to a list
    self.bot_token = "[TOOL_CALLS]"
    self.bot_token_id = self.vocab.get(self.bot_token)
    self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
    if _is_fn_name_regex_support(self.model_tokenizer):
        self.fn_name_regex = re.compile(
            r'([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)', re.DOTALL)
    else:
        self.fn_name_regex = None

    if self.bot_token_id is None:
        raise RuntimeError(
            "Mistral Tool Parser could not locate the tool call token in "
            "the tokenizer!")

adjust_request ¶

adjust_request(
    request: ChatCompletionRequest,
) -> ChatCompletionRequest

Source code in vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py

def adjust_request(
        self, request: ChatCompletionRequest) -> ChatCompletionRequest:
    if not isinstance(
            self.model_tokenizer, MistralTokenizer
    ) and request.tools and request.tool_choice != 'none':
        # Do not skip special tokens when using chat template
        # with Mistral parser as TOOL_CALL token is needed
        # for tool detection.
        # Note: we don't want skip_special_tokens=False
        # with MistralTokenizer as it is incompatible
        request.skip_special_tokens = False
    return request

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract the tool calls from a complete model response. Requires find-and-replacing single quotes with double quotes for JSON parsing, make sure your tool call arguments don't ever include quotes!

Source code in vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    """
    Extract the tool calls from a complete model response. Requires
    find-and-replacing single quotes with double quotes for JSON parsing,
    make sure your tool call arguments don't ever include quotes!
    """

    # case -- if a tool call token is not present, return a text response
    if self.bot_token not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    # first remove the BOT token
    tool_content = model_output.replace(self.bot_token, "").strip()

    try:
        # we first try to directly load the json as parsing very nested
        # jsons is difficult
        try:
            if self.fn_name_regex:
                matches = self.fn_name_regex.findall(tool_content)

                function_call_arr = []
                for match in matches:
                    fn_name = match[0]
                    args = match[1]

                    # fn_name is encoded outside serialized json dump
                    # only arguments are serialized
                    function_call_arr.append({
                        "name": fn_name,
                        "arguments": json.loads(args)
                    })
            else:
                function_call_arr = json.loads(tool_content)
        except json.JSONDecodeError:
            # use a regex to find the part corresponding to the tool call.
            # NOTE: This use case should not happen if the model is trained
            # correctly. It's an easy possible fix so it's included, but
            # can be brittle for very complex / highly nested tool calls
            raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
            function_call_arr = json.loads(raw_tool_call)

        # Tool Call
        tool_calls: list[MistralToolCall] = [
            MistralToolCall(
                type="function",
                function=FunctionCall(
                    name=raw_function_call["name"],
                    # function call args are JSON but as a string
                    arguments=json.dumps(raw_function_call["arguments"],
                                         ensure_ascii=False)))
            for raw_function_call in function_call_arr
        ]

        # get any content before  the tool call
        content = model_output.split(self.bot_token)[0]
        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=tool_calls,
            content=content if len(content) > 0 else None)

    except Exception:
        logger.exception("Error in extracting tool call from response.")
        # return information to just treat the tool call as regular JSON
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=tool_content)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    # if the tool call token is not in the tokens generated so far, append
    # output to contents since it's not a tool
    if self.bot_token not in current_text:
        return DeltaMessage(content=delta_text)

    # if the tool call token ID IS in the tokens generated so far, that
    # means we're parsing as tool calls now

    # handle if we detected the BOT token which means the start of tool
    # calling
    if (self.bot_token_id in delta_token_ids
            and len(delta_token_ids) == 1):
        # if it's the only token, return None, so we don't send a chat
        # completion any don't send a control token
        return None

    # bit mask flags for partial JSON parsing. If the name hasn't been
    # sent yet, don't allow sending
    # an incomplete string since OpenAI only ever (as far as I have
    # seen) allows sending the entire tool/ function name at once.
    flags = Allow.ALL if self.current_tool_name_sent \
        else Allow.ALL & ~Allow.STR
    try:

        # replace BOT token with empty string, and convert single quotes
        # to double to allow parsing as JSON since mistral uses single
        # quotes instead of double for tool calls
        parsable_arr = current_text.split(self.bot_token)[-1]

        # tool calls are generated in an array, so do partial JSON
        # parsing on the entire array
        try:
            tool_call_arr: list[dict] = partial_json_parser.loads(
                parsable_arr, flags)
        except partial_json_parser.core.exceptions.MalformedJSON:
            logger.debug('not enough tokens to parse into JSON yet')
            return None

        # select as the current tool call the one we're on the state at

        current_tool_call: dict = tool_call_arr[self.current_tool_id] \
            if len(tool_call_arr) > 0 else {}

        # case -- if no tokens have been streamed for the tool, e.g.
        #   only the array brackets, stream nothing
        if len(tool_call_arr) == 0:
            return None

        # case: we are starting a new tool in the array
        #   -> array has > 0 length AND length has moved past cursor
        elif (len(tool_call_arr) > 0
              and len(tool_call_arr) > self.current_tool_id + 1):

            # if we're moving on to a new call, first make sure we
            # haven't missed anything in the previous one that was
            # auto-generated due to JSON completions, but wasn't
            # streamed to the client yet.
            if self.current_tool_id >= 0:
                diff: Union[str, None] = current_tool_call.get("arguments")

                if diff:
                    diff = json.dumps(diff, ensure_ascii=False).replace(
                        self.streamed_args_for_tool[self.current_tool_id],
                        "")
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=diff).model_dump(
                                              exclude_none=True))
                    ])
                    self.streamed_args_for_tool[
                        self.current_tool_id] += diff
                else:
                    delta = None
            else:
                delta = None
            # re-set stuff pertaining to progress in the current tool
            self.current_tool_id = len(tool_call_arr) - 1
            self.current_tool_name_sent = False
            self.streamed_args_for_tool.append("")
            logger.debug("starting on new tool %d", self.current_tool_id)
            return delta

        # case: update an existing tool - this is handled below

        # if the current tool name hasn't been sent, send if available
        # - otherwise send nothing
        if not self.current_tool_name_sent:
            function_name = current_tool_call.get("name")
            if function_name:

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  type="function",
                                  id=MistralToolCall.generate_random_id(),
                                  function=DeltaFunctionCall(
                                      name=function_name).model_dump(
                                          exclude_none=True))
                ])
                self.current_tool_name_sent = True
            else:
                delta = None

        # now we know we're on the same tool call and we're streaming
        # arguments
        else:

            prev_arguments = self.prev_tool_call_arr[
                self.current_tool_id].get("arguments")
            cur_arguments = current_tool_call.get("arguments")

            new_text = delta_text.replace("\'", "\"")
            if ('"}' in new_text):
                new_text = new_text[:new_text.rindex('"}')]

            if not cur_arguments and not prev_arguments:

                delta = None
            elif not cur_arguments and prev_arguments:
                logger.error(
                    "INVARIANT - impossible to have arguments reset "
                    "mid-arguments")
                delta = None
            elif cur_arguments and not prev_arguments:
                cur_arguments_json = json.dumps(cur_arguments,
                                                ensure_ascii=False)[:-2]
                logger.debug("finding %s in %s", new_text,
                             cur_arguments_json)

                if (new_text not in cur_arguments_json):
                    return None
                arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                     rindex(new_text) +
                                                     len(new_text)]
                logger.debug("First tokens in arguments received: %s",
                             arguments_delta)
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=arguments_delta).
                                  model_dump(exclude_none=True))
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] += arguments_delta

            elif cur_arguments and prev_arguments:
                cur_args_json = json.dumps(cur_arguments,
                                           ensure_ascii=False)
                prev_args_json = json.dumps(prev_arguments,
                                            ensure_ascii=False)
                logger.debug("Searching for diff between \n%s\n%s",
                             cur_args_json, prev_args_json)

                argument_diff = extract_intermediate_diff(
                    cur_args_json, prev_args_json)
                logger.debug("got arguments diff: %s", argument_diff)
                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  function=DeltaFunctionCall(
                                      arguments=argument_diff).model_dump(
                                          exclude_none=True))
                ])
                self.streamed_args_for_tool[
                    self.current_tool_id] += argument_diff
            else:
                # try parsing it with regular JSON - if it works we're
                # at the end, and we need to send the difference between
                # tokens streamed so far and the valid JSON
                delta = None

        # check to see if the name is defined and has been sent. if so,
        # stream the name - otherwise keep waiting
        # finish by setting old and returning None as base case
        self.prev_tool_call_arr = tool_call_arr
        return delta

    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        logger.debug(
            "Skipping chunk as a result of tool streaming extraction "
            "error")
        return None

OpenAIToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py

@ToolParserManager.register_module("openai")
class OpenAIToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
        token_ids: Sequence[int] | None = None,
    ) -> ExtractedToolCallInformation:
        if token_ids is None:
            raise NotImplementedError(
                "OpenAIToolParser requires token IDs and does not support text-based extraction."  # noqa: E501
            )

        parser = parse_output_into_messages(token_ids)
        tool_calls = []
        final_content = None

        if len(parser.messages) > 0:
            for msg in parser.messages:
                if len(msg.content) < 1:
                    continue
                msg_text = msg.content[0].text
                if msg.recipient and msg.recipient.startswith("functions."):
                    # If no content-type is given assume JSON, as that's the
                    # most common case with gpt-oss models.
                    if not msg.content_type or "json" in msg.content_type:
                        # load and dump the JSON text to check validity and
                        # remove any extra newlines or other odd formatting
                        try:
                            tool_args = json.dumps(json.loads(msg_text))
                        except json.JSONDecodeError:
                            logger.exception(
                                "Error decoding JSON tool call from response.")
                            tool_args = msg_text
                    else:
                        tool_args = msg_text
                    tool_calls.append(
                        ToolCall(
                            type="function",
                            function=FunctionCall(
                                name=msg.recipient.split("functions.")[1],
                                arguments=tool_args,
                            ),
                        ))
                elif msg.channel == "final":
                    final_content = msg_text

        return ExtractedToolCallInformation(
            tools_called=len(tool_calls) > 0,
            tool_calls=tool_calls,
            content=final_content,
        )

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> DeltaMessage | None:
        raise NotImplementedError(
            "Not being used, manual parsing in serving_chat.py"  # noqa: E501
        )

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

extract_tool_calls ¶

extract_tool_calls(
    model_output: str,
    request: ChatCompletionRequest,
    token_ids: Sequence[int] | None = None,
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
    token_ids: Sequence[int] | None = None,
) -> ExtractedToolCallInformation:
    if token_ids is None:
        raise NotImplementedError(
            "OpenAIToolParser requires token IDs and does not support text-based extraction."  # noqa: E501
        )

    parser = parse_output_into_messages(token_ids)
    tool_calls = []
    final_content = None

    if len(parser.messages) > 0:
        for msg in parser.messages:
            if len(msg.content) < 1:
                continue
            msg_text = msg.content[0].text
            if msg.recipient and msg.recipient.startswith("functions."):
                # If no content-type is given assume JSON, as that's the
                # most common case with gpt-oss models.
                if not msg.content_type or "json" in msg.content_type:
                    # load and dump the JSON text to check validity and
                    # remove any extra newlines or other odd formatting
                    try:
                        tool_args = json.dumps(json.loads(msg_text))
                    except json.JSONDecodeError:
                        logger.exception(
                            "Error decoding JSON tool call from response.")
                        tool_args = msg_text
                else:
                    tool_args = msg_text
                tool_calls.append(
                    ToolCall(
                        type="function",
                        function=FunctionCall(
                            name=msg.recipient.split("functions.")[1],
                            arguments=tool_args,
                        ),
                    ))
            elif msg.channel == "final":
                final_content = msg_text

    return ExtractedToolCallInformation(
        tools_called=len(tool_calls) > 0,
        tool_calls=tool_calls,
        content=final_content,
    )

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> DeltaMessage | None

Source code in vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> DeltaMessage | None:
    raise NotImplementedError(
        "Not being used, manual parsing in serving_chat.py"  # noqa: E501
    )

Phi4MiniJsonToolParser ¶

Bases: ToolParser

Tool call parser for phi-4-mini models intended for use with the examples/tool_chat_template_llama.jinja template.

Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json
are all set

Source code in vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py

@ToolParserManager.register_module("phi4_mini_json")
class Phi4MiniJsonToolParser(ToolParser):
    """
    Tool call parser for phi-4-mini models intended for use with the
    examples/tool_chat_template_llama.jinja template.

    Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json  
    are all set
    """

    def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
        super().__init__(tokenizer)

        # initialize properties used for state when parsing tool calls in
        # streaming mode
        self.prev_tool_call_arr: list[dict[str, Any]] = []
        self.current_tool_id: int = -1
        self.current_tool_name_sent: bool = False
        self.streamed_args_for_tool: list[str] = [
        ]  # map what has been streamed for each tool so far to a list
        self.bot_token: str = "functools"

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        """
        Extract the tool calls from a complete model response.
        """
        logger.debug("Model output: %s", model_output)

        pattern = r'functools\[(.*?)\]'
        matches = re.search(pattern, model_output, re.DOTALL)

        if not matches:
            logger.debug("No function calls found")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        try:
            function_call_arr: list[dict[str, Any]] = []
            try:
                json_content = '[' + matches.group(1) + ']'

                function_call_arr = json.loads(json_content)
                logger.debug("Successfully extracted %d function calls",
                             len(function_call_arr))
            except json.JSONDecodeError as e:
                logger.error(
                    "Failed to parse function calls from model output. "
                    "Error: %s", str(e))

            tool_calls: list[ToolCall] = [
                ToolCall(
                    id=make_tool_call_id(),
                    type="function",
                    function=FunctionCall(
                        name=raw_function_call["name"],
                        # function call args are JSON but as a string
                        arguments=json.dumps(
                            raw_function_call["arguments"]
                            if "arguments" in raw_function_call else
                            raw_function_call["parameters"],
                            ensure_ascii=False),
                    )) for raw_function_call in function_call_arr
            ]

            # get any content before the tool call
            ret = ExtractedToolCallInformation(tools_called=True,
                                               tool_calls=tool_calls,
                                               content=None)
            return ret

        except Exception:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Optional[DeltaMessage]:

        return None

bot_token `instance-attribute` ¶

bot_token: str = 'functools'

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict[str, Any]] = []

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

init ¶

__init__(tokenizer: PreTrainedTokenizerBase) -> None

Source code in vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py

def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
    super().__init__(tokenizer)

    # initialize properties used for state when parsing tool calls in
    # streaming mode
    self.prev_tool_call_arr: list[dict[str, Any]] = []
    self.current_tool_id: int = -1
    self.current_tool_name_sent: bool = False
    self.streamed_args_for_tool: list[str] = [
    ]  # map what has been streamed for each tool so far to a list
    self.bot_token: str = "functools"

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract the tool calls from a complete model response.

Source code in vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    """
    Extract the tool calls from a complete model response.
    """
    logger.debug("Model output: %s", model_output)

    pattern = r'functools\[(.*?)\]'
    matches = re.search(pattern, model_output, re.DOTALL)

    if not matches:
        logger.debug("No function calls found")
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    try:
        function_call_arr: list[dict[str, Any]] = []
        try:
            json_content = '[' + matches.group(1) + ']'

            function_call_arr = json.loads(json_content)
            logger.debug("Successfully extracted %d function calls",
                         len(function_call_arr))
        except json.JSONDecodeError as e:
            logger.error(
                "Failed to parse function calls from model output. "
                "Error: %s", str(e))

        tool_calls: list[ToolCall] = [
            ToolCall(
                id=make_tool_call_id(),
                type="function",
                function=FunctionCall(
                    name=raw_function_call["name"],
                    # function call args are JSON but as a string
                    arguments=json.dumps(
                        raw_function_call["arguments"]
                        if "arguments" in raw_function_call else
                        raw_function_call["parameters"],
                        ensure_ascii=False),
                )) for raw_function_call in function_call_arr
        ]

        # get any content before the tool call
        ret = ExtractedToolCallInformation(tools_called=True,
                                           tool_calls=tool_calls,
                                           content=None)
        return ret

    except Exception:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Optional[DeltaMessage]

Source code in vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Optional[DeltaMessage]:

    return None

PythonicToolParser ¶

Bases: ToolParser

Tool call parser for models that produce tool calls in a pythonic style, such as Llama 3.2 and Llama 4 models.

Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set

Source code in vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py

@ToolParserManager.register_module("pythonic")
class PythonicToolParser(ToolParser):
    """
    Tool call parser for models that produce tool calls in a pythonic style,
    such as Llama 3.2 and Llama 4 models.

    Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
    """
    # TODO(mdepinet): Possible future improvements:
    #   1. Support text + tools separated by either <|python_tag|> or \n\n
    #   2. Support tools outside of a list (or separated by a semicolon).
    #      This depends on item 1 for consistent streaming.
    # Neither of these are necessary for e.g. ToolACE, but both would help make
    # Llama3.2 models more reliable.

    TOOL_CALL_REGEX = re.compile(
        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
        re.DOTALL)

    def __init__(self, tokenizer: PreTrainedTokenizerBase):
        super().__init__(tokenizer)

    # Rename for readability. This is NOT a tool id.
    @property
    def current_tool_index(self) -> int:
        return self.current_tool_id

    @current_tool_index.setter
    def current_tool_index(self, value: int) -> None:
        self.current_tool_id = value

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        """
        Extract the tool calls from a complete model response.
        """
        is_tool_call_pattern = False
        try:
            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
                model_output,
                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
        except TimeoutError:
            logger.warning(
                "Regex timeout occurred when matching tool call pattern.")
            logger.debug("Regex timeout occurred when matching user input: %s",
                         model_output)

        if not is_tool_call_pattern:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        try:
            module = ast.parse(model_output)
            parsed = getattr(module.body[0], "value", None)
            if isinstance(parsed, ast.List) and all(
                    isinstance(e, ast.Call) for e in parsed.elts):
                return ExtractedToolCallInformation(
                    tools_called=True,
                    tool_calls=[
                        _handle_single_tool(e)  # type: ignore
                        for e in parsed.elts
                    ],
                    content=None)
            else:
                raise _UnexpectedAstError(
                    "Tool output must be a list of function calls")
        except Exception:
            logger.exception("Error in extracting tool call from response.")
            # Treat as regular text
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        if not current_text.startswith("["):
            return DeltaMessage(content=delta_text)

        try:
            valid_and_added_text = _make_valid_python(current_text)
            if valid_and_added_text is None:
                return None
            valid_text, added_text = valid_and_added_text

            module = ast.parse(valid_text)
            parsed = getattr(module.body[0], "value", None)
            if not isinstance(parsed, ast.List) or not all(
                    isinstance(e, ast.Call) for e in parsed.elts):
                raise _UnexpectedAstError(
                    "Tool output must be a list of function calls")
            tool_calls = [
                _handle_single_tool(e)  # type: ignore
                for e in parsed.elts
            ]

            tool_deltas = []
            for index, new_call in enumerate(tool_calls):
                if index < self.current_tool_index:
                    continue

                self.current_tool_index = index
                if len(self.streamed_args_for_tool) == index:
                    self.streamed_args_for_tool.append("")

                new_call_complete = index < len(
                    tool_calls) - 1 or ")]" not in added_text
                if new_call_complete:
                    self.current_tool_index += 1

                withheld_suffix = (added_text[:-2]
                                   if not new_call_complete else "")
                if not new_call_complete and added_text[-2] == ")":
                    # Function call is incomplete. Withhold the closing bracket.
                    withheld_suffix = withheld_suffix + "}"
                # Strings get single quotes in the model-produced string.
                # JSON requires double quotes.
                withheld_suffix = withheld_suffix.replace("'", '"')
                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
                                            new_call, index, withheld_suffix)

                if delta is not None:
                    tool_deltas.append(delta)
                    if (delta.function is not None
                            and delta.function.arguments is not None):
                        self.streamed_args_for_tool[
                            index] += delta.function.arguments

            # HACK: serving_chat.py inspects the internal state of tool parsers
            # when determining its final streaming delta, automatically
            # adding autocompleted JSON.
            # These two lines avoid that nonsense while ensuring finish_reason
            # is set to tool_calls when at least one tool is called.
            if tool_deltas and not self.prev_tool_call_arr:
                self.prev_tool_call_arr = [{"arguments": {}}]

            if tool_deltas:
                return DeltaMessage(tool_calls=tool_deltas)
            elif not added_text and self.current_tool_id > 0:
                # Return an empty DeltaMessage once the tool calls are all done
                # so that finish_reason gets set.
                return DeltaMessage(content='')
            else:
                return None
        except Exception:
            logger.exception("Error trying to handle streaming tool call.")
            logger.debug(
                "Skipping chunk as a result of tool streaming extraction "
                "error")
            return None

TOOL_CALL_REGEX `class-attribute` `instance-attribute` ¶

TOOL_CALL_REGEX = compile(
    "\\[([a-zA-Z]+\\w*\\(([a-zA-Z]+\\w*=.*,\\s*)*([a-zA-Z]+\\w*=.*\\s)?\\),\\s*)*([a-zA-Z]+\\w*\\(([a-zA-Z]+\\w*=.*,\\s*)*([a-zA-Z]+\\w*=.*\\s*)?\\)\\s*)+\\]",
    DOTALL,
)

current_tool_index `property` `writable` ¶

current_tool_index: int

init ¶

__init__(tokenizer: PreTrainedTokenizerBase)

Source code in vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py

def __init__(self, tokenizer: PreTrainedTokenizerBase):
    super().__init__(tokenizer)

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract the tool calls from a complete model response.

Source code in vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    """
    Extract the tool calls from a complete model response.
    """
    is_tool_call_pattern = False
    try:
        is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
            model_output,
            timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
    except TimeoutError:
        logger.warning(
            "Regex timeout occurred when matching tool call pattern.")
        logger.debug("Regex timeout occurred when matching user input: %s",
                     model_output)

    if not is_tool_call_pattern:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    try:
        module = ast.parse(model_output)
        parsed = getattr(module.body[0], "value", None)
        if isinstance(parsed, ast.List) and all(
                isinstance(e, ast.Call) for e in parsed.elts):
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=[
                    _handle_single_tool(e)  # type: ignore
                    for e in parsed.elts
                ],
                content=None)
        else:
            raise _UnexpectedAstError(
                "Tool output must be a list of function calls")
    except Exception:
        logger.exception("Error in extracting tool call from response.")
        # Treat as regular text
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    if not current_text.startswith("["):
        return DeltaMessage(content=delta_text)

    try:
        valid_and_added_text = _make_valid_python(current_text)
        if valid_and_added_text is None:
            return None
        valid_text, added_text = valid_and_added_text

        module = ast.parse(valid_text)
        parsed = getattr(module.body[0], "value", None)
        if not isinstance(parsed, ast.List) or not all(
                isinstance(e, ast.Call) for e in parsed.elts):
            raise _UnexpectedAstError(
                "Tool output must be a list of function calls")
        tool_calls = [
            _handle_single_tool(e)  # type: ignore
            for e in parsed.elts
        ]

        tool_deltas = []
        for index, new_call in enumerate(tool_calls):
            if index < self.current_tool_index:
                continue

            self.current_tool_index = index
            if len(self.streamed_args_for_tool) == index:
                self.streamed_args_for_tool.append("")

            new_call_complete = index < len(
                tool_calls) - 1 or ")]" not in added_text
            if new_call_complete:
                self.current_tool_index += 1

            withheld_suffix = (added_text[:-2]
                               if not new_call_complete else "")
            if not new_call_complete and added_text[-2] == ")":
                # Function call is incomplete. Withhold the closing bracket.
                withheld_suffix = withheld_suffix + "}"
            # Strings get single quotes in the model-produced string.
            # JSON requires double quotes.
            withheld_suffix = withheld_suffix.replace("'", '"')
            delta = _compute_tool_delta(self.streamed_args_for_tool[index],
                                        new_call, index, withheld_suffix)

            if delta is not None:
                tool_deltas.append(delta)
                if (delta.function is not None
                        and delta.function.arguments is not None):
                    self.streamed_args_for_tool[
                        index] += delta.function.arguments

        # HACK: serving_chat.py inspects the internal state of tool parsers
        # when determining its final streaming delta, automatically
        # adding autocompleted JSON.
        # These two lines avoid that nonsense while ensuring finish_reason
        # is set to tool_calls when at least one tool is called.
        if tool_deltas and not self.prev_tool_call_arr:
            self.prev_tool_call_arr = [{"arguments": {}}]

        if tool_deltas:
            return DeltaMessage(tool_calls=tool_deltas)
        elif not added_text and self.current_tool_id > 0:
            # Return an empty DeltaMessage once the tool calls are all done
            # so that finish_reason gets set.
            return DeltaMessage(content='')
        else:
            return None
    except Exception:
        logger.exception("Error trying to handle streaming tool call.")
        logger.debug(
            "Skipping chunk as a result of tool streaming extraction "
            "error")
        return None

Qwen3CoderToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

@ToolParserManager.register_module("qwen3_coder")
class Qwen3CoderToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: list[dict] = []
        # Override base class type - we use string IDs for tool calls
        self.current_tool_id: Optional[str] = None  # type: ignore
        self.streamed_args_for_tool: list[str] = []

        # Sentinel tokens for streaming mode
        self.tool_call_start_token: str = "<tool_call>"
        self.tool_call_end_token: str = "</tool_call>"
        self.tool_call_prefix: str = "<function="
        self.function_end_token: str = "</function>"
        self.parameter_prefix: str = "<parameter="
        self.parameter_end_token: str = "</parameter>"
        self.is_tool_call_started: bool = False
        self.failed_count: int = 0

        # Enhanced streaming state - reset for each new message
        self._reset_streaming_state()

        # Regex patterns
        self.tool_call_complete_regex = re.compile(
            r"<tool_call>(.*?)</tool_call>", re.DOTALL)
        self.tool_call_regex = re.compile(
            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL)
        self.tool_call_function_regex = re.compile(
            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
        self.tool_call_parameter_regex = re.compile(
            r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
            re.DOTALL)

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction.")

        self.tool_call_start_token_id = self.vocab.get(
            self.tool_call_start_token)
        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

        if (self.tool_call_start_token_id is None
                or self.tool_call_end_token_id is None):
            raise RuntimeError(
                "Qwen3 XML Tool parser could not locate tool call start/end "
                "tokens in the tokenizer!")

        logger.info("vLLM Successfully import tool parser %s !",
                    self.__class__.__name__)

    def _generate_tool_call_id(self) -> str:
        """Generate a unique tool call ID."""
        return f"call_{uuid.uuid4().hex[:24]}"

    def _reset_streaming_state(self):
        """Reset all streaming state."""
        self.current_tool_index = 0
        self.is_tool_call_started = False
        self.header_sent = False
        self.current_tool_id = None
        self.current_function_name = None
        self.current_param_name = None
        self.current_param_value = ""
        self.param_count = 0
        self.in_param = False
        self.in_function = False
        self.accumulated_text = ""
        self.json_started = False
        self.json_closed = False
        # Store accumulated parameters for type conversion
        self.accumulated_params = {}
        self.streaming_request = None

    def _get_arguments_config(
            self, func_name: str,
            tools: Optional[list[ChatCompletionToolsParam]]) -> dict:
        """Extract argument configuration for a function."""
        if tools is None:
            return {}
        for config in tools:
            if not hasattr(config, "type") or not (hasattr(
                    config, "function") and hasattr(config.function, "name")):
                continue
            if config.type == "function" and config.function.name == func_name:
                if not hasattr(config.function, "parameters"):
                    return {}
                params = config.function.parameters
                if isinstance(params, dict) and "properties" in params:
                    return params["properties"]
                elif isinstance(params, dict):
                    return params
                else:
                    return {}
        logger.warning("Tool '%s' is not defined in the tools list.",
                       func_name)
        return {}

    def _convert_param_value(self, param_value: str, param_name: str,
                             param_config: dict, func_name: str) -> Any:
        """Convert parameter value based on its type in the schema."""
        # Handle null value for any type
        if param_value.lower() == "null":
            return None

        if param_name not in param_config:
            if param_config != {}:
                logger.warning(
                    "Parsed parameter '%s' is not defined in the tool "
                    "parameters for tool '%s', directly returning the "
                    "string value.", param_name, func_name)
            return param_value

        if isinstance(param_config[param_name],
                      dict) and "type" in param_config[param_name]:
            param_type = str(param_config[param_name]["type"]).strip().lower()
        else:
            param_type = "string"
        if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
            return param_value
        elif param_type.startswith("int") or param_type.startswith(
                "uint") or param_type.startswith(
                    "long") or param_type.startswith(
                        "short") or param_type.startswith("unsigned"):
            try:
                return int(param_value)
            except (ValueError, TypeError):
                logger.warning(
                    "Parsed value '%s' of parameter '%s' is not an "
                    "integer in tool '%s', degenerating to string.",
                    param_value, param_name, func_name)
                return param_value
        elif param_type.startswith("num") or param_type.startswith("float"):
            try:
                float_param_value = float(param_value)
                return float_param_value if float_param_value - int(
                    float_param_value) != 0 else int(float_param_value)
            except (ValueError, TypeError):
                logger.warning(
                    "Parsed value '%s' of parameter '%s' is not a float "
                    "in tool '%s', degenerating to string.", param_value,
                    param_name, func_name)
                return param_value
        elif param_type in ["boolean", "bool", "binary"]:
            param_value = param_value.lower()
            if param_value not in ["true", "false"]:
                logger.warning(
                    "Parsed value '%s' of parameter '%s' is not a boolean "
                    "(`true` or `false`) in tool '%s', degenerating to "
                    "false.", param_value, param_name, func_name)
            return param_value == "true"
        else:
            if param_type in ["object", "array", "arr"
                              ] or param_type.startswith(
                                  "dict") or param_type.startswith("list"):
                try:
                    param_value = json.loads(param_value)
                    return param_value
                except (json.JSONDecodeError, TypeError, ValueError):
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' cannot be "
                        "parsed with json.loads in tool '%s', will try "
                        "other methods to parse it.", param_value, param_name,
                        func_name)
            try:
                param_value = ast.literal_eval(param_value)  # safer
            except (ValueError, SyntaxError, TypeError):
                logger.warning(
                    "Parsed value '%s' of parameter '%s' cannot be "
                    "converted via Python `ast.literal_eval()` in tool "
                    "'%s', degenerating to string.", param_value, param_name,
                    func_name)
            return param_value

    def _parse_xml_function_call(
            self, function_call_str: str,
            tools: Optional[list[ChatCompletionToolsParam]]
    ) -> Optional[ToolCall]:

        # Extract function name
        end_index = function_call_str.index(">")
        function_name = function_call_str[:end_index]
        param_config = self._get_arguments_config(function_name, tools)
        parameters = function_call_str[end_index + 1:]
        param_dict = {}
        for match_text in self.tool_call_parameter_regex.findall(parameters):
            idx = match_text.index(">")
            param_name = match_text[:idx]
            param_value = str(match_text[idx + 1:])
            # Remove prefix and trailing \n
            if param_value.startswith("\n"):
                param_value = param_value[1:]
            if param_value.endswith("\n"):
                param_value = param_value[:-1]

            param_dict[param_name] = self._convert_param_value(
                param_value, param_name, param_config, function_name)
        return ToolCall(
            type="function",
            function=FunctionCall(name=function_name,
                                  arguments=json.dumps(param_dict,
                                                       ensure_ascii=False)),
        )

    def _get_function_calls(self, model_output: str) -> list[str]:
        # Find all tool calls
        matched_ranges = self.tool_call_regex.findall(model_output)
        raw_tool_calls = [
            match[0] if match[0] else match[1] for match in matched_ranges
        ]

        # Back-off strategy if no tool_call tags found
        if len(raw_tool_calls) == 0:
            raw_tool_calls = [model_output]

        raw_function_calls = []
        for tool_call in raw_tool_calls:
            raw_function_calls.extend(
                self.tool_call_function_regex.findall(tool_call))

        function_calls = [
            match[0] if match[0] else match[1] for match in raw_function_calls
        ]
        return function_calls

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        # Quick check to avoid unnecessary processing
        if self.tool_call_prefix not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        try:
            function_calls = self._get_function_calls(model_output)
            if len(function_calls) == 0:
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=model_output)

            tool_calls = [
                self._parse_xml_function_call(function_call_str, request.tools)
                for function_call_str in function_calls
            ]

            # Populate prev_tool_call_arr for serving layer to set finish_reason
            self.prev_tool_call_arr.clear()  # Clear previous calls
            for tool_call in tool_calls:
                if tool_call:
                    self.prev_tool_call_arr.append({
                        "name":
                        tool_call.function.name,
                        "arguments":
                        tool_call.function.arguments,
                    })

            # Extract content before tool calls
            content_index = model_output.find(self.tool_call_start_token)
            idx = model_output.find(self.tool_call_prefix)
            content_index = content_index if content_index >= 0 else idx
            content = model_output[:content_index]  # .rstrip()

            return ExtractedToolCallInformation(
                tools_called=(len(tool_calls) > 0),
                tool_calls=tool_calls,
                content=content if content else None,
            )

        except Exception:
            logger.exception("Error in extracting tool call from response.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        # Store request for type conversion
        if not previous_text:
            self._reset_streaming_state()
            self.streaming_request = request

        # If no delta text, return None unless it's an EOS token after tools
        if not delta_text:
            # Check if this is an EOS token after all tool calls are complete
            # Check for tool calls in text even if is_tool_call_started
            # is False (might have been reset after processing all tools)
            if (delta_token_ids
                    and self.tool_call_end_token_id not in delta_token_ids):
                # Count complete tool calls
                complete_calls = len(
                    self.tool_call_complete_regex.findall(current_text))

                # If we have completed tool calls and populated
                # prev_tool_call_arr
                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                    # Check if all tool calls are closed
                    open_calls = current_text.count(
                        self.tool_call_start_token) - current_text.count(
                            self.tool_call_end_token)
                    if open_calls == 0:
                        # Return empty delta for finish_reason processing
                        return DeltaMessage(content="")
                elif not self.is_tool_call_started and current_text:
                    # This is a regular content response that's now complete
                    return DeltaMessage(content="")
            return None

        # Update accumulated text
        self.accumulated_text = current_text

        # Check if we need to advance to next tool
        if self.json_closed and not self.in_function:
            # Check if this tool call has ended
            tool_ends = current_text.count(self.tool_call_end_token)
            if tool_ends > self.current_tool_index:
                # This tool has ended, advance to next
                self.current_tool_index += 1
                self.header_sent = False
                self.param_count = 0
                self.json_started = False
                self.json_closed = False
                self.accumulated_params = {}

                # Check if there are more tool calls
                tool_starts = current_text.count(self.tool_call_start_token)
                if self.current_tool_index >= tool_starts:
                    # No more tool calls
                    self.is_tool_call_started = False
                # Continue processing next tool
                return None

        # Handle normal content before tool calls
        if not self.is_tool_call_started:
            # Check if tool call is starting
            if (self.tool_call_start_token_id in delta_token_ids
                    or self.tool_call_start_token in delta_text):
                self.is_tool_call_started = True
                # Return any content before the tool call
                if self.tool_call_start_token in delta_text:
                    content_before = delta_text[:delta_text.index(
                        self.tool_call_start_token)]
                    if content_before:
                        return DeltaMessage(content=content_before)
                return None
            else:
                # Check if we're between tool calls - skip whitespace
                if (current_text.rstrip().endswith(self.tool_call_end_token)
                        and delta_text.strip() == ""):
                    # We just ended a tool call, skip whitespace
                    return None
                # Normal content, no tool call
                return DeltaMessage(content=delta_text)

        # Check if we're between tool calls (waiting for next one)
        # Count tool calls we've seen vs processed
        tool_starts_count = current_text.count(self.tool_call_start_token)
        if self.current_tool_index >= tool_starts_count:
            # We're past all tool calls, shouldn't be here
            return None

        # We're in a tool call, find the current tool call portion
        # Need to find the correct tool call based on current_tool_index
        tool_start_positions: list[int] = []
        idx = 0
        while True:
            idx = current_text.find(self.tool_call_start_token, idx)
            if idx == -1:
                break
            tool_start_positions.append(idx)
            idx += len(self.tool_call_start_token)

        if self.current_tool_index >= len(tool_start_positions):
            # No more tool calls to process yet
            return None

        tool_start_idx = tool_start_positions[self.current_tool_index]
        # Find where this tool call ends (or current position if not ended yet)
        tool_end_idx = current_text.find(self.tool_call_end_token,
                                         tool_start_idx)
        if tool_end_idx == -1:
            tool_text = current_text[tool_start_idx:]
        else:
            tool_text = current_text[tool_start_idx:tool_end_idx +
                                     len(self.tool_call_end_token)]

        # Looking for function header
        if not self.header_sent:
            if self.tool_call_prefix in tool_text:
                func_start = tool_text.find(self.tool_call_prefix) + len(
                    self.tool_call_prefix)
                func_end = tool_text.find(">", func_start)

                if func_end != -1:
                    # Found complete function name
                    self.current_function_name = tool_text[func_start:func_end]
                    self.current_tool_id = self._generate_tool_call_id()
                    self.header_sent = True
                    self.in_function = True

                    # IMPORTANT: Add to prev_tool_call_arr immediately when
                    # we detect a tool call. This ensures
                    # finish_reason="tool_calls" even if parsing isn't complete
                    already_added = any(
                        tool.get("name") == self.current_function_name
                        for tool in self.prev_tool_call_arr)
                    if not already_added:
                        self.prev_tool_call_arr.append({
                            "name": self.current_function_name,
                            "arguments":
                            "{}",  # Placeholder, will be updated later
                        })

                    # Send header with function info
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
                            id=self.current_tool_id,
                            function=DeltaFunctionCall(
                                name=self.current_function_name, arguments=""),
                            type="function",
                        )
                    ])
            return None

        # We've sent header, now handle function body
        if self.in_function:
            # Send opening brace if not sent yet
            if (not self.json_started
                    and self.parameter_prefix not in delta_text):
                self.json_started = True
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        function=DeltaFunctionCall(arguments="{"),
                    )
                ])

            # Make sure json_started is set if we're processing parameters
            if not self.json_started:
                self.json_started = True

            # Check for function end in accumulated text
            if not self.json_closed and self.function_end_token in tool_text:
                # Close JSON
                self.json_closed = True

                # Extract complete tool call to update
                # prev_tool_call_arr with final arguments
                # Find the function content
                func_start = tool_text.find(self.tool_call_prefix) + len(
                    self.tool_call_prefix)
                func_content_end = tool_text.find(self.function_end_token,
                                                  func_start)
                if func_content_end != -1:
                    func_content = tool_text[func_start:func_content_end]
                    # Parse to get the complete arguments
                    try:
                        parsed_tool = self._parse_xml_function_call(
                            func_content, self.streaming_request.tools
                            if self.streaming_request else None)
                        if parsed_tool:
                            # Update existing entry in
                            # prev_tool_call_arr with complete args
                            for i, tool in enumerate(self.prev_tool_call_arr):
                                if tool.get(
                                        "name") == parsed_tool.function.name:
                                    args = parsed_tool.function.arguments
                                    self.prev_tool_call_arr[i][
                                        "arguments"] = args
                                    break
                    except Exception:
                        pass  # Ignore parsing errors during streaming

                result = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        function=DeltaFunctionCall(arguments="}"),
                    )
                ])

                # Reset state for next tool
                self.in_function = False
                self.json_closed = True
                self.accumulated_params = {}

                return result

            # Look for parameters
            # Find all parameter starts
            param_starts = []
            idx = 0
            while True:
                idx = tool_text.find(self.parameter_prefix, idx)
                if idx == -1:
                    break
                param_starts.append(idx)
                idx += len(self.parameter_prefix)

            # Check if we should start a new parameter
            if (not self.in_param and self.param_count < len(param_starts)
                    and len(param_starts) > self.param_count):
                # Process the next parameter
                param_idx = param_starts[self.param_count]
                param_start = param_idx + len(self.parameter_prefix)
                remaining = tool_text[param_start:]

                if ">" in remaining:
                    # We have the complete parameter name
                    name_end = remaining.find(">")
                    self.current_param_name = remaining[:name_end]

                    # Find the parameter value
                    value_start = param_start + name_end + 1
                    value_text = tool_text[value_start:]
                    if value_text.startswith("\n"):
                        value_text = value_text[1:]

                    # Find where this parameter ends
                    param_end_idx = value_text.find(self.parameter_end_token)
                    if param_end_idx == -1:
                        # No closing tag, look for next parameter or
                        # function end
                        next_param_idx = value_text.find(self.parameter_prefix)
                        func_end_idx = value_text.find(self.function_end_token)

                        if next_param_idx != -1 and (func_end_idx == -1
                                                     or next_param_idx
                                                     < func_end_idx):
                            param_end_idx = next_param_idx
                        elif func_end_idx != -1:
                            param_end_idx = func_end_idx
                        else:
                            # Neither found, check if tool call is complete
                            if self.tool_call_end_token in tool_text:
                                # Tool call is complete, so parameter
                                # must be complete too. Use all
                                # remaining text before function end
                                param_end_idx = len(value_text)
                            else:
                                # Still streaming, wait for more content
                                return None

                    if param_end_idx != -1:
                        # Complete parameter found
                        param_value = value_text[:param_end_idx]
                        if param_value.endswith("\n"):
                            param_value = param_value[:-1]

                        # Store raw value for later processing
                        self.accumulated_params[
                            self.current_param_name] = param_value

                        # Get parameter configuration for type conversion
                        param_config = self._get_arguments_config(
                            self.current_function_name or "",
                            self.streaming_request.tools
                            if self.streaming_request else None)

                        # Convert param value to appropriate type
                        converted_value = self._convert_param_value(
                            param_value, self.current_param_name, param_config,
                            self.current_function_name or "")

                        # Build JSON fragment based on the converted type
                        # Use json.dumps to properly serialize the value
                        serialized_value = json.dumps(converted_value,
                                                      ensure_ascii=False)

                        if self.param_count == 0:
                            json_fragment = (f'"{self.current_param_name}": '
                                             f'{serialized_value}')
                        else:
                            json_fragment = (f', "{self.current_param_name}": '
                                             f'{serialized_value}')

                        self.param_count += 1

                        return DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=self.current_tool_index,
                                function=DeltaFunctionCall(
                                    arguments=json_fragment),
                            )
                        ])

            # Continue parameter value - Not used in the current implementation
            # since we process complete parameters above
            if self.in_param:
                if self.parameter_end_token in delta_text:
                    # End of parameter
                    end_idx = delta_text.find(self.parameter_end_token)
                    value_chunk = delta_text[:end_idx]

                    # Skip past > if at start
                    if not self.current_param_value and ">" in value_chunk:
                        gt_idx = value_chunk.find(">")
                        value_chunk = value_chunk[gt_idx + 1:]

                    if not self.current_param_value and value_chunk.startswith(
                            "\n"):
                        value_chunk = value_chunk[1:]

                    # Store complete value
                    full_value = self.current_param_value + value_chunk
                    self.accumulated_params[
                        self.current_param_name] = full_value

                    # Get parameter configuration for type conversion
                    param_config = self._get_arguments_config(
                        self.current_function_name or "",
                        self.streaming_request.tools
                        if self.streaming_request else None)

                    # Convert the parameter value to the appropriate type
                    converted_value = self._convert_param_value(
                        full_value, self.current_param_name or "",
                        param_config, self.current_function_name or "")

                    # Serialize the converted value
                    serialized_value = json.dumps(converted_value,
                                                  ensure_ascii=False)

                    # Since we've been streaming the quoted version,
                    # we need to close it properly
                    # This is complex - for now just complete the value
                    self.in_param = False
                    self.current_param_value = ""

                    # Just close the current parameter string
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
                            function=DeltaFunctionCall(
                                arguments='"'),  # Close the string quote
                        )
                    ])
                else:
                    # Continue accumulating value
                    value_chunk = delta_text

                    # Handle first chunk after param name
                    if not self.current_param_value and ">" in value_chunk:
                        gt_idx = value_chunk.find(">")
                        value_chunk = value_chunk[gt_idx + 1:]

                    if not self.current_param_value and value_chunk.startswith(
                            "\n"):
                        value_chunk = value_chunk[1:]

                    if value_chunk:
                        # Stream the escaped delta
                        prev_escaped = json.dumps(
                            self.current_param_value, ensure_ascii=False
                        )[1:-1] if self.current_param_value else ""
                        self.current_param_value += value_chunk
                        full_escaped = json.dumps(self.current_param_value,
                                                  ensure_ascii=False)[1:-1]
                        delta_escaped = full_escaped[len(prev_escaped):]

                        if delta_escaped:
                            return DeltaMessage(tool_calls=[
                                DeltaToolCall(
                                    index=self.current_tool_index,
                                    function=DeltaFunctionCall(
                                        arguments=delta_escaped),
                                )
                            ])

        return None

current_tool_id `instance-attribute` ¶

current_tool_id: Optional[str] = None

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

failed_count `instance-attribute` ¶

failed_count: int = 0

function_end_token `instance-attribute` ¶

function_end_token: str = '</function>'

is_tool_call_started `instance-attribute` ¶

is_tool_call_started: bool = False

parameter_end_token `instance-attribute` ¶

parameter_end_token: str = '</parameter>'

parameter_prefix `instance-attribute` ¶

parameter_prefix: str = '<parameter='

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

tool_call_complete_regex `instance-attribute` ¶

tool_call_complete_regex = compile(
    "<tool_call>(.*?)</tool_call>", DOTALL
)

tool_call_end_token `instance-attribute` ¶

tool_call_end_token: str = '</tool_call>'

tool_call_end_token_id `instance-attribute` ¶

tool_call_end_token_id = get(tool_call_end_token)

tool_call_function_regex `instance-attribute` ¶

tool_call_function_regex = compile(
    "<function=(.*?)</function>|<function=(.*)$", DOTALL
)

tool_call_parameter_regex `instance-attribute` ¶

tool_call_parameter_regex = compile(
    "<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
    DOTALL,
)

tool_call_prefix `instance-attribute` ¶

tool_call_prefix: str = '<function='

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    "<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", DOTALL
)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token: str = '<tool_call>'

tool_call_start_token_id `instance-attribute` ¶

tool_call_start_token_id = get(tool_call_start_token)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    self.current_tool_name_sent: bool = False
    self.prev_tool_call_arr: list[dict] = []
    # Override base class type - we use string IDs for tool calls
    self.current_tool_id: Optional[str] = None  # type: ignore
    self.streamed_args_for_tool: list[str] = []

    # Sentinel tokens for streaming mode
    self.tool_call_start_token: str = "<tool_call>"
    self.tool_call_end_token: str = "</tool_call>"
    self.tool_call_prefix: str = "<function="
    self.function_end_token: str = "</function>"
    self.parameter_prefix: str = "<parameter="
    self.parameter_end_token: str = "</parameter>"
    self.is_tool_call_started: bool = False
    self.failed_count: int = 0

    # Enhanced streaming state - reset for each new message
    self._reset_streaming_state()

    # Regex patterns
    self.tool_call_complete_regex = re.compile(
        r"<tool_call>(.*?)</tool_call>", re.DOTALL)
    self.tool_call_regex = re.compile(
        r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL)
    self.tool_call_function_regex = re.compile(
        r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
    self.tool_call_parameter_regex = re.compile(
        r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
        re.DOTALL)

    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction.")

    self.tool_call_start_token_id = self.vocab.get(
        self.tool_call_start_token)
    self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)

    if (self.tool_call_start_token_id is None
            or self.tool_call_end_token_id is None):
        raise RuntimeError(
            "Qwen3 XML Tool parser could not locate tool call start/end "
            "tokens in the tokenizer!")

    logger.info("vLLM Successfully import tool parser %s !",
                self.__class__.__name__)

_convert_param_value ¶

_convert_param_value(
    param_value: str,
    param_name: str,
    param_config: dict,
    func_name: str,
) -> Any

Convert parameter value based on its type in the schema.

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def _convert_param_value(self, param_value: str, param_name: str,
                         param_config: dict, func_name: str) -> Any:
    """Convert parameter value based on its type in the schema."""
    # Handle null value for any type
    if param_value.lower() == "null":
        return None

    if param_name not in param_config:
        if param_config != {}:
            logger.warning(
                "Parsed parameter '%s' is not defined in the tool "
                "parameters for tool '%s', directly returning the "
                "string value.", param_name, func_name)
        return param_value

    if isinstance(param_config[param_name],
                  dict) and "type" in param_config[param_name]:
        param_type = str(param_config[param_name]["type"]).strip().lower()
    else:
        param_type = "string"
    if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
        return param_value
    elif param_type.startswith("int") or param_type.startswith(
            "uint") or param_type.startswith(
                "long") or param_type.startswith(
                    "short") or param_type.startswith("unsigned"):
        try:
            return int(param_value)
        except (ValueError, TypeError):
            logger.warning(
                "Parsed value '%s' of parameter '%s' is not an "
                "integer in tool '%s', degenerating to string.",
                param_value, param_name, func_name)
            return param_value
    elif param_type.startswith("num") or param_type.startswith("float"):
        try:
            float_param_value = float(param_value)
            return float_param_value if float_param_value - int(
                float_param_value) != 0 else int(float_param_value)
        except (ValueError, TypeError):
            logger.warning(
                "Parsed value '%s' of parameter '%s' is not a float "
                "in tool '%s', degenerating to string.", param_value,
                param_name, func_name)
            return param_value
    elif param_type in ["boolean", "bool", "binary"]:
        param_value = param_value.lower()
        if param_value not in ["true", "false"]:
            logger.warning(
                "Parsed value '%s' of parameter '%s' is not a boolean "
                "(`true` or `false`) in tool '%s', degenerating to "
                "false.", param_value, param_name, func_name)
        return param_value == "true"
    else:
        if param_type in ["object", "array", "arr"
                          ] or param_type.startswith(
                              "dict") or param_type.startswith("list"):
            try:
                param_value = json.loads(param_value)
                return param_value
            except (json.JSONDecodeError, TypeError, ValueError):
                logger.warning(
                    "Parsed value '%s' of parameter '%s' cannot be "
                    "parsed with json.loads in tool '%s', will try "
                    "other methods to parse it.", param_value, param_name,
                    func_name)
        try:
            param_value = ast.literal_eval(param_value)  # safer
        except (ValueError, SyntaxError, TypeError):
            logger.warning(
                "Parsed value '%s' of parameter '%s' cannot be "
                "converted via Python `ast.literal_eval()` in tool "
                "'%s', degenerating to string.", param_value, param_name,
                func_name)
        return param_value

_generate_tool_call_id ¶

_generate_tool_call_id() -> str

Generate a unique tool call ID.

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def _generate_tool_call_id(self) -> str:
    """Generate a unique tool call ID."""
    return f"call_{uuid.uuid4().hex[:24]}"

_get_arguments_config ¶

_get_arguments_config(
    func_name: str,
    tools: Optional[list[ChatCompletionToolsParam]],
) -> dict

Extract argument configuration for a function.

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def _get_arguments_config(
        self, func_name: str,
        tools: Optional[list[ChatCompletionToolsParam]]) -> dict:
    """Extract argument configuration for a function."""
    if tools is None:
        return {}
    for config in tools:
        if not hasattr(config, "type") or not (hasattr(
                config, "function") and hasattr(config.function, "name")):
            continue
        if config.type == "function" and config.function.name == func_name:
            if not hasattr(config.function, "parameters"):
                return {}
            params = config.function.parameters
            if isinstance(params, dict) and "properties" in params:
                return params["properties"]
            elif isinstance(params, dict):
                return params
            else:
                return {}
    logger.warning("Tool '%s' is not defined in the tools list.",
                   func_name)
    return {}

_get_function_calls ¶

_get_function_calls(model_output: str) -> list[str]

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def _get_function_calls(self, model_output: str) -> list[str]:
    # Find all tool calls
    matched_ranges = self.tool_call_regex.findall(model_output)
    raw_tool_calls = [
        match[0] if match[0] else match[1] for match in matched_ranges
    ]

    # Back-off strategy if no tool_call tags found
    if len(raw_tool_calls) == 0:
        raw_tool_calls = [model_output]

    raw_function_calls = []
    for tool_call in raw_tool_calls:
        raw_function_calls.extend(
            self.tool_call_function_regex.findall(tool_call))

    function_calls = [
        match[0] if match[0] else match[1] for match in raw_function_calls
    ]
    return function_calls

_parse_xml_function_call ¶

_parse_xml_function_call(
    function_call_str: str,
    tools: Optional[list[ChatCompletionToolsParam]],
) -> Optional[ToolCall]

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def _parse_xml_function_call(
        self, function_call_str: str,
        tools: Optional[list[ChatCompletionToolsParam]]
) -> Optional[ToolCall]:

    # Extract function name
    end_index = function_call_str.index(">")
    function_name = function_call_str[:end_index]
    param_config = self._get_arguments_config(function_name, tools)
    parameters = function_call_str[end_index + 1:]
    param_dict = {}
    for match_text in self.tool_call_parameter_regex.findall(parameters):
        idx = match_text.index(">")
        param_name = match_text[:idx]
        param_value = str(match_text[idx + 1:])
        # Remove prefix and trailing \n
        if param_value.startswith("\n"):
            param_value = param_value[1:]
        if param_value.endswith("\n"):
            param_value = param_value[:-1]

        param_dict[param_name] = self._convert_param_value(
            param_value, param_name, param_config, function_name)
    return ToolCall(
        type="function",
        function=FunctionCall(name=function_name,
                              arguments=json.dumps(param_dict,
                                                   ensure_ascii=False)),
    )

_reset_streaming_state ¶

_reset_streaming_state()

Reset all streaming state.

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def _reset_streaming_state(self):
    """Reset all streaming state."""
    self.current_tool_index = 0
    self.is_tool_call_started = False
    self.header_sent = False
    self.current_tool_id = None
    self.current_function_name = None
    self.current_param_name = None
    self.current_param_value = ""
    self.param_count = 0
    self.in_param = False
    self.in_function = False
    self.accumulated_text = ""
    self.json_started = False
    self.json_closed = False
    # Store accumulated parameters for type conversion
    self.accumulated_params = {}
    self.streaming_request = None

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    # Quick check to avoid unnecessary processing
    if self.tool_call_prefix not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    try:
        function_calls = self._get_function_calls(model_output)
        if len(function_calls) == 0:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        tool_calls = [
            self._parse_xml_function_call(function_call_str, request.tools)
            for function_call_str in function_calls
        ]

        # Populate prev_tool_call_arr for serving layer to set finish_reason
        self.prev_tool_call_arr.clear()  # Clear previous calls
        for tool_call in tool_calls:
            if tool_call:
                self.prev_tool_call_arr.append({
                    "name":
                    tool_call.function.name,
                    "arguments":
                    tool_call.function.arguments,
                })

        # Extract content before tool calls
        content_index = model_output.find(self.tool_call_start_token)
        idx = model_output.find(self.tool_call_prefix)
        content_index = content_index if content_index >= 0 else idx
        content = model_output[:content_index]  # .rstrip()

        return ExtractedToolCallInformation(
            tools_called=(len(tool_calls) > 0),
            tool_calls=tool_calls,
            content=content if content else None,
        )

    except Exception:
        logger.exception("Error in extracting tool call from response.")
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    # Store request for type conversion
    if not previous_text:
        self._reset_streaming_state()
        self.streaming_request = request

    # If no delta text, return None unless it's an EOS token after tools
    if not delta_text:
        # Check if this is an EOS token after all tool calls are complete
        # Check for tool calls in text even if is_tool_call_started
        # is False (might have been reset after processing all tools)
        if (delta_token_ids
                and self.tool_call_end_token_id not in delta_token_ids):
            # Count complete tool calls
            complete_calls = len(
                self.tool_call_complete_regex.findall(current_text))

            # If we have completed tool calls and populated
            # prev_tool_call_arr
            if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                # Check if all tool calls are closed
                open_calls = current_text.count(
                    self.tool_call_start_token) - current_text.count(
                        self.tool_call_end_token)
                if open_calls == 0:
                    # Return empty delta for finish_reason processing
                    return DeltaMessage(content="")
            elif not self.is_tool_call_started and current_text:
                # This is a regular content response that's now complete
                return DeltaMessage(content="")
        return None

    # Update accumulated text
    self.accumulated_text = current_text

    # Check if we need to advance to next tool
    if self.json_closed and not self.in_function:
        # Check if this tool call has ended
        tool_ends = current_text.count(self.tool_call_end_token)
        if tool_ends > self.current_tool_index:
            # This tool has ended, advance to next
            self.current_tool_index += 1
            self.header_sent = False
            self.param_count = 0
            self.json_started = False
            self.json_closed = False
            self.accumulated_params = {}

            # Check if there are more tool calls
            tool_starts = current_text.count(self.tool_call_start_token)
            if self.current_tool_index >= tool_starts:
                # No more tool calls
                self.is_tool_call_started = False
            # Continue processing next tool
            return None

    # Handle normal content before tool calls
    if not self.is_tool_call_started:
        # Check if tool call is starting
        if (self.tool_call_start_token_id in delta_token_ids
                or self.tool_call_start_token in delta_text):
            self.is_tool_call_started = True
            # Return any content before the tool call
            if self.tool_call_start_token in delta_text:
                content_before = delta_text[:delta_text.index(
                    self.tool_call_start_token)]
                if content_before:
                    return DeltaMessage(content=content_before)
            return None
        else:
            # Check if we're between tool calls - skip whitespace
            if (current_text.rstrip().endswith(self.tool_call_end_token)
                    and delta_text.strip() == ""):
                # We just ended a tool call, skip whitespace
                return None
            # Normal content, no tool call
            return DeltaMessage(content=delta_text)

    # Check if we're between tool calls (waiting for next one)
    # Count tool calls we've seen vs processed
    tool_starts_count = current_text.count(self.tool_call_start_token)
    if self.current_tool_index >= tool_starts_count:
        # We're past all tool calls, shouldn't be here
        return None

    # We're in a tool call, find the current tool call portion
    # Need to find the correct tool call based on current_tool_index
    tool_start_positions: list[int] = []
    idx = 0
    while True:
        idx = current_text.find(self.tool_call_start_token, idx)
        if idx == -1:
            break
        tool_start_positions.append(idx)
        idx += len(self.tool_call_start_token)

    if self.current_tool_index >= len(tool_start_positions):
        # No more tool calls to process yet
        return None

    tool_start_idx = tool_start_positions[self.current_tool_index]
    # Find where this tool call ends (or current position if not ended yet)
    tool_end_idx = current_text.find(self.tool_call_end_token,
                                     tool_start_idx)
    if tool_end_idx == -1:
        tool_text = current_text[tool_start_idx:]
    else:
        tool_text = current_text[tool_start_idx:tool_end_idx +
                                 len(self.tool_call_end_token)]

    # Looking for function header
    if not self.header_sent:
        if self.tool_call_prefix in tool_text:
            func_start = tool_text.find(self.tool_call_prefix) + len(
                self.tool_call_prefix)
            func_end = tool_text.find(">", func_start)

            if func_end != -1:
                # Found complete function name
                self.current_function_name = tool_text[func_start:func_end]
                self.current_tool_id = self._generate_tool_call_id()
                self.header_sent = True
                self.in_function = True

                # IMPORTANT: Add to prev_tool_call_arr immediately when
                # we detect a tool call. This ensures
                # finish_reason="tool_calls" even if parsing isn't complete
                already_added = any(
                    tool.get("name") == self.current_function_name
                    for tool in self.prev_tool_call_arr)
                if not already_added:
                    self.prev_tool_call_arr.append({
                        "name": self.current_function_name,
                        "arguments":
                        "{}",  # Placeholder, will be updated later
                    })

                # Send header with function info
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        id=self.current_tool_id,
                        function=DeltaFunctionCall(
                            name=self.current_function_name, arguments=""),
                        type="function",
                    )
                ])
        return None

    # We've sent header, now handle function body
    if self.in_function:
        # Send opening brace if not sent yet
        if (not self.json_started
                and self.parameter_prefix not in delta_text):
            self.json_started = True
            return DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=self.current_tool_index,
                    function=DeltaFunctionCall(arguments="{"),
                )
            ])

        # Make sure json_started is set if we're processing parameters
        if not self.json_started:
            self.json_started = True

        # Check for function end in accumulated text
        if not self.json_closed and self.function_end_token in tool_text:
            # Close JSON
            self.json_closed = True

            # Extract complete tool call to update
            # prev_tool_call_arr with final arguments
            # Find the function content
            func_start = tool_text.find(self.tool_call_prefix) + len(
                self.tool_call_prefix)
            func_content_end = tool_text.find(self.function_end_token,
                                              func_start)
            if func_content_end != -1:
                func_content = tool_text[func_start:func_content_end]
                # Parse to get the complete arguments
                try:
                    parsed_tool = self._parse_xml_function_call(
                        func_content, self.streaming_request.tools
                        if self.streaming_request else None)
                    if parsed_tool:
                        # Update existing entry in
                        # prev_tool_call_arr with complete args
                        for i, tool in enumerate(self.prev_tool_call_arr):
                            if tool.get(
                                    "name") == parsed_tool.function.name:
                                args = parsed_tool.function.arguments
                                self.prev_tool_call_arr[i][
                                    "arguments"] = args
                                break
                except Exception:
                    pass  # Ignore parsing errors during streaming

            result = DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=self.current_tool_index,
                    function=DeltaFunctionCall(arguments="}"),
                )
            ])

            # Reset state for next tool
            self.in_function = False
            self.json_closed = True
            self.accumulated_params = {}

            return result

        # Look for parameters
        # Find all parameter starts
        param_starts = []
        idx = 0
        while True:
            idx = tool_text.find(self.parameter_prefix, idx)
            if idx == -1:
                break
            param_starts.append(idx)
            idx += len(self.parameter_prefix)

        # Check if we should start a new parameter
        if (not self.in_param and self.param_count < len(param_starts)
                and len(param_starts) > self.param_count):
            # Process the next parameter
            param_idx = param_starts[self.param_count]
            param_start = param_idx + len(self.parameter_prefix)
            remaining = tool_text[param_start:]

            if ">" in remaining:
                # We have the complete parameter name
                name_end = remaining.find(">")
                self.current_param_name = remaining[:name_end]

                # Find the parameter value
                value_start = param_start + name_end + 1
                value_text = tool_text[value_start:]
                if value_text.startswith("\n"):
                    value_text = value_text[1:]

                # Find where this parameter ends
                param_end_idx = value_text.find(self.parameter_end_token)
                if param_end_idx == -1:
                    # No closing tag, look for next parameter or
                    # function end
                    next_param_idx = value_text.find(self.parameter_prefix)
                    func_end_idx = value_text.find(self.function_end_token)

                    if next_param_idx != -1 and (func_end_idx == -1
                                                 or next_param_idx
                                                 < func_end_idx):
                        param_end_idx = next_param_idx
                    elif func_end_idx != -1:
                        param_end_idx = func_end_idx
                    else:
                        # Neither found, check if tool call is complete
                        if self.tool_call_end_token in tool_text:
                            # Tool call is complete, so parameter
                            # must be complete too. Use all
                            # remaining text before function end
                            param_end_idx = len(value_text)
                        else:
                            # Still streaming, wait for more content
                            return None

                if param_end_idx != -1:
                    # Complete parameter found
                    param_value = value_text[:param_end_idx]
                    if param_value.endswith("\n"):
                        param_value = param_value[:-1]

                    # Store raw value for later processing
                    self.accumulated_params[
                        self.current_param_name] = param_value

                    # Get parameter configuration for type conversion
                    param_config = self._get_arguments_config(
                        self.current_function_name or "",
                        self.streaming_request.tools
                        if self.streaming_request else None)

                    # Convert param value to appropriate type
                    converted_value = self._convert_param_value(
                        param_value, self.current_param_name, param_config,
                        self.current_function_name or "")

                    # Build JSON fragment based on the converted type
                    # Use json.dumps to properly serialize the value
                    serialized_value = json.dumps(converted_value,
                                                  ensure_ascii=False)

                    if self.param_count == 0:
                        json_fragment = (f'"{self.current_param_name}": '
                                         f'{serialized_value}')
                    else:
                        json_fragment = (f', "{self.current_param_name}": '
                                         f'{serialized_value}')

                    self.param_count += 1

                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
                            function=DeltaFunctionCall(
                                arguments=json_fragment),
                        )
                    ])

        # Continue parameter value - Not used in the current implementation
        # since we process complete parameters above
        if self.in_param:
            if self.parameter_end_token in delta_text:
                # End of parameter
                end_idx = delta_text.find(self.parameter_end_token)
                value_chunk = delta_text[:end_idx]

                # Skip past > if at start
                if not self.current_param_value and ">" in value_chunk:
                    gt_idx = value_chunk.find(">")
                    value_chunk = value_chunk[gt_idx + 1:]

                if not self.current_param_value and value_chunk.startswith(
                        "\n"):
                    value_chunk = value_chunk[1:]

                # Store complete value
                full_value = self.current_param_value + value_chunk
                self.accumulated_params[
                    self.current_param_name] = full_value

                # Get parameter configuration for type conversion
                param_config = self._get_arguments_config(
                    self.current_function_name or "",
                    self.streaming_request.tools
                    if self.streaming_request else None)

                # Convert the parameter value to the appropriate type
                converted_value = self._convert_param_value(
                    full_value, self.current_param_name or "",
                    param_config, self.current_function_name or "")

                # Serialize the converted value
                serialized_value = json.dumps(converted_value,
                                              ensure_ascii=False)

                # Since we've been streaming the quoted version,
                # we need to close it properly
                # This is complex - for now just complete the value
                self.in_param = False
                self.current_param_value = ""

                # Just close the current parameter string
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        function=DeltaFunctionCall(
                            arguments='"'),  # Close the string quote
                    )
                ])
            else:
                # Continue accumulating value
                value_chunk = delta_text

                # Handle first chunk after param name
                if not self.current_param_value and ">" in value_chunk:
                    gt_idx = value_chunk.find(">")
                    value_chunk = value_chunk[gt_idx + 1:]

                if not self.current_param_value and value_chunk.startswith(
                        "\n"):
                    value_chunk = value_chunk[1:]

                if value_chunk:
                    # Stream the escaped delta
                    prev_escaped = json.dumps(
                        self.current_param_value, ensure_ascii=False
                    )[1:-1] if self.current_param_value else ""
                    self.current_param_value += value_chunk
                    full_escaped = json.dumps(self.current_param_value,
                                              ensure_ascii=False)[1:-1]
                    delta_escaped = full_escaped[len(prev_escaped):]

                    if delta_escaped:
                        return DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=self.current_tool_index,
                                function=DeltaFunctionCall(
                                    arguments=delta_escaped),
                            )
                        ])

    return None

Qwen3XMLToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py

@ToolParserManager.register_module("qwen3_xml")
class Qwen3XMLToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
        self.parser = StreamingXMLToolCallParser()

        logger.info("vLLM Successfully import tool parser %s !",
                    self.__class__.__name__)

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        self.parser.reset_streaming_state()
        if request:
            self.parser.set_tools(request.tools)
        result = self.parser.parse_single_streaming_chunks(model_output)
        if not result.tool_calls:
            return ExtractedToolCallInformation(
                tool_calls=[],
                tools_called=False,
                content=result.content,
            )
        else:
            tool_calls = []
            for tool_call in result.tool_calls:
                if tool_call.function and tool_call.function.name:
                    tool_calls.append(
                        ToolCall(
                            id=tool_call.id,
                            type=tool_call.type,
                            function=FunctionCall(
                                name=tool_call.function.name,
                                arguments=tool_call.function.arguments,
                            ),
                        ))
            return ExtractedToolCallInformation(
                tool_calls=tool_calls,
                tools_called=len(tool_calls) > 0,
                content=result.content,
            )

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        if not previous_text:
            self.parser.reset_streaming_state()
            if request:
                self.parser.set_tools(request.tools)

        # Model sometimes outputs separately causing delta_text to be empty.
        # If there were tool_calls before and all current tool_calls have ended,
        # return an empty tool_call for outer streaming output
        # to correctly output tool_call field
        if not delta_text and delta_token_ids:
            open_calls = current_text.count(
                self.parser.tool_call_start_token) - current_text.count(
                    self.parser.tool_call_end_token)
            if open_calls == 0 and self.parser.tool_call_index > 0:
                # If current_call_id is None, use last_completed_call_id
                call_id = self.parser.current_call_id or \
                    self.parser.last_completed_call_id
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.parser.tool_call_index - 1,
                        id=call_id,
                        function=DeltaFunctionCall(arguments=''),
                        type='function',
                    )
                ])

        return self.parser.parse_single_streaming_chunks(delta_text)

parser `instance-attribute` ¶

parser = StreamingXMLToolCallParser()

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)
    self.parser = StreamingXMLToolCallParser()

    logger.info("vLLM Successfully import tool parser %s !",
                self.__class__.__name__)

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    self.parser.reset_streaming_state()
    if request:
        self.parser.set_tools(request.tools)
    result = self.parser.parse_single_streaming_chunks(model_output)
    if not result.tool_calls:
        return ExtractedToolCallInformation(
            tool_calls=[],
            tools_called=False,
            content=result.content,
        )
    else:
        tool_calls = []
        for tool_call in result.tool_calls:
            if tool_call.function and tool_call.function.name:
                tool_calls.append(
                    ToolCall(
                        id=tool_call.id,
                        type=tool_call.type,
                        function=FunctionCall(
                            name=tool_call.function.name,
                            arguments=tool_call.function.arguments,
                        ),
                    ))
        return ExtractedToolCallInformation(
            tool_calls=tool_calls,
            tools_called=len(tool_calls) > 0,
            content=result.content,
        )

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    if not previous_text:
        self.parser.reset_streaming_state()
        if request:
            self.parser.set_tools(request.tools)

    # Model sometimes outputs separately causing delta_text to be empty.
    # If there were tool_calls before and all current tool_calls have ended,
    # return an empty tool_call for outer streaming output
    # to correctly output tool_call field
    if not delta_text and delta_token_ids:
        open_calls = current_text.count(
            self.parser.tool_call_start_token) - current_text.count(
                self.parser.tool_call_end_token)
        if open_calls == 0 and self.parser.tool_call_index > 0:
            # If current_call_id is None, use last_completed_call_id
            call_id = self.parser.current_call_id or \
                self.parser.last_completed_call_id
            return DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=self.parser.tool_call_index - 1,
                    id=call_id,
                    function=DeltaFunctionCall(arguments=''),
                    type='function',
                )
            ])

    return self.parser.parse_single_streaming_chunks(delta_text)

SeedOssToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py

@ToolParserManager.register_module("seed_oss")
class SeedOssToolParser(ToolParser):
    TOOL_CALL_START = "<seed:tool_call>"
    TOOL_CALL_END = "</seed:tool_call>"

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        # --- streaming state ---
        self._reset_streaming_state()
        self.prev_tool_call_arr: list[dict] = []

        self.tool_call_start_token: str = self.TOOL_CALL_START
        self.tool_call_end_token: str = self.TOOL_CALL_END
        # Sentinel tokens for streaming mode
        self.tool_call_prefix: str = "<function="
        self.function_end_token: str = "</function>"
        self.parameter_prefix: str = "<parameter="
        self.parameter_end_token: str = "</parameter>"
        self.think_start_token: str = "<seed:think>"
        self.think_end_token: str = "</seed:think>"
        self.is_tool_call_started: bool = False
        self.is_thinking_end: bool = False
        self.failed_count: int = 0
        self._reset_streaming_state()

        self.tool_call_start_token_id = self.vocab.get(
            self.tool_call_start_token)
        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
        self.think_end_token_id = self.vocab.get(self.think_end_token)

        if (self.tool_call_start_token_id is None
                or self.tool_call_end_token_id is None):
            raise RuntimeError(
                "Seed_Oss XML parser: tokenizer did not include "
                "<seed:tool_call> or its closing tag.")

        tool_start_re = re.escape(self.tool_call_start_token)
        tool_end_re = re.escape(self.tool_call_end_token)

        self.tool_call_complete_regex = re.compile(
            rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL)
        self.tool_call_regex = re.compile(
            rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
            re.DOTALL)

        self.tool_call_function_regex = re.compile(
            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
        self.tool_call_parameter_regex = re.compile(
            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)

        logger.info("vLLM Seed-Oss XML tool parser loaded (%s).",
                    self.__class__.__name__)

    def _generate_tool_call_id(self) -> str:
        """Generate a unique tool call ID."""
        return f"call_{uuid.uuid4().hex[:24]}"

    def _reset_streaming_state(self):
        """Reset all streaming state."""
        self.current_tool_index = 0
        self.is_tool_call_started = False
        self.header_sent = False
        self.current_tool_id = -1
        self.current_function_name = None
        self.current_param_name = None
        self.current_param_value = ""
        self.param_count = 0
        self.in_param = False
        self.in_function = False
        self.accumulated_text = ""
        self.json_started = False
        self.json_closed = False

    def _parse_xml_function_call(
            self, function_call_str: str,
            tools: Optional[list[ChatCompletionToolsParam]]
    ) -> Optional[ToolCall]:

        def get_arguments_config(func_name: str) -> dict:
            if tools is None:
                return {}
            for config in tools:
                if not hasattr(config, "type") or not (
                        hasattr(config, "function")
                        and hasattr(config.function, "name")):
                    continue
                if (config.type == "function"
                        and config.function.name == func_name):
                    if not hasattr(config.function, "parameters"):
                        return {}
                    params = config.function.parameters
                    if isinstance(params, dict) and "properties" in params:
                        return params["properties"]
                    elif isinstance(params, dict):
                        return params
                    else:
                        return {}
            logger.warning("Tool '%s' is not defined in the tools list.",
                           func_name)
            return {}

        def convert_param_value(param_value: str, param_name: str,
                                param_config: dict, func_name: str) -> Any:
            # Handle null value for any type
            if param_value.lower() == "null":
                return None

            if param_name not in param_config:
                if param_config != {}:
                    logger.warning(
                        "Parsed parameter '%s' is not defined in "
                        "the tool parameters for tool '%s', "
                        "directly returning the string value.", param_name,
                        func_name)
                return param_value

            if (isinstance(param_config[param_name], dict)
                    and "type" in param_config[param_name]):
                param_type = str(
                    param_config[param_name]["type"]).strip().lower()
            else:
                param_type = "string"
            if param_type in [
                    "string", "str", "text", "varchar", "char", "enum"
            ]:
                return param_value
            elif (param_type.startswith("int") or param_type.startswith("uint")
                  or param_type.startswith("long")
                  or param_type.startswith("short")
                  or param_type.startswith("unsigned")):
                try:
                    param_value = int(param_value)  # type: ignore
                except (ValueError, TypeError):
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' is not an integer in tool "
                        "'%s', degenerating to string.", param_value,
                        param_name, func_name)
                return param_value
            elif param_type.startswith("num") or param_type.startswith(
                    "float"):
                try:
                    float_param_value = float(param_value)
                    param_value = float_param_value if float_param_value - int(
                        float_param_value) != 0 else int(
                            float_param_value)  # type: ignore
                except (ValueError, TypeError):
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' is not a float in tool "
                        "'%s', degenerating to string.", param_value,
                        param_name, func_name)
                return param_value
            elif param_type in ["boolean", "bool", "binary"]:
                param_value = param_value.lower()
                if param_value not in ["true", "false"]:
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' is not a boolean "
                        "(`true` of `false`) in tool '%s', degenerating to false.",
                        param_value, param_name, func_name)
                return param_value == "true"
            else:
                if param_type == "object" or param_type.startswith("dict"):
                    try:
                        param_value = json.loads(param_value)
                        return param_value
                    except (ValueError, TypeError, json.JSONDecodeError):
                        logger.warning(
                            "Parsed value '%s' of parameter '%s' is not a valid JSON "
                            "object in tool '%s', will try other methods to parse it.",
                            param_value, param_name, func_name)
                try:
                    param_value = ast.literal_eval(param_value)
                except (ValueError, SyntaxError):
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' cannot be converted via "
                        "Python `ast.literal_eval()` in tool '%s', degenerating to string.",
                        param_value, param_name, func_name)
                return param_value

        # Extract function name
        end_index = function_call_str.index(">")
        function_name = function_call_str[:end_index]
        param_config = get_arguments_config(function_name)
        parameters = function_call_str[end_index + 1:]
        param_dict = {}
        for match in self.tool_call_parameter_regex.findall(parameters):
            match_text = match[0] if match[0] else match[1]
            idx = match_text.index(">")
            param_name = match_text[:idx]
            param_value = str(match_text[idx + 1:])
            # Remove prefix and trailing \n
            if param_value.startswith("\n"):
                param_value = param_value[1:]
            if param_value.endswith("\n"):
                param_value = param_value[:-1]

            param_dict[param_name] = convert_param_value(
                param_value, param_name, param_config, function_name)
        return ToolCall(
            type="function",
            function=FunctionCall(name=function_name,
                                  arguments=json.dumps(param_dict,
                                                       ensure_ascii=False)),
        )

    def _get_function_calls(self, model_output: str) -> list[str]:
        # Find all tool calls
        matched_ranges = self.tool_call_regex.findall(model_output)
        raw_tool_calls = [
            match[0] if match[0] else match[1] for match in matched_ranges
        ]

        # Back-off strategy if no tool_call tags found
        if len(raw_tool_calls) == 0:
            raw_tool_calls = [model_output]

        raw_function_calls = []
        for tool_call in raw_tool_calls:
            raw_function_calls.extend(
                self.tool_call_function_regex.findall(tool_call))

        function_calls = [
            match[0] if match[0] else match[1] for match in raw_function_calls
        ]
        return function_calls

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        # Quick check to avoid unnecessary processing
        if self.tool_call_prefix not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        # Check if both think start and end tokens are present
        if (self.think_start_token in model_output
                and self.think_end_token in model_output):
            # Find the position of think end token
            think_end_index = model_output.find(self.think_end_token) + len(
                self.think_end_token)
            # Extract content after think end token
            result_content = model_output[think_end_index:]
            thinking_content = model_output[:think_end_index]
        else:
            thinking_content = ""
            result_content = model_output

        try:
            function_calls = self._get_function_calls(result_content)
            if len(function_calls) == 0:
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=model_output)

            tool_calls = [
                self._parse_xml_function_call(function_call_str, request.tools)
                for function_call_str in function_calls
            ]

            # Populate prev_tool_call_arr for serving layer to set finish_reason
            self.prev_tool_call_arr.clear()  # Clear previous calls
            for tool_call in tool_calls:
                if tool_call:
                    self.prev_tool_call_arr.append({
                        "name":
                        tool_call.function.name,
                        "arguments":
                        tool_call.function.arguments,
                    })

            # Extract content before tool calls
            tool_call_start_index = result_content.find(
                self.tool_call_start_token)
            tool_call_start_index = (
                tool_call_start_index if tool_call_start_index >= 0 else
                result_content.find(self.tool_call_prefix))
            content = thinking_content + result_content[:tool_call_start_index]

            return ExtractedToolCallInformation(
                tools_called=(len(tool_calls) > 0),
                tool_calls=tool_calls,
                content=content if content else None,
            )

        except Exception:
            logger.exception("Error in extracting tool call from response.")
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        # If no delta text, return None unless
        # it's an EOS token after tool calls
        if not delta_text:
            # Check if this is an EOS token after all tool calls are complete
            # We check for tool calls in the text even if is_tool_call_started
            # is False because it might have been reset after processing all tools
            if (delta_token_ids
                    and self.tool_call_end_token_id not in delta_token_ids):
                # Count complete tool calls
                complete_calls = len(
                    self.tool_call_complete_regex.findall(current_text))

                # If we have completed tool calls and populated prev_tool_call_arr
                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                    # Check if all tool calls are closed
                    open_calls = current_text.count(
                        self.tool_call_start_token) - current_text.count(
                            self.tool_call_end_token)
                    if open_calls == 0:
                        # Return empty delta message to allow finish_reason processing
                        return DeltaMessage(content="")
                elif not self.is_tool_call_started and current_text:
                    # This is a regular content response that's now complete
                    return DeltaMessage(content="")
            return None

        # Check if this is the first call (reset state if needed)
        if not previous_text:
            self._reset_streaming_state()

        # Update accumulated text
        self.accumulated_text = current_text

        # Check if we need to advance to next tool
        if self.json_closed and not self.in_function:
            # Check if this tool call has ended
            tool_ends = current_text.count(self.tool_call_end_token)
            if tool_ends > self.current_tool_index:
                # This tool has ended, advance to next
                self.current_tool_index += 1
                self.header_sent = False
                self.param_count = 0
                self.json_started = False
                self.json_closed = False

                # Check if there are more tool calls
                if self.current_tool_index >= current_text.count(
                        self.tool_call_start_token):
                    # No more tool calls
                    self.is_tool_call_started = False
                # Continue processing next tool
                return None

        # Check if end thinking
        if (not self.is_thinking_end
                and (self.think_end_token_id in delta_token_ids
                     or self.think_end_token in delta_text)):
            self.is_thinking_end = True

        # If thinking hasn't ended yet, don't process any tool calls
        if not self.is_thinking_end:
            return DeltaMessage(content=delta_text)

        # Handle normal content before tool calls
        if not self.is_tool_call_started:
            # Check if tool call is starting
            if (self.tool_call_start_token_id in delta_token_ids
                    or self.tool_call_start_token in delta_text):
                self.is_tool_call_started = True
                # Return any content before the tool call
                if self.tool_call_start_token in delta_text:
                    content_before = delta_text[:delta_text.index(
                        self.tool_call_start_token)]
                    if content_before:
                        return DeltaMessage(content=content_before)
                return None
            else:
                # Check if we're between tool calls - skip whitespace
                if (current_text.rstrip().endswith(self.tool_call_end_token)
                        and delta_text.strip() == ""):
                    # We just ended a tool call, skip whitespace
                    return None
                # Normal content, no tool call
                return DeltaMessage(content=delta_text)

        # Check if we're between tool calls (waiting for next one)
        # Count tool calls we've seen vs processed
        tool_starts_count = current_text.count(self.tool_call_start_token)
        if self.current_tool_index >= tool_starts_count:
            # We're past all tool calls, shouldn't be here
            return None

        # We're in a tool call, find the current tool call portion
        # Need to find the correct tool call based on current_tool_index
        # Only process tool calls after think_end_token
        think_end_index = current_text.find(self.think_end_token) + len(
            self.think_end_token
        ) if self.think_end_token in current_text else 0
        tool_starts: list[int] = []
        idx = think_end_index
        while True:
            idx = current_text.find(self.tool_call_start_token, idx)
            if idx == -1:
                break
            tool_starts.append(idx)
            idx += len(self.tool_call_start_token)

        if self.current_tool_index >= len(tool_starts):
            # No more tool calls to process yet
            return None

        tool_start_idx = tool_starts[self.current_tool_index]
        # Find where this tool call ends (or current position if not ended yet)
        tool_end_idx = current_text.find(self.tool_call_end_token,
                                         tool_start_idx)
        if tool_end_idx == -1:
            tool_text = current_text[tool_start_idx:]
        else:
            tool_text = current_text[tool_start_idx:tool_end_idx +
                                     len(self.tool_call_end_token)]

        # Looking for function header
        if not self.header_sent:
            if self.tool_call_prefix in tool_text:
                func_start = tool_text.find(self.tool_call_prefix) + len(
                    self.tool_call_prefix)
                func_end = tool_text.find(">", func_start)

                if func_end != -1:
                    # Found complete function name
                    self.current_function_name = tool_text[func_start:func_end]
                    self.current_tool_id = self._generate_tool_call_id(
                    )  # type: ignore
                    self.header_sent = True
                    self.in_function = True

                    # IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call
                    # This ensures finish_reason="tool_calls" even if parsing isn't complete
                    already_added = any(
                        tool.get("name") == self.current_function_name
                        for tool in self.prev_tool_call_arr)
                    if not already_added:
                        self.prev_tool_call_arr.append({
                            "name": self.current_function_name,
                            "arguments":
                            "{}",  # Placeholder, will be updated later
                        })

                    # Send header with function info
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
                            id=self.current_tool_id,
                            function=DeltaFunctionCall(
                                name=self.current_function_name, arguments=""),
                            type="function",
                        )
                    ])
            return None

        # We've sent header, now handle function body
        if self.in_function:
            # Send opening brace if not sent yet
            if (not self.json_started
                    and self.parameter_prefix not in delta_text):
                self.json_started = True
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        function=DeltaFunctionCall(arguments="{"),
                    )
                ])

            # Make sure json_started is set if we're processing parameters
            if not self.json_started:
                self.json_started = True

            # Check for function end in accumulated text
            if not self.json_closed and self.function_end_token in tool_text:
                # Close JSON
                self.json_closed = True

                # Extract the complete tool call to update prev_tool_call_arr with final arguments
                # Find the function content
                func_start = tool_text.find(self.tool_call_prefix) + len(
                    self.tool_call_prefix)
                func_content_end = tool_text.find(self.function_end_token,
                                                  func_start)
                if func_content_end != -1:
                    func_content = tool_text[func_start:func_content_end]
                    # Parse to get the complete arguments
                    try:
                        parsed_tool = self._parse_xml_function_call(
                            func_content, request.tools if request else None)
                        if parsed_tool:
                            # Update existing entry in prev_tool_call_arr with complete arguments
                            for i, tool in enumerate(self.prev_tool_call_arr):
                                if tool.get(
                                        "name") == parsed_tool.function.name:
                                    self.prev_tool_call_arr[i]["arguments"] = (
                                        parsed_tool.function.arguments)
                                    break
                    except Exception:
                        logger.warning(
                            "Failed to parse tool arguments during streaming.",
                            exc_info=True)

                result = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        function=DeltaFunctionCall(arguments="}"),
                    )
                ])

                # Reset state for next tool
                self.in_function = False
                self.json_closed = True

                return result

            # Look for parameters
            # Count how many complete parameters we have processed
            complete_params = tool_text.count(self.parameter_end_token)

            # Check if we should start a new parameter
            if not self.in_param and self.param_count < complete_params:
                # Find the unprocessed parameter
                # Count parameter starts
                param_starts = []
                idx = 0
                while True:
                    idx = tool_text.find(self.parameter_prefix, idx)
                    if idx == -1:
                        break
                    param_starts.append(idx)
                    idx += len(self.parameter_prefix)

                if len(param_starts) > self.param_count:
                    # Process the next parameter
                    param_idx = param_starts[self.param_count]
                    param_start = param_idx + len(self.parameter_prefix)
                    remaining = tool_text[param_start:]

                    if ">" in remaining:
                        # We have the complete parameter name
                        name_end = remaining.find(">")
                        self.current_param_name = remaining[:name_end]

                        # Find the parameter value
                        value_start = param_start + name_end + 1
                        value_text = tool_text[value_start:]
                        if value_text.startswith("\n"):
                            value_text = value_text[1:]

                        # Find where this parameter ends
                        param_end_idx = value_text.find(
                            self.parameter_end_token)
                        if param_end_idx != -1:
                            # Complete parameter found
                            param_value = value_text[:param_end_idx]
                            if param_value.endswith("\n"):
                                param_value = param_value[:-1]

                            # Build complete JSON fragment for this parameter
                            if self.param_count == 0:
                                json_fragment = (
                                    '"' + self.current_param_name + '": "' +
                                    json.dumps(param_value)[1:-1] + '"')
                            else:
                                json_fragment = (
                                    ', "' + self.current_param_name + '": "' +
                                    json.dumps(param_value)[1:-1] + '"')

                            self.param_count += 1

                            return DeltaMessage(tool_calls=[
                                DeltaToolCall(
                                    index=self.current_tool_index,
                                    function=DeltaFunctionCall(
                                        arguments=json_fragment),
                                )
                            ])

            # Continue parameter value
            if self.in_param:
                if self.parameter_end_token in delta_text:
                    # End of parameter
                    end_idx = delta_text.find(self.parameter_end_token)
                    value_chunk = delta_text[:end_idx]

                    # Skip past > if at start
                    if not self.current_param_value and ">" in value_chunk:
                        gt_idx = value_chunk.find(">")
                        value_chunk = value_chunk[gt_idx + 1:]

                    if not self.current_param_value and value_chunk.startswith(
                            "\n"):
                        value_chunk = value_chunk[1:]

                    # Calculate incremental JSON
                    full_value = self.current_param_value + value_chunk
                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
                                    if self.current_param_value else "")
                    full_escaped = json.dumps(full_value)[1:-1]
                    delta_escaped = full_escaped[len(prev_escaped):]

                    self.in_param = False
                    self.current_param_value = ""

                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=self.current_tool_index,
                            function=DeltaFunctionCall(
                                arguments=delta_escaped + '"'),
                        )
                    ])
                else:
                    # Continue accumulating value
                    value_chunk = delta_text

                    # Handle first chunk after param name
                    if not self.current_param_value and ">" in value_chunk:
                        gt_idx = value_chunk.find(">")
                        value_chunk = value_chunk[gt_idx + 1:]

                    if not self.current_param_value and value_chunk.startswith(
                            "\n"):
                        value_chunk = value_chunk[1:]

                    if value_chunk:
                        # Stream the escaped delta
                        prev_escaped = (json.dumps(
                            self.current_param_value)[1:-1]
                                        if self.current_param_value else "")
                        self.current_param_value += value_chunk
                        full_escaped = json.dumps(
                            self.current_param_value)[1:-1]
                        delta_escaped = full_escaped[len(prev_escaped):]

                        if delta_escaped:
                            return DeltaMessage(tool_calls=[
                                DeltaToolCall(
                                    index=self.current_tool_index,
                                    function=DeltaFunctionCall(
                                        arguments=delta_escaped),
                                )
                            ])

        return None

TOOL_CALL_END `class-attribute` `instance-attribute` ¶

TOOL_CALL_END = '</seed:tool_call>'

TOOL_CALL_START `class-attribute` `instance-attribute` ¶

TOOL_CALL_START = '<seed:tool_call>'

failed_count `instance-attribute` ¶

failed_count: int = 0

function_end_token `instance-attribute` ¶

function_end_token: str = '</function>'

is_thinking_end `instance-attribute` ¶

is_thinking_end: bool = False

is_tool_call_started `instance-attribute` ¶

is_tool_call_started: bool = False

parameter_end_token `instance-attribute` ¶

parameter_end_token: str = '</parameter>'

parameter_prefix `instance-attribute` ¶

parameter_prefix: str = '<parameter='

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

think_end_token `instance-attribute` ¶

think_end_token: str = '</seed:think>'

think_end_token_id `instance-attribute` ¶

think_end_token_id = get(think_end_token)

think_start_token `instance-attribute` ¶

think_start_token: str = '<seed:think>'

tool_call_complete_regex `instance-attribute` ¶

tool_call_complete_regex = compile(
    f"{tool_start_re}(.*?){tool_end_re}", DOTALL
)

tool_call_end_token `instance-attribute` ¶

tool_call_end_token: str = TOOL_CALL_END

tool_call_end_token_id `instance-attribute` ¶

tool_call_end_token_id = get(tool_call_end_token)

tool_call_function_regex `instance-attribute` ¶

tool_call_function_regex = compile(
    "<function=(.*?)</function>|<function=(.*)$", DOTALL
)

tool_call_parameter_regex `instance-attribute` ¶

tool_call_parameter_regex = compile(
    "<parameter=(.*?)</parameter>|<parameter=(.*?)$", DOTALL
)

tool_call_prefix `instance-attribute` ¶

tool_call_prefix: str = '<function='

tool_call_regex `instance-attribute` ¶

tool_call_regex = compile(
    f"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
    DOTALL,
)

tool_call_start_token `instance-attribute` ¶

tool_call_start_token: str = TOOL_CALL_START

tool_call_start_token_id `instance-attribute` ¶

tool_call_start_token_id = get(tool_call_start_token)

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    # --- streaming state ---
    self._reset_streaming_state()
    self.prev_tool_call_arr: list[dict] = []

    self.tool_call_start_token: str = self.TOOL_CALL_START
    self.tool_call_end_token: str = self.TOOL_CALL_END
    # Sentinel tokens for streaming mode
    self.tool_call_prefix: str = "<function="
    self.function_end_token: str = "</function>"
    self.parameter_prefix: str = "<parameter="
    self.parameter_end_token: str = "</parameter>"
    self.think_start_token: str = "<seed:think>"
    self.think_end_token: str = "</seed:think>"
    self.is_tool_call_started: bool = False
    self.is_thinking_end: bool = False
    self.failed_count: int = 0
    self._reset_streaming_state()

    self.tool_call_start_token_id = self.vocab.get(
        self.tool_call_start_token)
    self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
    self.think_end_token_id = self.vocab.get(self.think_end_token)

    if (self.tool_call_start_token_id is None
            or self.tool_call_end_token_id is None):
        raise RuntimeError(
            "Seed_Oss XML parser: tokenizer did not include "
            "<seed:tool_call> or its closing tag.")

    tool_start_re = re.escape(self.tool_call_start_token)
    tool_end_re = re.escape(self.tool_call_end_token)

    self.tool_call_complete_regex = re.compile(
        rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL)
    self.tool_call_regex = re.compile(
        rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
        re.DOTALL)

    self.tool_call_function_regex = re.compile(
        r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
    self.tool_call_parameter_regex = re.compile(
        r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)

    logger.info("vLLM Seed-Oss XML tool parser loaded (%s).",
                self.__class__.__name__)

_generate_tool_call_id ¶

_generate_tool_call_id() -> str

Generate a unique tool call ID.

Source code in vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py

def _generate_tool_call_id(self) -> str:
    """Generate a unique tool call ID."""
    return f"call_{uuid.uuid4().hex[:24]}"

_get_function_calls ¶

_get_function_calls(model_output: str) -> list[str]

Source code in vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py

def _get_function_calls(self, model_output: str) -> list[str]:
    # Find all tool calls
    matched_ranges = self.tool_call_regex.findall(model_output)
    raw_tool_calls = [
        match[0] if match[0] else match[1] for match in matched_ranges
    ]

    # Back-off strategy if no tool_call tags found
    if len(raw_tool_calls) == 0:
        raw_tool_calls = [model_output]

    raw_function_calls = []
    for tool_call in raw_tool_calls:
        raw_function_calls.extend(
            self.tool_call_function_regex.findall(tool_call))

    function_calls = [
        match[0] if match[0] else match[1] for match in raw_function_calls
    ]
    return function_calls

_parse_xml_function_call ¶

_parse_xml_function_call(
    function_call_str: str,
    tools: Optional[list[ChatCompletionToolsParam]],
) -> Optional[ToolCall]

Source code in vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py

def _parse_xml_function_call(
        self, function_call_str: str,
        tools: Optional[list[ChatCompletionToolsParam]]
) -> Optional[ToolCall]:

    def get_arguments_config(func_name: str) -> dict:
        if tools is None:
            return {}
        for config in tools:
            if not hasattr(config, "type") or not (
                    hasattr(config, "function")
                    and hasattr(config.function, "name")):
                continue
            if (config.type == "function"
                    and config.function.name == func_name):
                if not hasattr(config.function, "parameters"):
                    return {}
                params = config.function.parameters
                if isinstance(params, dict) and "properties" in params:
                    return params["properties"]
                elif isinstance(params, dict):
                    return params
                else:
                    return {}
        logger.warning("Tool '%s' is not defined in the tools list.",
                       func_name)
        return {}

    def convert_param_value(param_value: str, param_name: str,
                            param_config: dict, func_name: str) -> Any:
        # Handle null value for any type
        if param_value.lower() == "null":
            return None

        if param_name not in param_config:
            if param_config != {}:
                logger.warning(
                    "Parsed parameter '%s' is not defined in "
                    "the tool parameters for tool '%s', "
                    "directly returning the string value.", param_name,
                    func_name)
            return param_value

        if (isinstance(param_config[param_name], dict)
                and "type" in param_config[param_name]):
            param_type = str(
                param_config[param_name]["type"]).strip().lower()
        else:
            param_type = "string"
        if param_type in [
                "string", "str", "text", "varchar", "char", "enum"
        ]:
            return param_value
        elif (param_type.startswith("int") or param_type.startswith("uint")
              or param_type.startswith("long")
              or param_type.startswith("short")
              or param_type.startswith("unsigned")):
            try:
                param_value = int(param_value)  # type: ignore
            except (ValueError, TypeError):
                logger.warning(
                    "Parsed value '%s' of parameter '%s' is not an integer in tool "
                    "'%s', degenerating to string.", param_value,
                    param_name, func_name)
            return param_value
        elif param_type.startswith("num") or param_type.startswith(
                "float"):
            try:
                float_param_value = float(param_value)
                param_value = float_param_value if float_param_value - int(
                    float_param_value) != 0 else int(
                        float_param_value)  # type: ignore
            except (ValueError, TypeError):
                logger.warning(
                    "Parsed value '%s' of parameter '%s' is not a float in tool "
                    "'%s', degenerating to string.", param_value,
                    param_name, func_name)
            return param_value
        elif param_type in ["boolean", "bool", "binary"]:
            param_value = param_value.lower()
            if param_value not in ["true", "false"]:
                logger.warning(
                    "Parsed value '%s' of parameter '%s' is not a boolean "
                    "(`true` of `false`) in tool '%s', degenerating to false.",
                    param_value, param_name, func_name)
            return param_value == "true"
        else:
            if param_type == "object" or param_type.startswith("dict"):
                try:
                    param_value = json.loads(param_value)
                    return param_value
                except (ValueError, TypeError, json.JSONDecodeError):
                    logger.warning(
                        "Parsed value '%s' of parameter '%s' is not a valid JSON "
                        "object in tool '%s', will try other methods to parse it.",
                        param_value, param_name, func_name)
            try:
                param_value = ast.literal_eval(param_value)
            except (ValueError, SyntaxError):
                logger.warning(
                    "Parsed value '%s' of parameter '%s' cannot be converted via "
                    "Python `ast.literal_eval()` in tool '%s', degenerating to string.",
                    param_value, param_name, func_name)
            return param_value

    # Extract function name
    end_index = function_call_str.index(">")
    function_name = function_call_str[:end_index]
    param_config = get_arguments_config(function_name)
    parameters = function_call_str[end_index + 1:]
    param_dict = {}
    for match in self.tool_call_parameter_regex.findall(parameters):
        match_text = match[0] if match[0] else match[1]
        idx = match_text.index(">")
        param_name = match_text[:idx]
        param_value = str(match_text[idx + 1:])
        # Remove prefix and trailing \n
        if param_value.startswith("\n"):
            param_value = param_value[1:]
        if param_value.endswith("\n"):
            param_value = param_value[:-1]

        param_dict[param_name] = convert_param_value(
            param_value, param_name, param_config, function_name)
    return ToolCall(
        type="function",
        function=FunctionCall(name=function_name,
                              arguments=json.dumps(param_dict,
                                                   ensure_ascii=False)),
    )

_reset_streaming_state ¶

_reset_streaming_state()

Reset all streaming state.

Source code in vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py

def _reset_streaming_state(self):
    """Reset all streaming state."""
    self.current_tool_index = 0
    self.is_tool_call_started = False
    self.header_sent = False
    self.current_tool_id = -1
    self.current_function_name = None
    self.current_param_name = None
    self.current_param_value = ""
    self.param_count = 0
    self.in_param = False
    self.in_function = False
    self.accumulated_text = ""
    self.json_started = False
    self.json_closed = False

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    # Quick check to avoid unnecessary processing
    if self.tool_call_prefix not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    # Check if both think start and end tokens are present
    if (self.think_start_token in model_output
            and self.think_end_token in model_output):
        # Find the position of think end token
        think_end_index = model_output.find(self.think_end_token) + len(
            self.think_end_token)
        # Extract content after think end token
        result_content = model_output[think_end_index:]
        thinking_content = model_output[:think_end_index]
    else:
        thinking_content = ""
        result_content = model_output

    try:
        function_calls = self._get_function_calls(result_content)
        if len(function_calls) == 0:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        tool_calls = [
            self._parse_xml_function_call(function_call_str, request.tools)
            for function_call_str in function_calls
        ]

        # Populate prev_tool_call_arr for serving layer to set finish_reason
        self.prev_tool_call_arr.clear()  # Clear previous calls
        for tool_call in tool_calls:
            if tool_call:
                self.prev_tool_call_arr.append({
                    "name":
                    tool_call.function.name,
                    "arguments":
                    tool_call.function.arguments,
                })

        # Extract content before tool calls
        tool_call_start_index = result_content.find(
            self.tool_call_start_token)
        tool_call_start_index = (
            tool_call_start_index if tool_call_start_index >= 0 else
            result_content.find(self.tool_call_prefix))
        content = thinking_content + result_content[:tool_call_start_index]

        return ExtractedToolCallInformation(
            tools_called=(len(tool_calls) > 0),
            tool_calls=tool_calls,
            content=content if content else None,
        )

    except Exception:
        logger.exception("Error in extracting tool call from response.")
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    # If no delta text, return None unless
    # it's an EOS token after tool calls
    if not delta_text:
        # Check if this is an EOS token after all tool calls are complete
        # We check for tool calls in the text even if is_tool_call_started
        # is False because it might have been reset after processing all tools
        if (delta_token_ids
                and self.tool_call_end_token_id not in delta_token_ids):
            # Count complete tool calls
            complete_calls = len(
                self.tool_call_complete_regex.findall(current_text))

            # If we have completed tool calls and populated prev_tool_call_arr
            if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                # Check if all tool calls are closed
                open_calls = current_text.count(
                    self.tool_call_start_token) - current_text.count(
                        self.tool_call_end_token)
                if open_calls == 0:
                    # Return empty delta message to allow finish_reason processing
                    return DeltaMessage(content="")
            elif not self.is_tool_call_started and current_text:
                # This is a regular content response that's now complete
                return DeltaMessage(content="")
        return None

    # Check if this is the first call (reset state if needed)
    if not previous_text:
        self._reset_streaming_state()

    # Update accumulated text
    self.accumulated_text = current_text

    # Check if we need to advance to next tool
    if self.json_closed and not self.in_function:
        # Check if this tool call has ended
        tool_ends = current_text.count(self.tool_call_end_token)
        if tool_ends > self.current_tool_index:
            # This tool has ended, advance to next
            self.current_tool_index += 1
            self.header_sent = False
            self.param_count = 0
            self.json_started = False
            self.json_closed = False

            # Check if there are more tool calls
            if self.current_tool_index >= current_text.count(
                    self.tool_call_start_token):
                # No more tool calls
                self.is_tool_call_started = False
            # Continue processing next tool
            return None

    # Check if end thinking
    if (not self.is_thinking_end
            and (self.think_end_token_id in delta_token_ids
                 or self.think_end_token in delta_text)):
        self.is_thinking_end = True

    # If thinking hasn't ended yet, don't process any tool calls
    if not self.is_thinking_end:
        return DeltaMessage(content=delta_text)

    # Handle normal content before tool calls
    if not self.is_tool_call_started:
        # Check if tool call is starting
        if (self.tool_call_start_token_id in delta_token_ids
                or self.tool_call_start_token in delta_text):
            self.is_tool_call_started = True
            # Return any content before the tool call
            if self.tool_call_start_token in delta_text:
                content_before = delta_text[:delta_text.index(
                    self.tool_call_start_token)]
                if content_before:
                    return DeltaMessage(content=content_before)
            return None
        else:
            # Check if we're between tool calls - skip whitespace
            if (current_text.rstrip().endswith(self.tool_call_end_token)
                    and delta_text.strip() == ""):
                # We just ended a tool call, skip whitespace
                return None
            # Normal content, no tool call
            return DeltaMessage(content=delta_text)

    # Check if we're between tool calls (waiting for next one)
    # Count tool calls we've seen vs processed
    tool_starts_count = current_text.count(self.tool_call_start_token)
    if self.current_tool_index >= tool_starts_count:
        # We're past all tool calls, shouldn't be here
        return None

    # We're in a tool call, find the current tool call portion
    # Need to find the correct tool call based on current_tool_index
    # Only process tool calls after think_end_token
    think_end_index = current_text.find(self.think_end_token) + len(
        self.think_end_token
    ) if self.think_end_token in current_text else 0
    tool_starts: list[int] = []
    idx = think_end_index
    while True:
        idx = current_text.find(self.tool_call_start_token, idx)
        if idx == -1:
            break
        tool_starts.append(idx)
        idx += len(self.tool_call_start_token)

    if self.current_tool_index >= len(tool_starts):
        # No more tool calls to process yet
        return None

    tool_start_idx = tool_starts[self.current_tool_index]
    # Find where this tool call ends (or current position if not ended yet)
    tool_end_idx = current_text.find(self.tool_call_end_token,
                                     tool_start_idx)
    if tool_end_idx == -1:
        tool_text = current_text[tool_start_idx:]
    else:
        tool_text = current_text[tool_start_idx:tool_end_idx +
                                 len(self.tool_call_end_token)]

    # Looking for function header
    if not self.header_sent:
        if self.tool_call_prefix in tool_text:
            func_start = tool_text.find(self.tool_call_prefix) + len(
                self.tool_call_prefix)
            func_end = tool_text.find(">", func_start)

            if func_end != -1:
                # Found complete function name
                self.current_function_name = tool_text[func_start:func_end]
                self.current_tool_id = self._generate_tool_call_id(
                )  # type: ignore
                self.header_sent = True
                self.in_function = True

                # IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call
                # This ensures finish_reason="tool_calls" even if parsing isn't complete
                already_added = any(
                    tool.get("name") == self.current_function_name
                    for tool in self.prev_tool_call_arr)
                if not already_added:
                    self.prev_tool_call_arr.append({
                        "name": self.current_function_name,
                        "arguments":
                        "{}",  # Placeholder, will be updated later
                    })

                # Send header with function info
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        id=self.current_tool_id,
                        function=DeltaFunctionCall(
                            name=self.current_function_name, arguments=""),
                        type="function",
                    )
                ])
        return None

    # We've sent header, now handle function body
    if self.in_function:
        # Send opening brace if not sent yet
        if (not self.json_started
                and self.parameter_prefix not in delta_text):
            self.json_started = True
            return DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=self.current_tool_index,
                    function=DeltaFunctionCall(arguments="{"),
                )
            ])

        # Make sure json_started is set if we're processing parameters
        if not self.json_started:
            self.json_started = True

        # Check for function end in accumulated text
        if not self.json_closed and self.function_end_token in tool_text:
            # Close JSON
            self.json_closed = True

            # Extract the complete tool call to update prev_tool_call_arr with final arguments
            # Find the function content
            func_start = tool_text.find(self.tool_call_prefix) + len(
                self.tool_call_prefix)
            func_content_end = tool_text.find(self.function_end_token,
                                              func_start)
            if func_content_end != -1:
                func_content = tool_text[func_start:func_content_end]
                # Parse to get the complete arguments
                try:
                    parsed_tool = self._parse_xml_function_call(
                        func_content, request.tools if request else None)
                    if parsed_tool:
                        # Update existing entry in prev_tool_call_arr with complete arguments
                        for i, tool in enumerate(self.prev_tool_call_arr):
                            if tool.get(
                                    "name") == parsed_tool.function.name:
                                self.prev_tool_call_arr[i]["arguments"] = (
                                    parsed_tool.function.arguments)
                                break
                except Exception:
                    logger.warning(
                        "Failed to parse tool arguments during streaming.",
                        exc_info=True)

            result = DeltaMessage(tool_calls=[
                DeltaToolCall(
                    index=self.current_tool_index,
                    function=DeltaFunctionCall(arguments="}"),
                )
            ])

            # Reset state for next tool
            self.in_function = False
            self.json_closed = True

            return result

        # Look for parameters
        # Count how many complete parameters we have processed
        complete_params = tool_text.count(self.parameter_end_token)

        # Check if we should start a new parameter
        if not self.in_param and self.param_count < complete_params:
            # Find the unprocessed parameter
            # Count parameter starts
            param_starts = []
            idx = 0
            while True:
                idx = tool_text.find(self.parameter_prefix, idx)
                if idx == -1:
                    break
                param_starts.append(idx)
                idx += len(self.parameter_prefix)

            if len(param_starts) > self.param_count:
                # Process the next parameter
                param_idx = param_starts[self.param_count]
                param_start = param_idx + len(self.parameter_prefix)
                remaining = tool_text[param_start:]

                if ">" in remaining:
                    # We have the complete parameter name
                    name_end = remaining.find(">")
                    self.current_param_name = remaining[:name_end]

                    # Find the parameter value
                    value_start = param_start + name_end + 1
                    value_text = tool_text[value_start:]
                    if value_text.startswith("\n"):
                        value_text = value_text[1:]

                    # Find where this parameter ends
                    param_end_idx = value_text.find(
                        self.parameter_end_token)
                    if param_end_idx != -1:
                        # Complete parameter found
                        param_value = value_text[:param_end_idx]
                        if param_value.endswith("\n"):
                            param_value = param_value[:-1]

                        # Build complete JSON fragment for this parameter
                        if self.param_count == 0:
                            json_fragment = (
                                '"' + self.current_param_name + '": "' +
                                json.dumps(param_value)[1:-1] + '"')
                        else:
                            json_fragment = (
                                ', "' + self.current_param_name + '": "' +
                                json.dumps(param_value)[1:-1] + '"')

                        self.param_count += 1

                        return DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=self.current_tool_index,
                                function=DeltaFunctionCall(
                                    arguments=json_fragment),
                            )
                        ])

        # Continue parameter value
        if self.in_param:
            if self.parameter_end_token in delta_text:
                # End of parameter
                end_idx = delta_text.find(self.parameter_end_token)
                value_chunk = delta_text[:end_idx]

                # Skip past > if at start
                if not self.current_param_value and ">" in value_chunk:
                    gt_idx = value_chunk.find(">")
                    value_chunk = value_chunk[gt_idx + 1:]

                if not self.current_param_value and value_chunk.startswith(
                        "\n"):
                    value_chunk = value_chunk[1:]

                # Calculate incremental JSON
                full_value = self.current_param_value + value_chunk
                prev_escaped = (json.dumps(self.current_param_value)[1:-1]
                                if self.current_param_value else "")
                full_escaped = json.dumps(full_value)[1:-1]
                delta_escaped = full_escaped[len(prev_escaped):]

                self.in_param = False
                self.current_param_value = ""

                return DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=self.current_tool_index,
                        function=DeltaFunctionCall(
                            arguments=delta_escaped + '"'),
                    )
                ])
            else:
                # Continue accumulating value
                value_chunk = delta_text

                # Handle first chunk after param name
                if not self.current_param_value and ">" in value_chunk:
                    gt_idx = value_chunk.find(">")
                    value_chunk = value_chunk[gt_idx + 1:]

                if not self.current_param_value and value_chunk.startswith(
                        "\n"):
                    value_chunk = value_chunk[1:]

                if value_chunk:
                    # Stream the escaped delta
                    prev_escaped = (json.dumps(
                        self.current_param_value)[1:-1]
                                    if self.current_param_value else "")
                    self.current_param_value += value_chunk
                    full_escaped = json.dumps(
                        self.current_param_value)[1:-1]
                    delta_escaped = full_escaped[len(prev_escaped):]

                    if delta_escaped:
                        return DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=self.current_tool_index,
                                function=DeltaFunctionCall(
                                    arguments=delta_escaped),
                            )
                        ])

    return None

Step3ToolParser ¶

Bases: ToolParser

Tool parser for a model that uses a specific XML-like format for tool calls. This version uses a robust, stateful, cursor-based streaming parser and consolidates tool arguments into a single message.

Source code in vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py

@ToolParserManager.register_module(["step3"])
class Step3ToolParser(ToolParser):
    """
    Tool parser for a model that uses a specific XML-like format for tool calls.
    This version uses a robust, stateful, cursor-based streaming parser and
    consolidates tool arguments into a single message.
    """

    TOOL_CALLS_BEGIN = "<｜tool_calls_begin｜>"
    TOOL_CALLS_END = "<｜tool_calls_end｜>"
    TOOL_CALL_BEGIN = "<｜tool_call_begin｜>"
    TOOL_CALL_END = "<｜tool_call_end｜>"
    TOOL_SEP = "<｜tool_sep｜>"
    SPECIAL_TOKENS = [
        TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END
    ]

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)
        self.position = 0
        # Explicit state flags for robust streaming
        self.tool_block_started = False
        self.tool_block_finished = False

    def adjust_request(
            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        if request.tools and request.tool_choice != 'none':
            request.skip_special_tokens = False
        return request

    @staticmethod
    def _parse_steptml_invoke(
            action_text: str
    ) -> tuple[Optional[str], Optional[dict[str, str]]]:
        func_name_match = re.search(r'<steptml:invoke name="([^"]+)">',
                                    action_text)
        if not func_name_match:
            return None, None
        func_name = func_name_match.group(1)

        params: dict[str, str] = {}
        param_matches = re.findall(
            r'<steptml:parameter name="([^"]+)">([^<]*)</steptml:parameter>',
            action_text)
        for name, value in param_matches:
            params[name] = value.strip()
        return func_name, params

    def _cast_arguments(
        self,
        func_name: str,
        params: dict[str, Any],
        request: ChatCompletionRequest,
    ) -> dict[str, Any]:
        for tool in request.tools or []:
            if tool.function.name == func_name:
                schema = tool.function.parameters or {}
                properties = schema.get("properties", {})
                for key, value in params.items():
                    if not isinstance(value, str):
                        continue
                    prop = properties.get(key, {})
                    typ = prop.get("type")
                    if typ == "string":
                        params[key] = value.strip()
                    elif typ == "integer":
                        with contextlib.suppress(ValueError):
                            params[key] = int(value)
                    elif typ == "number":
                        with contextlib.suppress(ValueError):
                            params[key] = float(value)
                    elif typ == "boolean":
                        lower_val = value.lower()
                        params[key] = lower_val == "true" if lower_val in (
                            "true", "false") else value
                    elif typ == "null":
                        params[key] = None if value.lower(
                        ) == "null" else value
                break
        return params

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:

        # The main loop processes the stream from the last known position.
        while True:
            if self.position >= len(current_text):
                return None  # We've processed the entire stream.

            unprocessed_text = current_text[self.position:]

            # STATE: After all tools are done, all subsequent text is content.
            if self.tool_block_finished:
                self.position = len(current_text)
                return DeltaMessage(content=unprocessed_text)

            # STATE: Before the tool block has started.
            if not self.tool_block_started:
                if unprocessed_text.startswith(self.TOOL_CALLS_BEGIN):
                    self.position += len(self.TOOL_CALLS_BEGIN)
                    self.tool_block_started = True
                    continue  # Token consumed, re-loop.

                start_pos = unprocessed_text.find(self.TOOL_CALLS_BEGIN)
                if start_pos == -1:
                    if self.TOOL_CALLS_BEGIN.startswith(
                            unprocessed_text.strip()) and unprocessed_text:
                        return None  # It's a prefix, wait.
                    self.position = len(current_text)
                    return DeltaMessage(content=unprocessed_text)
                else:
                    content = unprocessed_text[:start_pos]
                    self.position += len(content)
                    return DeltaMessage(content=content)

            # STATE: Inside the main tool block.
            offset = len(unprocessed_text) - len(unprocessed_text.lstrip())
            unprocessed_text = unprocessed_text.lstrip()
            self.position += offset

            if unprocessed_text.startswith(self.TOOL_CALLS_END):
                self.position += len(self.TOOL_CALLS_END)
                self.tool_block_finished = True
                self.current_tool_id = -1
                continue

            # Check if we are between tool calls.
            tool_finished = (
                self.current_tool_id != -1 and
                self.prev_tool_call_arr[self.current_tool_id].get("finished"))
            if self.current_tool_id == -1 or tool_finished:
                if unprocessed_text.startswith(self.TOOL_CALL_BEGIN):
                    self.position += len(self.TOOL_CALL_BEGIN)
                    if self.current_tool_id == -1:
                        self.current_tool_id = 0
                    else:
                        self.current_tool_id += 1
                    self.current_tool_name_sent = False
                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
                        self.prev_tool_call_arr.append({})
                    self.prev_tool_call_arr[
                        self.current_tool_id]["finished"] = False
                    continue

                if self.TOOL_CALL_BEGIN.startswith(unprocessed_text):
                    return None

            # STATE: Parsing an active tool call.
            if self.current_tool_id != -1 and not self.prev_tool_call_arr[
                    self.current_tool_id].get("finished", False):
                end_tool_pos = unprocessed_text.find(self.TOOL_CALL_END)
                if end_tool_pos == -1:
                    tool_body = unprocessed_text
                else:
                    tool_body = unprocessed_text[:end_tool_pos]

                if end_tool_pos == -1 and self.TOOL_CALL_END.startswith(
                        tool_body):
                    return None

                function_name, arguments = self._parse_steptml_invoke(
                    tool_body)
                if not function_name:
                    return None

                tool_call_arr = {
                    "name": function_name,
                    "parameters": arguments or {}
                }

                # Send the function name as soon as it's parsed.
                if not self.current_tool_name_sent:
                    self.current_tool_name_sent = True
                    self.prev_tool_call_arr[self.current_tool_id].update(
                        tool_call_arr)
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      type="function",
                                      id=f"chatcmpl-tool-{random_uuid()}",
                                      function=DeltaFunctionCall(
                                          name=function_name))
                    ])

                # Update our internal state with the latest parsed arguments.
                self.prev_tool_call_arr[
                    self.current_tool_id].update(  # noqa: E501
                        tool_call_arr)

                # Only send arguments when the tool call is complete.
                if end_tool_pos != -1:
                    self.position += end_tool_pos + len(self.TOOL_CALL_END)
                    self.prev_tool_call_arr[
                        self.current_tool_id]["finished"] = True

                    final_args = self._cast_arguments(
                        function_name,
                        tool_call_arr.get("parameters", {}),  # type: ignore
                        request)
                    if final_args:
                        final_args_json = json.dumps(final_args,
                                                     ensure_ascii=False)
                        return DeltaMessage(tool_calls=[
                            DeltaToolCall(index=self.current_tool_id,
                                          function=DeltaFunctionCall(
                                              arguments=final_args_json))
                        ])

                # If tool is not finished, return None to wait for more tokens.
                return None

            return None

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        if self.TOOL_CALLS_BEGIN not in model_output:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        pre_text, rest = model_output.split(self.TOOL_CALLS_BEGIN, 1)
        if self.TOOL_CALLS_END not in rest:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

        tool_block, post_text = rest.split(self.TOOL_CALLS_END, 1)
        content = (pre_text + post_text).strip()

        tool_calls: list[ToolCall] = []
        call_parts = tool_block.split(self.TOOL_CALL_BEGIN)

        for part in call_parts:
            if not part or self.TOOL_CALL_END not in part:
                continue

            call_content = part.split(self.TOOL_CALL_END, 1)[0]
            if self.TOOL_SEP not in call_content:
                continue

            type_part, invoke_part = call_content.split(self.TOOL_SEP, 1)
            if type_part.strip() != "function":
                continue

            function_name, params_dict = self._parse_steptml_invoke(
                invoke_part)

            if function_name and params_dict is not None:
                params_dict = self._cast_arguments(function_name, params_dict,
                                                   request)
                params_str = json.dumps(params_dict, ensure_ascii=False)
                tool_calls.append(
                    ToolCall(function=FunctionCall(name=function_name,
                                                   arguments=params_str)))
        if tool_calls:
            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content if content else None)
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

SPECIAL_TOKENS `class-attribute` `instance-attribute` ¶

SPECIAL_TOKENS = [
    TOOL_CALLS_BEGIN,
    TOOL_CALLS_END,
    TOOL_CALL_BEGIN,
    TOOL_CALL_END,
]

TOOL_CALLS_BEGIN `class-attribute` `instance-attribute` ¶

TOOL_CALLS_BEGIN = '<｜tool_calls_begin｜>'

TOOL_CALLS_END `class-attribute` `instance-attribute` ¶

TOOL_CALLS_END = '<｜tool_calls_end｜>'

TOOL_CALL_BEGIN `class-attribute` `instance-attribute` ¶

TOOL_CALL_BEGIN = '<｜tool_call_begin｜>'

TOOL_CALL_END `class-attribute` `instance-attribute` ¶

TOOL_CALL_END = '<｜tool_call_end｜>'

TOOL_SEP `class-attribute` `instance-attribute` ¶

TOOL_SEP = '<｜tool_sep｜>'

position `instance-attribute` ¶

position = 0

tool_block_finished `instance-attribute` ¶

tool_block_finished = False

tool_block_started `instance-attribute` ¶

tool_block_started = False

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)
    self.position = 0
    # Explicit state flags for robust streaming
    self.tool_block_started = False
    self.tool_block_finished = False

_cast_arguments ¶

_cast_arguments(
    func_name: str,
    params: dict[str, Any],
    request: ChatCompletionRequest,
) -> dict[str, Any]

Source code in vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py

def _cast_arguments(
    self,
    func_name: str,
    params: dict[str, Any],
    request: ChatCompletionRequest,
) -> dict[str, Any]:
    for tool in request.tools or []:
        if tool.function.name == func_name:
            schema = tool.function.parameters or {}
            properties = schema.get("properties", {})
            for key, value in params.items():
                if not isinstance(value, str):
                    continue
                prop = properties.get(key, {})
                typ = prop.get("type")
                if typ == "string":
                    params[key] = value.strip()
                elif typ == "integer":
                    with contextlib.suppress(ValueError):
                        params[key] = int(value)
                elif typ == "number":
                    with contextlib.suppress(ValueError):
                        params[key] = float(value)
                elif typ == "boolean":
                    lower_val = value.lower()
                    params[key] = lower_val == "true" if lower_val in (
                        "true", "false") else value
                elif typ == "null":
                    params[key] = None if value.lower(
                    ) == "null" else value
            break
    return params

_parse_steptml_invoke `staticmethod` ¶

_parse_steptml_invoke(
    action_text: str,
) -> tuple[Optional[str], Optional[dict[str, str]]]

Source code in vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py

@staticmethod
def _parse_steptml_invoke(
        action_text: str
) -> tuple[Optional[str], Optional[dict[str, str]]]:
    func_name_match = re.search(r'<steptml:invoke name="([^"]+)">',
                                action_text)
    if not func_name_match:
        return None, None
    func_name = func_name_match.group(1)

    params: dict[str, str] = {}
    param_matches = re.findall(
        r'<steptml:parameter name="([^"]+)">([^<]*)</steptml:parameter>',
        action_text)
    for name, value in param_matches:
        params[name] = value.strip()
    return func_name, params

adjust_request ¶

adjust_request(
    request: ChatCompletionRequest,
) -> ChatCompletionRequest

Source code in vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py

def adjust_request(
        self, request: ChatCompletionRequest) -> ChatCompletionRequest:
    if request.tools and request.tool_choice != 'none':
        request.skip_special_tokens = False
    return request

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Source code in vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py

def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    if self.TOOL_CALLS_BEGIN not in model_output:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    pre_text, rest = model_output.split(self.TOOL_CALLS_BEGIN, 1)
    if self.TOOL_CALLS_END not in rest:
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

    tool_block, post_text = rest.split(self.TOOL_CALLS_END, 1)
    content = (pre_text + post_text).strip()

    tool_calls: list[ToolCall] = []
    call_parts = tool_block.split(self.TOOL_CALL_BEGIN)

    for part in call_parts:
        if not part or self.TOOL_CALL_END not in part:
            continue

        call_content = part.split(self.TOOL_CALL_END, 1)[0]
        if self.TOOL_SEP not in call_content:
            continue

        type_part, invoke_part = call_content.split(self.TOOL_SEP, 1)
        if type_part.strip() != "function":
            continue

        function_name, params_dict = self._parse_steptml_invoke(
            invoke_part)

        if function_name and params_dict is not None:
            params_dict = self._cast_arguments(function_name, params_dict,
                                               request)
            params_str = json.dumps(params_dict, ensure_ascii=False)
            tool_calls.append(
                ToolCall(function=FunctionCall(name=function_name,
                                               arguments=params_str)))
    if tool_calls:
        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=tool_calls,
            content=content if content else None)
    return ExtractedToolCallInformation(tools_called=False,
                                        tool_calls=[],
                                        content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Source code in vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:

    # The main loop processes the stream from the last known position.
    while True:
        if self.position >= len(current_text):
            return None  # We've processed the entire stream.

        unprocessed_text = current_text[self.position:]

        # STATE: After all tools are done, all subsequent text is content.
        if self.tool_block_finished:
            self.position = len(current_text)
            return DeltaMessage(content=unprocessed_text)

        # STATE: Before the tool block has started.
        if not self.tool_block_started:
            if unprocessed_text.startswith(self.TOOL_CALLS_BEGIN):
                self.position += len(self.TOOL_CALLS_BEGIN)
                self.tool_block_started = True
                continue  # Token consumed, re-loop.

            start_pos = unprocessed_text.find(self.TOOL_CALLS_BEGIN)
            if start_pos == -1:
                if self.TOOL_CALLS_BEGIN.startswith(
                        unprocessed_text.strip()) and unprocessed_text:
                    return None  # It's a prefix, wait.
                self.position = len(current_text)
                return DeltaMessage(content=unprocessed_text)
            else:
                content = unprocessed_text[:start_pos]
                self.position += len(content)
                return DeltaMessage(content=content)

        # STATE: Inside the main tool block.
        offset = len(unprocessed_text) - len(unprocessed_text.lstrip())
        unprocessed_text = unprocessed_text.lstrip()
        self.position += offset

        if unprocessed_text.startswith(self.TOOL_CALLS_END):
            self.position += len(self.TOOL_CALLS_END)
            self.tool_block_finished = True
            self.current_tool_id = -1
            continue

        # Check if we are between tool calls.
        tool_finished = (
            self.current_tool_id != -1 and
            self.prev_tool_call_arr[self.current_tool_id].get("finished"))
        if self.current_tool_id == -1 or tool_finished:
            if unprocessed_text.startswith(self.TOOL_CALL_BEGIN):
                self.position += len(self.TOOL_CALL_BEGIN)
                if self.current_tool_id == -1:
                    self.current_tool_id = 0
                else:
                    self.current_tool_id += 1
                self.current_tool_name_sent = False
                while len(self.prev_tool_call_arr) <= self.current_tool_id:
                    self.prev_tool_call_arr.append({})
                self.prev_tool_call_arr[
                    self.current_tool_id]["finished"] = False
                continue

            if self.TOOL_CALL_BEGIN.startswith(unprocessed_text):
                return None

        # STATE: Parsing an active tool call.
        if self.current_tool_id != -1 and not self.prev_tool_call_arr[
                self.current_tool_id].get("finished", False):
            end_tool_pos = unprocessed_text.find(self.TOOL_CALL_END)
            if end_tool_pos == -1:
                tool_body = unprocessed_text
            else:
                tool_body = unprocessed_text[:end_tool_pos]

            if end_tool_pos == -1 and self.TOOL_CALL_END.startswith(
                    tool_body):
                return None

            function_name, arguments = self._parse_steptml_invoke(
                tool_body)
            if not function_name:
                return None

            tool_call_arr = {
                "name": function_name,
                "parameters": arguments or {}
            }

            # Send the function name as soon as it's parsed.
            if not self.current_tool_name_sent:
                self.current_tool_name_sent = True
                self.prev_tool_call_arr[self.current_tool_id].update(
                    tool_call_arr)
                return DeltaMessage(tool_calls=[
                    DeltaToolCall(index=self.current_tool_id,
                                  type="function",
                                  id=f"chatcmpl-tool-{random_uuid()}",
                                  function=DeltaFunctionCall(
                                      name=function_name))
                ])

            # Update our internal state with the latest parsed arguments.
            self.prev_tool_call_arr[
                self.current_tool_id].update(  # noqa: E501
                    tool_call_arr)

            # Only send arguments when the tool call is complete.
            if end_tool_pos != -1:
                self.position += end_tool_pos + len(self.TOOL_CALL_END)
                self.prev_tool_call_arr[
                    self.current_tool_id]["finished"] = True

                final_args = self._cast_arguments(
                    function_name,
                    tool_call_arr.get("parameters", {}),  # type: ignore
                    request)
                if final_args:
                    final_args_json = json.dumps(final_args,
                                                 ensure_ascii=False)
                    return DeltaMessage(tool_calls=[
                        DeltaToolCall(index=self.current_tool_id,
                                      function=DeltaFunctionCall(
                                          arguments=final_args_json))
                    ])

            # If tool is not finished, return None to wait for more tokens.
            return None

        return None

ToolParser ¶

Abstract ToolParser class that should not be used directly. Provided properties and methods should be used in derived classes.

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

class ToolParser:
    """
    Abstract ToolParser class that should not be used directly. Provided
    properties and methods should be used in
    derived classes.
    """

    def __init__(self, tokenizer: AnyTokenizer):
        self.prev_tool_call_arr: list[dict] = []
        # the index of the tool call that is currently being parsed
        self.current_tool_id: int = -1
        self.current_tool_name_sent: bool = False
        self.streamed_args_for_tool: list[str] = []

        self.model_tokenizer = tokenizer

    @cached_property
    def vocab(self) -> dict[str, int]:
        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
        # whereas all tokenizers have .get_vocab()
        return self.model_tokenizer.get_vocab()

    def adjust_request(
            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        """
        Static method that used to adjust the request parameters.
        """
        return request

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        """
        Static method that should be implemented for extracting tool calls from
        a complete model-generated string.
        Used for non-streaming responses where we have the entire model response
        available before sending to the client.
        Static because it's stateless.
        """
        raise NotImplementedError(
            "AbstractToolParser.extract_tool_calls has not been implemented!")

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        """
        Instance method that should be implemented for extracting tool calls
        from an incomplete response; for use when handling tool calls and
        streaming. Has to be an instance method because  it requires state -
        the current tokens/diffs, but also the information about what has
        previously been parsed and extracted (see constructor)
        """
        raise NotImplementedError(
            "AbstractToolParser.extract_tool_calls_streaming has not been "
            "implemented!")

current_tool_id `instance-attribute` ¶

current_tool_id: int = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent: bool = False

model_tokenizer `instance-attribute` ¶

model_tokenizer = tokenizer

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr: list[dict] = []

streamed_args_for_tool `instance-attribute` ¶

streamed_args_for_tool: list[str] = []

vocab `cached` `property` ¶

vocab: dict[str, int]

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    self.prev_tool_call_arr: list[dict] = []
    # the index of the tool call that is currently being parsed
    self.current_tool_id: int = -1
    self.current_tool_name_sent: bool = False
    self.streamed_args_for_tool: list[str] = []

    self.model_tokenizer = tokenizer

adjust_request ¶

adjust_request(
    request: ChatCompletionRequest,
) -> ChatCompletionRequest

Static method that used to adjust the request parameters.

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

def adjust_request(
        self, request: ChatCompletionRequest) -> ChatCompletionRequest:
    """
    Static method that used to adjust the request parameters.
    """
    return request

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Static method that should be implemented for extracting tool calls from a complete model-generated string. Used for non-streaming responses where we have the entire model response available before sending to the client. Static because it's stateless.

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    """
    Static method that should be implemented for extracting tool calls from
    a complete model-generated string.
    Used for non-streaming responses where we have the entire model response
    available before sending to the client.
    Static because it's stateless.
    """
    raise NotImplementedError(
        "AbstractToolParser.extract_tool_calls has not been implemented!")

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Instance method that should be implemented for extracting tool calls from an incomplete response; for use when handling tool calls and streaming. Has to be an instance method because it requires state - the current tokens/diffs, but also the information about what has previously been parsed and extracted (see constructor)

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    """
    Instance method that should be implemented for extracting tool calls
    from an incomplete response; for use when handling tool calls and
    streaming. Has to be an instance method because  it requires state -
    the current tokens/diffs, but also the information about what has
    previously been parsed and extracted (see constructor)
    """
    raise NotImplementedError(
        "AbstractToolParser.extract_tool_calls_streaming has not been "
        "implemented!")

ToolParserManager ¶

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

class ToolParserManager:
    tool_parsers: dict[str, type] = {}

    @classmethod
    def get_tool_parser(cls, name) -> type:
        """
        Get tool parser by name which is registered by `register_module`.

        Raise a KeyError exception if the name is not registered.
        """
        if name in cls.tool_parsers:
            return cls.tool_parsers[name]

        raise KeyError(f"tool helper: '{name}' not found in tool_parsers")

    @classmethod
    def _register_module(cls,
                         module: type,
                         module_name: Optional[Union[str, list[str]]] = None,
                         force: bool = True) -> None:
        if not issubclass(module, ToolParser):
            raise TypeError(
                f'module must be subclass of ToolParser, but got {type(module)}'
            )
        if module_name is None:
            module_name = module.__name__
        if isinstance(module_name, str):
            module_name = [module_name]
        for name in module_name:
            if not force and name in cls.tool_parsers:
                existed_module = cls.tool_parsers[name]
                raise KeyError(f'{name} is already registered '
                               f'at {existed_module.__module__}')
            cls.tool_parsers[name] = module

    @classmethod
    def register_module(
            cls,
            name: Optional[Union[str, list[str]]] = None,
            force: bool = True,
            module: Union[type, None] = None) -> Union[type, Callable]:
        """
        Register module with the given name or name list. it can be used as a
        decoder(with module as None) or normal function(with module as not 
        None).
        """
        if not isinstance(force, bool):
            raise TypeError(f'force must be a boolean, but got {type(force)}')

        # raise the error ahead of time
        if not (name is None or isinstance(name, str)
                or is_list_of(name, str)):
            raise TypeError(
                'name must be None, an instance of str, or a sequence of str, '
                f'but got {type(name)}')

        # use it as a normal method: x.register_module(module=SomeClass)
        if module is not None:
            cls._register_module(module=module, module_name=name, force=force)
            return module

        # use it as a decorator: @x.register_module()
        def _register(module):
            cls._register_module(module=module, module_name=name, force=force)
            return module

        return _register

    @classmethod
    def import_tool_parser(cls, plugin_path: str) -> None:
        """
        Import a user-defined tool parser by the path of the tool parser define
        file.
        """
        module_name = os.path.splitext(os.path.basename(plugin_path))[0]

        try:
            import_from_path(module_name, plugin_path)
        except Exception:
            logger.exception("Failed to load module '%s' from %s.",
                             module_name, plugin_path)
            return

tool_parsers `class-attribute` `instance-attribute` ¶

tool_parsers: dict[str, type] = {}

_register_module `classmethod` ¶

_register_module(
    module: type,
    module_name: Optional[Union[str, list[str]]] = None,
    force: bool = True,
) -> None

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

@classmethod
def _register_module(cls,
                     module: type,
                     module_name: Optional[Union[str, list[str]]] = None,
                     force: bool = True) -> None:
    if not issubclass(module, ToolParser):
        raise TypeError(
            f'module must be subclass of ToolParser, but got {type(module)}'
        )
    if module_name is None:
        module_name = module.__name__
    if isinstance(module_name, str):
        module_name = [module_name]
    for name in module_name:
        if not force and name in cls.tool_parsers:
            existed_module = cls.tool_parsers[name]
            raise KeyError(f'{name} is already registered '
                           f'at {existed_module.__module__}')
        cls.tool_parsers[name] = module

get_tool_parser `classmethod` ¶

get_tool_parser(name) -> type

Get tool parser by name which is registered by register_module.

Raise a KeyError exception if the name is not registered.

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

@classmethod
def get_tool_parser(cls, name) -> type:
    """
    Get tool parser by name which is registered by `register_module`.

    Raise a KeyError exception if the name is not registered.
    """
    if name in cls.tool_parsers:
        return cls.tool_parsers[name]

    raise KeyError(f"tool helper: '{name}' not found in tool_parsers")

import_tool_parser `classmethod` ¶

import_tool_parser(plugin_path: str) -> None

Import a user-defined tool parser by the path of the tool parser define file.

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

@classmethod
def import_tool_parser(cls, plugin_path: str) -> None:
    """
    Import a user-defined tool parser by the path of the tool parser define
    file.
    """
    module_name = os.path.splitext(os.path.basename(plugin_path))[0]

    try:
        import_from_path(module_name, plugin_path)
    except Exception:
        logger.exception("Failed to load module '%s' from %s.",
                         module_name, plugin_path)
        return

register_module `classmethod` ¶

register_module(
    name: Optional[Union[str, list[str]]] = None,
    force: bool = True,
    module: Union[type, None] = None,
) -> Union[type, Callable]

Register module with the given name or name list. it can be used as a decoder(with module as None) or normal function(with module as not None).

Source code in vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py

@classmethod
def register_module(
        cls,
        name: Optional[Union[str, list[str]]] = None,
        force: bool = True,
        module: Union[type, None] = None) -> Union[type, Callable]:
    """
    Register module with the given name or name list. it can be used as a
    decoder(with module as None) or normal function(with module as not 
    None).
    """
    if not isinstance(force, bool):
        raise TypeError(f'force must be a boolean, but got {type(force)}')

    # raise the error ahead of time
    if not (name is None or isinstance(name, str)
            or is_list_of(name, str)):
        raise TypeError(
            'name must be None, an instance of str, or a sequence of str, '
            f'but got {type(name)}')

    # use it as a normal method: x.register_module(module=SomeClass)
    if module is not None:
        cls._register_module(module=module, module_name=name, force=force)
        return module

    # use it as a decorator: @x.register_module()
    def _register(module):
        cls._register_module(module=module, module_name=name, force=force)
        return module

    return _register

xLAMToolParser ¶

Bases: ToolParser

Source code in vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py

@ToolParserManager.register_module("xlam")
class xLAMToolParser(ToolParser):

    def __init__(self, tokenizer: AnyTokenizer):
        super().__init__(tokenizer)

        # Initialize state for streaming mode
        self.prev_tool_calls: list[dict] = []
        self.current_tool_id = -1
        self.current_tool_name_sent = False
        self.streamed_args: list[str] = [
        ]  # Track arguments sent for each tool

        # For backward compatibility with tests
        self.current_tools_sent: list[bool] = []

        # For backward compatibility with serving code
        self.prev_tool_call_arr = []

        # Regex patterns for preprocessing
        self.json_code_block_patterns = [
            r"```(?:json)?\s*([\s\S]*?)```",
            r"\[TOOL_CALLS\]([\s\S]*?)(?=\n|$)",
            r"<tool_call>([\s\S]*?)</tool_call>",
        ]
        self.thinking_tag_pattern = r"</think>([\s\S]*)"

        # Define streaming state type to be initialized later
        self.streaming_state: dict[str, Any] = {
            "current_tool_index": -1,
            "tool_ids": [],
            "sent_tools": [],
        }

    def preprocess_model_output(
            self, model_output: str) -> tuple[Optional[str], Optional[str]]:
        """
        Preprocess the model output to extract content and potential tool calls.
        Returns:
            Tuple of (content, potential_tool_calls_json)
        """
        # Check for thinking tag
        thinking_match = re.search(self.thinking_tag_pattern, model_output)
        if thinking_match:
            content = model_output[:thinking_match.start() +
                                   len("</think>")].strip()
            thinking_content = thinking_match.group(1).strip()

            # Try to parse the thinking content as JSON
            try:
                json.loads(thinking_content)
                return content, thinking_content
            except json.JSONDecodeError:
                # If can't parse as JSON, look for JSON code blocks
                for json_pattern in self.json_code_block_patterns:
                    json_matches = re.findall(json_pattern, thinking_content)
                    if json_matches:
                        for json_str in json_matches:
                            try:
                                json.loads(json_str)
                                return content, json_str
                            except json.JSONDecodeError:
                                continue

        # Check for JSON code blocks in the entire output
        for json_pattern in self.json_code_block_patterns:
            json_matches = re.findall(json_pattern, model_output)
            if json_matches:
                for json_str in json_matches:
                    try:
                        json.loads(json_str)
                        # Extract content by removing the JSON code block
                        content = re.sub(json_pattern, "",
                                         model_output).strip()
                        return content, json_str
                    except json.JSONDecodeError:
                        continue

        # If the entire output is a valid JSON array or looks like one, treat it as tool calls
        if model_output.strip().startswith("["):
            try:
                json.loads(model_output)
                return None, model_output
            except json.JSONDecodeError:
                # Even if it's not valid JSON yet, it might be a tool call in progress
                if ("{" in model_output and "name" in model_output
                        and "arguments" in model_output):
                    return None, model_output

        # If no tool calls found, return the original output as content
        return model_output, None

    def extract_tool_calls(
            self, model_output: str,
            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
        """
        Extract tool calls from a complete model output.
        """
        try:
            # Preprocess the model output
            content, potential_tool_calls = self.preprocess_model_output(
                model_output)

            if not potential_tool_calls:
                return ExtractedToolCallInformation(tools_called=False,
                                                    tool_calls=[],
                                                    content=content)

            # Parse the potential tool calls as JSON
            tool_calls_data = json.loads(potential_tool_calls)

            # Ensure it's an array
            if not isinstance(tool_calls_data, list):
                logger.debug("Tool calls data is not an array")
                return ExtractedToolCallInformation(
                    tools_called=False,
                    tool_calls=[],
                    content=content or model_output,
                )

            tool_calls: list[ToolCall] = []

            for idx, call in enumerate(tool_calls_data):
                if (not isinstance(call, dict) or "name" not in call
                        or "arguments" not in call):
                    logger.debug("Invalid tool call format at index %d", idx)
                    continue

                tool_call = ToolCall(
                    id=f"call_{idx}_{random_uuid()}",
                    type="function",
                    function=FunctionCall(
                        name=call["name"],
                        arguments=(json.dumps(call["arguments"]) if isinstance(
                            call["arguments"], dict) else call["arguments"]),
                    ),
                )
                tool_calls.append(tool_call)

            return ExtractedToolCallInformation(
                tools_called=len(tool_calls) > 0,
                tool_calls=tool_calls,
                content=content,
            )

        except Exception as e:
            logger.exception("Error extracting tool calls: %s", str(e))
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=model_output)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> Union[DeltaMessage, None]:
        """
        Extract tool calls for streaming mode.
        """
        # First, check for a definitive start of a tool call block.
        # This prevents premature parsing of incomplete output.
        stripped_text = current_text.strip()
        preprocessed_content, preprocessed_tool_calls = (
            self.preprocess_model_output(current_text))

        # For JSON code blocks, we need to detect them earlier, even if incomplete
        has_potential_json_block = ("```json" in current_text
                                    or "```\n[" in current_text
                                    or "[TOOL_CALLS]" in current_text
                                    or "<tool_call>" in current_text)

        is_tool_call_block = (
            stripped_text.startswith("[")
            or stripped_text.startswith("<tool_call>")
            or stripped_text.startswith("[TOOL_CALLS]") or
            # Check if we have thinking tags with JSON-like content following
            ("</think>[" in current_text) or
            # Check if the text contains a JSON array after preprocessing
            preprocessed_tool_calls is not None or
            # For JSON code blocks, detect early if we see enough structure
            (has_potential_json_block and '"name"' in current_text
             and '"arguments"' in current_text))

        if not is_tool_call_block:
            return DeltaMessage(content=delta_text)

        try:
            # Initialize streaming state if not exists
            if not hasattr(self, "streaming_state"):
                self.streaming_state = {
                    "current_tool_index": -1,
                    "tool_ids": [],
                    "sent_tools": [],  # Track complete state of each tool
                }

            # Try parsing as JSON to check for complete tool calls
            try:
                # Use preprocessed tool calls if available
                tool_calls_text = (preprocessed_tool_calls if
                                   preprocessed_tool_calls else current_text)
                parsed_tools = json.loads(tool_calls_text)
                if isinstance(parsed_tools, list):
                    # Update our tool array for next time
                    self.prev_tool_call_arr = parsed_tools
            except json.JSONDecodeError:
                # Not complete JSON yet, use regex for partial parsing
                pass

            # Check for test-specific state setup (current_tools_sent)
            # This handles the case where tests manually set current_tools_sent
            if (hasattr(self, "current_tools_sent")  # type: ignore
                    and len(self.current_tools_sent) > 0):
                # If current_tools_sent is set to [False], it means the test wants us to send the name
                if (len(self.current_tools_sent) == 1
                        and self.current_tools_sent[0] is False):
                    # Extract the function name using regex
                    name_pattern = r'"name"\s*:\s*"([^"]+)"'
                    name_match = re.search(name_pattern, current_text)
                    if name_match:
                        function_name = name_match.group(1)

                        # The test expects us to send just the name first
                        tool_id = make_tool_call_id()
                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=0,
                                type="function",
                                id=tool_id,
                                function=DeltaFunctionCall(
                                    name=function_name).model_dump(
                                        exclude_none=True),  # type: ignore
                            )
                        ])
                        # Update state to reflect that we've sent the name
                        self.current_tools_sent = [True]
                        self.current_tool_id = 0
                        self.streaming_state["current_tool_index"] = 0
                        if len(self.streaming_state["sent_tools"]) == 0:
                            self.streaming_state["sent_tools"].append({
                                "sent_name":
                                True,
                                "sent_arguments_prefix":
                                False,
                                "sent_arguments":
                                "",
                            })
                        else:
                            self.streaming_state["sent_tools"][0][
                                "sent_name"] = True
                        self.current_tool_name_sent = True
                        return delta

            # Use regex to identify tool calls in the output
            # Use preprocessed tool calls text for better parsing, but also try to extract from incomplete JSON blocks
            search_text = (preprocessed_tool_calls
                           if preprocessed_tool_calls else current_text)

            # For JSON code blocks that aren't complete yet, try to extract the JSON content
            if not preprocessed_tool_calls and has_potential_json_block:
                # Try to extract the JSON array from within the code block
                json_match = re.search(r"```(?:json)?\s*([\s\S]*?)(?:```|$)",
                                       current_text)
                if json_match:
                    potential_json = json_match.group(1).strip()
                    # Use this as search text even if it's incomplete
                    if potential_json.startswith("[") and (
                            '"name"' in potential_json
                            and '"arguments"' in potential_json):
                        search_text = potential_json

            # Try to find complete tool names first
            name_pattern = r'"name"\s*:\s*"([^"]+)"'
            name_matches = list(re.finditer(name_pattern, search_text))
            tool_count = len(name_matches)

            # If no complete tool names found, check for partial tool names
            if tool_count == 0:
                # Check if we're in the middle of parsing a tool name
                partial_name_pattern = r'"name"\s*:\s*"([^"]*)'
                partial_matches = list(
                    re.finditer(partial_name_pattern, search_text))
                if partial_matches:
                    # We have a partial tool name - not ready to emit yet
                    return None
                else:
                    # No tools found at all
                    return None

            # Ensure our state arrays are large enough
            while len(self.streaming_state["sent_tools"]) < tool_count:
                self.streaming_state["sent_tools"].append({
                    "sent_name":
                    False,
                    "sent_arguments_prefix":
                    False,
                    "sent_arguments":
                    "",
                })

            while len(self.streaming_state["tool_ids"]) < tool_count:
                self.streaming_state["tool_ids"].append(None)

            # Determine if we need to move to a new tool
            current_idx = self.streaming_state["current_tool_index"]

            # If we haven't processed any tool yet or current tool is complete, move to next
            if current_idx == -1 or current_idx < tool_count - 1:
                next_idx = current_idx + 1

                # If tool at next_idx has not been sent yet
                if (next_idx < tool_count
                        and not self.streaming_state["sent_tools"][next_idx]
                    ["sent_name"]):
                    # Update indexes
                    self.streaming_state["current_tool_index"] = next_idx
                    self.current_tool_id = (
                        next_idx  # For backward compatibility
                    )
                    current_idx = next_idx

                    # Extract the tool name
                    tool_name = name_matches[current_idx].group(1)

                    # Generate ID and send tool name
                    tool_id = f"call_{current_idx}_{random_uuid()}"
                    self.streaming_state["tool_ids"][current_idx] = tool_id

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=current_idx,
                            type="function",
                            id=tool_id,
                            function=DeltaFunctionCall(
                                name=tool_name).model_dump(
                                    exclude_none=True),  # type: ignore
                        )
                    ])
                    self.streaming_state["sent_tools"][current_idx][
                        "sent_name"] = True
                    self.current_tool_name_sent = (
                        True  # For backward compatibility
                    )

                    # Keep track of streamed args for backward compatibility
                    while len(self.streamed_args) <= current_idx:
                        self.streamed_args.append("")

                    return delta

            # Process arguments for the current tool
            if current_idx >= 0 and current_idx < tool_count:
                # Support both regular and empty argument objects
                # First, check for the empty arguments case: "arguments": {}
                empty_args_pattern = (
                    r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
                empty_args_match = re.search(empty_args_pattern, search_text)

                # Check if this tool has empty arguments
                if empty_args_match and empty_args_match.start() > 0:
                    # Find which tool this empty arguments belongs to
                    empty_args_tool_idx = 0
                    for i in range(tool_count):
                        if i == current_idx:
                            # If this is our current tool and it has empty arguments
                            if not self.streaming_state["sent_tools"][
                                    current_idx]["sent_arguments_prefix"]:
                                # Send empty object
                                self.streaming_state["sent_tools"][
                                    current_idx][
                                        "sent_arguments_prefix"] = True
                                self.streaming_state["sent_tools"][
                                    current_idx]["sent_arguments"] = "{}"

                                # Update streamed_args for backward compatibility
                                while len(self.streamed_args) <= current_idx:
                                    self.streamed_args.append("")
                                self.streamed_args[current_idx] += "{}"

                                delta = DeltaMessage(tool_calls=[
                                    DeltaToolCall(
                                        index=current_idx,
                                        function=DeltaFunctionCall(
                                            arguments="{}").
                                        model_dump(
                                            exclude_none=True),  # type: ignore
                                    )
                                ])

                                # Move to next tool if available
                                if current_idx < tool_count - 1:
                                    self.streaming_state[
                                        "current_tool_index"] += 1
                                    self.current_tool_id = self.streaming_state[
                                        "current_tool_index"]

                                return delta

                # Extract arguments for current tool using regex for non-empty arguments
                args_pattern = r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
                args_matches = list(re.finditer(args_pattern, search_text))

                if current_idx < len(args_matches):
                    args_text = args_matches[current_idx].group(1)

                    # Handle transition between tools
                    is_last_tool = current_idx == tool_count - 1

                    # For multiple tools, extract only the arguments for the current tool
                    if tool_count > 1:
                        # Parse the entire JSON structure to properly extract arguments for each tool
                        try:
                            parsed_tools = json.loads(search_text)
                            if isinstance(
                                    parsed_tools,
                                    list) and current_idx < len(parsed_tools):
                                current_tool = parsed_tools[current_idx]
                                if isinstance(current_tool.get("arguments"),
                                              dict):
                                    args_text = json.dumps(
                                        current_tool["arguments"])
                                else:
                                    args_text = str(
                                        current_tool.get("arguments", "{}"))
                        except (json.JSONDecodeError, KeyError, IndexError):
                            # Fallback to regex-based extraction
                            pass

                    # If arguments haven't been sent yet
                    sent_args = self.streaming_state["sent_tools"][
                        current_idx]["sent_arguments"]

                    # If we haven't sent the opening bracket yet
                    if not self.streaming_state["sent_tools"][current_idx][
                            "sent_arguments_prefix"] and args_text.startswith(
                                "{"):
                        self.streaming_state["sent_tools"][current_idx][
                            "sent_arguments_prefix"] = True
                        self.streaming_state["sent_tools"][current_idx][
                            "sent_arguments"] = "{"

                        # Update streamed_args for backward compatibility
                        while len(self.streamed_args) <= current_idx:
                            self.streamed_args.append("")
                        self.streamed_args[current_idx] += "{"

                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=current_idx,
                                function=DeltaFunctionCall(
                                    arguments="{").model_dump(
                                        exclude_none=True),  # type: ignore
                            )
                        ])
                        return delta

                    # If we need to send more arguments
                    if args_text.startswith(sent_args):
                        # Calculate what part of arguments we need to send
                        args_diff = args_text[len(sent_args):]

                        if args_diff:
                            # Update our state
                            self.streaming_state["sent_tools"][current_idx][
                                "sent_arguments"] = args_text

                            # Update streamed_args for backward compatibility
                            while len(self.streamed_args) <= current_idx:
                                self.streamed_args.append("")
                            self.streamed_args[current_idx] += args_diff

                            delta = DeltaMessage(tool_calls=[
                                DeltaToolCall(
                                    index=current_idx,
                                    function=DeltaFunctionCall(
                                        arguments=args_diff).model_dump(
                                            exclude_none=True),  # type: ignore
                                )
                            ])
                            return delta

                    # If the tool's arguments are complete, check if we need to move to the next tool
                    if args_text.endswith("}") and args_text == sent_args:
                        # This tool is complete, move to the next one in the next iteration
                        if current_idx < tool_count - 1:
                            self.streaming_state["current_tool_index"] += 1
                            self.current_tool_id = self.streaming_state[
                                "current_tool_index"]  # For compatibility

            # If we got here, we couldn't determine what to stream next
            return None

        except Exception as e:
            logger.exception(f"Error in streaming tool calls: {e}")
            # If we encounter an error, just return the delta text as regular content
            return DeltaMessage(content=delta_text)

current_tool_id `instance-attribute` ¶

current_tool_id = -1

current_tool_name_sent `instance-attribute` ¶

current_tool_name_sent = False

current_tools_sent `instance-attribute` ¶

current_tools_sent: list[bool] = []

json_code_block_patterns `instance-attribute` ¶

json_code_block_patterns = [
    "```(?:json)?\\s*([\\s\\S]*?)```",
    "\\[TOOL_CALLS\\]([\\s\\S]*?)(?=\\n|$)",
    "<tool_call>([\\s\\S]*?)</tool_call>",
]

prev_tool_call_arr `instance-attribute` ¶

prev_tool_call_arr = []

prev_tool_calls `instance-attribute` ¶

prev_tool_calls: list[dict] = []

streamed_args `instance-attribute` ¶

streamed_args: list[str] = []

streaming_state `instance-attribute` ¶

streaming_state: dict[str, Any] = {
    "current_tool_index": -1,
    "tool_ids": [],
    "sent_tools": [],
}

thinking_tag_pattern `instance-attribute` ¶

thinking_tag_pattern = '</think>([\\s\\S]*)'

init ¶

__init__(tokenizer: AnyTokenizer)

Source code in vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py

def __init__(self, tokenizer: AnyTokenizer):
    super().__init__(tokenizer)

    # Initialize state for streaming mode
    self.prev_tool_calls: list[dict] = []
    self.current_tool_id = -1
    self.current_tool_name_sent = False
    self.streamed_args: list[str] = [
    ]  # Track arguments sent for each tool

    # For backward compatibility with tests
    self.current_tools_sent: list[bool] = []

    # For backward compatibility with serving code
    self.prev_tool_call_arr = []

    # Regex patterns for preprocessing
    self.json_code_block_patterns = [
        r"```(?:json)?\s*([\s\S]*?)```",
        r"\[TOOL_CALLS\]([\s\S]*?)(?=\n|$)",
        r"<tool_call>([\s\S]*?)</tool_call>",
    ]
    self.thinking_tag_pattern = r"</think>([\s\S]*)"

    # Define streaming state type to be initialized later
    self.streaming_state: dict[str, Any] = {
        "current_tool_index": -1,
        "tool_ids": [],
        "sent_tools": [],
    }

extract_tool_calls ¶

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract tool calls from a complete model output.

Source code in vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py

def extract_tool_calls(
        self, model_output: str,
        request: ChatCompletionRequest) -> ExtractedToolCallInformation:
    """
    Extract tool calls from a complete model output.
    """
    try:
        # Preprocess the model output
        content, potential_tool_calls = self.preprocess_model_output(
            model_output)

        if not potential_tool_calls:
            return ExtractedToolCallInformation(tools_called=False,
                                                tool_calls=[],
                                                content=content)

        # Parse the potential tool calls as JSON
        tool_calls_data = json.loads(potential_tool_calls)

        # Ensure it's an array
        if not isinstance(tool_calls_data, list):
            logger.debug("Tool calls data is not an array")
            return ExtractedToolCallInformation(
                tools_called=False,
                tool_calls=[],
                content=content or model_output,
            )

        tool_calls: list[ToolCall] = []

        for idx, call in enumerate(tool_calls_data):
            if (not isinstance(call, dict) or "name" not in call
                    or "arguments" not in call):
                logger.debug("Invalid tool call format at index %d", idx)
                continue

            tool_call = ToolCall(
                id=f"call_{idx}_{random_uuid()}",
                type="function",
                function=FunctionCall(
                    name=call["name"],
                    arguments=(json.dumps(call["arguments"]) if isinstance(
                        call["arguments"], dict) else call["arguments"]),
                ),
            )
            tool_calls.append(tool_call)

        return ExtractedToolCallInformation(
            tools_called=len(tool_calls) > 0,
            tool_calls=tool_calls,
            content=content,
        )

    except Exception as e:
        logger.exception("Error extracting tool calls: %s", str(e))
        return ExtractedToolCallInformation(tools_called=False,
                                            tool_calls=[],
                                            content=model_output)

extract_tool_calls_streaming ¶

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]

Extract tool calls for streaming mode.

Source code in vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py

def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
    """
    Extract tool calls for streaming mode.
    """
    # First, check for a definitive start of a tool call block.
    # This prevents premature parsing of incomplete output.
    stripped_text = current_text.strip()
    preprocessed_content, preprocessed_tool_calls = (
        self.preprocess_model_output(current_text))

    # For JSON code blocks, we need to detect them earlier, even if incomplete
    has_potential_json_block = ("```json" in current_text
                                or "```\n[" in current_text
                                or "[TOOL_CALLS]" in current_text
                                or "<tool_call>" in current_text)

    is_tool_call_block = (
        stripped_text.startswith("[")
        or stripped_text.startswith("<tool_call>")
        or stripped_text.startswith("[TOOL_CALLS]") or
        # Check if we have thinking tags with JSON-like content following
        ("</think>[" in current_text) or
        # Check if the text contains a JSON array after preprocessing
        preprocessed_tool_calls is not None or
        # For JSON code blocks, detect early if we see enough structure
        (has_potential_json_block and '"name"' in current_text
         and '"arguments"' in current_text))

    if not is_tool_call_block:
        return DeltaMessage(content=delta_text)

    try:
        # Initialize streaming state if not exists
        if not hasattr(self, "streaming_state"):
            self.streaming_state = {
                "current_tool_index": -1,
                "tool_ids": [],
                "sent_tools": [],  # Track complete state of each tool
            }

        # Try parsing as JSON to check for complete tool calls
        try:
            # Use preprocessed tool calls if available
            tool_calls_text = (preprocessed_tool_calls if
                               preprocessed_tool_calls else current_text)
            parsed_tools = json.loads(tool_calls_text)
            if isinstance(parsed_tools, list):
                # Update our tool array for next time
                self.prev_tool_call_arr = parsed_tools
        except json.JSONDecodeError:
            # Not complete JSON yet, use regex for partial parsing
            pass

        # Check for test-specific state setup (current_tools_sent)
        # This handles the case where tests manually set current_tools_sent
        if (hasattr(self, "current_tools_sent")  # type: ignore
                and len(self.current_tools_sent) > 0):
            # If current_tools_sent is set to [False], it means the test wants us to send the name
            if (len(self.current_tools_sent) == 1
                    and self.current_tools_sent[0] is False):
                # Extract the function name using regex
                name_pattern = r'"name"\s*:\s*"([^"]+)"'
                name_match = re.search(name_pattern, current_text)
                if name_match:
                    function_name = name_match.group(1)

                    # The test expects us to send just the name first
                    tool_id = make_tool_call_id()
                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=0,
                            type="function",
                            id=tool_id,
                            function=DeltaFunctionCall(
                                name=function_name).model_dump(
                                    exclude_none=True),  # type: ignore
                        )
                    ])
                    # Update state to reflect that we've sent the name
                    self.current_tools_sent = [True]
                    self.current_tool_id = 0
                    self.streaming_state["current_tool_index"] = 0
                    if len(self.streaming_state["sent_tools"]) == 0:
                        self.streaming_state["sent_tools"].append({
                            "sent_name":
                            True,
                            "sent_arguments_prefix":
                            False,
                            "sent_arguments":
                            "",
                        })
                    else:
                        self.streaming_state["sent_tools"][0][
                            "sent_name"] = True
                    self.current_tool_name_sent = True
                    return delta

        # Use regex to identify tool calls in the output
        # Use preprocessed tool calls text for better parsing, but also try to extract from incomplete JSON blocks
        search_text = (preprocessed_tool_calls
                       if preprocessed_tool_calls else current_text)

        # For JSON code blocks that aren't complete yet, try to extract the JSON content
        if not preprocessed_tool_calls and has_potential_json_block:
            # Try to extract the JSON array from within the code block
            json_match = re.search(r"```(?:json)?\s*([\s\S]*?)(?:```|$)",
                                   current_text)
            if json_match:
                potential_json = json_match.group(1).strip()
                # Use this as search text even if it's incomplete
                if potential_json.startswith("[") and (
                        '"name"' in potential_json
                        and '"arguments"' in potential_json):
                    search_text = potential_json

        # Try to find complete tool names first
        name_pattern = r'"name"\s*:\s*"([^"]+)"'
        name_matches = list(re.finditer(name_pattern, search_text))
        tool_count = len(name_matches)

        # If no complete tool names found, check for partial tool names
        if tool_count == 0:
            # Check if we're in the middle of parsing a tool name
            partial_name_pattern = r'"name"\s*:\s*"([^"]*)'
            partial_matches = list(
                re.finditer(partial_name_pattern, search_text))
            if partial_matches:
                # We have a partial tool name - not ready to emit yet
                return None
            else:
                # No tools found at all
                return None

        # Ensure our state arrays are large enough
        while len(self.streaming_state["sent_tools"]) < tool_count:
            self.streaming_state["sent_tools"].append({
                "sent_name":
                False,
                "sent_arguments_prefix":
                False,
                "sent_arguments":
                "",
            })

        while len(self.streaming_state["tool_ids"]) < tool_count:
            self.streaming_state["tool_ids"].append(None)

        # Determine if we need to move to a new tool
        current_idx = self.streaming_state["current_tool_index"]

        # If we haven't processed any tool yet or current tool is complete, move to next
        if current_idx == -1 or current_idx < tool_count - 1:
            next_idx = current_idx + 1

            # If tool at next_idx has not been sent yet
            if (next_idx < tool_count
                    and not self.streaming_state["sent_tools"][next_idx]
                ["sent_name"]):
                # Update indexes
                self.streaming_state["current_tool_index"] = next_idx
                self.current_tool_id = (
                    next_idx  # For backward compatibility
                )
                current_idx = next_idx

                # Extract the tool name
                tool_name = name_matches[current_idx].group(1)

                # Generate ID and send tool name
                tool_id = f"call_{current_idx}_{random_uuid()}"
                self.streaming_state["tool_ids"][current_idx] = tool_id

                delta = DeltaMessage(tool_calls=[
                    DeltaToolCall(
                        index=current_idx,
                        type="function",
                        id=tool_id,
                        function=DeltaFunctionCall(
                            name=tool_name).model_dump(
                                exclude_none=True),  # type: ignore
                    )
                ])
                self.streaming_state["sent_tools"][current_idx][
                    "sent_name"] = True
                self.current_tool_name_sent = (
                    True  # For backward compatibility
                )

                # Keep track of streamed args for backward compatibility
                while len(self.streamed_args) <= current_idx:
                    self.streamed_args.append("")

                return delta

        # Process arguments for the current tool
        if current_idx >= 0 and current_idx < tool_count:
            # Support both regular and empty argument objects
            # First, check for the empty arguments case: "arguments": {}
            empty_args_pattern = (
                r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
            empty_args_match = re.search(empty_args_pattern, search_text)

            # Check if this tool has empty arguments
            if empty_args_match and empty_args_match.start() > 0:
                # Find which tool this empty arguments belongs to
                empty_args_tool_idx = 0
                for i in range(tool_count):
                    if i == current_idx:
                        # If this is our current tool and it has empty arguments
                        if not self.streaming_state["sent_tools"][
                                current_idx]["sent_arguments_prefix"]:
                            # Send empty object
                            self.streaming_state["sent_tools"][
                                current_idx][
                                    "sent_arguments_prefix"] = True
                            self.streaming_state["sent_tools"][
                                current_idx]["sent_arguments"] = "{}"

                            # Update streamed_args for backward compatibility
                            while len(self.streamed_args) <= current_idx:
                                self.streamed_args.append("")
                            self.streamed_args[current_idx] += "{}"

                            delta = DeltaMessage(tool_calls=[
                                DeltaToolCall(
                                    index=current_idx,
                                    function=DeltaFunctionCall(
                                        arguments="{}").
                                    model_dump(
                                        exclude_none=True),  # type: ignore
                                )
                            ])

                            # Move to next tool if available
                            if current_idx < tool_count - 1:
                                self.streaming_state[
                                    "current_tool_index"] += 1
                                self.current_tool_id = self.streaming_state[
                                    "current_tool_index"]

                            return delta

            # Extract arguments for current tool using regex for non-empty arguments
            args_pattern = r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
            args_matches = list(re.finditer(args_pattern, search_text))

            if current_idx < len(args_matches):
                args_text = args_matches[current_idx].group(1)

                # Handle transition between tools
                is_last_tool = current_idx == tool_count - 1

                # For multiple tools, extract only the arguments for the current tool
                if tool_count > 1:
                    # Parse the entire JSON structure to properly extract arguments for each tool
                    try:
                        parsed_tools = json.loads(search_text)
                        if isinstance(
                                parsed_tools,
                                list) and current_idx < len(parsed_tools):
                            current_tool = parsed_tools[current_idx]
                            if isinstance(current_tool.get("arguments"),
                                          dict):
                                args_text = json.dumps(
                                    current_tool["arguments"])
                            else:
                                args_text = str(
                                    current_tool.get("arguments", "{}"))
                    except (json.JSONDecodeError, KeyError, IndexError):
                        # Fallback to regex-based extraction
                        pass

                # If arguments haven't been sent yet
                sent_args = self.streaming_state["sent_tools"][
                    current_idx]["sent_arguments"]

                # If we haven't sent the opening bracket yet
                if not self.streaming_state["sent_tools"][current_idx][
                        "sent_arguments_prefix"] and args_text.startswith(
                            "{"):
                    self.streaming_state["sent_tools"][current_idx][
                        "sent_arguments_prefix"] = True
                    self.streaming_state["sent_tools"][current_idx][
                        "sent_arguments"] = "{"

                    # Update streamed_args for backward compatibility
                    while len(self.streamed_args) <= current_idx:
                        self.streamed_args.append("")
                    self.streamed_args[current_idx] += "{"

                    delta = DeltaMessage(tool_calls=[
                        DeltaToolCall(
                            index=current_idx,
                            function=DeltaFunctionCall(
                                arguments="{").model_dump(
                                    exclude_none=True),  # type: ignore
                        )
                    ])
                    return delta

                # If we need to send more arguments
                if args_text.startswith(sent_args):
                    # Calculate what part of arguments we need to send
                    args_diff = args_text[len(sent_args):]

                    if args_diff:
                        # Update our state
                        self.streaming_state["sent_tools"][current_idx][
                            "sent_arguments"] = args_text

                        # Update streamed_args for backward compatibility
                        while len(self.streamed_args) <= current_idx:
                            self.streamed_args.append("")
                        self.streamed_args[current_idx] += args_diff

                        delta = DeltaMessage(tool_calls=[
                            DeltaToolCall(
                                index=current_idx,
                                function=DeltaFunctionCall(
                                    arguments=args_diff).model_dump(
                                        exclude_none=True),  # type: ignore
                            )
                        ])
                        return delta

                # If the tool's arguments are complete, check if we need to move to the next tool
                if args_text.endswith("}") and args_text == sent_args:
                    # This tool is complete, move to the next one in the next iteration
                    if current_idx < tool_count - 1:
                        self.streaming_state["current_tool_index"] += 1
                        self.current_tool_id = self.streaming_state[
                            "current_tool_index"]  # For compatibility

        # If we got here, we couldn't determine what to stream next
        return None

    except Exception as e:
        logger.exception(f"Error in streaming tool calls: {e}")
        # If we encounter an error, just return the delta text as regular content
        return DeltaMessage(content=delta_text)

preprocess_model_output ¶

preprocess_model_output(
    model_output: str,
) -> tuple[Optional[str], Optional[str]]

Preprocess the model output to extract content and potential tool calls. Returns: Tuple of (content, potential_tool_calls_json)

Source code in vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py

def preprocess_model_output(
        self, model_output: str) -> tuple[Optional[str], Optional[str]]:
    """
    Preprocess the model output to extract content and potential tool calls.
    Returns:
        Tuple of (content, potential_tool_calls_json)
    """
    # Check for thinking tag
    thinking_match = re.search(self.thinking_tag_pattern, model_output)
    if thinking_match:
        content = model_output[:thinking_match.start() +
                               len("</think>")].strip()
        thinking_content = thinking_match.group(1).strip()

        # Try to parse the thinking content as JSON
        try:
            json.loads(thinking_content)
            return content, thinking_content
        except json.JSONDecodeError:
            # If can't parse as JSON, look for JSON code blocks
            for json_pattern in self.json_code_block_patterns:
                json_matches = re.findall(json_pattern, thinking_content)
                if json_matches:
                    for json_str in json_matches:
                        try:
                            json.loads(json_str)
                            return content, json_str
                        except json.JSONDecodeError:
                            continue

    # Check for JSON code blocks in the entire output
    for json_pattern in self.json_code_block_patterns:
        json_matches = re.findall(json_pattern, model_output)
        if json_matches:
            for json_str in json_matches:
                try:
                    json.loads(json_str)
                    # Extract content by removing the JSON code block
                    content = re.sub(json_pattern, "",
                                     model_output).strip()
                    return content, json_str
                except json.JSONDecodeError:
                    continue

    # If the entire output is a valid JSON array or looks like one, treat it as tool calls
    if model_output.strip().startswith("["):
        try:
            json.loads(model_output)
            return None, model_output
        except json.JSONDecodeError:
            # Even if it's not valid JSON yet, it might be a tool call in progress
            if ("{" in model_output and "name" in model_output
                    and "arguments" in model_output):
                return None, model_output

    # If no tool calls found, return the original output as content
    return model_output, None

vllm.entrypoints.openai.tool_parsers ¶

__all__ module-attribute ¶

DeepSeekV31ToolParser ¶

current_tool_id instance-attribute ¶

current_tool_name_sent instance-attribute ¶

prev_tool_call_arr instance-attribute ¶

stream_tool_call_name_regex instance-attribute ¶

stream_tool_call_portion_regex instance-attribute ¶

streamed_args_for_tool instance-attribute ¶

tool_call_end_token instance-attribute ¶

tool_call_end_token_id instance-attribute ¶

tool_call_regex instance-attribute ¶

tool_call_start_token instance-attribute ¶

tool_call_start_token_id instance-attribute ¶

tool_calls_end_token instance-attribute ¶

tool_calls_end_token_id instance-attribute ¶

tool_calls_start_token instance-attribute ¶

tool_calls_start_token_id instance-attribute ¶

__init__ ¶

extract_tool_calls ¶

extract_tool_calls_streaming ¶

DeepSeekV3ToolParser ¶

current_tool_id instance-attribute ¶

current_tool_name_sent instance-attribute ¶

prev_tool_call_arr instance-attribute ¶

stream_tool_call_name_regex instance-attribute ¶

stream_tool_call_portion_regex instance-attribute ¶

streamed_args_for_tool instance-attribute ¶

tool_call_end_token instance-attribute ¶

tool_call_end_token_id instance-attribute ¶

tool_call_regex instance-attribute ¶

tool_call_start_token instance-attribute ¶

tool_call_start_token_id instance-attribute ¶

tool_calls_end_token instance-attribute ¶

tool_calls_end_token_id instance-attribute ¶

tool_calls_start_token instance-attribute ¶

tool_calls_start_token_id instance-attribute ¶

__init__ ¶

extract_tool_calls ¶

extract_tool_calls_streaming ¶

Glm4MoeModelToolParser ¶

_buffer instance-attribute ¶

current_tool_id instance-attribute ¶

current_tool_name_sent instance-attribute ¶

func_arg_regex instance-attribute ¶

func_call_regex instance-attribute ¶

func_detail_regex instance-attribute ¶

prev_tool_call_arr instance-attribute ¶

streamed_args_for_tool instance-attribute ¶

tool_call_end_token instance-attribute ¶

tool_call_end_token_id instance-attribute ¶

tool_call_start_token instance-attribute ¶

tool_call_start_token_id instance-attribute ¶

tool_calls_start_token instance-attribute ¶

__init__ ¶

extract_tool_calls ¶

extract_tool_calls_streaming ¶

Granite20bFCToolParser ¶

bot_token instance-attribute ¶

tool_call_regex instance-attribute ¶

tool_start_token instance-attribute ¶

__init__ ¶

extract_tool_calls ¶

extract_tool_calls_streaming ¶

GraniteToolParser ¶

bot_string instance-attribute ¶

bot_token instance-attribute ¶

__init__ ¶

extract_tool_calls ¶

extract_tool_calls_streaming ¶

Hermes2ProToolParser ¶

buffered_delta_text instance-attribute ¶

current_tool_id instance-attribute ¶

current_tool_name_sent instance-attribute ¶

model_tokenizer instance-attribute ¶

prev_tool_call_arr instance-attribute ¶

scratch_pad_regex instance-attribute ¶

streamed_args_for_tool instance-attribute ¶

tool_call_end_token instance-attribute ¶

tool_call_end_token_array instance-attribute ¶

all `module-attribute` ¶

current_tool_id `instance-attribute` ¶

current_tool_name_sent `instance-attribute` ¶

prev_tool_call_arr `instance-attribute` ¶

stream_tool_call_name_regex `instance-attribute` ¶

stream_tool_call_portion_regex `instance-attribute` ¶

streamed_args_for_tool `instance-attribute` ¶

tool_call_end_token `instance-attribute` ¶

tool_call_end_token_id `instance-attribute` ¶

tool_call_regex `instance-attribute` ¶

tool_call_start_token `instance-attribute` ¶

tool_call_start_token_id `instance-attribute` ¶

tool_calls_end_token `instance-attribute` ¶

tool_calls_end_token_id `instance-attribute` ¶

tool_calls_start_token `instance-attribute` ¶

tool_calls_start_token_id `instance-attribute` ¶

init ¶

current_tool_id `instance-attribute` ¶

current_tool_name_sent `instance-attribute` ¶

prev_tool_call_arr `instance-attribute` ¶

stream_tool_call_name_regex `instance-attribute` ¶

stream_tool_call_portion_regex `instance-attribute` ¶

streamed_args_for_tool `instance-attribute` ¶

tool_call_end_token `instance-attribute` ¶

tool_call_end_token_id `instance-attribute` ¶

tool_call_regex `instance-attribute` ¶

tool_call_start_token `instance-attribute` ¶

tool_call_start_token_id `instance-attribute` ¶

tool_calls_end_token `instance-attribute` ¶

tool_calls_end_token_id `instance-attribute` ¶

tool_calls_start_token `instance-attribute` ¶

tool_calls_start_token_id `instance-attribute` ¶

init ¶

_buffer `instance-attribute` ¶

current_tool_id `instance-attribute` ¶

current_tool_name_sent `instance-attribute` ¶

func_arg_regex `instance-attribute` ¶

func_call_regex `instance-attribute` ¶

func_detail_regex `instance-attribute` ¶

prev_tool_call_arr `instance-attribute` ¶

streamed_args_for_tool `instance-attribute` ¶

tool_call_end_token `instance-attribute` ¶

tool_call_end_token_id `instance-attribute` ¶

tool_call_start_token `instance-attribute` ¶

tool_call_start_token_id `instance-attribute` ¶

tool_calls_start_token `instance-attribute` ¶

init ¶

bot_token `instance-attribute` ¶

tool_call_regex `instance-attribute` ¶

tool_start_token `instance-attribute` ¶

init ¶

bot_string `instance-attribute` ¶

bot_token `instance-attribute` ¶

init ¶

buffered_delta_text `instance-attribute` ¶

current_tool_id `instance-attribute` ¶

current_tool_name_sent `instance-attribute` ¶

model_tokenizer `instance-attribute` ¶

prev_tool_call_arr `instance-attribute` ¶

scratch_pad_regex `instance-attribute` ¶

streamed_args_for_tool `instance-attribute` ¶

tool_call_end_token `instance-attribute` ¶

tool_call_end_token_array `instance-attribute` ¶

tool_call_end_token_ids `instance-attribute` ¶

tool_call_regex `instance-attribute` ¶

tool_call_start_token `instance-attribute` ¶

tool_call_start_token_array `instance-attribute` ¶

tool_call_start_token_ids `instance-attribute` ¶

init ¶

answer_tool_calls_pattern `instance-attribute` ¶

bot_string `instance-attribute` ¶

current_tool_id `instance-attribute` ¶

current_tool_name_sent `instance-attribute` ¶

current_tools_sent `instance-attribute` ¶

prev_tool_call_arr `instance-attribute` ¶

prev_tool_calls `instance-attribute` ¶

streamed_args `instance-attribute` ¶

streaming_state `instance-attribute` ¶

tool_empty_arg_reg `instance-attribute` ¶

tool_name_reg `instance-attribute` ¶