vllm.model_executor.models.qwen3_vl_moe ¶

Inference-only Qwen3-VL-MoE model compatible with HuggingFace weights.

logger `module-attribute` ¶

logger = init_logger(__name__)

Qwen3MoeLLMForCausalLM ¶

Bases: Qwen3MoeForCausalLM

Source code in vllm/model_executor/models/qwen3_vl_moe.py

class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super(Qwen3MoeForCausalLM, self).__init__()
        self.config = vllm_config.model_config.hf_config.text_config
        self.quant_config = vllm_config.quant_config
        self.model = Qwen3MoeLLMModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "model"))
        self.lm_head = ParallelLMHead(self.config.vocab_size,
                                      self.config.hidden_size,
                                      quant_config=self.quant_config)
        if self.config.tie_word_embeddings:
            self.lm_head.weight = self.model.embed_tokens.weight
        self.logits_processor = LogitsProcessor(self.config.vocab_size)
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

config `instance-attribute` ¶

config = text_config

lm_head `instance-attribute` ¶

lm_head = ParallelLMHead(
    vocab_size, hidden_size, quant_config=quant_config
)

logits_processor `instance-attribute` ¶

logits_processor = LogitsProcessor(vocab_size)

make_empty_intermediate_tensors `instance-attribute` ¶

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors
)

model `instance-attribute` ¶

model = Qwen3MoeLLMModel(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "model"),
)

quant_config `instance-attribute` ¶

quant_config = quant_config

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/qwen3_vl_moe.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super(Qwen3MoeForCausalLM, self).__init__()
    self.config = vllm_config.model_config.hf_config.text_config
    self.quant_config = vllm_config.quant_config
    self.model = Qwen3MoeLLMModel(vllm_config=vllm_config,
                                  prefix=maybe_prefix(prefix, "model"))
    self.lm_head = ParallelLMHead(self.config.vocab_size,
                                  self.config.hidden_size,
                                  quant_config=self.quant_config)
    if self.config.tie_word_embeddings:
        self.lm_head.weight = self.model.embed_tokens.weight
    self.logits_processor = LogitsProcessor(self.config.vocab_size)
    self.make_empty_intermediate_tensors = (
        self.model.make_empty_intermediate_tensors)

Qwen3MoeLLMModel ¶

Bases: Qwen3MoeModel

Source code in vllm/model_executor/models/qwen3_vl_moe.py

@support_torch_compile(
    dynamic_arg_dims={
        "input_ids": 0,
        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
        # otherwise (seq_len, ).
        "positions": -1,
        "intermediate_tensors": 0,
        "inputs_embeds": 0,
        # the same shape as input_embeds
        "deepstack_input_embeds": 0
    })
class Qwen3MoeLLMModel(Qwen3MoeModel):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)
        if not get_pp_group().is_first_rank:
            assert self.start_layer >= len(
                vllm_config.model_config.hf_config.vision_config.
                deepstack_visual_indexes), (
                    "start_layer should be greater than or equal to "
                    "len(deepstack_visual_indexes)")

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        deepstack_input_embeds: Optional[IntermediateTensors] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if get_pp_group().is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds
            else:
                hidden_states = self.get_input_embeddings(input_ids)
            residual = None
        else:
            assert intermediate_tensors is not None
            hidden_states = intermediate_tensors["hidden_states"]
            residual = intermediate_tensors["residual"]
        for layer_idx, layer in enumerate(
                self.layers[self.start_layer:self.end_layer]):
            layer_idx = layer_idx + self.start_layer

            hidden_states, residual = layer(
                positions,
                hidden_states,
                residual,
            )

            if deepstack_input_embeds is not None and \
                    layer_idx in range(0, len(deepstack_input_embeds)):
                hidden_states = hidden_states + deepstack_input_embeds[
                    f"deepstack_input_embeds_{layer_idx}"]

        if not get_pp_group().is_last_rank:
            return IntermediateTensors({
                "hidden_states": hidden_states,
                "residual": residual
            })
        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states

    def load_fused_expert_weights(self, name: str, params_dict: dict,
                                  loaded_weight: torch.Tensor, shard_id: str,
                                  num_experts: int) -> bool:
        param = params_dict[name]
        weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
        loaded_local_expert = False
        for expert_id in range(num_experts):
            curr_expert_weight = loaded_weight[expert_id]
            success = weight_loader(param,
                                    curr_expert_weight,
                                    name,
                                    shard_id,
                                    expert_id,
                                    return_success=True)
            if success:
                loaded_local_expert = True

        return loaded_local_expert

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
        ]
        # Skip loading extra parameters for GPTQ/modelopt models.
        ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale",
                           ".v_scale", "_v_scale", ".weight_scale",
                           "_weight_scale", ".input_scale", "_input_scale")
        params_dict = dict(self.named_parameters())
        loaded_params: set[str] = set()
        expert_params_mapping = self.get_expert_mapping()
        is_fused_expert = False
        fused_expert_params_mapping = [
            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
        ]
        num_experts = self.config.num_experts
        for name, loaded_weight in weights:
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if ("experts.gate_up_proj" in name
                        or "experts.down_proj" in name):
                    is_fused_expert = True
                    expert_params_mapping = fused_expert_params_mapping

                # Skip non-stacked layers and experts (experts handled below).
                if weight_name not in name:
                    continue
                # We have mlp.experts[0].gate_proj in the checkpoint.
                # Since we handle the experts below in expert_params_mapping,
                # we need to skip here BEFORE we update the name, otherwise
                # name will be updated to mlp.experts[0].gate_up_proj, which
                # will then be updated below in expert_params_mapping
                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                if "mlp.experts" in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra parameters for GPTQ/modelopt models.
                if name.endswith(ignore_suffixes) and name not in params_dict:
                    continue
                # Skip layers on other devices.
                if is_pp_missing_parameter(name, self):
                    continue
                if name.endswith("scale"):
                    # Remapping the name of FP8 kv-scale.
                    name = maybe_remap_kv_scale_name(name, params_dict)
                    if name is None:
                        continue
                if name not in params_dict:
                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                if weight_loader == default_weight_loader:
                    weight_loader(param, loaded_weight)
                else:
                    weight_loader(param, loaded_weight, shard_id)
                break
            else:
                is_expert_weight = False
                for mapping in expert_params_mapping:
                    param_name, weight_name, expert_id, shard_id = mapping
                    if weight_name not in name:
                        continue
                    # Anyway, this is an expert weight and should not be
                    # attempted to load as other weights later
                    is_expert_weight = True
                    name_mapped = name.replace(weight_name, param_name)
                    if is_pp_missing_parameter(name_mapped, self):
                        continue
                    if is_fused_expert:
                        loaded_weight = loaded_weight.transpose(-1,
                                                                -2)  # no bias
                        if "experts.gate_up_proj" in name:
                            loaded_weight = loaded_weight.chunk(2, dim=-2)
                            success_w1 = self.load_fused_expert_weights(
                                name_mapped, params_dict, loaded_weight[0],
                                "w1", num_experts)
                            success_w3 = self.load_fused_expert_weights(
                                name_mapped, params_dict, loaded_weight[1],
                                "w3", num_experts)
                            success = success_w1 and success_w3
                        else:
                            # down_proj
                            success = self.load_fused_expert_weights(
                                name_mapped, params_dict, loaded_weight,
                                shard_id, num_experts)
                    else:
                        # Skip loading extra parameters for GPTQ/modelopt models
                        if name_mapped.endswith(
                                ignore_suffixes
                        ) and name_mapped not in params_dict:
                            continue
                        param = params_dict[name_mapped]
                        # We should ask the weight loader to return success or
                        # not here since otherwise we may skip experts with
                        # other available replicas.
                        weight_loader = typing.cast(Callable[..., bool],
                                                    param.weight_loader)
                        success = weight_loader(param,
                                                loaded_weight,
                                                name_mapped,
                                                shard_id=shard_id,
                                                expert_id=expert_id,
                                                return_success=True)
                    if success:
                        name = name_mapped
                        break
                else:
                    if is_expert_weight:
                        # We've checked that this is an expert weight
                        # However it's not mapped locally to this rank
                        # So we simply skip it
                        continue
                    # Skip loading extra parameters for GPTQ/modelopt models.
                    if name.endswith(
                            ignore_suffixes) and name not in params_dict:
                        continue
                    # Skip layers on other devices.
                    if is_pp_missing_parameter(name, self):
                        continue
                    # Remapping the name of FP8 kv-scale.
                    if name.endswith("kv_scale"):
                        remapped_kv_scale_name = name.replace(
                            ".kv_scale", ".attn.kv_scale")
                        if remapped_kv_scale_name not in params_dict:
                            logger.warning_once(
                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
                                name,
                                remapped_kv_scale_name,
                            )
                            continue
                        else:
                            name = remapped_kv_scale_name
                    param = params_dict[name]
                    weight_loader = getattr(param, "weight_loader",
                                            default_weight_loader)
                    weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/qwen3_vl_moe.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__(vllm_config=vllm_config, prefix=prefix)
    if not get_pp_group().is_first_rank:
        assert self.start_layer >= len(
            vllm_config.model_config.hf_config.vision_config.
            deepstack_visual_indexes), (
                "start_layer should be greater than or equal to "
                "len(deepstack_visual_indexes)")

forward ¶

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
    deepstack_input_embeds: Optional[
        IntermediateTensors
    ] = None,
) -> Union[Tensor, IntermediateTensors]

Source code in vllm/model_executor/models/qwen3_vl_moe.py

def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    deepstack_input_embeds: Optional[IntermediateTensors] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
    if get_pp_group().is_first_rank:
        if inputs_embeds is not None:
            hidden_states = inputs_embeds
        else:
            hidden_states = self.get_input_embeddings(input_ids)
        residual = None
    else:
        assert intermediate_tensors is not None
        hidden_states = intermediate_tensors["hidden_states"]
        residual = intermediate_tensors["residual"]
    for layer_idx, layer in enumerate(
            self.layers[self.start_layer:self.end_layer]):
        layer_idx = layer_idx + self.start_layer

        hidden_states, residual = layer(
            positions,
            hidden_states,
            residual,
        )

        if deepstack_input_embeds is not None and \
                layer_idx in range(0, len(deepstack_input_embeds)):
            hidden_states = hidden_states + deepstack_input_embeds[
                f"deepstack_input_embeds_{layer_idx}"]

    if not get_pp_group().is_last_rank:
        return IntermediateTensors({
            "hidden_states": hidden_states,
            "residual": residual
        })
    hidden_states, _ = self.norm(hidden_states, residual)
    return hidden_states

load_fused_expert_weights ¶

load_fused_expert_weights(
    name: str,
    params_dict: dict,
    loaded_weight: Tensor,
    shard_id: str,
    num_experts: int,
) -> bool

Source code in vllm/model_executor/models/qwen3_vl_moe.py

def load_fused_expert_weights(self, name: str, params_dict: dict,
                              loaded_weight: torch.Tensor, shard_id: str,
                              num_experts: int) -> bool:
    param = params_dict[name]
    weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
    loaded_local_expert = False
    for expert_id in range(num_experts):
        curr_expert_weight = loaded_weight[expert_id]
        success = weight_loader(param,
                                curr_expert_weight,
                                name,
                                shard_id,
                                expert_id,
                                return_success=True)
        if success:
            loaded_local_expert = True

    return loaded_local_expert

load_weights ¶

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

Source code in vllm/model_executor/models/qwen3_vl_moe.py

def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    stacked_params_mapping = [
        # (param_name, shard_name, shard_id)
        ("qkv_proj", "q_proj", "q"),
        ("qkv_proj", "k_proj", "k"),
        ("qkv_proj", "v_proj", "v"),
        ("gate_up_proj", "gate_proj", 0),
        ("gate_up_proj", "up_proj", 1),
    ]
    # Skip loading extra parameters for GPTQ/modelopt models.
    ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale",
                       ".v_scale", "_v_scale", ".weight_scale",
                       "_weight_scale", ".input_scale", "_input_scale")
    params_dict = dict(self.named_parameters())
    loaded_params: set[str] = set()
    expert_params_mapping = self.get_expert_mapping()
    is_fused_expert = False
    fused_expert_params_mapping = [
        ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
        ("experts.w2_weight", "experts.down_proj", 0, "w2"),
    ]
    num_experts = self.config.num_experts
    for name, loaded_weight in weights:
        for (param_name, weight_name, shard_id) in stacked_params_mapping:
            if ("experts.gate_up_proj" in name
                    or "experts.down_proj" in name):
                is_fused_expert = True
                expert_params_mapping = fused_expert_params_mapping

            # Skip non-stacked layers and experts (experts handled below).
            if weight_name not in name:
                continue
            # We have mlp.experts[0].gate_proj in the checkpoint.
            # Since we handle the experts below in expert_params_mapping,
            # we need to skip here BEFORE we update the name, otherwise
            # name will be updated to mlp.experts[0].gate_up_proj, which
            # will then be updated below in expert_params_mapping
            # for mlp.experts[0].gate_gate_up_proj, which breaks load.
            if "mlp.experts" in name:
                continue
            name = name.replace(weight_name, param_name)
            # Skip loading extra parameters for GPTQ/modelopt models.
            if name.endswith(ignore_suffixes) and name not in params_dict:
                continue
            # Skip layers on other devices.
            if is_pp_missing_parameter(name, self):
                continue
            if name.endswith("scale"):
                # Remapping the name of FP8 kv-scale.
                name = maybe_remap_kv_scale_name(name, params_dict)
                if name is None:
                    continue
            if name not in params_dict:
                continue
            param = params_dict[name]
            weight_loader = getattr(param, "weight_loader",
                                    default_weight_loader)
            if weight_loader == default_weight_loader:
                weight_loader(param, loaded_weight)
            else:
                weight_loader(param, loaded_weight, shard_id)
            break
        else:
            is_expert_weight = False
            for mapping in expert_params_mapping:
                param_name, weight_name, expert_id, shard_id = mapping
                if weight_name not in name:
                    continue
                # Anyway, this is an expert weight and should not be
                # attempted to load as other weights later
                is_expert_weight = True
                name_mapped = name.replace(weight_name, param_name)
                if is_pp_missing_parameter(name_mapped, self):
                    continue
                if is_fused_expert:
                    loaded_weight = loaded_weight.transpose(-1,
                                                            -2)  # no bias
                    if "experts.gate_up_proj" in name:
                        loaded_weight = loaded_weight.chunk(2, dim=-2)
                        success_w1 = self.load_fused_expert_weights(
                            name_mapped, params_dict, loaded_weight[0],
                            "w1", num_experts)
                        success_w3 = self.load_fused_expert_weights(
                            name_mapped, params_dict, loaded_weight[1],
                            "w3", num_experts)
                        success = success_w1 and success_w3
                    else:
                        # down_proj
                        success = self.load_fused_expert_weights(
                            name_mapped, params_dict, loaded_weight,
                            shard_id, num_experts)
                else:
                    # Skip loading extra parameters for GPTQ/modelopt models
                    if name_mapped.endswith(
                            ignore_suffixes
                    ) and name_mapped not in params_dict:
                        continue
                    param = params_dict[name_mapped]
                    # We should ask the weight loader to return success or
                    # not here since otherwise we may skip experts with
                    # other available replicas.
                    weight_loader = typing.cast(Callable[..., bool],
                                                param.weight_loader)
                    success = weight_loader(param,
                                            loaded_weight,
                                            name_mapped,
                                            shard_id=shard_id,
                                            expert_id=expert_id,
                                            return_success=True)
                if success:
                    name = name_mapped
                    break
            else:
                if is_expert_weight:
                    # We've checked that this is an expert weight
                    # However it's not mapped locally to this rank
                    # So we simply skip it
                    continue
                # Skip loading extra parameters for GPTQ/modelopt models.
                if name.endswith(
                        ignore_suffixes) and name not in params_dict:
                    continue
                # Skip layers on other devices.
                if is_pp_missing_parameter(name, self):
                    continue
                # Remapping the name of FP8 kv-scale.
                if name.endswith("kv_scale"):
                    remapped_kv_scale_name = name.replace(
                        ".kv_scale", ".attn.kv_scale")
                    if remapped_kv_scale_name not in params_dict:
                        logger.warning_once(
                            "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
                            name,
                            remapped_kv_scale_name,
                        )
                        continue
                    else:
                        name = remapped_kv_scale_name
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)
        loaded_params.add(name)
    return loaded_params

Qwen3VLMoeForConditionalGeneration ¶

Bases: Qwen3VLForConditionalGeneration

Source code in vllm/model_executor/models/qwen3_vl_moe.py

@MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor,
                                        info=Qwen3VLMoeProcessingInfo,
                                        dummy_inputs=Qwen3VLDummyInputsBuilder)
class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super(Qwen3VLForConditionalGeneration, self).__init__()
        config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        multimodal_config = vllm_config.model_config.multimodal_config

        self.config = config
        self.multimodal_config = multimodal_config
        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"

        self.visual = Qwen3_VisionTransformer(
            config.vision_config,
            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
            quant_config=quant_config,
            prefix=maybe_prefix(prefix, "visual"),
            use_data_parallel=self.use_data_parallel,
        )

        self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config,
                                                     prefix=maybe_prefix(
                                                         prefix,
                                                         "language_model"))

        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors)

        self.use_deepstack = hasattr(config.vision_config,
                                     'deepstack_visual_indexes')
        self.deepstack_num_level = len(
            config.vision_config.deepstack_visual_indexes
        ) if self.use_deepstack else 0
        # register buffer for deepstack
        self.deepstack_input_embeds = [
            torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens,
                        config.text_config.hidden_size)
            for _ in range(self.deepstack_num_level)
        ] if self.use_deepstack else None
        self.visual_dim = config.vision_config.out_hidden_size
        self.multiscale_dim = self.visual_dim * self.deepstack_num_level

config `instance-attribute` ¶

config = config

deepstack_input_embeds `instance-attribute` ¶

deepstack_input_embeds = (
    [
        (zeros(max_num_batched_tokens, hidden_size))
        for _ in (range(deepstack_num_level))
    ]
    if use_deepstack
    else None
)

deepstack_num_level `instance-attribute` ¶

deepstack_num_level = (
    len(deepstack_visual_indexes) if use_deepstack else 0
)

language_model `instance-attribute` ¶

language_model = Qwen3MoeLLMForCausalLM(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "language_model"),
)

make_empty_intermediate_tensors `instance-attribute` ¶

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors
)

multimodal_config `instance-attribute` ¶

multimodal_config = multimodal_config

multiscale_dim `instance-attribute` ¶

multiscale_dim = visual_dim * deepstack_num_level

use_data_parallel `instance-attribute` ¶

use_data_parallel = mm_encoder_tp_mode == 'data'

use_deepstack `instance-attribute` ¶

use_deepstack = hasattr(
    vision_config, "deepstack_visual_indexes"
)

visual `instance-attribute` ¶

visual = Qwen3_VisionTransformer(
    vision_config,
    norm_eps=getattr(config, "rms_norm_eps", 1e-06),
    quant_config=quant_config,
    prefix=maybe_prefix(prefix, "visual"),
    use_data_parallel=use_data_parallel,
)

visual_dim `instance-attribute` ¶

visual_dim = out_hidden_size

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/qwen3_vl_moe.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super(Qwen3VLForConditionalGeneration, self).__init__()
    config: Qwen3VLMoeConfig = vllm_config.model_config.hf_config
    quant_config = vllm_config.quant_config
    multimodal_config = vllm_config.model_config.multimodal_config

    self.config = config
    self.multimodal_config = multimodal_config
    self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"

    self.visual = Qwen3_VisionTransformer(
        config.vision_config,
        norm_eps=getattr(config, "rms_norm_eps", 1e-6),
        quant_config=quant_config,
        prefix=maybe_prefix(prefix, "visual"),
        use_data_parallel=self.use_data_parallel,
    )

    self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config,
                                                 prefix=maybe_prefix(
                                                     prefix,
                                                     "language_model"))

    self.make_empty_intermediate_tensors = (
        self.language_model.make_empty_intermediate_tensors)

    self.use_deepstack = hasattr(config.vision_config,
                                 'deepstack_visual_indexes')
    self.deepstack_num_level = len(
        config.vision_config.deepstack_visual_indexes
    ) if self.use_deepstack else 0
    # register buffer for deepstack
    self.deepstack_input_embeds = [
        torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens,
                    config.text_config.hidden_size)
        for _ in range(self.deepstack_num_level)
    ] if self.use_deepstack else None
    self.visual_dim = config.vision_config.out_hidden_size
    self.multiscale_dim = self.visual_dim * self.deepstack_num_level

Qwen3VLMoeProcessingInfo ¶

Bases: Qwen3VLProcessingInfo

Source code in vllm/model_executor/models/qwen3_vl_moe.py

class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo):

    def get_hf_config(self):
        return self.ctx.get_hf_config(Qwen3VLMoeConfig)

get_hf_config ¶

get_hf_config()

Source code in vllm/model_executor/models/qwen3_vl_moe.py

def get_hf_config(self):
    return self.ctx.get_hf_config(Qwen3VLMoeConfig)

vllm.model_executor.models.qwen3_vl_moe ¶

logger module-attribute ¶

Qwen3MoeLLMForCausalLM ¶

config instance-attribute ¶

lm_head instance-attribute ¶

logits_processor instance-attribute ¶

make_empty_intermediate_tensors instance-attribute ¶

model instance-attribute ¶

quant_config instance-attribute ¶

__init__ ¶

Qwen3MoeLLMModel ¶

__init__ ¶

forward ¶

load_fused_expert_weights ¶

load_weights ¶

Qwen3VLMoeForConditionalGeneration ¶

config instance-attribute ¶

deepstack_input_embeds instance-attribute ¶

deepstack_num_level instance-attribute ¶

language_model instance-attribute ¶

make_empty_intermediate_tensors instance-attribute ¶

multimodal_config instance-attribute ¶

multiscale_dim instance-attribute ¶

use_data_parallel instance-attribute ¶

use_deepstack instance-attribute ¶

visual instance-attribute ¶

visual_dim instance-attribute ¶

__init__ ¶

Qwen3VLMoeProcessingInfo ¶

get_hf_config ¶

logger `module-attribute` ¶

config `instance-attribute` ¶

lm_head `instance-attribute` ¶

logits_processor `instance-attribute` ¶

make_empty_intermediate_tensors `instance-attribute` ¶

model `instance-attribute` ¶

quant_config `instance-attribute` ¶

init ¶

init ¶

config `instance-attribute` ¶

deepstack_input_embeds `instance-attribute` ¶

deepstack_num_level `instance-attribute` ¶

language_model `instance-attribute` ¶

make_empty_intermediate_tensors `instance-attribute` ¶

multimodal_config `instance-attribute` ¶

multiscale_dim `instance-attribute` ¶

use_data_parallel `instance-attribute` ¶

use_deepstack `instance-attribute` ¶

visual `instance-attribute` ¶

visual_dim `instance-attribute` ¶

init ¶