vllm.transformers_utils.configs ¶

Model configs may be defined in this directory for the following reasons:

There is no configuration file defined by HF Hub or Transformers library.
There is a need to override the existing config to support vLLM.

Modules:

Name	Description
`arctic`	Arctic model configuration
`chatglm`
`deepseek_v3`
`deepseek_vl2`
`dotsocr`
`eagle`
`falcon`	Falcon configuration
`jais`	JAIS configuration
`kimi_vl`
`medusa`
`midashenglm`
`mistral`
`mlp_speculator`
`moonvit`
`nemotron`	Nemotron model configuration
`nemotron_h`	NemotronH model configuration
`nemotron_vl`
`olmo3`
`ovis`
`qwen3_next`	Qwen3-Next model configuration
`radio`	Radio vision model configuration
`speculators`
`step3_vl`
`ultravox`

all `module-attribute` ¶

__all__ = [
    "ChatGLMConfig",
    "DeepseekVLV2Config",
    "DeepseekV3Config",
    "DotsOCRConfig",
    "EAGLEConfig",
    "RWConfig",
    "JAISConfig",
    "MedusaConfig",
    "MiDashengLMConfig",
    "MLPSpeculatorConfig",
    "MoonViTConfig",
    "KimiVLConfig",
    "NemotronConfig",
    "NemotronHConfig",
    "Nemotron_Nano_VL_Config",
    "Olmo3Config",
    "OvisConfig",
    "RadioConfig",
    "SpeculatorsConfig",
    "UltravoxConfig",
    "Step3VLConfig",
    "Step3VisionEncoderConfig",
    "Step3TextConfig",
    "Qwen3NextConfig",
]

ChatGLMConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/chatglm.py

class ChatGLMConfig(PretrainedConfig):
    model_type = "chatglm"
    attribute_map = {
        "num_hidden_layers": "num_layers",
        "n_head_kv": "multi_query_group_num",
    }

    def __init__(self,
                 num_layers=28,
                 padded_vocab_size=65024,
                 hidden_size=4096,
                 ffn_hidden_size=13696,
                 kv_channels=128,
                 num_attention_heads=32,
                 seq_length=2048,
                 hidden_dropout=0.0,
                 attention_dropout=0.0,
                 layernorm_epsilon=1e-5,
                 rmsnorm=True,
                 apply_residual_connection_post_layernorm=False,
                 post_layer_norm=True,
                 add_bias_linear=False,
                 add_qkv_bias=False,
                 interleaved_qkv=False,
                 bias_dropout_fusion=True,
                 multi_query_attention=False,
                 multi_query_group_num=1,
                 apply_query_key_layer_scaling=True,
                 attention_softmax_in_fp32=True,
                 fp32_residual_connection=False,
                 quantization_bit=0,
                 pre_seq_len=None,
                 prefix_projection=False,
                 **kwargs):
        self.num_layers = num_layers
        self.vocab_size = padded_vocab_size
        self.padded_vocab_size = padded_vocab_size
        self.hidden_size = hidden_size
        self.ffn_hidden_size = ffn_hidden_size
        self.kv_channels = kv_channels
        self.num_attention_heads = num_attention_heads
        self.seq_length = seq_length
        # It is to be compatible with long lora.
        self.max_position_embeddings = seq_length
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.layernorm_epsilon = layernorm_epsilon
        self.rmsnorm = rmsnorm
        self.apply_residual_connection_post_layernorm = (
            apply_residual_connection_post_layernorm)
        self.post_layer_norm = post_layer_norm
        self.add_bias_linear = add_bias_linear
        self.add_qkv_bias = add_qkv_bias
        self.bias_dropout_fusion = bias_dropout_fusion
        self.multi_query_attention = multi_query_attention
        self.multi_query_group_num = multi_query_group_num
        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
        self.fp32_residual_connection = fp32_residual_connection
        self.quantization_bit = quantization_bit
        self.pre_seq_len = pre_seq_len
        self.prefix_projection = prefix_projection
        self.interleaved_qkv = interleaved_qkv
        super().__init__(**kwargs)

add_bias_linear `instance-attribute` ¶

add_bias_linear = add_bias_linear

add_qkv_bias `instance-attribute` ¶

add_qkv_bias = add_qkv_bias

apply_query_key_layer_scaling `instance-attribute` ¶

apply_query_key_layer_scaling = (
    apply_query_key_layer_scaling
)

apply_residual_connection_post_layernorm `instance-attribute` ¶

apply_residual_connection_post_layernorm = (
    apply_residual_connection_post_layernorm
)

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

attention_softmax_in_fp32 `instance-attribute` ¶

attention_softmax_in_fp32 = attention_softmax_in_fp32

attribute_map `class-attribute` `instance-attribute` ¶

attribute_map = {
    "num_hidden_layers": "num_layers",
    "n_head_kv": "multi_query_group_num",
}

bias_dropout_fusion `instance-attribute` ¶

bias_dropout_fusion = bias_dropout_fusion

ffn_hidden_size `instance-attribute` ¶

ffn_hidden_size = ffn_hidden_size

fp32_residual_connection `instance-attribute` ¶

fp32_residual_connection = fp32_residual_connection

hidden_dropout `instance-attribute` ¶

hidden_dropout = hidden_dropout

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

interleaved_qkv `instance-attribute` ¶

interleaved_qkv = interleaved_qkv

kv_channels `instance-attribute` ¶

kv_channels = kv_channels

layernorm_epsilon `instance-attribute` ¶

layernorm_epsilon = layernorm_epsilon

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = seq_length

model_type `class-attribute` `instance-attribute` ¶

model_type = 'chatglm'

multi_query_attention `instance-attribute` ¶

multi_query_attention = multi_query_attention

multi_query_group_num `instance-attribute` ¶

multi_query_group_num = multi_query_group_num

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_layers `instance-attribute` ¶

num_layers = num_layers

padded_vocab_size `instance-attribute` ¶

padded_vocab_size = padded_vocab_size

post_layer_norm `instance-attribute` ¶

post_layer_norm = post_layer_norm

pre_seq_len `instance-attribute` ¶

pre_seq_len = pre_seq_len

prefix_projection `instance-attribute` ¶

prefix_projection = prefix_projection

quantization_bit `instance-attribute` ¶

quantization_bit = quantization_bit

rmsnorm `instance-attribute` ¶

rmsnorm = rmsnorm

seq_length `instance-attribute` ¶

seq_length = seq_length

vocab_size `instance-attribute` ¶

vocab_size = padded_vocab_size

init ¶

__init__(
    num_layers=28,
    padded_vocab_size=65024,
    hidden_size=4096,
    ffn_hidden_size=13696,
    kv_channels=128,
    num_attention_heads=32,
    seq_length=2048,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    layernorm_epsilon=1e-05,
    rmsnorm=True,
    apply_residual_connection_post_layernorm=False,
    post_layer_norm=True,
    add_bias_linear=False,
    add_qkv_bias=False,
    interleaved_qkv=False,
    bias_dropout_fusion=True,
    multi_query_attention=False,
    multi_query_group_num=1,
    apply_query_key_layer_scaling=True,
    attention_softmax_in_fp32=True,
    fp32_residual_connection=False,
    quantization_bit=0,
    pre_seq_len=None,
    prefix_projection=False,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/chatglm.py

def __init__(self,
             num_layers=28,
             padded_vocab_size=65024,
             hidden_size=4096,
             ffn_hidden_size=13696,
             kv_channels=128,
             num_attention_heads=32,
             seq_length=2048,
             hidden_dropout=0.0,
             attention_dropout=0.0,
             layernorm_epsilon=1e-5,
             rmsnorm=True,
             apply_residual_connection_post_layernorm=False,
             post_layer_norm=True,
             add_bias_linear=False,
             add_qkv_bias=False,
             interleaved_qkv=False,
             bias_dropout_fusion=True,
             multi_query_attention=False,
             multi_query_group_num=1,
             apply_query_key_layer_scaling=True,
             attention_softmax_in_fp32=True,
             fp32_residual_connection=False,
             quantization_bit=0,
             pre_seq_len=None,
             prefix_projection=False,
             **kwargs):
    self.num_layers = num_layers
    self.vocab_size = padded_vocab_size
    self.padded_vocab_size = padded_vocab_size
    self.hidden_size = hidden_size
    self.ffn_hidden_size = ffn_hidden_size
    self.kv_channels = kv_channels
    self.num_attention_heads = num_attention_heads
    self.seq_length = seq_length
    # It is to be compatible with long lora.
    self.max_position_embeddings = seq_length
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout
    self.layernorm_epsilon = layernorm_epsilon
    self.rmsnorm = rmsnorm
    self.apply_residual_connection_post_layernorm = (
        apply_residual_connection_post_layernorm)
    self.post_layer_norm = post_layer_norm
    self.add_bias_linear = add_bias_linear
    self.add_qkv_bias = add_qkv_bias
    self.bias_dropout_fusion = bias_dropout_fusion
    self.multi_query_attention = multi_query_attention
    self.multi_query_group_num = multi_query_group_num
    self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
    self.attention_softmax_in_fp32 = attention_softmax_in_fp32
    self.fp32_residual_connection = fp32_residual_connection
    self.quantization_bit = quantization_bit
    self.pre_seq_len = pre_seq_len
    self.prefix_projection = prefix_projection
    self.interleaved_qkv = interleaved_qkv
    super().__init__(**kwargs)

DeepseekV3Config ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/deepseek_v3.py

class DeepseekV3Config(PretrainedConfig):

    model_type = "deepseek_v3"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=129280,
        hidden_size=7168,
        intermediate_size=18432,
        moe_intermediate_size=2048,
        num_hidden_layers=61,
        num_nextn_predict_layers=1,
        num_attention_heads=128,
        num_key_value_heads=128,
        n_shared_experts=1,
        n_routed_experts=256,
        ep_size=1,
        routed_scaling_factor=2.5,
        kv_lora_rank=512,
        q_lora_rank=1536,
        qk_rope_head_dim=64,
        v_head_dim=128,
        qk_nope_head_dim=128,
        topk_method='noaux_tc',
        n_group=8,
        topk_group=4,
        num_experts_per_tok=8,
        moe_layer_freq=1,
        first_k_dense_replace=3,
        norm_topk_prob=True,
        scoring_func='sigmoid',
        hidden_act="silu",
        max_position_embeddings=4096,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=0,
        eos_token_id=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_nextn_predict_layers = num_nextn_predict_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

attention_bias `instance-attribute` ¶

attention_bias = attention_bias

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

ep_size `instance-attribute` ¶

ep_size = ep_size

first_k_dense_replace `instance-attribute` ¶

first_k_dense_replace = first_k_dense_replace

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

kv_lora_rank `instance-attribute` ¶

kv_lora_rank = kv_lora_rank

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

model_type `class-attribute` `instance-attribute` ¶

model_type = 'deepseek_v3'

moe_intermediate_size `instance-attribute` ¶

moe_intermediate_size = moe_intermediate_size

moe_layer_freq `instance-attribute` ¶

moe_layer_freq = moe_layer_freq

n_group `instance-attribute` ¶

n_group = n_group

n_routed_experts `instance-attribute` ¶

n_routed_experts = n_routed_experts

n_shared_experts `instance-attribute` ¶

n_shared_experts = n_shared_experts

norm_topk_prob `instance-attribute` ¶

norm_topk_prob = norm_topk_prob

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_experts_per_tok `instance-attribute` ¶

num_experts_per_tok = num_experts_per_tok

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_key_value_heads

num_nextn_predict_layers `instance-attribute` ¶

num_nextn_predict_layers = num_nextn_predict_layers

q_lora_rank `instance-attribute` ¶

q_lora_rank = q_lora_rank

qk_nope_head_dim `instance-attribute` ¶

qk_nope_head_dim = qk_nope_head_dim

qk_rope_head_dim `instance-attribute` ¶

qk_rope_head_dim = qk_rope_head_dim

rms_norm_eps `instance-attribute` ¶

rms_norm_eps = rms_norm_eps

rope_scaling `instance-attribute` ¶

rope_scaling = rope_scaling

rope_theta `instance-attribute` ¶

rope_theta = rope_theta

routed_scaling_factor `instance-attribute` ¶

routed_scaling_factor = routed_scaling_factor

scoring_func `instance-attribute` ¶

scoring_func = scoring_func

topk_group `instance-attribute` ¶

topk_group = topk_group

topk_method `instance-attribute` ¶

topk_method = topk_method

use_cache `instance-attribute` ¶

use_cache = use_cache

v_head_dim `instance-attribute` ¶

v_head_dim = v_head_dim

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=129280,
    hidden_size=7168,
    intermediate_size=18432,
    moe_intermediate_size=2048,
    num_hidden_layers=61,
    num_nextn_predict_layers=1,
    num_attention_heads=128,
    num_key_value_heads=128,
    n_shared_experts=1,
    n_routed_experts=256,
    ep_size=1,
    routed_scaling_factor=2.5,
    kv_lora_rank=512,
    q_lora_rank=1536,
    qk_rope_head_dim=64,
    v_head_dim=128,
    qk_nope_head_dim=128,
    topk_method="noaux_tc",
    n_group=8,
    topk_group=4,
    num_experts_per_tok=8,
    moe_layer_freq=1,
    first_k_dense_replace=3,
    norm_topk_prob=True,
    scoring_func="sigmoid",
    hidden_act="silu",
    max_position_embeddings=4096,
    initializer_range=0.02,
    rms_norm_eps=1e-06,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=0,
    eos_token_id=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/deepseek_v3.py

def __init__(
    self,
    vocab_size=129280,
    hidden_size=7168,
    intermediate_size=18432,
    moe_intermediate_size=2048,
    num_hidden_layers=61,
    num_nextn_predict_layers=1,
    num_attention_heads=128,
    num_key_value_heads=128,
    n_shared_experts=1,
    n_routed_experts=256,
    ep_size=1,
    routed_scaling_factor=2.5,
    kv_lora_rank=512,
    q_lora_rank=1536,
    qk_rope_head_dim=64,
    v_head_dim=128,
    qk_nope_head_dim=128,
    topk_method='noaux_tc',
    n_group=8,
    topk_group=4,
    num_experts_per_tok=8,
    moe_layer_freq=1,
    first_k_dense_replace=3,
    norm_topk_prob=True,
    scoring_func='sigmoid',
    hidden_act="silu",
    max_position_embeddings=4096,
    initializer_range=0.02,
    rms_norm_eps=1e-6,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=0,
    eos_token_id=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.moe_intermediate_size = moe_intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_nextn_predict_layers = num_nextn_predict_layers
    self.num_attention_heads = num_attention_heads
    self.n_shared_experts = n_shared_experts
    self.n_routed_experts = n_routed_experts
    self.ep_size = ep_size
    self.routed_scaling_factor = routed_scaling_factor
    self.kv_lora_rank = kv_lora_rank
    self.q_lora_rank = q_lora_rank
    self.qk_rope_head_dim = qk_rope_head_dim
    self.v_head_dim = v_head_dim
    self.qk_nope_head_dim = qk_nope_head_dim
    self.topk_method = topk_method
    self.n_group = n_group
    self.topk_group = topk_group
    self.num_experts_per_tok = num_experts_per_tok
    self.moe_layer_freq = moe_layer_freq
    self.first_k_dense_replace = first_k_dense_replace
    self.norm_topk_prob = norm_topk_prob
    self.scoring_func = scoring_func
    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

DeepseekVLV2Config ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/deepseek_vl2.py

class DeepseekVLV2Config(PretrainedConfig):
    model_type = "deepseek_vl_v2"
    vision_config: VisionEncoderConfig
    projector_config: MlpProjectorConfig

    tile_tag: str = "2D"
    global_view_pos: str = "head"
    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )

    def __init__(self,
                 tile_tag: str = "tile_tag",
                 global_view_pos: str = "head",
                 candidate_resolutions: tuple[tuple[int,
                                                    int]] = ((384, 384), ),
                 **kwargs):
        super().__init__(**kwargs)

        vision_config = kwargs.get("vision_config", {})
        self.vision_config = VisionEncoderConfig(**vision_config)

        projector_config = kwargs.get("projector_config", {})
        self.projector_config = MlpProjectorConfig(**projector_config)

        language_config = kwargs.get("language_config", {})
        self.text_config = DeepseekV2Config(**language_config)

        self.tile_tag = tile_tag
        self.global_view_pos = global_view_pos
        self.candidate_resolutions = candidate_resolutions
        self.vocab_size = self.text_config.vocab_size

candidate_resolutions `class-attribute` `instance-attribute` ¶

candidate_resolutions: tuple[tuple[int, int]] = (
    candidate_resolutions
)

global_view_pos `class-attribute` `instance-attribute` ¶

global_view_pos: str = global_view_pos

model_type `class-attribute` `instance-attribute` ¶

model_type = 'deepseek_vl_v2'

projector_config `instance-attribute` ¶

projector_config: MlpProjectorConfig = MlpProjectorConfig(
    **projector_config
)

text_config `instance-attribute` ¶

text_config = DeepseekV2Config(**language_config)

tile_tag `class-attribute` `instance-attribute` ¶

tile_tag: str = tile_tag

vision_config `instance-attribute` ¶

vision_config: VisionEncoderConfig = VisionEncoderConfig(
    **vision_config
)

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    tile_tag: str = "tile_tag",
    global_view_pos: str = "head",
    candidate_resolutions: tuple[tuple[int, int]] = (
        (384, 384),
    ),
    **kwargs,
)

Source code in vllm/transformers_utils/configs/deepseek_vl2.py

def __init__(self,
             tile_tag: str = "tile_tag",
             global_view_pos: str = "head",
             candidate_resolutions: tuple[tuple[int,
                                                int]] = ((384, 384), ),
             **kwargs):
    super().__init__(**kwargs)

    vision_config = kwargs.get("vision_config", {})
    self.vision_config = VisionEncoderConfig(**vision_config)

    projector_config = kwargs.get("projector_config", {})
    self.projector_config = MlpProjectorConfig(**projector_config)

    language_config = kwargs.get("language_config", {})
    self.text_config = DeepseekV2Config(**language_config)

    self.tile_tag = tile_tag
    self.global_view_pos = global_view_pos
    self.candidate_resolutions = candidate_resolutions
    self.vocab_size = self.text_config.vocab_size

DotsOCRConfig ¶

Bases: Qwen2Config

Source code in vllm/transformers_utils/configs/dotsocr.py

class DotsOCRConfig(Qwen2Config):
    model_type = "dots_ocr"

    def __init__(self,
                 image_token_id=151665,
                 video_token_id=151656,
                 vision_config: Optional[dict] = None,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.image_token_id = image_token_id
        self.video_token_id = video_token_id
        self.vision_config = DotsVisionConfig(**(vision_config or {}))

    def save_pretrained(self, save_directory, **kwargs):
        self._auto_class = None
        super().save_pretrained(save_directory, **kwargs)

image_token_id `instance-attribute` ¶

image_token_id = image_token_id

model_type `class-attribute` `instance-attribute` ¶

model_type = 'dots_ocr'

video_token_id `instance-attribute` ¶

video_token_id = video_token_id

vision_config `instance-attribute` ¶

vision_config = DotsVisionConfig(**(vision_config or {}))

init ¶

__init__(
    image_token_id=151665,
    video_token_id=151656,
    vision_config: Optional[dict] = None,
    *args,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/dotsocr.py

def __init__(self,
             image_token_id=151665,
             video_token_id=151656,
             vision_config: Optional[dict] = None,
             *args,
             **kwargs):
    super().__init__(*args, **kwargs)
    self.image_token_id = image_token_id
    self.video_token_id = video_token_id
    self.vision_config = DotsVisionConfig(**(vision_config or {}))

save_pretrained ¶

save_pretrained(save_directory, **kwargs)

Source code in vllm/transformers_utils/configs/dotsocr.py

def save_pretrained(self, save_directory, **kwargs):
    self._auto_class = None
    super().save_pretrained(save_directory, **kwargs)

EAGLEConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/eagle.py

class EAGLEConfig(PretrainedConfig):
    model_type = "eagle"

    def __init__(self,
                 model: Union[PretrainedConfig, dict, None] = None,
                 truncated_vocab_size: Optional[int] = None,
                 method: Optional[str] = 'eagle',
                 **kwargs):

        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
        if isinstance(model, dict):
            archs = model.get("architectures", [])
            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
            if any(target_arch in archs for target_arch in target_archs):
                # AutoConfig does not support DeepSeek MoE models yet
                model_config = DeepseekV2Config(**model)
            else:
                model_config = AutoConfig.for_model(**model)
        else:
            model_config = model

        for k, v in kwargs.items():
            if k != "architectures" and k != "model_type" and hasattr(
                    model_config, k):
                setattr(model_config, k, v)

        self.model = model_config

        if self.model is None:
            self.truncated_vocab_size = None
        else:
            self.truncated_vocab_size = self.model.vocab_size if \
                truncated_vocab_size is None else truncated_vocab_size

        # Eagle model name should follow naming convention of
        # LlamaForCausalLM -> EagleLlamaForCausalLM
        # LlamaForCausalLM -> Eagle3LlamaForCausalLM
        # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
        if method == "eagle":
            assert self.model is not None, \
                "model should not be None when method is eagle"
            kwargs["architectures"] = [
                f"Eagle{arch}" if not arch.startswith("Eagle") \
                    else arch for arch in self.model.architectures
            ]

        elif method == "eagle3":
            assert self.model is not None, \
                "model should not be None when method is eagle3"
            kwargs["architectures"] = [
                arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
                else f"Eagle3{arch}" for arch in self.model.architectures
            ]
        else:
            raise ValueError(f"Invalid method {method}. "
                             "Supported methods are eagle and eagle3.")

        super().__init__(**kwargs)

        if self.model is not None:
            for k, v in self.model.to_dict().items():
                if k not in kwargs:
                    setattr(self, k, v)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "EAGLEConfig":
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(config_dict, **kwargs)

model `instance-attribute` ¶

model = model_config

model_type `class-attribute` `instance-attribute` ¶

model_type = 'eagle'

truncated_vocab_size `instance-attribute` ¶

truncated_vocab_size = None

init ¶

__init__(
    model: Union[PretrainedConfig, dict, None] = None,
    truncated_vocab_size: Optional[int] = None,
    method: Optional[str] = "eagle",
    **kwargs,
)

Source code in vllm/transformers_utils/configs/eagle.py

def __init__(self,
             model: Union[PretrainedConfig, dict, None] = None,
             truncated_vocab_size: Optional[int] = None,
             method: Optional[str] = 'eagle',
             **kwargs):

    model_config: Union[PretrainedConfig, DeepseekV2Config, None]
    if isinstance(model, dict):
        archs = model.get("architectures", [])
        target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
        if any(target_arch in archs for target_arch in target_archs):
            # AutoConfig does not support DeepSeek MoE models yet
            model_config = DeepseekV2Config(**model)
        else:
            model_config = AutoConfig.for_model(**model)
    else:
        model_config = model

    for k, v in kwargs.items():
        if k != "architectures" and k != "model_type" and hasattr(
                model_config, k):
            setattr(model_config, k, v)

    self.model = model_config

    if self.model is None:
        self.truncated_vocab_size = None
    else:
        self.truncated_vocab_size = self.model.vocab_size if \
            truncated_vocab_size is None else truncated_vocab_size

    # Eagle model name should follow naming convention of
    # LlamaForCausalLM -> EagleLlamaForCausalLM
    # LlamaForCausalLM -> Eagle3LlamaForCausalLM
    # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
    if method == "eagle":
        assert self.model is not None, \
            "model should not be None when method is eagle"
        kwargs["architectures"] = [
            f"Eagle{arch}" if not arch.startswith("Eagle") \
                else arch for arch in self.model.architectures
        ]

    elif method == "eagle3":
        assert self.model is not None, \
            "model should not be None when method is eagle3"
        kwargs["architectures"] = [
            arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
            else f"Eagle3{arch}" for arch in self.model.architectures
        ]
    else:
        raise ValueError(f"Invalid method {method}. "
                         "Supported methods are eagle and eagle3.")

    super().__init__(**kwargs)

    if self.model is not None:
        for k, v in self.model.to_dict().items():
            if k not in kwargs:
                setattr(self, k, v)

from_pretrained `classmethod` ¶

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> EAGLEConfig

Source code in vllm/transformers_utils/configs/eagle.py

@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "EAGLEConfig":
    config_dict, kwargs = cls.get_config_dict(
        pretrained_model_name_or_path, **kwargs)
    return cls.from_dict(config_dict, **kwargs)

JAISConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [JAISModel]. It is used to instantiate a JAIS model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`, optional, defaults to 50257	Vocabulary size of the JAIS model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`JAISModel`].	`50257`
`n_positions`	`int`, optional, defaults to 1024	The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).	`1024`
`n_embd`	`int`, optional, defaults to 768	Dimensionality of the embeddings and hidden states.	`768`
`n_layer`	`int`, optional, defaults to 12	Number of hidden layers in the Transformer encoder.	`12`
`n_head`	`int`, optional, defaults to 12	Number of attention heads for each attention layer in the Transformer encoder.	`12`
`n_inner`	`int`, optional, defaults to None	Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd	`None`
`activation_function`	`str`, optional, defaults to `"gelu"`	Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.	`'gelu_new'`
`resid_pdrop`	`float`, optional, defaults to 0.1	The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.	`0.1`
`embd_pdrop`	`float`, optional, defaults to 0.1	The dropout ratio for the embeddings.	`0.1`
`attn_pdrop`	`float`, optional, defaults to 0.1	The dropout ratio for the attention.	`0.1`
`layer_norm_epsilon`	`float`, optional, defaults to 1e-5	The epsilon to use in the layer normalization layers.	`1e-05`
`initializer_range`	`float`, optional, defaults to 0.02	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.	`0.02`
`scale_attn_weights`	`bool`, optional, defaults to `True`	Scale attention weights by dividing by sqrt(hidden_size)..	`True`
`use_cache`	`bool`, optional, defaults to `True`	Whether or not the model should return the last key/values attentions (not used by all models).	`True`
`scale_attn_by_inverse_layer_idx`	`bool`, optional, default `True`	Whether to additionally scale attention weights by `1 / layer_idx + 1`.	`False`
`reorder_and_upcast_attn`	`bool`, optional, defaults to `False`	Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention dot-product/softmax to float() when training with mixed precision.	`False`
`position_embedding_type`	`str`, optional, defaults to `"learned"`	Positional embedding can be either `"alibi"` or `"learned"`.	`'learned'`
`mup_width_scale`	`float`, optional, defaults to 1.0	muP parameter to scale learning rate and initializers. Calculated as (`d_model,0 / d_model`), where `d_model` is the model's width and `d_model,0` is the proxy model's width.	`1.0`
`mup_embeddings_scale`	`float`, optional, defaults to 1.0	muP parameter to scale token and position embeddings.	`1.0`
`mup_output_alpha`	`float`, optional, defaults to 1.0	muP parameter to scale output logits (`output_logits_scale = mup_output_alpha * mup_width_scale`).	`1.0`
`mup_scale_qk_dot_by_d`	`bool`, optional, defaults to `False`	Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set scale_attn_weights to `True` as well.	`False`
`alibi_scaling`	`dict`, optional	Dictionary containing the scaling configuration for ALiBi embeddings. Currently only supports linear scaling strategy. Can specify either the scaling `factor` (must be a float greater than 1) for fixed scaling or `train_seq_len` for dynamic scaling on input samples with sequence length > `train_seq_len`. The expected formats are `{"type": strategy name, "factor": scaling factor}` or `{"type": strategy name, "train_seq_len": training sequence length}`.	`None`
`architectures`	`list`, optional, defaults to ['JAISLMHeadModel']	architecture names for Jais.	`None`

Example:

>>> from transformers import JAISConfig, JAISModel

>>> # Initializing a JAIS configuration
>>> configuration = JAISConfig()

>>> # Initializing a model (with random weights) from the configuration
>>> model = JAISModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

Source code in vllm/transformers_utils/configs/jais.py

class JAISConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a
    [`JAISModel`]. It is used to instantiate a JAIS model according to the
    specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used
    to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the JAIS model. Defines the number of different
            tokens that can be represented by the
            `inputs_ids` passed when calling [`JAISModel`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used
            with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set
            it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list
            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in
            the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
            Whether to additionally scale attention weights 
            by `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention
            (dot-product)
            and upcast attention dot-product/softmax to float() when training
            with mixed precision.
        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
            Positional embedding can be either `"alibi"` or `"learned"`.
        mup_width_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale learning rate and initializers. Calculated
            as (`d_model,0 / d_model`), where
            `d_model` is the model's width and `d_model,0` is the proxy
            model's width.
        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale token and position embeddings.
        mup_output_alpha (`float`, *optional*, defaults to 1.0):
            muP parameter to scale output logits
            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
            Scale attention weights by dividing by hidden_size instead of
            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
            well.
        alibi_scaling (`dict`, *optional*):
            Dictionary containing the scaling configuration for ALiBi
            embeddings. Currently only supports linear
            scaling strategy. Can specify either the scaling `factor` (must be
            a float greater than 1) for fixed scaling
            or `train_seq_len` for dynamic scaling on input samples with
            sequence length > `train_seq_len`. The expected
            formats are `{"type": strategy name, "factor": scaling factor}` or
            `{"type": strategy name,
            "train_seq_len": training sequence length}`.
        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
            architecture names for Jais.

    Example:

    ```python
    >>> from transformers import JAISConfig, JAISModel

    >>> # Initializing a JAIS configuration
    >>> configuration = JAISConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = JAISModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "jais"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "hidden_size": "n_embd",
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }

    def __init__(
        self,
        vocab_size=50257,
        n_positions=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
        n_inner=None,
        activation_function="gelu_new",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        scale_attn_weights=True,
        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
        scale_attn_by_inverse_layer_idx=False,
        reorder_and_upcast_attn=False,
        position_embedding_type="learned",
        mup_width_scale=1.0,
        mup_embeddings_scale=1.0,
        mup_output_alpha=1.0,
        mup_scale_qk_dot_by_d=False,
        alibi_scaling=None,
        architectures=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.scale_attn_weights = scale_attn_weights
        self.use_cache = use_cache
        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
        self.reorder_and_upcast_attn = reorder_and_upcast_attn

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        self.position_embedding_type = position_embedding_type
        self.mup_width_scale = mup_width_scale
        self.mup_embeddings_scale = mup_embeddings_scale
        self.mup_output_alpha = mup_output_alpha
        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

        self.alibi_scaling = alibi_scaling
        self._alibi_scaling_validation()
        if architectures is None:
            architectures = ["JAISLMHeadModel"]

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            architectures=architectures,
            **kwargs,
        )

    def _alibi_scaling_validation(self):
        """
        Validate the `alibi_scaling` configuration.
        """
        if self.alibi_scaling is None:
            return

        if (not isinstance(self.alibi_scaling, dict)
                or len(self.alibi_scaling) != 2):
            raise ValueError(
                "`alibi_scaling` must be a dictionary with two fields, "
                "`type` and `factor` or `type` and `train_seq_len`, "
                f"got {self.alibi_scaling}")
        alibi_scaling_type = self.alibi_scaling.get("type", None)
        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
        if alibi_scaling_type is None or alibi_scaling_type != "linear":
            raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                             f"got {alibi_scaling_type}")
        if (alibi_scaling_factor is not None
                and not isinstance(alibi_scaling_factor, float)
                or (alibi_scaling_factor is not None
                    and alibi_scaling_factor <= 1.0)):
            raise ValueError(
                f"`alibi_scaling`'s factor field must be a float > 1.0, "
                f"got {alibi_scaling_factor}")
        if (alibi_dynamic_scaling is not None
                and not isinstance(alibi_dynamic_scaling, int)
                or (alibi_dynamic_scaling is not None
                    and alibi_dynamic_scaling <= 1)):
            raise ValueError(
                f"`alibi_scaling`'s `train_seq_len` field must be an "
                f"integer > 1, got {alibi_dynamic_scaling}")

activation_function `instance-attribute` ¶

activation_function = activation_function

alibi_scaling `instance-attribute` ¶

alibi_scaling = alibi_scaling

attn_pdrop `instance-attribute` ¶

attn_pdrop = attn_pdrop

attribute_map `class-attribute` `instance-attribute` ¶

attribute_map = {
    "hidden_size": "n_embd",
    "max_position_embeddings": "n_positions",
    "num_attention_heads": "n_head",
    "num_hidden_layers": "n_layer",
}

bos_token_id `instance-attribute` ¶

bos_token_id = bos_token_id

embd_pdrop `instance-attribute` ¶

embd_pdrop = embd_pdrop

eos_token_id `instance-attribute` ¶

eos_token_id = eos_token_id

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon `instance-attribute` ¶

layer_norm_epsilon = layer_norm_epsilon

model_type `class-attribute` `instance-attribute` ¶

model_type = 'jais'

mup_embeddings_scale `instance-attribute` ¶

mup_embeddings_scale = mup_embeddings_scale

mup_output_alpha `instance-attribute` ¶

mup_output_alpha = mup_output_alpha

mup_scale_qk_dot_by_d `instance-attribute` ¶

mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

mup_width_scale `instance-attribute` ¶

mup_width_scale = mup_width_scale

n_embd `instance-attribute` ¶

n_embd = n_embd

n_head `instance-attribute` ¶

n_head = n_head

n_inner `instance-attribute` ¶

n_inner = n_inner

n_layer `instance-attribute` ¶

n_layer = n_layer

n_positions `instance-attribute` ¶

n_positions = n_positions

position_embedding_type `instance-attribute` ¶

position_embedding_type = position_embedding_type

reorder_and_upcast_attn `instance-attribute` ¶

reorder_and_upcast_attn = reorder_and_upcast_attn

resid_pdrop `instance-attribute` ¶

resid_pdrop = resid_pdrop

scale_attn_by_inverse_layer_idx `instance-attribute` ¶

scale_attn_by_inverse_layer_idx = (
    scale_attn_by_inverse_layer_idx
)

scale_attn_weights `instance-attribute` ¶

scale_attn_weights = scale_attn_weights

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    scale_attn_by_inverse_layer_idx=False,
    reorder_and_upcast_attn=False,
    position_embedding_type="learned",
    mup_width_scale=1.0,
    mup_embeddings_scale=1.0,
    mup_output_alpha=1.0,
    mup_scale_qk_dot_by_d=False,
    alibi_scaling=None,
    architectures=None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/jais.py

def __init__(
    self,
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    scale_attn_by_inverse_layer_idx=False,
    reorder_and_upcast_attn=False,
    position_embedding_type="learned",
    mup_width_scale=1.0,
    mup_embeddings_scale=1.0,
    mup_output_alpha=1.0,
    mup_scale_qk_dot_by_d=False,
    alibi_scaling=None,
    architectures=None,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.n_positions = n_positions
    self.n_embd = n_embd
    self.n_layer = n_layer
    self.n_head = n_head
    self.n_inner = n_inner
    self.activation_function = activation_function
    self.resid_pdrop = resid_pdrop
    self.embd_pdrop = embd_pdrop
    self.attn_pdrop = attn_pdrop
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.scale_attn_weights = scale_attn_weights
    self.use_cache = use_cache
    self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
    self.reorder_and_upcast_attn = reorder_and_upcast_attn

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id

    self.position_embedding_type = position_embedding_type
    self.mup_width_scale = mup_width_scale
    self.mup_embeddings_scale = mup_embeddings_scale
    self.mup_output_alpha = mup_output_alpha
    self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

    self.alibi_scaling = alibi_scaling
    self._alibi_scaling_validation()
    if architectures is None:
        architectures = ["JAISLMHeadModel"]

    super().__init__(
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        architectures=architectures,
        **kwargs,
    )

_alibi_scaling_validation ¶

_alibi_scaling_validation()

Validate the alibi_scaling configuration.

Source code in vllm/transformers_utils/configs/jais.py

def _alibi_scaling_validation(self):
    """
    Validate the `alibi_scaling` configuration.
    """
    if self.alibi_scaling is None:
        return

    if (not isinstance(self.alibi_scaling, dict)
            or len(self.alibi_scaling) != 2):
        raise ValueError(
            "`alibi_scaling` must be a dictionary with two fields, "
            "`type` and `factor` or `type` and `train_seq_len`, "
            f"got {self.alibi_scaling}")
    alibi_scaling_type = self.alibi_scaling.get("type", None)
    alibi_scaling_factor = self.alibi_scaling.get("factor", None)
    alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
    if alibi_scaling_type is None or alibi_scaling_type != "linear":
        raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                         f"got {alibi_scaling_type}")
    if (alibi_scaling_factor is not None
            and not isinstance(alibi_scaling_factor, float)
            or (alibi_scaling_factor is not None
                and alibi_scaling_factor <= 1.0)):
        raise ValueError(
            f"`alibi_scaling`'s factor field must be a float > 1.0, "
            f"got {alibi_scaling_factor}")
    if (alibi_dynamic_scaling is not None
            and not isinstance(alibi_dynamic_scaling, int)
            or (alibi_dynamic_scaling is not None
                and alibi_dynamic_scaling <= 1)):
        raise ValueError(
            f"`alibi_scaling`'s `train_seq_len` field must be an "
            f"integer > 1, got {alibi_dynamic_scaling}")

KimiVLConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/kimi_vl.py

class KimiVLConfig(PretrainedConfig):
    model_type = "kimi_vl"

    def __init__(self,
                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
                 ignore_index: int = -100,
                 media_placeholder_token_id: int = 163605,
                 pad_token_id: int = 0,
                 **kwargs):
        if vision_config is None:
            vision_config = MoonViTConfig()
        elif isinstance(vision_config, dict):
            vision_config = MoonViTConfig(**vision_config)
        self.vision_config = vision_config

        if text_config is None:
            text_config = DeepseekV2Config()
        elif isinstance(text_config, dict):
            text_config = DeepseekV2Config(**text_config)
        self.text_config = text_config

        self.ignore_index = ignore_index
        self.media_placeholder_token_id = media_placeholder_token_id

        super().__init__(pad_token_id=pad_token_id, **kwargs)

ignore_index `instance-attribute` ¶

ignore_index = ignore_index

media_placeholder_token_id `instance-attribute` ¶

media_placeholder_token_id = media_placeholder_token_id

model_type `class-attribute` `instance-attribute` ¶

model_type = 'kimi_vl'

text_config `instance-attribute` ¶

text_config = text_config

vision_config `instance-attribute` ¶

vision_config = vision_config

init ¶

__init__(
    vision_config: Optional[
        Union[dict, MoonViTConfig]
    ] = None,
    text_config: Optional[
        Union[dict, DeepseekV2Config]
    ] = None,
    ignore_index: int = -100,
    media_placeholder_token_id: int = 163605,
    pad_token_id: int = 0,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/kimi_vl.py

def __init__(self,
             vision_config: Optional[Union[dict, MoonViTConfig]] = None,
             text_config: Optional[Union[dict, DeepseekV2Config]] = None,
             ignore_index: int = -100,
             media_placeholder_token_id: int = 163605,
             pad_token_id: int = 0,
             **kwargs):
    if vision_config is None:
        vision_config = MoonViTConfig()
    elif isinstance(vision_config, dict):
        vision_config = MoonViTConfig(**vision_config)
    self.vision_config = vision_config

    if text_config is None:
        text_config = DeepseekV2Config()
    elif isinstance(text_config, dict):
        text_config = DeepseekV2Config(**text_config)
    self.text_config = text_config

    self.ignore_index = ignore_index
    self.media_placeholder_token_id = media_placeholder_token_id

    super().__init__(pad_token_id=pad_token_id, **kwargs)

MLPSpeculatorConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/mlp_speculator.py

class MLPSpeculatorConfig(PretrainedConfig):
    model_type = "mlp_speculator"

    attribute_map = {
        "hidden_size": "emb_dim",
    }

    def __init__(self,
                 vocab_size: int = 32000,
                 emb_dim: int = 4096,
                 inner_dim: int = 0,
                 n_predict: int = 3,
                 top_k_tokens_per_head: Optional[list[int]] = None,
                 n_candidates: int = 5,
                 tie_weights: bool = False,
                 scale_input: bool = False,
                 **kwargs):
        """
        Initialize an MLPSpeculatorConfig

        Args:
            vocab_size: int
                the model vocab size
            emb_dim: int
                the model embedding dimension
            inner_dim: int
                the inner dimension of the model. If 0, will be the emb_dim.
            n_predict: int
                the number of lookaheads for the speculator
            top_k_tokens_per_head: list[int]
                Number of tokens to consider from each head when forming the
                candidate tree.
                For each candidate branch in the tree, head n produces topk[n]
                additional sub-branches.
                NOTE: This parameter is currently unused.
            n_candidates: int
                number of child candidates to create per sequence
            tie_weights: bool
                If true, use a single set of weights for every model
                head/stage after the first. The initial projection
                from the base model may have a different size, so that
                stays separate.
            scale_input: bool
                if True, will scale the initial hidden states from
                the base model.
        """
        if top_k_tokens_per_head is None:
            top_k_tokens_per_head = [5, 4, 3]
        assert len(top_k_tokens_per_head) == n_predict
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.inner_dim = inner_dim
        self.n_predict = n_predict
        self.top_k_tokens_per_head = top_k_tokens_per_head
        self.n_candidates = n_candidates
        self.num_lookahead_tokens = n_predict
        self.tie_weights = tie_weights
        self.scale_input = scale_input

        super().__init__(**kwargs)

attribute_map `class-attribute` `instance-attribute` ¶

attribute_map = {'hidden_size': 'emb_dim'}

emb_dim `instance-attribute` ¶

emb_dim = emb_dim

inner_dim `instance-attribute` ¶

inner_dim = inner_dim

model_type `class-attribute` `instance-attribute` ¶

model_type = 'mlp_speculator'

n_candidates `instance-attribute` ¶

n_candidates = n_candidates

n_predict `instance-attribute` ¶

n_predict = n_predict

num_lookahead_tokens `instance-attribute` ¶

num_lookahead_tokens = n_predict

scale_input `instance-attribute` ¶

scale_input = scale_input

tie_weights `instance-attribute` ¶

tie_weights = tie_weights

top_k_tokens_per_head `instance-attribute` ¶

top_k_tokens_per_head = top_k_tokens_per_head

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size: int = 32000,
    emb_dim: int = 4096,
    inner_dim: int = 0,
    n_predict: int = 3,
    top_k_tokens_per_head: Optional[list[int]] = None,
    n_candidates: int = 5,
    tie_weights: bool = False,
    scale_input: bool = False,
    **kwargs,
)

Initialize an MLPSpeculatorConfig

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`	int the model vocab size	`32000`
`emb_dim`	`int`	int the model embedding dimension	`4096`
`inner_dim`	`int`	int the inner dimension of the model. If 0, will be the emb_dim.	`0`
`n_predict`	`int`	int the number of lookaheads for the speculator	`3`
`top_k_tokens_per_head`	`Optional[list[int]]`	list[int] Number of tokens to consider from each head when forming the candidate tree. For each candidate branch in the tree, head n produces topk[n] additional sub-branches. NOTE: This parameter is currently unused.	`None`
`n_candidates`	`int`	int number of child candidates to create per sequence	`5`
`tie_weights`	`bool`	bool If true, use a single set of weights for every model head/stage after the first. The initial projection from the base model may have a different size, so that stays separate.	`False`
`scale_input`	`bool`	bool if True, will scale the initial hidden states from the base model.	`False`

Source code in vllm/transformers_utils/configs/mlp_speculator.py

def __init__(self,
             vocab_size: int = 32000,
             emb_dim: int = 4096,
             inner_dim: int = 0,
             n_predict: int = 3,
             top_k_tokens_per_head: Optional[list[int]] = None,
             n_candidates: int = 5,
             tie_weights: bool = False,
             scale_input: bool = False,
             **kwargs):
    """
    Initialize an MLPSpeculatorConfig

    Args:
        vocab_size: int
            the model vocab size
        emb_dim: int
            the model embedding dimension
        inner_dim: int
            the inner dimension of the model. If 0, will be the emb_dim.
        n_predict: int
            the number of lookaheads for the speculator
        top_k_tokens_per_head: list[int]
            Number of tokens to consider from each head when forming the
            candidate tree.
            For each candidate branch in the tree, head n produces topk[n]
            additional sub-branches.
            NOTE: This parameter is currently unused.
        n_candidates: int
            number of child candidates to create per sequence
        tie_weights: bool
            If true, use a single set of weights for every model
            head/stage after the first. The initial projection
            from the base model may have a different size, so that
            stays separate.
        scale_input: bool
            if True, will scale the initial hidden states from
            the base model.
    """
    if top_k_tokens_per_head is None:
        top_k_tokens_per_head = [5, 4, 3]
    assert len(top_k_tokens_per_head) == n_predict
    self.vocab_size = vocab_size
    self.emb_dim = emb_dim
    self.inner_dim = inner_dim
    self.n_predict = n_predict
    self.top_k_tokens_per_head = top_k_tokens_per_head
    self.n_candidates = n_candidates
    self.num_lookahead_tokens = n_predict
    self.tie_weights = tie_weights
    self.scale_input = scale_input

    super().__init__(**kwargs)

MedusaConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/medusa.py

class MedusaConfig(PretrainedConfig):
    model_type = "medusa"

    def __init__(self,
                 hidden_size: int = 4096,
                 vocab_size: int = 32001,
                 num_heads: int = 5,
                 num_hidden_layers: int = 1,
                 max_paths: int = 64,
                 topk: int = 10,
                 truncated_vocab_size: Optional[int] = None,
                 **kwargs):

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_hidden_layers = num_hidden_layers
        self.max_paths = max_paths
        self.topk = topk
        self.max_seq_len = int(2**20)
        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
            else truncated_vocab_size
        if "architectures" not in kwargs:
            kwargs["architectures"] = ["MedusaModel"]

        super().__init__(**kwargs)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "MedusaConfig":
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        for k in list(config_dict.keys()):
            if 'num' in k:
                if 'heads' in k:
                    config_dict["num_heads"] = config_dict.pop(k)
                elif 'layers' in k:
                    config_dict["num_hidden_layers"] = config_dict.pop(k)
        return cls.from_dict(config_dict, **kwargs)

    @property
    def num_attention_heads(self):
        return 0

    @property
    def num_lookahead_tokens(self):
        return self.num_heads

    @num_lookahead_tokens.setter
    def num_lookahead_tokens(self, num_lookahead_tokens: int):
        self.num_heads = num_lookahead_tokens

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

max_paths `instance-attribute` ¶

max_paths = max_paths

max_seq_len `instance-attribute` ¶

max_seq_len = int(2 ** 20)

model_type `class-attribute` `instance-attribute` ¶

model_type = 'medusa'

num_attention_heads `property` ¶

num_attention_heads

num_heads `instance-attribute` ¶

num_heads = num_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_lookahead_tokens `property` `writable` ¶

num_lookahead_tokens

topk `instance-attribute` ¶

topk = topk

truncated_vocab_size `instance-attribute` ¶

truncated_vocab_size = (
    vocab_size
    if truncated_vocab_size is None
    else truncated_vocab_size
)

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    hidden_size: int = 4096,
    vocab_size: int = 32001,
    num_heads: int = 5,
    num_hidden_layers: int = 1,
    max_paths: int = 64,
    topk: int = 10,
    truncated_vocab_size: Optional[int] = None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/medusa.py

def __init__(self,
             hidden_size: int = 4096,
             vocab_size: int = 32001,
             num_heads: int = 5,
             num_hidden_layers: int = 1,
             max_paths: int = 64,
             topk: int = 10,
             truncated_vocab_size: Optional[int] = None,
             **kwargs):

    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.num_heads = num_heads
    self.num_hidden_layers = num_hidden_layers
    self.max_paths = max_paths
    self.topk = topk
    self.max_seq_len = int(2**20)
    self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
        else truncated_vocab_size
    if "architectures" not in kwargs:
        kwargs["architectures"] = ["MedusaModel"]

    super().__init__(**kwargs)

from_pretrained `classmethod` ¶

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> MedusaConfig

Source code in vllm/transformers_utils/configs/medusa.py

@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "MedusaConfig":
    config_dict, kwargs = cls.get_config_dict(
        pretrained_model_name_or_path, **kwargs)
    for k in list(config_dict.keys()):
        if 'num' in k:
            if 'heads' in k:
                config_dict["num_heads"] = config_dict.pop(k)
            elif 'layers' in k:
                config_dict["num_hidden_layers"] = config_dict.pop(k)
    return cls.from_dict(config_dict, **kwargs)

MiDashengLMConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/midashenglm.py

class MiDashengLMConfig(PretrainedConfig):
    model_type = "midashenglm"

    def __init__(
        self,
        audio_encoder_config: Optional[dict] = None,
        subsample_factor: int = 5,
        text_config: Optional[dict] = None,
        audio_token_id: Optional[int] = None,
        **kwargs,
    ):
        self.audio_encoder_config = DashengConfig(
            **(audio_encoder_config or {}))
        self.subsample_factor = subsample_factor
        self.text_config = (Qwen2_5OmniTextConfig(
            **text_config) if text_config else Qwen2_5OmniTextConfig())
        self.text_config.rope_scaling = None  # uses_mrope is false
        self.audio_token_id = audio_token_id
        super().__init__(**kwargs)

audio_encoder_config `instance-attribute` ¶

audio_encoder_config = DashengConfig(
    **(audio_encoder_config or {})
)

audio_token_id `instance-attribute` ¶

audio_token_id = audio_token_id

model_type `class-attribute` `instance-attribute` ¶

model_type = 'midashenglm'

subsample_factor `instance-attribute` ¶

subsample_factor = subsample_factor

text_config `instance-attribute` ¶

text_config = (
    Qwen2_5OmniTextConfig(**text_config)
    if text_config
    else Qwen2_5OmniTextConfig()
)

init ¶

__init__(
    audio_encoder_config: Optional[dict] = None,
    subsample_factor: int = 5,
    text_config: Optional[dict] = None,
    audio_token_id: Optional[int] = None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/midashenglm.py

def __init__(
    self,
    audio_encoder_config: Optional[dict] = None,
    subsample_factor: int = 5,
    text_config: Optional[dict] = None,
    audio_token_id: Optional[int] = None,
    **kwargs,
):
    self.audio_encoder_config = DashengConfig(
        **(audio_encoder_config or {}))
    self.subsample_factor = subsample_factor
    self.text_config = (Qwen2_5OmniTextConfig(
        **text_config) if text_config else Qwen2_5OmniTextConfig())
    self.text_config.rope_scaling = None  # uses_mrope is false
    self.audio_token_id = audio_token_id
    super().__init__(**kwargs)

MoonViTConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/moonvit.py

class MoonViTConfig(PretrainedConfig):
    model_type = "moonvit"

    def __init__(
            self,
            patch_size: int = 14,
            init_pos_emb_height: int = 64,
            init_pos_emb_width: int = 64,
            num_attention_heads: int = 16,
            num_hidden_layers: int = 27,
            hidden_size: int = 1152,
            intermediate_size: int = 4304,
            merge_kernel_size: tuple[int, int] = (2, 2),
            **kwargs,
    ):
        super().__init__(**kwargs)
        self.patch_size = patch_size
        # Positional embedding config
        self.init_pos_emb_height = init_pos_emb_height
        self.init_pos_emb_width = init_pos_emb_width
        # Transformer config
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        # Patch merger config
        self.merge_kernel_size = merge_kernel_size

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

init_pos_emb_height `instance-attribute` ¶

init_pos_emb_height = init_pos_emb_height

init_pos_emb_width `instance-attribute` ¶

init_pos_emb_width = init_pos_emb_width

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

merge_kernel_size `instance-attribute` ¶

merge_kernel_size = merge_kernel_size

model_type `class-attribute` `instance-attribute` ¶

model_type = 'moonvit'

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

patch_size `instance-attribute` ¶

patch_size = patch_size

init ¶

__init__(
    patch_size: int = 14,
    init_pos_emb_height: int = 64,
    init_pos_emb_width: int = 64,
    num_attention_heads: int = 16,
    num_hidden_layers: int = 27,
    hidden_size: int = 1152,
    intermediate_size: int = 4304,
    merge_kernel_size: tuple[int, int] = (2, 2),
    **kwargs,
)

Source code in vllm/transformers_utils/configs/moonvit.py

def __init__(
        self,
        patch_size: int = 14,
        init_pos_emb_height: int = 64,
        init_pos_emb_width: int = 64,
        num_attention_heads: int = 16,
        num_hidden_layers: int = 27,
        hidden_size: int = 1152,
        intermediate_size: int = 4304,
        merge_kernel_size: tuple[int, int] = (2, 2),
        **kwargs,
):
    super().__init__(**kwargs)
    self.patch_size = patch_size
    # Positional embedding config
    self.init_pos_emb_height = init_pos_emb_height
    self.init_pos_emb_width = init_pos_emb_width
    # Transformer config
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    # Patch merger config
    self.merge_kernel_size = merge_kernel_size

NemotronConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [NemotronModel]. It is used to instantiate a Nemotron model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Nemotron-8B.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`, optional, defaults to 256000	Vocabulary size of the Nemotron model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`NemotronModel`]	`256000`
`hidden_size`	`int`, optional, defaults to 6144	Dimension of the hidden representations.	`6144`
`intermediate_size`	`int`, optional, defaults to 24576	Dimension of the MLP representations.	`24576`
`num_hidden_layers`	`int`, optional, defaults to 32	Number of hidden layers in the Transformer decoder.	`32`
`num_attention_heads`	`int`, optional, defaults to 48	Number of attention heads for each attention layer in the Transformer decoder.	`48`
`head_dim`	`int`, optional	Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if None	`None`
`num_key_value_heads`	`int`, optional	This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to`num_attention_heads`.	`None`
`hidden_act`	`str` or `function`, optional, defaults to `"relu2"`	The non-linear activation function (function or string) in the decoder.	`'relu2'`
`max_position_embeddings`	`int`, optional, defaults to 4096	The maximum sequence length that this model might ever be used with.	`4096`
`initializer_range`	`float`, optional, defaults to 0.0134	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.	`0.0134`
`norm_eps`	`float`, optional, defaults to 1e-05	The epsilon used by the normalization layers.	`1e-05`
`use_cache`	`bool`, optional, defaults to `True`	Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`.	`True`
`pad_token_id`	`int`, optional	Padding token id.	`None`
`bos_token_id`	`int`, optional, defaults to 2	Beginning of stream token id.	`2`
`eos_token_id`	`int`, optional, defaults to 3	End of stream token id.	`3`
`tie_word_embeddings`	`bool`, optional, defaults to `False`	Whether to tie weight embeddings	`False`
`rope_theta`	`float`, optional, defaults to 10000.0	The base period of the RoPE embeddings.	`10000.0`
`partial_rotary_factor`	`float`, optional, defaults to 0.5	Percentage of the query and keys which will have rotary embedding.	`0.5`
`attention_bias`	`bool`, optional, defaults to `False`	Whether to use a bias in the query, key, value and output projection layers during self-attention.	`False`
`attention_dropout`	`float`, optional, defaults to 0.0	The dropout ratio for the attention probabilities.	`0.0`
`mlp_bias`	`bool`, optional, defaults to `False`	Whether to use a bias in up_proj and down_proj layers in the MLP layers.	`False`

>>> from transformers import NemotronModel, NemotronConfig
>>> # Initializing a Nemotron nemotron-15b style configuration
>>> configuration = NemotronConfig()
>>> # Initializing a model from the nemotron-15b style configuration
>>> model = NemotronModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config

Source code in vllm/transformers_utils/configs/nemotron.py

class NemotronConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`NemotronModel`]. It is used to instantiate a Nemotron model
    according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar
    configuration to that of the Nemotron-8B.

    Configuration objects inherit from [`PretrainedConfig`] and can be
    used to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Nemotron model. Defines the number of
            different tokens that can be represented by the
            `inputs_ids` passed when calling [`NemotronModel`]
        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the
            Transformer decoder.
        head_dim (`int`, *optional*):
            Projection weights dimension in multi-head attention. Set to
            hidden_size // num_attention_heads if None
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention
            (MQA) otherwise GQA is used. When converting a multi-head
            checkpoint to a GQA checkpoint, each group key and value
            head should be constructed by meanpooling all the original
            heads within that group. For more details checkout 
            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
            is not specified, will default to `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
            The non-linear activation function (function or string) in the
            decoder.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        initializer_range (`float`, *optional*, defaults to 0.0134):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 3):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
            Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output
            projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj and down_proj layers in the MLP
            layers.

    ```python
    >>> from transformers import NemotronModel, NemotronConfig
    >>> # Initializing a Nemotron nemotron-15b style configuration
    >>> configuration = NemotronConfig()
    >>> # Initializing a model from the nemotron-15b style configuration
    >>> model = NemotronModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "nemotron"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=256000,
        hidden_size=6144,
        intermediate_size=24576,
        num_hidden_layers=32,
        num_attention_heads=48,
        head_dim=None,
        num_key_value_heads=None,
        hidden_act="relu2",
        max_position_embeddings=4096,
        initializer_range=0.0134,
        norm_eps=1e-5,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=2,
        eos_token_id=3,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.5,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        head_dim = head_dim or kwargs.get("kv_channels")
        self.head_dim = head_dim if head_dim is not None else (
            hidden_size // num_attention_heads)

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.norm_eps = norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        # for backward compatibility
        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
            "rope_percentage") or partial_rotary_factor
        self.partial_rotary_factor = partial_rotary_factor
        self._rope_scaling_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(
                self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with two fields, "
                f"`type` and `factor`, got {self.rope_scaling}")
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        if rope_scaling_type is None or rope_scaling_type not in [
                "linear", "dynamic"
        ]:
            raise ValueError(
                "`rope_scaling`'s type field must be one of ['linear', "
                f"'dynamic'], got {rope_scaling_type}")
        if rope_scaling_factor is None or not isinstance(
                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(
                "`rope_scaling`'s factor field must be a float > 1, got "
                f"{rope_scaling_factor}")

attention_bias `instance-attribute` ¶

attention_bias = attention_bias

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

head_dim `instance-attribute` ¶

head_dim = (
    head_dim
    if head_dim is not None
    else hidden_size // num_attention_heads
)

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

mlp_bias `instance-attribute` ¶

mlp_bias = mlp_bias

model_type `class-attribute` `instance-attribute` ¶

model_type = 'nemotron'

norm_eps `instance-attribute` ¶

norm_eps = norm_eps

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_key_value_heads

partial_rotary_factor `instance-attribute` ¶

partial_rotary_factor = partial_rotary_factor

rope_scaling `instance-attribute` ¶

rope_scaling = rope_scaling

rope_theta `instance-attribute` ¶

rope_theta = rope_theta

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=256000,
    hidden_size=6144,
    intermediate_size=24576,
    num_hidden_layers=32,
    num_attention_heads=48,
    head_dim=None,
    num_key_value_heads=None,
    hidden_act="relu2",
    max_position_embeddings=4096,
    initializer_range=0.0134,
    norm_eps=1e-05,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=2,
    eos_token_id=3,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.5,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/nemotron.py

def __init__(
    self,
    vocab_size=256000,
    hidden_size=6144,
    intermediate_size=24576,
    num_hidden_layers=32,
    num_attention_heads=48,
    head_dim=None,
    num_key_value_heads=None,
    hidden_act="relu2",
    max_position_embeddings=4096,
    initializer_range=0.0134,
    norm_eps=1e-5,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=2,
    eos_token_id=3,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.5,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    head_dim = head_dim or kwargs.get("kv_channels")
    self.head_dim = head_dim if head_dim is not None else (
        hidden_size // num_attention_heads)

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.norm_eps = norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    # for backward compatibility
    partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
        "rope_percentage") or partial_rotary_factor
    self.partial_rotary_factor = partial_rotary_factor
    self._rope_scaling_validation()
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.mlp_bias = mlp_bias

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

_rope_scaling_validation ¶

_rope_scaling_validation()

Validate the rope_scaling configuration.

Source code in vllm/transformers_utils/configs/nemotron.py

def _rope_scaling_validation(self):
    """
    Validate the `rope_scaling` configuration.
    """
    if self.rope_scaling is None:
        return

    if not isinstance(self.rope_scaling, dict) or len(
            self.rope_scaling) != 2:
        raise ValueError(
            "`rope_scaling` must be a dictionary with two fields, "
            f"`type` and `factor`, got {self.rope_scaling}")
    rope_scaling_type = self.rope_scaling.get("type", None)
    rope_scaling_factor = self.rope_scaling.get("factor", None)
    if rope_scaling_type is None or rope_scaling_type not in [
            "linear", "dynamic"
    ]:
        raise ValueError(
            "`rope_scaling`'s type field must be one of ['linear', "
            f"'dynamic'], got {rope_scaling_type}")
    if rope_scaling_factor is None or not isinstance(
            rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
        raise ValueError(
            "`rope_scaling`'s factor field must be a float > 1, got "
            f"{rope_scaling_factor}")

NemotronHConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [NemotronHModel]. It is used to instantiate a NemotronH model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the NemotronH-v0.1 model. Args: vocab_size (int, optional, defaults to 131072): Vocabulary size of the NemotronH model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [NemotronHModel] tie_word_embeddings (bool, optional, defaults to False): Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has an output word embedding layer. hidden_size (int, optional, defaults to 4096): Dimension of the hidden representations. intermediate_size (int, optional, defaults to 21504): Dimension of the MLP representations. num_hidden_layers (int, optional, defaults to 52): Number of hidden layers in the Transformer encoder. hybrid_override_pattern (str, optional, defaults to "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"): The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, : Attention, -: MLP num_attention_heads (int, optional, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. attention_head_dim (int, optional, defaults to 128): Dimension of each attention head. num_key_value_heads (int, optional, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. mlp_hidden_act (str, optional, defaults to "relu2"): The non-linear activation function in the MLP layers. attention_bias (bool, optional, defaults to False): Whether to use bias in attention layers. mlp_bias (bool, optional, defaults to False): Whether to use bias in MLP layers. use_bias (bool, optional, defaults to False): Whether to use bias in the model. initializer_range (float, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_epsilon (float, optional, defaults to 1e-5): The epsilon used by the layer normalization layers. residual_in_fp32 (bool, optional, defaults to False): Whether or not residuals should be in float32. If set to False residuals will keep the same dtype as the rest of the model. use_cache (bool, optional, defaults to True): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True. num_logits_to_keep (int or None, optional, defaults to 1): Number of prompt logits to calculate during generation. If None, all logits will be calculated. If an integer value, only last num_logits_to_keep logits will be calculated. pad_token_id (int, optional, defaults to 0): The id of the padding token. bos_token_id (int, optional, defaults to 1): The id of the "beginning-of-sequence" token. eos_token_id (int, optional, defaults to 2): The id of the "end-of-sequence" token. sliding_window (int, optional, defaults to None): Sliding window attention window size. max_position_embeddings (int, optional, defaults to 4096): The maximum sequence length that this model might ever be used with. attention_dropout (float, optional, defaults to 0.0): The dropout ratio for the attention probabilities. hidden_dropout (float, optional, defaults to 0.0): The dropout ratio for the hidden states. use_mamba_kernels (bool, optional, defaults to True): Flag indicating whether or not to use the fast mamba kernels. These are available only if mamba-ssm and causal-conv1d are installed, and the mamba modules are running on a CUDA device. ssm_state_size (int, optional, defaults to 128): The dimension of the mamba state space latents. mamba_num_heads (int, optional, defaults to 128): Number of heads in Mamba layers. mamba_n_groups (int, optional, defaults to 8): Number of groups in Mamba layers. mamba_head_dim (int, optional, defaults to 64): Dimension of each Mamba head. mamba_d_conv (int, optional, defaults to 4): The size of the mamba convolution kernel. mamba_expand (int, optional, defaults to 2): Expanding factor used to determine the mamba intermediate size. mamba_hidden_act (str, optional, defaults to "silu"): The non-linear activation function in the Mamba layers. mamba_dt_min (float, optional, defaults to 0.001): Minimum value for the time step in Mamba. mamba_dt_max (float, optional, defaults to 0.1): Maximum value for the time step in Mamba. mamba_dt_limit (tuple, optional, defaults to (0.0, float("inf"))): Limits for the time step in Mamba. mamba_dt_init_floor (float, optional, defaults to 1e-4): Floor value for time step initialization in Mamba. mamba_conv_bias (bool, optional, defaults to True): Whether to use bias in the convolution layer of the mamba mixer block. mamba_proj_bias (bool, optional, defaults to False): Whether to use bias in the input and output projections of the mamba mixer block. mamba_chunk_size (int, optional, defaults to 256): Size of chunks for Mamba processing. rescale_prenorm_residual (bool, optional*, defaults to True): Whether to rescale the pre-normalization residual connections.

Source code in vllm/transformers_utils/configs/nemotron_h.py

class NemotronHConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
    to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to
    that of the NemotronH-v0.1 model.
    Args:
        vocab_size (`int`, *optional*, defaults to 131072):
            Vocabulary size of the NemotronH model. Defines the number of
            different tokens that can be represented by the `inputs_ids`
            passed when calling [`NemotronHModel`]
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be
            tied. Note that this is only relevant if the model has an output
            word embedding layer.
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 52):
            Number of hidden layers in the Transformer encoder.
        hybrid_override_pattern (`str`, *optional*, defaults to
            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
            The pattern of the hybrid model. The pattern is a string of
            characters where each character represents
            M: Mamba2, *: Attention, -: MLP
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        attention_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each attention head.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
            will use Multi Query Attention (MQA) otherwise GQA is used.
        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
            The non-linear activation function in the MLP layers.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in attention layers.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in MLP layers.
        use_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
            Whether or not residuals should be in `float32`. If set to `False`
            residuals will keep the same `dtype` as the rest of the model.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
            Number of prompt logits to calculate during generation. If `None`,
            all logits will be calculated. If an integer value, only last
            `num_logits_to_keep` logits will be calculated.
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        sliding_window (`int`, *optional*, defaults to None):
            Sliding window attention window size.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the hidden states.
        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
            Flag indicating whether or not to use the fast mamba kernels.
            These are available only if `mamba-ssm` and `causal-conv1d`
            are installed, and the mamba modules are running on a CUDA device.
        ssm_state_size (`int`, *optional*, defaults to 128):
            The dimension of the mamba state space latents.
        mamba_num_heads (`int`, *optional*, defaults to 128):
            Number of heads in Mamba layers.
        mamba_n_groups (`int`, *optional*, defaults to 8):
            Number of groups in Mamba layers.
        mamba_head_dim (`int`, *optional*, defaults to 64):
            Dimension of each Mamba head.
        mamba_d_conv (`int`, *optional*, defaults to 4):
            The size of the mamba convolution kernel.
        mamba_expand (`int`, *optional*, defaults to 2):
            Expanding factor used to determine the mamba intermediate size.
        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
            The non-linear activation function in the Mamba layers.
        mamba_dt_min (`float`, *optional*, defaults to 0.001):
            Minimum value for the time step in Mamba.
        mamba_dt_max (`float`, *optional*, defaults to 0.1):
            Maximum value for the time step in Mamba.
        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
            Limits for the time step in Mamba.
        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
            Floor value for time step initialization in Mamba.
        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the convolution layer of the mamba mixer
            block.
        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the input and output projections of the
            mamba mixer block.
        mamba_chunk_size (`int`, *optional*, defaults to 256):
            Size of chunks for Mamba processing.
        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
            Whether to rescale the pre-normalization residual connections.
    """

    model_type = "nemotron_h"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=131072,
        tie_word_embeddings=False,
        hidden_size=4096,
        intermediate_size=21504,
        num_hidden_layers=52,
        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
        num_attention_heads=32,
        head_dim=128,
        num_key_value_heads=8,  # nemo: num_query_groups
        mlp_hidden_act="relu2",
        attention_bias=False,
        mlp_bias=False,
        use_bias=False,
        initializer_range=0.02,  # nemo: init_method_std
        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
        residual_in_fp32=False,  #  Megatron Core default value
        use_cache=True,
        num_logits_to_keep=1,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        sliding_window=None,
        max_position_embeddings=4096,
        attention_dropout=0.0,
        hidden_dropout=0.0,  # * ADDED
        use_mamba_kernels=True,
        ssm_state_size=128,  # mamba_state_size
        mamba_num_heads=128,
        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
        mamba_head_dim=64,
        mamba_d_conv=4,
        mamba_expand=2,
        mamba_hidden_act="silu",
        mamba_dt_min=0.001,
        mamba_dt_max=0.1,
        mamba_dt_limit=(0.0, float("inf")),
        mamba_dt_init_floor=1e-4,
        mamba_conv_bias=True,
        mamba_proj_bias=False,
        mamba_chunk_size=256,
        rescale_prenorm_residual=True,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.tie_word_embeddings = tie_word_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.hybrid_override_pattern = hybrid_override_pattern
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        self.sliding_window = sliding_window
        self.max_position_embeddings = max_position_embeddings
        self.attention_dropout = attention_dropout
        self.hidden_dropout = hidden_dropout

        # Validate hybrid_override_pattern
        # M: Mamba2, *: Attention, -: MLP
        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
            "hybrid_override_pattern must have same length as "
            "num_hidden_layers")
        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
            "hybrid_override_pattern must only contain characters "
            "'M', '*', or '-'")

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.mlp_hidden_act = mlp_hidden_act
        self.attention_bias = attention_bias
        self.mlp_bias = mlp_bias
        self.use_bias = use_bias
        self.initializer_range = initializer_range
        self.layer_norm_epsilon = layer_norm_epsilon
        self.residual_in_fp32 = residual_in_fp32

        self.use_cache = use_cache
        self.num_logits_to_keep = num_logits_to_keep

        self.use_mamba_kernels = use_mamba_kernels
        self.n_groups = mamba_n_groups
        self.mamba_head_dim = mamba_head_dim
        self.ssm_state_size = ssm_state_size
        self.mamba_num_heads = mamba_num_heads
        self.conv_kernel = mamba_d_conv
        self.expand = mamba_expand
        self.mamba_hidden_act = mamba_hidden_act
        self.time_step_min = mamba_dt_min
        self.time_step_max = mamba_dt_max
        self.time_step_limit = mamba_dt_limit
        self.time_step_floor = mamba_dt_init_floor
        self.use_conv_bias = mamba_conv_bias
        self.mamba_proj_bias = mamba_proj_bias
        self.chunk_size = mamba_chunk_size
        self.rescale_prenorm_residual = rescale_prenorm_residual

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def layers_block_type(self):
        return [
            "mamba" if self.hybrid_override_pattern[i] == "M" else
            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
            for i in range(self.num_hidden_layers)
        ]

attention_bias `instance-attribute` ¶

attention_bias = attention_bias

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

chunk_size `instance-attribute` ¶

chunk_size = mamba_chunk_size

conv_kernel `instance-attribute` ¶

conv_kernel = mamba_d_conv

expand `instance-attribute` ¶

expand = mamba_expand

head_dim `instance-attribute` ¶

head_dim = head_dim

hidden_dropout `instance-attribute` ¶

hidden_dropout = hidden_dropout

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

hybrid_override_pattern `instance-attribute` ¶

hybrid_override_pattern = hybrid_override_pattern

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon `instance-attribute` ¶

layer_norm_epsilon = layer_norm_epsilon

layers_block_type `property` ¶

layers_block_type

mamba_head_dim `instance-attribute` ¶

mamba_head_dim = mamba_head_dim

mamba_hidden_act `instance-attribute` ¶

mamba_hidden_act = mamba_hidden_act

mamba_num_heads `instance-attribute` ¶

mamba_num_heads = mamba_num_heads

mamba_proj_bias `instance-attribute` ¶

mamba_proj_bias = mamba_proj_bias

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

mlp_bias `instance-attribute` ¶

mlp_bias = mlp_bias

mlp_hidden_act `instance-attribute` ¶

mlp_hidden_act = mlp_hidden_act

model_type `class-attribute` `instance-attribute` ¶

model_type = 'nemotron_h'

n_groups `instance-attribute` ¶

n_groups = mamba_n_groups

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_key_value_heads

num_logits_to_keep `instance-attribute` ¶

num_logits_to_keep = num_logits_to_keep

rescale_prenorm_residual `instance-attribute` ¶

rescale_prenorm_residual = rescale_prenorm_residual

residual_in_fp32 `instance-attribute` ¶

residual_in_fp32 = residual_in_fp32

sliding_window `instance-attribute` ¶

sliding_window = sliding_window

ssm_state_size `instance-attribute` ¶

ssm_state_size = ssm_state_size

tie_word_embeddings `instance-attribute` ¶

tie_word_embeddings = tie_word_embeddings

time_step_floor `instance-attribute` ¶

time_step_floor = mamba_dt_init_floor

time_step_limit `instance-attribute` ¶

time_step_limit = mamba_dt_limit

time_step_max `instance-attribute` ¶

time_step_max = mamba_dt_max

time_step_min `instance-attribute` ¶

time_step_min = mamba_dt_min

use_bias `instance-attribute` ¶

use_bias = use_bias

use_cache `instance-attribute` ¶

use_cache = use_cache

use_conv_bias `instance-attribute` ¶

use_conv_bias = mamba_conv_bias

use_mamba_kernels `instance-attribute` ¶

use_mamba_kernels = use_mamba_kernels

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=131072,
    tie_word_embeddings=False,
    hidden_size=4096,
    intermediate_size=21504,
    num_hidden_layers=52,
    hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
    num_attention_heads=32,
    head_dim=128,
    num_key_value_heads=8,
    mlp_hidden_act="relu2",
    attention_bias=False,
    mlp_bias=False,
    use_bias=False,
    initializer_range=0.02,
    layer_norm_epsilon=1e-05,
    residual_in_fp32=False,
    use_cache=True,
    num_logits_to_keep=1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sliding_window=None,
    max_position_embeddings=4096,
    attention_dropout=0.0,
    hidden_dropout=0.0,
    use_mamba_kernels=True,
    ssm_state_size=128,
    mamba_num_heads=128,
    mamba_n_groups=8,
    mamba_head_dim=64,
    mamba_d_conv=4,
    mamba_expand=2,
    mamba_hidden_act="silu",
    mamba_dt_min=0.001,
    mamba_dt_max=0.1,
    mamba_dt_limit=(0.0, float("inf")),
    mamba_dt_init_floor=0.0001,
    mamba_conv_bias=True,
    mamba_proj_bias=False,
    mamba_chunk_size=256,
    rescale_prenorm_residual=True,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/nemotron_h.py

def __init__(
    self,
    vocab_size=131072,
    tie_word_embeddings=False,
    hidden_size=4096,
    intermediate_size=21504,
    num_hidden_layers=52,
    hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
    num_attention_heads=32,
    head_dim=128,
    num_key_value_heads=8,  # nemo: num_query_groups
    mlp_hidden_act="relu2",
    attention_bias=False,
    mlp_bias=False,
    use_bias=False,
    initializer_range=0.02,  # nemo: init_method_std
    layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
    residual_in_fp32=False,  #  Megatron Core default value
    use_cache=True,
    num_logits_to_keep=1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sliding_window=None,
    max_position_embeddings=4096,
    attention_dropout=0.0,
    hidden_dropout=0.0,  # * ADDED
    use_mamba_kernels=True,
    ssm_state_size=128,  # mamba_state_size
    mamba_num_heads=128,
    mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
    mamba_head_dim=64,
    mamba_d_conv=4,
    mamba_expand=2,
    mamba_hidden_act="silu",
    mamba_dt_min=0.001,
    mamba_dt_max=0.1,
    mamba_dt_limit=(0.0, float("inf")),
    mamba_dt_init_floor=1e-4,
    mamba_conv_bias=True,
    mamba_proj_bias=False,
    mamba_chunk_size=256,
    rescale_prenorm_residual=True,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.tie_word_embeddings = tie_word_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.hybrid_override_pattern = hybrid_override_pattern
    self.num_attention_heads = num_attention_heads
    self.head_dim = head_dim
    self.sliding_window = sliding_window
    self.max_position_embeddings = max_position_embeddings
    self.attention_dropout = attention_dropout
    self.hidden_dropout = hidden_dropout

    # Validate hybrid_override_pattern
    # M: Mamba2, *: Attention, -: MLP
    assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
        "hybrid_override_pattern must have same length as "
        "num_hidden_layers")
    assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
        "hybrid_override_pattern must only contain characters "
        "'M', '*', or '-'")

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.mlp_hidden_act = mlp_hidden_act
    self.attention_bias = attention_bias
    self.mlp_bias = mlp_bias
    self.use_bias = use_bias
    self.initializer_range = initializer_range
    self.layer_norm_epsilon = layer_norm_epsilon
    self.residual_in_fp32 = residual_in_fp32

    self.use_cache = use_cache
    self.num_logits_to_keep = num_logits_to_keep

    self.use_mamba_kernels = use_mamba_kernels
    self.n_groups = mamba_n_groups
    self.mamba_head_dim = mamba_head_dim
    self.ssm_state_size = ssm_state_size
    self.mamba_num_heads = mamba_num_heads
    self.conv_kernel = mamba_d_conv
    self.expand = mamba_expand
    self.mamba_hidden_act = mamba_hidden_act
    self.time_step_min = mamba_dt_min
    self.time_step_max = mamba_dt_max
    self.time_step_limit = mamba_dt_limit
    self.time_step_floor = mamba_dt_init_floor
    self.use_conv_bias = mamba_conv_bias
    self.mamba_proj_bias = mamba_proj_bias
    self.chunk_size = mamba_chunk_size
    self.rescale_prenorm_residual = rescale_prenorm_residual

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

Nemotron_Nano_VL_Config ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/nemotron_vl.py

class Nemotron_Nano_VL_Config(PretrainedConfig):
    model_type = 'Llama_Nemotron_Nano_VL'
    is_composition = True

    def __init__(
        self,
        vision_config=None,
        llm_config=None,
        force_image_size=None,
        downsample_ratio=0.5,
        template=None,
        ps_version='v1',
        image_tag_type="internvl",
        projector_hidden_size=4096,
        vit_hidden_size=1280,
        **kwargs
    ):
        super().__init__(**kwargs)

        if vision_config is not None:
            assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
            vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
            self.vision_config = vision_auto_config(**vision_config)
        else:
            self.vision_config = PretrainedConfig()

        if llm_config is None:
            self.text_config = LlamaConfig()
        else:
            self.text_config = LlamaConfig(**llm_config)

        # Assign configuration values
        self.force_image_size = force_image_size
        self.downsample_ratio = downsample_ratio
        self.template = template  # TODO move out of here and into the tokenizer
        self.ps_version = ps_version  # Pixel shuffle version
        self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
        self.projector_hidden_size = projector_hidden_size
        self.vit_hidden_size = vit_hidden_size

downsample_ratio `instance-attribute` ¶

downsample_ratio = downsample_ratio

force_image_size `instance-attribute` ¶

force_image_size = force_image_size

image_tag_type `instance-attribute` ¶

image_tag_type = image_tag_type

is_composition `class-attribute` `instance-attribute` ¶

is_composition = True

model_type `class-attribute` `instance-attribute` ¶

model_type = 'Llama_Nemotron_Nano_VL'

projector_hidden_size `instance-attribute` ¶

projector_hidden_size = projector_hidden_size

ps_version `instance-attribute` ¶

ps_version = ps_version

template `instance-attribute` ¶

template = template

text_config `instance-attribute` ¶

text_config = LlamaConfig()

vision_config `instance-attribute` ¶

vision_config = vision_auto_config(**vision_config)

vit_hidden_size `instance-attribute` ¶

vit_hidden_size = vit_hidden_size

init ¶

__init__(
    vision_config=None,
    llm_config=None,
    force_image_size=None,
    downsample_ratio=0.5,
    template=None,
    ps_version="v1",
    image_tag_type="internvl",
    projector_hidden_size=4096,
    vit_hidden_size=1280,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/nemotron_vl.py

def __init__(
    self,
    vision_config=None,
    llm_config=None,
    force_image_size=None,
    downsample_ratio=0.5,
    template=None,
    ps_version='v1',
    image_tag_type="internvl",
    projector_hidden_size=4096,
    vit_hidden_size=1280,
    **kwargs
):
    super().__init__(**kwargs)

    if vision_config is not None:
        assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
        vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
        self.vision_config = vision_auto_config(**vision_config)
    else:
        self.vision_config = PretrainedConfig()

    if llm_config is None:
        self.text_config = LlamaConfig()
    else:
        self.text_config = LlamaConfig(**llm_config)

    # Assign configuration values
    self.force_image_size = force_image_size
    self.downsample_ratio = downsample_ratio
    self.template = template  # TODO move out of here and into the tokenizer
    self.ps_version = ps_version  # Pixel shuffle version
    self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
    self.projector_hidden_size = projector_hidden_size
    self.vit_hidden_size = vit_hidden_size

Olmo3Config ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/olmo3.py

class Olmo3Config(PretrainedConfig):

    model_type = "olmo3"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=50304,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        use_cache=True,
        pad_token_id=1,
        bos_token_id=None,
        eos_token_id=50279,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        rms_norm_eps=1e-5,
        sliding_window=4096,
        layer_types=None,
        **kwargs,
    ):
        # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
        # in vLLM.
        if "architectures" not in kwargs:
            kwargs["architectures"] = ["Olmo2ForCausalLM"]
        elif "Olmo3ForCausalLM" in kwargs["architectures"]:
            kwargs["architectures"].remove("Olmo3ForCausalLM")
            kwargs["architectures"].append("Olmo2ForCausalLM")

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        self.rms_norm_eps = rms_norm_eps

        self.sliding_window = sliding_window
        self.layer_types = layer_types
        if self.layer_types is None:
            self.layer_types = [
                "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
                for i in range(self.num_hidden_layers)
            ]

attention_bias `instance-attribute` ¶

attention_bias = attention_bias

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

layer_types `instance-attribute` ¶

layer_types = layer_types

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

model_type `class-attribute` `instance-attribute` ¶

model_type = 'olmo3'

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_key_value_heads

rms_norm_eps `instance-attribute` ¶

rms_norm_eps = rms_norm_eps

rope_scaling `instance-attribute` ¶

rope_scaling = rope_scaling

rope_theta `instance-attribute` ¶

rope_theta = rope_theta

sliding_window `instance-attribute` ¶

sliding_window = sliding_window

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=50304,
    hidden_size=4096,
    intermediate_size=11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    use_cache=True,
    pad_token_id=1,
    bos_token_id=None,
    eos_token_id=50279,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    rms_norm_eps=1e-05,
    sliding_window=4096,
    layer_types=None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/olmo3.py

def __init__(
    self,
    vocab_size=50304,
    hidden_size=4096,
    intermediate_size=11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    use_cache=True,
    pad_token_id=1,
    bos_token_id=None,
    eos_token_id=50279,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    rms_norm_eps=1e-5,
    sliding_window=4096,
    layer_types=None,
    **kwargs,
):
    # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
    # in vLLM.
    if "architectures" not in kwargs:
        kwargs["architectures"] = ["Olmo2ForCausalLM"]
    elif "Olmo3ForCausalLM" in kwargs["architectures"]:
        kwargs["architectures"].remove("Olmo3ForCausalLM")
        kwargs["architectures"].append("Olmo2ForCausalLM")

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout

    self.rms_norm_eps = rms_norm_eps

    self.sliding_window = sliding_window
    self.layer_types = layer_types
    if self.layer_types is None:
        self.layer_types = [
            "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
            for i in range(self.num_hidden_layers)
        ]

OvisConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/ovis.py

class OvisConfig(PretrainedConfig):
    model_type = "ovis"

    def __init__(self,
                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
                                                         dict]] = None,
                 multimodal_max_length=8192,
                 hidden_size=None,
                 conversation_formatter_class=None,
                 llm_attn_implementation=None,
                 disable_tie_weight=False,
                 **kwargs):
        super().__init__(**kwargs)
        if llm_config is not None:
            assert isinstance(llm_config, (PretrainedConfig, dict)), \
                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
            if not isinstance(llm_config, PretrainedConfig):
                model_type = llm_config['model_type']
                llm_config.pop('model_type')
                llm_config = AutoConfig.for_model(model_type, **llm_config)

        # map llm_config to text_config
        self.text_config = llm_config
        if visual_tokenizer_config is not None:
            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
            if not isinstance(visual_tokenizer_config, PretrainedConfig):
                model_type = visual_tokenizer_config['model_type']
                visual_tokenizer_config.pop('model_type')
                visual_tokenizer_config = AutoConfig.for_model(
                    model_type, **visual_tokenizer_config)

        self.visual_tokenizer_config = visual_tokenizer_config
        self.multimodal_max_length = multimodal_max_length
        self.hidden_size = hidden_size
        self.conversation_formatter_class = conversation_formatter_class
        self.llm_attn_implementation = llm_attn_implementation
        self.disable_tie_weight = disable_tie_weight

conversation_formatter_class `instance-attribute` ¶

conversation_formatter_class = conversation_formatter_class

disable_tie_weight `instance-attribute` ¶

disable_tie_weight = disable_tie_weight

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

llm_attn_implementation `instance-attribute` ¶

llm_attn_implementation = llm_attn_implementation

model_type `class-attribute` `instance-attribute` ¶

model_type = 'ovis'

multimodal_max_length `instance-attribute` ¶

multimodal_max_length = multimodal_max_length

text_config `instance-attribute` ¶

text_config = llm_config

visual_tokenizer_config `instance-attribute` ¶

visual_tokenizer_config = visual_tokenizer_config

init ¶

__init__(
    llm_config: Optional[
        Union[PretrainedConfig, dict]
    ] = None,
    visual_tokenizer_config: Optional[
        Union[PretrainedConfig, dict]
    ] = None,
    multimodal_max_length=8192,
    hidden_size=None,
    conversation_formatter_class=None,
    llm_attn_implementation=None,
    disable_tie_weight=False,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/ovis.py

def __init__(self,
             llm_config: Optional[Union[PretrainedConfig, dict]] = None,
             visual_tokenizer_config: Optional[Union[PretrainedConfig,
                                                     dict]] = None,
             multimodal_max_length=8192,
             hidden_size=None,
             conversation_formatter_class=None,
             llm_attn_implementation=None,
             disable_tie_weight=False,
             **kwargs):
    super().__init__(**kwargs)
    if llm_config is not None:
        assert isinstance(llm_config, (PretrainedConfig, dict)), \
            f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
        if not isinstance(llm_config, PretrainedConfig):
            model_type = llm_config['model_type']
            llm_config.pop('model_type')
            llm_config = AutoConfig.for_model(model_type, **llm_config)

    # map llm_config to text_config
    self.text_config = llm_config
    if visual_tokenizer_config is not None:
        assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
            f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
        if not isinstance(visual_tokenizer_config, PretrainedConfig):
            model_type = visual_tokenizer_config['model_type']
            visual_tokenizer_config.pop('model_type')
            visual_tokenizer_config = AutoConfig.for_model(
                model_type, **visual_tokenizer_config)

    self.visual_tokenizer_config = visual_tokenizer_config
    self.multimodal_max_length = multimodal_max_length
    self.hidden_size = hidden_size
    self.conversation_formatter_class = conversation_formatter_class
    self.llm_attn_implementation = llm_attn_implementation
    self.disable_tie_weight = disable_tie_weight

Qwen3NextConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [Qwen3NextModel]. It is used to instantiate a Qwen3-Next model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of Qwen3-Next-80B-A3B-Instruct Qwen/Qwen3-Next-80B-A3B-Instruct.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`, optional, defaults to 151936	Vocabulary size of the model. Defines the number of different tokens that can be represented by the `inputs_ids`.	`151936`
`hidden_size`	`int`, optional, defaults to 2048	Dimension of the hidden representations.	`2048`
`intermediate_size`	`int`, optional, defaults to 5632	Dimension of the MLP representations.	`5632`
`num_hidden_layers`	`int`, optional, defaults to 48	Number of hidden layers in the Transformer encoder.	`48`
`num_attention_heads`	`int`, optional, defaults to 16	Number of attention heads for each attention layer in the Transformer encoder.	`16`
`num_key_value_heads`	`int`, optional, defaults to 2	This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout this paper. If it is not specified, will default to `32`.	`2`
`hidden_act`	`str`, optional, defaults to `"silu"`	The non-linear activation function in the decoder.	`'silu'`
`max_position_embeddings`	`int`, optional, defaults to 32768	The maximum sequence length that this model might ever be used with.	`32768`
`initializer_range`	`float`, optional, defaults to 0.02	The standard deviation of the truncated_normal_initializer for initializing all weight matrices.	`0.02`
`rms_norm_eps`	`float`, optional, defaults to 1e-06	The epsilon used by the rms normalization layers.	`1e-06`
`use_cache`	`bool`, optional, defaults to `True`	Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`.	`True`
`tie_word_embeddings`	`bool`, optional, defaults to `False`	Whether the model's input and output word embeddings should be tied.	`False`
`rope_theta`	`float`, optional, defaults to 10000.0	The base period of the RoPE embeddings.	`10000.0`
`rope_scaling`	`Dict`, optional	Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. Expected contents: `rope_type` (`str`): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. `factor` (`float`, optional): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length. `original_max_position_embeddings` (`int`, optional): Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during pretraining. `attention_factor` (`float`, optional): Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention computation. If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value. `beta_fast` (`float`, optional): Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear ramp function. If unspecified, it defaults to 32. `beta_slow` (`float`, optional): Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear ramp function. If unspecified, it defaults to 1. `short_factor` (`List[float]`, optional): Only used with 'longrope'. The scaling factor to be applied to short contexts (< `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 `long_factor` (`List[float]`, optional): Only used with 'longrope'. The scaling factor to be applied to long contexts (< `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 `low_freq_factor` (`float`, optional): Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE `high_freq_factor` (`float`, optional): Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE	`None`
`partial_rotary_factor`	`float`, optional, defaults to 0.25	Percentage of the query and keys which will have rotary embedding.	`0.25`
`attention_bias`	`bool`, optional, defaults to `False`	Whether to use a bias in the query, key, value and output projection layers during self-attention.	`False`
`attention_dropout`	`float`, optional, defaults to 0.0	The dropout ratio for the attention probabilities.	`0.0`
`head_dim`	`int`, optional, defaults to 256	Projection weights dimension in multi-head attention.	`256`
`linear_conv_kernel_dim`	`int`, optional, defaults to 4	Kernel size of the convolution used in linear attention layers.	`4`
`linear_key_head_dim`	`int`, optional, defaults to 128	Dimension of each key head in linear attention.	`128`
`linear_value_head_dim`	`int`, optional, defaults to 128	Dimension of each value head in linear attention.	`128`
`linear_num_key_heads`	`int`, optional, defaults to 16	Number of key heads used in linear attention layers.	`16`
`linear_num_value_heads`	`int`, optional, defaults to 32	Number of value heads used in linear attention layers.	`32`
`decoder_sparse_step`	`int`, optional, defaults to 1	The frequency of the MoE layer.	`1`
`moe_intermediate_size`	`int`, optional, defaults to 512	Intermediate size of the routed expert.	`512`
`shared_expert_intermediate_size`	`int`, optional, defaults to 512	Intermediate size of the shared expert.	`512`
`num_experts_per_tok`	`int`, optional, defaults to 10	Number of selected experts.	`10`
`num_experts`	`int`, optional, defaults to 512	Number of routed experts.	`512`
`norm_topk_prob`	`bool`, optional, defaults to `True`	Whether to normalize the topk probabilities.	`True`
`output_router_logits`	`bool`, optional, defaults to `False`	Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss, including load balancing loss and router z-loss.	`False`
`router_aux_loss_coef`	`float`, optional, defaults to 0.001	The aux loss factor for the total loss.	`0.001`
`mlp_only_layers`	`list[int]`, optional, defaults to `[]`	Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock The list contains layer index, from 0 to num_layers-1 if we have num_layers layers If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.	`None`
`layer_types`	`list[str]`, optional	Types of each layer (attention or linear).	`None`

>>> from transformers import Qwen3NextModel, Qwen3NextConfig

>>> # Initializing a Qwen3Next style configuration
>>> configuration =  Qwen3NextConfig()

>>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
>>> model = Qwen3NextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

Source code in vllm/transformers_utils/configs/qwen3_next.py

class Qwen3NextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
    Qwen3-Next model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of
    Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
            `inputs_ids`.
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 5632):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 2):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str`, *optional*, defaults to `"silu"`):
            The non-linear activation function in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
            Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        head_dim (`int`, *optional*, defaults to 256):
            Projection weights dimension in multi-head attention.
        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
            Kernel size of the convolution used in linear attention layers.
        linear_key_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each key head in linear attention.
        linear_value_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each value head in linear attention.
        linear_num_key_heads (`int`, *optional*, defaults to 16):
            Number of key heads used in linear attention layers.
        linear_num_value_heads (`int`, *optional*, defaults to 32):
            Number of value heads used in linear attention layers.
        decoder_sparse_step (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer.
        moe_intermediate_size (`int`, *optional*, defaults to 512):
            Intermediate size of the routed expert.
        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
            Intermediate size of the shared expert.
        num_experts_per_tok (`int`, *optional*, defaults to 10):
            Number of selected experts.
        num_experts (`int`, *optional*, defaults to 512):
            Number of routed experts.
        norm_topk_prob (`bool`, *optional*, defaults to `True`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
            Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
        layer_types (`list[str]`, *optional*):
            Types of each layer (attention or linear).

    ```python
    >>> from transformers import Qwen3NextModel, Qwen3NextConfig

    >>> # Initializing a Qwen3Next style configuration
    >>> configuration =  Qwen3NextConfig()

    >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
    >>> model = Qwen3NextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """  # noqa: E501

    model_type = "qwen3_next"
    keys_to_ignore_at_inference = ["past_key_values"]

    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.experts.*.gate_proj": "colwise",
        "layers.*.mlp.experts.*.up_proj": "colwise",
        "layers.*.mlp.experts.*.down_proj": "rowwise",
        "layers.*.mlp.shared_experts.gate_proj": "colwise",
        "layers.*.mlp.shared_experts.up_proj": "colwise",
        "layers.*.mlp.shared_experts.down_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    def __init__(
        self,
        vocab_size=151936,
        hidden_size=2048,
        intermediate_size=5632,
        num_hidden_layers=48,
        num_attention_heads=16,
        num_key_value_heads=2,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.25,
        attention_bias=False,
        attention_dropout=0.0,
        head_dim=256,
        linear_conv_kernel_dim=4,
        linear_key_head_dim=128,
        linear_value_head_dim=128,
        linear_num_key_heads=16,
        linear_num_value_heads=32,
        decoder_sparse_step=1,
        moe_intermediate_size=512,
        shared_expert_intermediate_size=512,
        num_experts_per_tok=10,
        num_experts=512,
        norm_topk_prob=True,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        mlp_only_layers=None,
        layer_types=None,
        **kwargs,
    ):
        if mlp_only_layers is None:
            mlp_only_layers = []
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.partial_rotary_factor = partial_rotary_factor
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.head_dim = head_dim
        rope_config_validation(self)

        self.layer_types = layer_types
        if self.layer_types is None:
            self.layer_types = [
                "linear_attention" if bool((i + 1) % 4) else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
        layer_type_validation(self.layer_types)

        # linear attention part
        self.linear_conv_kernel_dim = linear_conv_kernel_dim
        self.linear_key_head_dim = linear_key_head_dim
        self.linear_value_head_dim = linear_value_head_dim
        self.linear_num_key_heads = linear_num_key_heads
        self.linear_num_value_heads = linear_num_value_heads

        # MoE arguments
        self.decoder_sparse_step = decoder_sparse_step
        self.moe_intermediate_size = moe_intermediate_size
        self.shared_expert_intermediate_size = shared_expert_intermediate_size
        self.num_experts_per_tok = num_experts_per_tok
        self.num_experts = num_experts
        self.norm_topk_prob = norm_topk_prob
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.mlp_only_layers = mlp_only_layers

attention_bias `instance-attribute` ¶

attention_bias = attention_bias

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

base_model_pp_plan `class-attribute` `instance-attribute` ¶

base_model_pp_plan = {
    "embed_tokens": (["input_ids"], ["inputs_embeds"]),
    "layers": (
        ["hidden_states", "attention_mask"],
        ["hidden_states"],
    ),
    "norm": (["hidden_states"], ["hidden_states"]),
}

base_model_tp_plan `class-attribute` `instance-attribute` ¶

base_model_tp_plan = {
    "layers.*.self_attn.q_proj": "colwise",
    "layers.*.self_attn.k_proj": "colwise",
    "layers.*.self_attn.v_proj": "colwise",
    "layers.*.self_attn.o_proj": "rowwise",
    "layers.*.mlp.experts.*.gate_proj": "colwise",
    "layers.*.mlp.experts.*.up_proj": "colwise",
    "layers.*.mlp.experts.*.down_proj": "rowwise",
    "layers.*.mlp.shared_experts.gate_proj": "colwise",
    "layers.*.mlp.shared_experts.up_proj": "colwise",
    "layers.*.mlp.shared_experts.down_proj": "rowwise",
    "layers.*.mlp.gate_proj": "colwise",
    "layers.*.mlp.up_proj": "colwise",
    "layers.*.mlp.down_proj": "rowwise",
}

decoder_sparse_step `instance-attribute` ¶

decoder_sparse_step = decoder_sparse_step

head_dim `instance-attribute` ¶

head_dim = head_dim

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

layer_types `instance-attribute` ¶

layer_types = layer_types

linear_conv_kernel_dim `instance-attribute` ¶

linear_conv_kernel_dim = linear_conv_kernel_dim

linear_key_head_dim `instance-attribute` ¶

linear_key_head_dim = linear_key_head_dim

linear_num_key_heads `instance-attribute` ¶

linear_num_key_heads = linear_num_key_heads

linear_num_value_heads `instance-attribute` ¶

linear_num_value_heads = linear_num_value_heads

linear_value_head_dim `instance-attribute` ¶

linear_value_head_dim = linear_value_head_dim

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

mlp_only_layers `instance-attribute` ¶

mlp_only_layers = mlp_only_layers

model_type `class-attribute` `instance-attribute` ¶

model_type = 'qwen3_next'

moe_intermediate_size `instance-attribute` ¶

moe_intermediate_size = moe_intermediate_size

norm_topk_prob `instance-attribute` ¶

norm_topk_prob = norm_topk_prob

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_experts `instance-attribute` ¶

num_experts = num_experts

num_experts_per_tok `instance-attribute` ¶

num_experts_per_tok = num_experts_per_tok

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

num_key_value_heads `instance-attribute` ¶

num_key_value_heads = num_key_value_heads

output_router_logits `instance-attribute` ¶

output_router_logits = output_router_logits

partial_rotary_factor `instance-attribute` ¶

partial_rotary_factor = partial_rotary_factor

rms_norm_eps `instance-attribute` ¶

rms_norm_eps = rms_norm_eps

rope_scaling `instance-attribute` ¶

rope_scaling = rope_scaling

rope_theta `instance-attribute` ¶

rope_theta = rope_theta

router_aux_loss_coef `instance-attribute` ¶

router_aux_loss_coef = router_aux_loss_coef

shared_expert_intermediate_size `instance-attribute` ¶

shared_expert_intermediate_size = (
    shared_expert_intermediate_size
)

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=151936,
    hidden_size=2048,
    intermediate_size=5632,
    num_hidden_layers=48,
    num_attention_heads=16,
    num_key_value_heads=2,
    hidden_act="silu",
    max_position_embeddings=32768,
    initializer_range=0.02,
    rms_norm_eps=1e-06,
    use_cache=True,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.25,
    attention_bias=False,
    attention_dropout=0.0,
    head_dim=256,
    linear_conv_kernel_dim=4,
    linear_key_head_dim=128,
    linear_value_head_dim=128,
    linear_num_key_heads=16,
    linear_num_value_heads=32,
    decoder_sparse_step=1,
    moe_intermediate_size=512,
    shared_expert_intermediate_size=512,
    num_experts_per_tok=10,
    num_experts=512,
    norm_topk_prob=True,
    output_router_logits=False,
    router_aux_loss_coef=0.001,
    mlp_only_layers=None,
    layer_types=None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/qwen3_next.py

def __init__(
    self,
    vocab_size=151936,
    hidden_size=2048,
    intermediate_size=5632,
    num_hidden_layers=48,
    num_attention_heads=16,
    num_key_value_heads=2,
    hidden_act="silu",
    max_position_embeddings=32768,
    initializer_range=0.02,
    rms_norm_eps=1e-6,
    use_cache=True,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.25,
    attention_bias=False,
    attention_dropout=0.0,
    head_dim=256,
    linear_conv_kernel_dim=4,
    linear_key_head_dim=128,
    linear_value_head_dim=128,
    linear_num_key_heads=16,
    linear_num_value_heads=32,
    decoder_sparse_step=1,
    moe_intermediate_size=512,
    shared_expert_intermediate_size=512,
    num_experts_per_tok=10,
    num_experts=512,
    norm_topk_prob=True,
    output_router_logits=False,
    router_aux_loss_coef=0.001,
    mlp_only_layers=None,
    layer_types=None,
    **kwargs,
):
    if mlp_only_layers is None:
        mlp_only_layers = []
    super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.partial_rotary_factor = partial_rotary_factor
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.head_dim = head_dim
    rope_config_validation(self)

    self.layer_types = layer_types
    if self.layer_types is None:
        self.layer_types = [
            "linear_attention" if bool((i + 1) % 4) else "full_attention"
            for i in range(self.num_hidden_layers)
        ]
    layer_type_validation(self.layer_types)

    # linear attention part
    self.linear_conv_kernel_dim = linear_conv_kernel_dim
    self.linear_key_head_dim = linear_key_head_dim
    self.linear_value_head_dim = linear_value_head_dim
    self.linear_num_key_heads = linear_num_key_heads
    self.linear_num_value_heads = linear_num_value_heads

    # MoE arguments
    self.decoder_sparse_step = decoder_sparse_step
    self.moe_intermediate_size = moe_intermediate_size
    self.shared_expert_intermediate_size = shared_expert_intermediate_size
    self.num_experts_per_tok = num_experts_per_tok
    self.num_experts = num_experts
    self.norm_topk_prob = norm_topk_prob
    self.output_router_logits = output_router_logits
    self.router_aux_loss_coef = router_aux_loss_coef
    self.mlp_only_layers = mlp_only_layers

RWConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/falcon.py

class RWConfig(PretrainedConfig):
    model_type = "falcon"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "num_hidden_layers": "n_layer",
        "num_attention_heads": "n_head",
        "num_kv_heads": "n_head_kv",
    }

    def __init__(
        self,
        vocab_size=250880,
        hidden_size=64,
        n_layer=2,
        n_head=8,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=1,
        eos_token_id=2,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        multi_query=True,
        n_head_kv=None,
        alibi=False,
        bias=False,
        parallel_attn=False,
        new_decoder_architecture=False,
        **kwargs,
    ) -> None:
        self.vocab_size = vocab_size
        # Backward compatibility with n_embed kwarg
        n_embed = kwargs.pop("n_embed", None)
        self.hidden_size = hidden_size if n_embed is None else n_embed
        self.n_layer = n_layer
        self.n_head = n_head
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.multi_query = multi_query
        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
        self.alibi = alibi
        self.bias = bias
        self.parallel_attn = parallel_attn
        self.new_decoder_architecture = new_decoder_architecture

        if self.hidden_size == 8192:
            # Hack for falcon-40b
            self.new_decoder_architecture = True

        super().__init__(bos_token_id=bos_token_id,
                         eos_token_id=eos_token_id,
                         **kwargs)

    @property
    def head_dim(self):
        return self.hidden_size // self.n_head

    @property
    def rotary(self):
        return not self.alibi

alibi `instance-attribute` ¶

alibi = alibi

attention_dropout `instance-attribute` ¶

attention_dropout = attention_dropout

attribute_map `class-attribute` `instance-attribute` ¶

attribute_map = {
    "num_hidden_layers": "n_layer",
    "num_attention_heads": "n_head",
    "num_kv_heads": "n_head_kv",
}

bias `instance-attribute` ¶

bias = bias

bos_token_id `instance-attribute` ¶

bos_token_id = bos_token_id

eos_token_id `instance-attribute` ¶

eos_token_id = eos_token_id

head_dim `property` ¶

head_dim

hidden_dropout `instance-attribute` ¶

hidden_dropout = hidden_dropout

hidden_size `instance-attribute` ¶

hidden_size = hidden_size if n_embed is None else n_embed

initializer_range `instance-attribute` ¶

initializer_range = initializer_range

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon `instance-attribute` ¶

layer_norm_epsilon = layer_norm_epsilon

model_type `class-attribute` `instance-attribute` ¶

model_type = 'falcon'

multi_query `instance-attribute` ¶

multi_query = multi_query

n_head `instance-attribute` ¶

n_head = n_head

n_head_kv `instance-attribute` ¶

n_head_kv = 1 if n_head_kv is None else n_head_kv

n_layer `instance-attribute` ¶

n_layer = n_layer

new_decoder_architecture `instance-attribute` ¶

new_decoder_architecture = new_decoder_architecture

parallel_attn `instance-attribute` ¶

parallel_attn = parallel_attn

rotary `property` ¶

rotary

use_cache `instance-attribute` ¶

use_cache = use_cache

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    vocab_size=250880,
    hidden_size=64,
    n_layer=2,
    n_head=8,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    multi_query=True,
    n_head_kv=None,
    alibi=False,
    bias=False,
    parallel_attn=False,
    new_decoder_architecture=False,
    **kwargs,
) -> None

Source code in vllm/transformers_utils/configs/falcon.py

def __init__(
    self,
    vocab_size=250880,
    hidden_size=64,
    n_layer=2,
    n_head=8,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    multi_query=True,
    n_head_kv=None,
    alibi=False,
    bias=False,
    parallel_attn=False,
    new_decoder_architecture=False,
    **kwargs,
) -> None:
    self.vocab_size = vocab_size
    # Backward compatibility with n_embed kwarg
    n_embed = kwargs.pop("n_embed", None)
    self.hidden_size = hidden_size if n_embed is None else n_embed
    self.n_layer = n_layer
    self.n_head = n_head
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id
    self.multi_query = multi_query
    self.n_head_kv = 1 if n_head_kv is None else n_head_kv
    self.alibi = alibi
    self.bias = bias
    self.parallel_attn = parallel_attn
    self.new_decoder_architecture = new_decoder_architecture

    if self.hidden_size == 8192:
        # Hack for falcon-40b
        self.new_decoder_architecture = True

    super().__init__(bos_token_id=bos_token_id,
                     eos_token_id=eos_token_id,
                     **kwargs)

RadioConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a Radio vision model. It is used to instantiate a Radio model according to the specified arguments, defining the model architecture.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	Name of the vision transformer model (e.g., "vit_base_patch16_224"). Used to determine architecture dimensions from `VIT_TIMM_DIM_BY_NAME`.	required
`image_size`	`int`	The size (resolution) of each image.	`224`
`patch_size`	`int`	The size (resolution) of each patch.	`16`
`qkv_bias`	`bool`	Whether to add a bias to the queries, keys and values.	`True`
`qk_normalization`	`bool`	Whether to apply normalization to queries and keys.	`False`
`norm_type`	`str`	The normalization type to use.	`'layer_norm'`
`layer_norm_eps`	`float`	The epsilon used by the layer normalization layers.	`1e-06`
`initializer_factor`	`float`	A factor for initializing all weight matrices.	`1.0`
`hidden_act`	`str`	The non-linear activation function in the encoder.	`'gelu'`
`max_img_size`	`int`	Maximum image size for position embeddings.	`2048`
`norm_mean`	`Union[tuple[float, float, float], list]`	Mean values for image normalization (RGB channels). Defaults to (0.48145466, 0.4578275, 0.40821073)).	`OPENAI_CLIP_MEAN`
`norm_std`	`Union[tuple[float, float, float], list]`	Standard deviation values for image normalization (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).	`OPENAI_CLIP_STD`
`reg_tokens`	`Optional[int]`	Number of register tokens to use.	`None`

Source code in vllm/transformers_utils/configs/radio.py

class RadioConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a Radio
    vision model. It is used to instantiate a Radio model according to the
    specified arguments, defining the model architecture.

    Args:
        model_name: Name of the vision transformer model
            (e.g., "vit_base_patch16_224"). Used to determine architecture
            dimensions from `VIT_TIMM_DIM_BY_NAME`.
        image_size: The size (resolution) of each image.
        patch_size: The size (resolution) of each patch.
        qkv_bias: Whether to add a bias to the queries, keys and values.
        qk_normalization: Whether to apply normalization to queries and keys.
        norm_type: The normalization type to use.
        layer_norm_eps: The epsilon used by the layer normalization layers.
        initializer_factor: A factor for initializing all weight matrices.
        hidden_act: The non-linear activation function in the encoder.
        max_img_size: Maximum image size for position embeddings.
        norm_mean: Mean values for image normalization (RGB channels).
            Defaults to (0.48145466, 0.4578275, 0.40821073)).
        norm_std: Standard deviation values for image normalization
            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
        reg_tokens: Number of register tokens to use.
    """

    model_type = "radio"

    def __init__(
        self,
        model_name: str,
        image_size: int = 224,
        patch_size: int = 16,
        qkv_bias: bool = True,
        qk_normalization: bool = False,
        norm_type: str = "layer_norm",
        layer_norm_eps: float = 1e-6,
        initializer_factor: float = 1.0,
        hidden_act: str = "gelu",
        max_img_size: int = 2048,
        norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
        norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
        reg_tokens: Optional[int] = None,
        **kwargs,
    ):
        self.model_name = model_name
        (
            self.hidden_size,
            self.num_hidden_layers,
            self.num_attention_heads,
            self.intermediate_size,
        ) = VIT_TIMM_DIM_BY_NAME[model_name]
        self.image_size = image_size
        self.patch_size = patch_size
        self.qkv_bias = qkv_bias
        self.qk_normalization = qk_normalization
        self.norm_type = norm_type
        self.layer_norm_eps = layer_norm_eps
        self.initializer_factor = initializer_factor
        self.hidden_act = hidden_act
        self.max_img_size = max_img_size
        self.norm_mean = list(norm_mean) if isinstance(norm_mean,
                                                       (tuple,
                                                        list)) else norm_mean
        self.norm_std = list(norm_std) if isinstance(norm_std,
                                                     (tuple,
                                                      list)) else norm_std
        self.reg_tokens = reg_tokens
        super().__init__(**kwargs)

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

image_size `instance-attribute` ¶

image_size = image_size

initializer_factor `instance-attribute` ¶

initializer_factor = initializer_factor

layer_norm_eps `instance-attribute` ¶

layer_norm_eps = layer_norm_eps

max_img_size `instance-attribute` ¶

max_img_size = max_img_size

model_name `instance-attribute` ¶

model_name = model_name

model_type `class-attribute` `instance-attribute` ¶

model_type = 'radio'

norm_mean `instance-attribute` ¶

norm_mean = (
    list(norm_mean)
    if isinstance(norm_mean, (tuple, list))
    else norm_mean
)

norm_std `instance-attribute` ¶

norm_std = (
    list(norm_std)
    if isinstance(norm_std, (tuple, list))
    else norm_std
)

norm_type `instance-attribute` ¶

norm_type = norm_type

patch_size `instance-attribute` ¶

patch_size = patch_size

qk_normalization `instance-attribute` ¶

qk_normalization = qk_normalization

qkv_bias `instance-attribute` ¶

qkv_bias = qkv_bias

reg_tokens `instance-attribute` ¶

reg_tokens = reg_tokens

init ¶

__init__(
    model_name: str,
    image_size: int = 224,
    patch_size: int = 16,
    qkv_bias: bool = True,
    qk_normalization: bool = False,
    norm_type: str = "layer_norm",
    layer_norm_eps: float = 1e-06,
    initializer_factor: float = 1.0,
    hidden_act: str = "gelu",
    max_img_size: int = 2048,
    norm_mean: Union[
        tuple[float, float, float], list
    ] = OPENAI_CLIP_MEAN,
    norm_std: Union[
        tuple[float, float, float], list
    ] = OPENAI_CLIP_STD,
    reg_tokens: Optional[int] = None,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/radio.py

def __init__(
    self,
    model_name: str,
    image_size: int = 224,
    patch_size: int = 16,
    qkv_bias: bool = True,
    qk_normalization: bool = False,
    norm_type: str = "layer_norm",
    layer_norm_eps: float = 1e-6,
    initializer_factor: float = 1.0,
    hidden_act: str = "gelu",
    max_img_size: int = 2048,
    norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
    norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
    reg_tokens: Optional[int] = None,
    **kwargs,
):
    self.model_name = model_name
    (
        self.hidden_size,
        self.num_hidden_layers,
        self.num_attention_heads,
        self.intermediate_size,
    ) = VIT_TIMM_DIM_BY_NAME[model_name]
    self.image_size = image_size
    self.patch_size = patch_size
    self.qkv_bias = qkv_bias
    self.qk_normalization = qk_normalization
    self.norm_type = norm_type
    self.layer_norm_eps = layer_norm_eps
    self.initializer_factor = initializer_factor
    self.hidden_act = hidden_act
    self.max_img_size = max_img_size
    self.norm_mean = list(norm_mean) if isinstance(norm_mean,
                                                   (tuple,
                                                    list)) else norm_mean
    self.norm_std = list(norm_std) if isinstance(norm_std,
                                                 (tuple,
                                                  list)) else norm_std
    self.reg_tokens = reg_tokens
    super().__init__(**kwargs)

SpeculatorsConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/speculators/base.py

class SpeculatorsConfig(PretrainedConfig):
    model_type = "speculators"

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "SpeculatorsConfig":
        """Load speculators Eagle config and convert to vLLM format."""
        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
                                             **kwargs)

        vllm_config = cls.extract_vllm_speculative_config(config_dict)
        return cls(**vllm_config)

    @classmethod
    def extract_vllm_speculative_config(
            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
        speculators_model_type = config_dict.get("speculators_model_type")
        if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
            raise ValueError(
                f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
                "Please ensure you're loading a speculators-format model.")

        # validate fields
        # TODO: @dsikka - use speculators pydantic model to validate
        cls.validate_speculators_config(config_dict=config_dict)
        # Convert from speculators config -> format that can be ingested by vLLM
        vllm_config = cls.build_vllm_speculative_config(
            config_dict=config_dict)
        # Apply anything specific to the supported algorithm
        algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
        algo_updater(config_dict=config_dict, vllm_config=vllm_config)
        return vllm_config

    @classmethod
    def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
        try:
            spec_config = config_dict["speculators_config"]
            methods = spec_config["proposal_methods"]
            first_method = methods[0]
            _ = first_method["speculative_tokens"]
            _ = spec_config["verifier"]["name_or_path"]
            _ = config_dict["speculators_model_type"]
        except (KeyError, IndexError, TypeError) as e:
            raise ValueError("Invalid speculators config structure") from e

        if "transformer_layer_config" not in config_dict:
            raise ValueError("Must provide transformer_layer_config")

        if not isinstance(config_dict["transformer_layer_config"], dict):
            raise TypeError(
                "'transformer_layer_config' must be a dictionary if provided")

    @classmethod
    def build_vllm_speculative_config(
            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
        """
        Build vLLM-compatible speculative configuration from speculators format.

        This method extracts and transforms speculative configuration from the
        speculators format into the structure expected by vLLM.

        Args:
            config_dict: Configuration dictionary in speculators format

        Returns:
            Dictionary with vLLM-compatible speculative configuration
        """
        # Extract speculators configuration
        spec_config = config_dict["speculators_config"]

        # Currently we only support one proposal method
        proposal_methods = spec_config.get("proposal_methods")
        if not proposal_methods:
            raise ValueError("No proposal methods found in speculators config")

        first_method = proposal_methods[0]
        num_speculative_tokens = first_method.get("speculative_tokens")

        if num_speculative_tokens is None:
            raise ValueError(
                "Missing 'speculative_tokens' in proposal method. "
                f"Got: {first_method}")

        # Build base vLLM speculative configuration
        vllm_config = {
            "method": config_dict.get("speculators_model_type"),
            "num_speculative_tokens": num_speculative_tokens,
            "target_model": spec_config.get("verifier")["name_or_path"]
        }

        # Merge transformer layer configuration if present
        transformer_config = config_dict.get("transformer_layer_config", {})
        vllm_config.update(transformer_config)

        return vllm_config

model_type `class-attribute` `instance-attribute` ¶

model_type = 'speculators'

build_vllm_speculative_config `classmethod` ¶

build_vllm_speculative_config(
    config_dict: dict[str, Any],
) -> dict[str, Any]

Build vLLM-compatible speculative configuration from speculators format.

This method extracts and transforms speculative configuration from the speculators format into the structure expected by vLLM.

Parameters:

Name	Type	Description	Default
`config_dict`	`dict[str, Any]`	Configuration dictionary in speculators format	required

Returns:

Type	Description
`dict[str, Any]`	Dictionary with vLLM-compatible speculative configuration

Source code in vllm/transformers_utils/configs/speculators/base.py

@classmethod
def build_vllm_speculative_config(
        cls, config_dict: dict[str, Any]) -> dict[str, Any]:
    """
    Build vLLM-compatible speculative configuration from speculators format.

    This method extracts and transforms speculative configuration from the
    speculators format into the structure expected by vLLM.

    Args:
        config_dict: Configuration dictionary in speculators format

    Returns:
        Dictionary with vLLM-compatible speculative configuration
    """
    # Extract speculators configuration
    spec_config = config_dict["speculators_config"]

    # Currently we only support one proposal method
    proposal_methods = spec_config.get("proposal_methods")
    if not proposal_methods:
        raise ValueError("No proposal methods found in speculators config")

    first_method = proposal_methods[0]
    num_speculative_tokens = first_method.get("speculative_tokens")

    if num_speculative_tokens is None:
        raise ValueError(
            "Missing 'speculative_tokens' in proposal method. "
            f"Got: {first_method}")

    # Build base vLLM speculative configuration
    vllm_config = {
        "method": config_dict.get("speculators_model_type"),
        "num_speculative_tokens": num_speculative_tokens,
        "target_model": spec_config.get("verifier")["name_or_path"]
    }

    # Merge transformer layer configuration if present
    transformer_config = config_dict.get("transformer_layer_config", {})
    vllm_config.update(transformer_config)

    return vllm_config

extract_vllm_speculative_config `classmethod` ¶

extract_vllm_speculative_config(
    config_dict: dict[str, Any],
) -> dict[str, Any]

Source code in vllm/transformers_utils/configs/speculators/base.py

@classmethod
def extract_vllm_speculative_config(
        cls, config_dict: dict[str, Any]) -> dict[str, Any]:
    speculators_model_type = config_dict.get("speculators_model_type")
    if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
        raise ValueError(
            f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
            "Please ensure you're loading a speculators-format model.")

    # validate fields
    # TODO: @dsikka - use speculators pydantic model to validate
    cls.validate_speculators_config(config_dict=config_dict)
    # Convert from speculators config -> format that can be ingested by vLLM
    vllm_config = cls.build_vllm_speculative_config(
        config_dict=config_dict)
    # Apply anything specific to the supported algorithm
    algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
    algo_updater(config_dict=config_dict, vllm_config=vllm_config)
    return vllm_config

from_pretrained `classmethod` ¶

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> SpeculatorsConfig

Load speculators Eagle config and convert to vLLM format.

Source code in vllm/transformers_utils/configs/speculators/base.py

@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "SpeculatorsConfig":
    """Load speculators Eagle config and convert to vLLM format."""
    config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
                                         **kwargs)

    vllm_config = cls.extract_vllm_speculative_config(config_dict)
    return cls(**vllm_config)

validate_speculators_config `classmethod` ¶

validate_speculators_config(
    config_dict: dict[str, Any],
) -> None

Source code in vllm/transformers_utils/configs/speculators/base.py

@classmethod
def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
    try:
        spec_config = config_dict["speculators_config"]
        methods = spec_config["proposal_methods"]
        first_method = methods[0]
        _ = first_method["speculative_tokens"]
        _ = spec_config["verifier"]["name_or_path"]
        _ = config_dict["speculators_model_type"]
    except (KeyError, IndexError, TypeError) as e:
        raise ValueError("Invalid speculators config structure") from e

    if "transformer_layer_config" not in config_dict:
        raise ValueError("Must provide transformer_layer_config")

    if not isinstance(config_dict["transformer_layer_config"], dict):
        raise TypeError(
            "'transformer_layer_config' must be a dictionary if provided")

Step3TextConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py

class Step3TextConfig(PretrainedConfig):
    model_type = "step3_text"
    architectures = ["Step3TextForCausalLM"]

    def __init__(
        self,
        hidden_size: int = 7168,
        intermediate_size: int = 18432,
        num_attention_heads: int = 64,
        num_attention_groups: int = 1,
        num_hidden_layers: int = 61,
        max_seq_len: int = 65536,
        vocab_size: int = 128815,
        rms_norm_eps: float = 1e-5,
        moe_intermediate_size: int = 5120,
        moe_num_experts: int = 48,
        moe_top_k: int = 3,
        rope_theta: float = 500000,
        rope_scaling: Optional[dict[str, Any]] = None,
        max_position_embedding: int = 65536,
        share_expert_dim: int = 5120,
        share_q_dim: int = 2048,
        head_dim: int = 256,
        norm_expert_weight: bool = False,
        moe_layers_enum: tuple[int,
                               ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                       15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                       25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                                       35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
                                       45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
                                       55, 56, 57, 58, 59),
        **kwargs,
    ) -> None:
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_attention_heads = num_attention_heads
        self.num_attention_groups = num_attention_groups
        self.num_hidden_layers = num_hidden_layers
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.rms_norm_eps = rms_norm_eps
        self.moe_intermediate_size = moe_intermediate_size
        self.moe_num_experts = moe_num_experts
        self.moe_top_k = moe_top_k
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.max_position_embedding = max_position_embedding
        self.share_expert_dim = share_expert_dim
        self.share_q_dim = share_q_dim
        self.head_dim = head_dim
        self.norm_expert_weight = norm_expert_weight
        self.moe_layers_enum = moe_layers_enum

        super().__init__(**kwargs)

architectures `class-attribute` `instance-attribute` ¶

architectures = ['Step3TextForCausalLM']

head_dim `instance-attribute` ¶

head_dim = head_dim

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

max_position_embedding `instance-attribute` ¶

max_position_embedding = max_position_embedding

max_seq_len `instance-attribute` ¶

max_seq_len = max_seq_len

model_type `class-attribute` `instance-attribute` ¶

model_type = 'step3_text'

moe_intermediate_size `instance-attribute` ¶

moe_intermediate_size = moe_intermediate_size

moe_layers_enum `instance-attribute` ¶

moe_layers_enum = moe_layers_enum

moe_num_experts `instance-attribute` ¶

moe_num_experts = moe_num_experts

moe_top_k `instance-attribute` ¶

moe_top_k = moe_top_k

norm_expert_weight `instance-attribute` ¶

norm_expert_weight = norm_expert_weight

num_attention_groups `instance-attribute` ¶

num_attention_groups = num_attention_groups

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

rms_norm_eps `instance-attribute` ¶

rms_norm_eps = rms_norm_eps

rope_scaling `instance-attribute` ¶

rope_scaling = rope_scaling

rope_theta `instance-attribute` ¶

rope_theta = rope_theta

share_expert_dim `instance-attribute` ¶

share_expert_dim = share_expert_dim

share_q_dim `instance-attribute` ¶

share_q_dim = share_q_dim

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(
    hidden_size: int = 7168,
    intermediate_size: int = 18432,
    num_attention_heads: int = 64,
    num_attention_groups: int = 1,
    num_hidden_layers: int = 61,
    max_seq_len: int = 65536,
    vocab_size: int = 128815,
    rms_norm_eps: float = 1e-05,
    moe_intermediate_size: int = 5120,
    moe_num_experts: int = 48,
    moe_top_k: int = 3,
    rope_theta: float = 500000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embedding: int = 65536,
    share_expert_dim: int = 5120,
    share_q_dim: int = 2048,
    head_dim: int = 256,
    norm_expert_weight: bool = False,
    moe_layers_enum: tuple[int, ...] = (
        4,
        5,
        6,
        7,
        8,
        9,
        10,
        11,
        12,
        13,
        14,
        15,
        16,
        17,
        18,
        19,
        20,
        21,
        22,
        23,
        24,
        25,
        26,
        27,
        28,
        29,
        30,
        31,
        32,
        33,
        34,
        35,
        36,
        37,
        38,
        39,
        40,
        41,
        42,
        43,
        44,
        45,
        46,
        47,
        48,
        49,
        50,
        51,
        52,
        53,
        54,
        55,
        56,
        57,
        58,
        59,
    ),
    **kwargs,
) -> None

Source code in vllm/transformers_utils/configs/step3_vl.py

def __init__(
    self,
    hidden_size: int = 7168,
    intermediate_size: int = 18432,
    num_attention_heads: int = 64,
    num_attention_groups: int = 1,
    num_hidden_layers: int = 61,
    max_seq_len: int = 65536,
    vocab_size: int = 128815,
    rms_norm_eps: float = 1e-5,
    moe_intermediate_size: int = 5120,
    moe_num_experts: int = 48,
    moe_top_k: int = 3,
    rope_theta: float = 500000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embedding: int = 65536,
    share_expert_dim: int = 5120,
    share_q_dim: int = 2048,
    head_dim: int = 256,
    norm_expert_weight: bool = False,
    moe_layers_enum: tuple[int,
                           ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                   15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                   25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                                   35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
                                   45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
                                   55, 56, 57, 58, 59),
    **kwargs,
) -> None:
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_attention_heads = num_attention_heads
    self.num_attention_groups = num_attention_groups
    self.num_hidden_layers = num_hidden_layers
    self.max_seq_len = max_seq_len
    self.vocab_size = vocab_size
    self.rms_norm_eps = rms_norm_eps
    self.moe_intermediate_size = moe_intermediate_size
    self.moe_num_experts = moe_num_experts
    self.moe_top_k = moe_top_k
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.max_position_embedding = max_position_embedding
    self.share_expert_dim = share_expert_dim
    self.share_q_dim = share_q_dim
    self.head_dim = head_dim
    self.norm_expert_weight = norm_expert_weight
    self.moe_layers_enum = moe_layers_enum

    super().__init__(**kwargs)

Step3VLConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py

class Step3VLConfig(PretrainedConfig):
    model_type = "step3_vl"

    def __init__(
        self,
        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
        text_config: Optional[Union[dict, Step3TextConfig]] = None,
        understand_projector_stride: int = 1,
        projector_bias: bool = True,
        image_token_id: int = 128001,
        **kwargs,
    ) -> None:
        if vision_config is None:
            vision_config = Step3VisionEncoderConfig()
        elif isinstance(vision_config, dict):
            vision_config = Step3VisionEncoderConfig(**vision_config)
        self.vision_config = vision_config

        if text_config is None:
            text_config = Step3TextConfig()
        elif isinstance(text_config, dict):
            text_config = Step3TextConfig(**text_config)
        self.text_config = text_config

        self.understand_projector_stride = understand_projector_stride
        self.projector_bias = projector_bias
        self.hidden_size = text_config.hidden_size
        self.image_token_id = image_token_id

        super().__init__(**kwargs)

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

image_token_id `instance-attribute` ¶

image_token_id = image_token_id

model_type `class-attribute` `instance-attribute` ¶

model_type = 'step3_vl'

projector_bias `instance-attribute` ¶

projector_bias = projector_bias

text_config `instance-attribute` ¶

text_config = text_config

understand_projector_stride `instance-attribute` ¶

understand_projector_stride = understand_projector_stride

vision_config `instance-attribute` ¶

vision_config = vision_config

init ¶

__init__(
    vision_config: Optional[
        Union[dict, Step3VisionEncoderConfig]
    ] = None,
    text_config: Optional[
        Union[dict, Step3TextConfig]
    ] = None,
    understand_projector_stride: int = 1,
    projector_bias: bool = True,
    image_token_id: int = 128001,
    **kwargs,
) -> None

Source code in vllm/transformers_utils/configs/step3_vl.py

def __init__(
    self,
    vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
    text_config: Optional[Union[dict, Step3TextConfig]] = None,
    understand_projector_stride: int = 1,
    projector_bias: bool = True,
    image_token_id: int = 128001,
    **kwargs,
) -> None:
    if vision_config is None:
        vision_config = Step3VisionEncoderConfig()
    elif isinstance(vision_config, dict):
        vision_config = Step3VisionEncoderConfig(**vision_config)
    self.vision_config = vision_config

    if text_config is None:
        text_config = Step3TextConfig()
    elif isinstance(text_config, dict):
        text_config = Step3TextConfig(**text_config)
    self.text_config = text_config

    self.understand_projector_stride = understand_projector_stride
    self.projector_bias = projector_bias
    self.hidden_size = text_config.hidden_size
    self.image_token_id = image_token_id

    super().__init__(**kwargs)

Step3VisionEncoderConfig ¶

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py

class Step3VisionEncoderConfig(PretrainedConfig):
    model_type = "step3_vision_encoder"

    def __init__(
        self,
        hidden_size=1792,
        intermediate_size=3072,
        output_hidden_size=4096,
        num_hidden_layers=63,
        num_attention_heads=16,
        num_channels=3,
        image_size=728,
        patch_size=14,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        **kwargs,
    ):
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.output_hidden_size = output_hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        super().__init__(**kwargs)

hidden_act `instance-attribute` ¶

hidden_act = hidden_act

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

image_size `instance-attribute` ¶

image_size = image_size

intermediate_size `instance-attribute` ¶

intermediate_size = intermediate_size

layer_norm_eps `instance-attribute` ¶

layer_norm_eps = layer_norm_eps

model_type `class-attribute` `instance-attribute` ¶

model_type = 'step3_vision_encoder'

num_attention_heads `instance-attribute` ¶

num_attention_heads = num_attention_heads

num_channels `instance-attribute` ¶

num_channels = num_channels

num_hidden_layers `instance-attribute` ¶

num_hidden_layers = num_hidden_layers

output_hidden_size `instance-attribute` ¶

output_hidden_size = output_hidden_size

patch_size `instance-attribute` ¶

patch_size = patch_size

init ¶

__init__(
    hidden_size=1792,
    intermediate_size=3072,
    output_hidden_size=4096,
    num_hidden_layers=63,
    num_attention_heads=16,
    num_channels=3,
    image_size=728,
    patch_size=14,
    hidden_act="quick_gelu",
    layer_norm_eps=1e-05,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/step3_vl.py

def __init__(
    self,
    hidden_size=1792,
    intermediate_size=3072,
    output_hidden_size=4096,
    num_hidden_layers=63,
    num_attention_heads=16,
    num_channels=3,
    image_size=728,
    patch_size=14,
    hidden_act="quick_gelu",
    layer_norm_eps=1e-5,
    **kwargs,
):
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.output_hidden_size = output_hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.num_channels = num_channels
    self.patch_size = patch_size
    self.image_size = image_size
    self.layer_norm_eps = layer_norm_eps
    self.hidden_act = hidden_act
    super().__init__(**kwargs)

UltravoxConfig ¶

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [UltravoxForConditionalGeneration]. It is used to instantiate an Ultravox model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name	Type	Description	Default
`audio_config`	`Union[AutoConfig, dict]`, optional	Custom audio config or dict.	`None`
`text_config`	`Union[AutoConfig, dict]`, optional	The config object of the text backbone.	`None`
`audio_model_id`	`str`, optional	The model ID of the audio backbone.	`None`
`text_model_id`	`str`, optional	The model ID of the text backbone.	`None`
`ignore_index`	`int`, optional, defaults to -100	The ignore index for the loss function.	`-100`
`audio_token_index`	`int`, optional, defaults to 32000	The audio token index to encode the audio prompt.	`32000`
`stack_factor`	`int`, optional, defaults to 8	Audio downsampling factor for the multimodal projector.	`8`
`norm_init`	`float`, optional, defaults to 0.4	The initialization value for the layer normalization.	`0.4`
`projector_act`	`str`, optional, defaults to `"swiglu"`	The activation function used by the multimodal projector.	`'swiglu'`
`projector_ln_mid`	`bool`, optional, defaults to `False`	Whether to apply layer normalization at the middle of the projector or at the end. Versions v0.4.1 and below use `False`, but v0.5 and above use `True`.	`False`

Source code in vllm/transformers_utils/configs/ultravox.py

class UltravoxConfig(transformers.PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
    Ultravox model according to the specified arguments, defining the model
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to
    control the model outputs. Read the documentation from [`PretrainedConfig`]
    for more information.

    Args:
        audio_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom audio config or dict.
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object of the text backbone.
        audio_model_id (`str`, *optional*):
            The model ID of the audio backbone.
        text_model_id (`str`, *optional*):
            The model ID of the text backbone.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        audio_token_index (`int`, *optional*, defaults to 32000):
            The audio token index to encode the audio prompt.
        stack_factor (`int`, *optional*, defaults to 8):
            Audio downsampling factor for the multimodal projector.
        norm_init (`float`, *optional*, defaults to 0.4):
            The initialization value for the layer normalization.
        projector_act (`str`, *optional*, defaults to `"swiglu"`):
            The activation function used by the multimodal projector.
        projector_ln_mid (`bool`, *optional*, defaults to `False`):
            Whether to apply layer normalization at the middle of the
            projector or at the end. Versions v0.4.1 and below
            use `False`, but v0.5 and above use `True`.
    """
    wrapped_model_config: transformers.PretrainedConfig
    model_type = "ultravox"
    audio_token = "<|audio|>"
    is_composition = False

    def __init__(
        self,
        audio_config: Optional[dict[str, Any]] = None,
        text_config: Optional[dict[str, Any]] = None,
        audio_model_id: Optional[str] = None,
        text_model_id: Optional[str] = None,
        ignore_index: int = -100,
        audio_token_index: int = 32000,
        hidden_size: int = 4096,
        stack_factor: int = 8,
        norm_init: float = 0.4,
        projector_act: str = "swiglu",
        projector_ln_mid: bool = False,
        **kwargs,
    ):
        self.ignore_index = ignore_index
        self.audio_token_index = audio_token_index

        self.hidden_size = hidden_size
        self.stack_factor = stack_factor
        self.norm_init = norm_init
        self.projector_act = projector_act
        self.projector_ln_mid = projector_ln_mid

        # N.B. May set the wrapped_model_config below.
        self.text_model_id = text_model_id
        if text_model_id is None:
            text_config = text_config or {}
            self.wrapped_model_config = transformers.CONFIG_MAPPING[
                text_config.get("model_type", "llama")](**text_config)

        # N.B. May set the audio_config below.
        self.audio_model_id = audio_model_id
        if audio_model_id is None:
            self.audio_model_id = None
            audio_config = audio_config or {}
            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
                "model_type", "whisper")](**audio_config)

        super().__init__(**kwargs)

    def __setattr__(self, key, value):
        # Since --hf-overrides are applied _after_ the UltravoxConfig is
        # instantiated, load the configs implicitly when assigning text_model_id
        # or audio_model_id. This allows:
        #
        #   --hf-overrides.text_model_id=<quantized variant>
        #
        # to behave as intended.
        if key == "text_model_id" and value is not None:
            from vllm.transformers_utils.config import get_config

            self.wrapped_model_config = get_config(value,
                                                   trust_remote_code=False)
        elif key == "audio_model_id" and value is not None:
            from vllm.transformers_utils.config import get_config

            self.audio_config = get_config(value, trust_remote_code=False)

        return super().__setattr__(key, value)

    @property
    def text_config(self) -> transformers.PretrainedConfig:
        # When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
        # the full model, but the text config is the text config of the inner
        # model.
        return self.wrapped_model_config.get_text_config()

audio_config `instance-attribute` ¶

audio_config = CONFIG_MAPPING[get("model_type", "whisper")](
    **audio_config
)

audio_model_id `instance-attribute` ¶

audio_model_id = audio_model_id

audio_token `class-attribute` `instance-attribute` ¶

audio_token = '<|audio|>'

audio_token_index `instance-attribute` ¶

audio_token_index = audio_token_index

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

ignore_index `instance-attribute` ¶

ignore_index = ignore_index

is_composition `class-attribute` `instance-attribute` ¶

is_composition = False

model_type `class-attribute` `instance-attribute` ¶

model_type = 'ultravox'

norm_init `instance-attribute` ¶

norm_init = norm_init

projector_act `instance-attribute` ¶

projector_act = projector_act

projector_ln_mid `instance-attribute` ¶

projector_ln_mid = projector_ln_mid

stack_factor `instance-attribute` ¶

stack_factor = stack_factor

text_config `property` ¶

text_config: PretrainedConfig

text_model_id `instance-attribute` ¶

text_model_id = text_model_id

wrapped_model_config `instance-attribute` ¶

wrapped_model_config: PretrainedConfig

init ¶

__init__(
    audio_config: Optional[dict[str, Any]] = None,
    text_config: Optional[dict[str, Any]] = None,
    audio_model_id: Optional[str] = None,
    text_model_id: Optional[str] = None,
    ignore_index: int = -100,
    audio_token_index: int = 32000,
    hidden_size: int = 4096,
    stack_factor: int = 8,
    norm_init: float = 0.4,
    projector_act: str = "swiglu",
    projector_ln_mid: bool = False,
    **kwargs,
)

Source code in vllm/transformers_utils/configs/ultravox.py

def __init__(
    self,
    audio_config: Optional[dict[str, Any]] = None,
    text_config: Optional[dict[str, Any]] = None,
    audio_model_id: Optional[str] = None,
    text_model_id: Optional[str] = None,
    ignore_index: int = -100,
    audio_token_index: int = 32000,
    hidden_size: int = 4096,
    stack_factor: int = 8,
    norm_init: float = 0.4,
    projector_act: str = "swiglu",
    projector_ln_mid: bool = False,
    **kwargs,
):
    self.ignore_index = ignore_index
    self.audio_token_index = audio_token_index

    self.hidden_size = hidden_size
    self.stack_factor = stack_factor
    self.norm_init = norm_init
    self.projector_act = projector_act
    self.projector_ln_mid = projector_ln_mid

    # N.B. May set the wrapped_model_config below.
    self.text_model_id = text_model_id
    if text_model_id is None:
        text_config = text_config or {}
        self.wrapped_model_config = transformers.CONFIG_MAPPING[
            text_config.get("model_type", "llama")](**text_config)

    # N.B. May set the audio_config below.
    self.audio_model_id = audio_model_id
    if audio_model_id is None:
        self.audio_model_id = None
        audio_config = audio_config or {}
        self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
            "model_type", "whisper")](**audio_config)

    super().__init__(**kwargs)

setattr ¶

__setattr__(key, value)

Source code in vllm/transformers_utils/configs/ultravox.py

def __setattr__(self, key, value):
    # Since --hf-overrides are applied _after_ the UltravoxConfig is
    # instantiated, load the configs implicitly when assigning text_model_id
    # or audio_model_id. This allows:
    #
    #   --hf-overrides.text_model_id=<quantized variant>
    #
    # to behave as intended.
    if key == "text_model_id" and value is not None:
        from vllm.transformers_utils.config import get_config

        self.wrapped_model_config = get_config(value,
                                               trust_remote_code=False)
    elif key == "audio_model_id" and value is not None:
        from vllm.transformers_utils.config import get_config

        self.audio_config = get_config(value, trust_remote_code=False)

    return super().__setattr__(key, value)

vllm.transformers_utils.configs ¶

__all__ module-attribute ¶

ChatGLMConfig ¶

add_bias_linear instance-attribute ¶

add_qkv_bias instance-attribute ¶

apply_query_key_layer_scaling instance-attribute ¶

apply_residual_connection_post_layernorm instance-attribute ¶

attention_dropout instance-attribute ¶

attention_softmax_in_fp32 instance-attribute ¶

attribute_map class-attribute instance-attribute ¶

bias_dropout_fusion instance-attribute ¶

ffn_hidden_size instance-attribute ¶

fp32_residual_connection instance-attribute ¶

hidden_dropout instance-attribute ¶

hidden_size instance-attribute ¶

interleaved_qkv instance-attribute ¶

kv_channels instance-attribute ¶

layernorm_epsilon instance-attribute ¶

max_position_embeddings instance-attribute ¶

model_type class-attribute instance-attribute ¶

multi_query_attention instance-attribute ¶

multi_query_group_num instance-attribute ¶

num_attention_heads instance-attribute ¶

num_layers instance-attribute ¶

padded_vocab_size instance-attribute ¶

post_layer_norm instance-attribute ¶

pre_seq_len instance-attribute ¶

prefix_projection instance-attribute ¶

quantization_bit instance-attribute ¶

rmsnorm instance-attribute ¶

seq_length instance-attribute ¶

vocab_size instance-attribute ¶

__init__ ¶

DeepseekV3Config ¶

attention_bias instance-attribute ¶

attention_dropout instance-attribute ¶

ep_size instance-attribute ¶

first_k_dense_replace instance-attribute ¶

hidden_act instance-attribute ¶

hidden_size instance-attribute ¶

initializer_range instance-attribute ¶

intermediate_size instance-attribute ¶

keys_to_ignore_at_inference class-attribute instance-attribute ¶

kv_lora_rank instance-attribute ¶

max_position_embeddings instance-attribute ¶

model_type class-attribute instance-attribute ¶

moe_intermediate_size instance-attribute ¶

moe_layer_freq instance-attribute ¶

n_group instance-attribute ¶

n_routed_experts instance-attribute ¶

n_shared_experts instance-attribute ¶

norm_topk_prob instance-attribute ¶

num_attention_heads instance-attribute ¶

num_experts_per_tok instance-attribute ¶

num_hidden_layers instance-attribute ¶

num_key_value_heads instance-attribute ¶

num_nextn_predict_layers instance-attribute ¶

q_lora_rank instance-attribute ¶

qk_nope_head_dim instance-attribute ¶

qk_rope_head_dim instance-attribute ¶

rms_norm_eps instance-attribute ¶

rope_scaling instance-attribute ¶

rope_theta instance-attribute ¶

routed_scaling_factor instance-attribute ¶

scoring_func instance-attribute ¶

topk_group instance-attribute ¶

topk_method instance-attribute ¶

use_cache instance-attribute ¶

v_head_dim instance-attribute ¶

vocab_size instance-attribute ¶

__init__ ¶

DeepseekVLV2Config ¶

candidate_resolutions class-attribute instance-attribute ¶

global_view_pos class-attribute instance-attribute ¶

model_type class-attribute instance-attribute ¶

projector_config instance-attribute ¶

text_config instance-attribute ¶

tile_tag class-attribute instance-attribute ¶

vision_config instance-attribute ¶

vocab_size instance-attribute ¶

all `module-attribute` ¶

add_bias_linear `instance-attribute` ¶

add_qkv_bias `instance-attribute` ¶

apply_query_key_layer_scaling `instance-attribute` ¶

apply_residual_connection_post_layernorm `instance-attribute` ¶

attention_dropout `instance-attribute` ¶

attention_softmax_in_fp32 `instance-attribute` ¶

attribute_map `class-attribute` `instance-attribute` ¶

bias_dropout_fusion `instance-attribute` ¶

ffn_hidden_size `instance-attribute` ¶

fp32_residual_connection `instance-attribute` ¶

hidden_dropout `instance-attribute` ¶

hidden_size `instance-attribute` ¶

interleaved_qkv `instance-attribute` ¶

kv_channels `instance-attribute` ¶

layernorm_epsilon `instance-attribute` ¶

max_position_embeddings `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

multi_query_attention `instance-attribute` ¶

multi_query_group_num `instance-attribute` ¶

num_attention_heads `instance-attribute` ¶

num_layers `instance-attribute` ¶

padded_vocab_size `instance-attribute` ¶

post_layer_norm `instance-attribute` ¶

pre_seq_len `instance-attribute` ¶

prefix_projection `instance-attribute` ¶

quantization_bit `instance-attribute` ¶

rmsnorm `instance-attribute` ¶

seq_length `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

attention_bias `instance-attribute` ¶

attention_dropout `instance-attribute` ¶

ep_size `instance-attribute` ¶

first_k_dense_replace `instance-attribute` ¶

hidden_act `instance-attribute` ¶

hidden_size `instance-attribute` ¶

initializer_range `instance-attribute` ¶

intermediate_size `instance-attribute` ¶

keys_to_ignore_at_inference `class-attribute` `instance-attribute` ¶

kv_lora_rank `instance-attribute` ¶

max_position_embeddings `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

moe_intermediate_size `instance-attribute` ¶

moe_layer_freq `instance-attribute` ¶

n_group `instance-attribute` ¶

n_routed_experts `instance-attribute` ¶

n_shared_experts `instance-attribute` ¶

norm_topk_prob `instance-attribute` ¶

num_attention_heads `instance-attribute` ¶

num_experts_per_tok `instance-attribute` ¶

num_hidden_layers `instance-attribute` ¶

num_key_value_heads `instance-attribute` ¶

num_nextn_predict_layers `instance-attribute` ¶

q_lora_rank `instance-attribute` ¶

qk_nope_head_dim `instance-attribute` ¶

qk_rope_head_dim `instance-attribute` ¶

rms_norm_eps `instance-attribute` ¶

rope_scaling `instance-attribute` ¶

rope_theta `instance-attribute` ¶

routed_scaling_factor `instance-attribute` ¶

scoring_func `instance-attribute` ¶

topk_group `instance-attribute` ¶

topk_method `instance-attribute` ¶

use_cache `instance-attribute` ¶

v_head_dim `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

candidate_resolutions `class-attribute` `instance-attribute` ¶

global_view_pos `class-attribute` `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

projector_config `instance-attribute` ¶

text_config `instance-attribute` ¶

tile_tag `class-attribute` `instance-attribute` ¶

vision_config `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

image_token_id `instance-attribute` ¶

model_type `class-attribute` `instance-attribute` ¶

video_token_id `instance-attribute` ¶