Skip to content

vllm.transformers_utils.configs

Model configs may be defined in this directory for the following reasons:

  • There is no configuration file defined by HF Hub or Transformers library.
  • There is a need to override the existing config to support vLLM.

Modules:

Name Description
arctic

Arctic model configuration

chatglm
deepseek_v3
deepseek_vl2
dotsocr
eagle
falcon

Falcon configuration

jais

JAIS configuration

kimi_vl
medusa
midashenglm
mistral
mlp_speculator
moonvit
nemotron

Nemotron model configuration

nemotron_h

NemotronH model configuration

nemotron_vl
olmo3
ovis
qwen3_next

Qwen3-Next model configuration

radio

Radio vision model configuration

speculators
step3_vl
ultravox

__all__ module-attribute

__all__ = [
    "ChatGLMConfig",
    "DeepseekVLV2Config",
    "DeepseekV3Config",
    "DotsOCRConfig",
    "EAGLEConfig",
    "RWConfig",
    "JAISConfig",
    "MedusaConfig",
    "MiDashengLMConfig",
    "MLPSpeculatorConfig",
    "MoonViTConfig",
    "KimiVLConfig",
    "NemotronConfig",
    "NemotronHConfig",
    "Nemotron_Nano_VL_Config",
    "Olmo3Config",
    "OvisConfig",
    "RadioConfig",
    "SpeculatorsConfig",
    "UltravoxConfig",
    "Step3VLConfig",
    "Step3VisionEncoderConfig",
    "Step3TextConfig",
    "Qwen3NextConfig",
]

ChatGLMConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/chatglm.py
class ChatGLMConfig(PretrainedConfig):
    model_type = "chatglm"
    attribute_map = {
        "num_hidden_layers": "num_layers",
        "n_head_kv": "multi_query_group_num",
    }

    def __init__(self,
                 num_layers=28,
                 padded_vocab_size=65024,
                 hidden_size=4096,
                 ffn_hidden_size=13696,
                 kv_channels=128,
                 num_attention_heads=32,
                 seq_length=2048,
                 hidden_dropout=0.0,
                 attention_dropout=0.0,
                 layernorm_epsilon=1e-5,
                 rmsnorm=True,
                 apply_residual_connection_post_layernorm=False,
                 post_layer_norm=True,
                 add_bias_linear=False,
                 add_qkv_bias=False,
                 interleaved_qkv=False,
                 bias_dropout_fusion=True,
                 multi_query_attention=False,
                 multi_query_group_num=1,
                 apply_query_key_layer_scaling=True,
                 attention_softmax_in_fp32=True,
                 fp32_residual_connection=False,
                 quantization_bit=0,
                 pre_seq_len=None,
                 prefix_projection=False,
                 **kwargs):
        self.num_layers = num_layers
        self.vocab_size = padded_vocab_size
        self.padded_vocab_size = padded_vocab_size
        self.hidden_size = hidden_size
        self.ffn_hidden_size = ffn_hidden_size
        self.kv_channels = kv_channels
        self.num_attention_heads = num_attention_heads
        self.seq_length = seq_length
        # It is to be compatible with long lora.
        self.max_position_embeddings = seq_length
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.layernorm_epsilon = layernorm_epsilon
        self.rmsnorm = rmsnorm
        self.apply_residual_connection_post_layernorm = (
            apply_residual_connection_post_layernorm)
        self.post_layer_norm = post_layer_norm
        self.add_bias_linear = add_bias_linear
        self.add_qkv_bias = add_qkv_bias
        self.bias_dropout_fusion = bias_dropout_fusion
        self.multi_query_attention = multi_query_attention
        self.multi_query_group_num = multi_query_group_num
        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
        self.fp32_residual_connection = fp32_residual_connection
        self.quantization_bit = quantization_bit
        self.pre_seq_len = pre_seq_len
        self.prefix_projection = prefix_projection
        self.interleaved_qkv = interleaved_qkv
        super().__init__(**kwargs)

add_bias_linear instance-attribute

add_bias_linear = add_bias_linear

add_qkv_bias instance-attribute

add_qkv_bias = add_qkv_bias

apply_query_key_layer_scaling instance-attribute

apply_query_key_layer_scaling = (
    apply_query_key_layer_scaling
)

apply_residual_connection_post_layernorm instance-attribute

apply_residual_connection_post_layernorm = (
    apply_residual_connection_post_layernorm
)

attention_dropout instance-attribute

attention_dropout = attention_dropout

attention_softmax_in_fp32 instance-attribute

attention_softmax_in_fp32 = attention_softmax_in_fp32

attribute_map class-attribute instance-attribute

attribute_map = {
    "num_hidden_layers": "num_layers",
    "n_head_kv": "multi_query_group_num",
}

bias_dropout_fusion instance-attribute

bias_dropout_fusion = bias_dropout_fusion

ffn_hidden_size instance-attribute

ffn_hidden_size = ffn_hidden_size

fp32_residual_connection instance-attribute

fp32_residual_connection = fp32_residual_connection

hidden_dropout instance-attribute

hidden_dropout = hidden_dropout

hidden_size instance-attribute

hidden_size = hidden_size

interleaved_qkv instance-attribute

interleaved_qkv = interleaved_qkv

kv_channels instance-attribute

kv_channels = kv_channels

layernorm_epsilon instance-attribute

layernorm_epsilon = layernorm_epsilon

max_position_embeddings instance-attribute

max_position_embeddings = seq_length

model_type class-attribute instance-attribute

model_type = 'chatglm'

multi_query_attention instance-attribute

multi_query_attention = multi_query_attention

multi_query_group_num instance-attribute

multi_query_group_num = multi_query_group_num

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_layers instance-attribute

num_layers = num_layers

padded_vocab_size instance-attribute

padded_vocab_size = padded_vocab_size

post_layer_norm instance-attribute

post_layer_norm = post_layer_norm

pre_seq_len instance-attribute

pre_seq_len = pre_seq_len

prefix_projection instance-attribute

prefix_projection = prefix_projection

quantization_bit instance-attribute

quantization_bit = quantization_bit

rmsnorm instance-attribute

rmsnorm = rmsnorm

seq_length instance-attribute

seq_length = seq_length

vocab_size instance-attribute

vocab_size = padded_vocab_size

__init__

__init__(
    num_layers=28,
    padded_vocab_size=65024,
    hidden_size=4096,
    ffn_hidden_size=13696,
    kv_channels=128,
    num_attention_heads=32,
    seq_length=2048,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    layernorm_epsilon=1e-05,
    rmsnorm=True,
    apply_residual_connection_post_layernorm=False,
    post_layer_norm=True,
    add_bias_linear=False,
    add_qkv_bias=False,
    interleaved_qkv=False,
    bias_dropout_fusion=True,
    multi_query_attention=False,
    multi_query_group_num=1,
    apply_query_key_layer_scaling=True,
    attention_softmax_in_fp32=True,
    fp32_residual_connection=False,
    quantization_bit=0,
    pre_seq_len=None,
    prefix_projection=False,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/chatglm.py
def __init__(self,
             num_layers=28,
             padded_vocab_size=65024,
             hidden_size=4096,
             ffn_hidden_size=13696,
             kv_channels=128,
             num_attention_heads=32,
             seq_length=2048,
             hidden_dropout=0.0,
             attention_dropout=0.0,
             layernorm_epsilon=1e-5,
             rmsnorm=True,
             apply_residual_connection_post_layernorm=False,
             post_layer_norm=True,
             add_bias_linear=False,
             add_qkv_bias=False,
             interleaved_qkv=False,
             bias_dropout_fusion=True,
             multi_query_attention=False,
             multi_query_group_num=1,
             apply_query_key_layer_scaling=True,
             attention_softmax_in_fp32=True,
             fp32_residual_connection=False,
             quantization_bit=0,
             pre_seq_len=None,
             prefix_projection=False,
             **kwargs):
    self.num_layers = num_layers
    self.vocab_size = padded_vocab_size
    self.padded_vocab_size = padded_vocab_size
    self.hidden_size = hidden_size
    self.ffn_hidden_size = ffn_hidden_size
    self.kv_channels = kv_channels
    self.num_attention_heads = num_attention_heads
    self.seq_length = seq_length
    # It is to be compatible with long lora.
    self.max_position_embeddings = seq_length
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout
    self.layernorm_epsilon = layernorm_epsilon
    self.rmsnorm = rmsnorm
    self.apply_residual_connection_post_layernorm = (
        apply_residual_connection_post_layernorm)
    self.post_layer_norm = post_layer_norm
    self.add_bias_linear = add_bias_linear
    self.add_qkv_bias = add_qkv_bias
    self.bias_dropout_fusion = bias_dropout_fusion
    self.multi_query_attention = multi_query_attention
    self.multi_query_group_num = multi_query_group_num
    self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
    self.attention_softmax_in_fp32 = attention_softmax_in_fp32
    self.fp32_residual_connection = fp32_residual_connection
    self.quantization_bit = quantization_bit
    self.pre_seq_len = pre_seq_len
    self.prefix_projection = prefix_projection
    self.interleaved_qkv = interleaved_qkv
    super().__init__(**kwargs)

DeepseekV3Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/deepseek_v3.py
class DeepseekV3Config(PretrainedConfig):

    model_type = "deepseek_v3"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=129280,
        hidden_size=7168,
        intermediate_size=18432,
        moe_intermediate_size=2048,
        num_hidden_layers=61,
        num_nextn_predict_layers=1,
        num_attention_heads=128,
        num_key_value_heads=128,
        n_shared_experts=1,
        n_routed_experts=256,
        ep_size=1,
        routed_scaling_factor=2.5,
        kv_lora_rank=512,
        q_lora_rank=1536,
        qk_rope_head_dim=64,
        v_head_dim=128,
        qk_nope_head_dim=128,
        topk_method='noaux_tc',
        n_group=8,
        topk_group=4,
        num_experts_per_tok=8,
        moe_layer_freq=1,
        first_k_dense_replace=3,
        norm_topk_prob=True,
        scoring_func='sigmoid',
        hidden_act="silu",
        max_position_embeddings=4096,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=0,
        eos_token_id=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_nextn_predict_layers = num_nextn_predict_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

ep_size instance-attribute

ep_size = ep_size

first_k_dense_replace instance-attribute

first_k_dense_replace = first_k_dense_replace

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

kv_lora_rank instance-attribute

kv_lora_rank = kv_lora_rank

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

model_type class-attribute instance-attribute

model_type = 'deepseek_v3'

moe_intermediate_size instance-attribute

moe_intermediate_size = moe_intermediate_size

moe_layer_freq instance-attribute

moe_layer_freq = moe_layer_freq

n_group instance-attribute

n_group = n_group

n_routed_experts instance-attribute

n_routed_experts = n_routed_experts

n_shared_experts instance-attribute

n_shared_experts = n_shared_experts

norm_topk_prob instance-attribute

norm_topk_prob = norm_topk_prob

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_experts_per_tok instance-attribute

num_experts_per_tok = num_experts_per_tok

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

num_nextn_predict_layers instance-attribute

num_nextn_predict_layers = num_nextn_predict_layers

q_lora_rank instance-attribute

q_lora_rank = q_lora_rank

qk_nope_head_dim instance-attribute

qk_nope_head_dim = qk_nope_head_dim

qk_rope_head_dim instance-attribute

qk_rope_head_dim = qk_rope_head_dim

rms_norm_eps instance-attribute

rms_norm_eps = rms_norm_eps

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

routed_scaling_factor instance-attribute

routed_scaling_factor = routed_scaling_factor

scoring_func instance-attribute

scoring_func = scoring_func

topk_group instance-attribute

topk_group = topk_group

topk_method instance-attribute

topk_method = topk_method

use_cache instance-attribute

use_cache = use_cache

v_head_dim instance-attribute

v_head_dim = v_head_dim

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=129280,
    hidden_size=7168,
    intermediate_size=18432,
    moe_intermediate_size=2048,
    num_hidden_layers=61,
    num_nextn_predict_layers=1,
    num_attention_heads=128,
    num_key_value_heads=128,
    n_shared_experts=1,
    n_routed_experts=256,
    ep_size=1,
    routed_scaling_factor=2.5,
    kv_lora_rank=512,
    q_lora_rank=1536,
    qk_rope_head_dim=64,
    v_head_dim=128,
    qk_nope_head_dim=128,
    topk_method="noaux_tc",
    n_group=8,
    topk_group=4,
    num_experts_per_tok=8,
    moe_layer_freq=1,
    first_k_dense_replace=3,
    norm_topk_prob=True,
    scoring_func="sigmoid",
    hidden_act="silu",
    max_position_embeddings=4096,
    initializer_range=0.02,
    rms_norm_eps=1e-06,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=0,
    eos_token_id=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/deepseek_v3.py
def __init__(
    self,
    vocab_size=129280,
    hidden_size=7168,
    intermediate_size=18432,
    moe_intermediate_size=2048,
    num_hidden_layers=61,
    num_nextn_predict_layers=1,
    num_attention_heads=128,
    num_key_value_heads=128,
    n_shared_experts=1,
    n_routed_experts=256,
    ep_size=1,
    routed_scaling_factor=2.5,
    kv_lora_rank=512,
    q_lora_rank=1536,
    qk_rope_head_dim=64,
    v_head_dim=128,
    qk_nope_head_dim=128,
    topk_method='noaux_tc',
    n_group=8,
    topk_group=4,
    num_experts_per_tok=8,
    moe_layer_freq=1,
    first_k_dense_replace=3,
    norm_topk_prob=True,
    scoring_func='sigmoid',
    hidden_act="silu",
    max_position_embeddings=4096,
    initializer_range=0.02,
    rms_norm_eps=1e-6,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=0,
    eos_token_id=1,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.moe_intermediate_size = moe_intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_nextn_predict_layers = num_nextn_predict_layers
    self.num_attention_heads = num_attention_heads
    self.n_shared_experts = n_shared_experts
    self.n_routed_experts = n_routed_experts
    self.ep_size = ep_size
    self.routed_scaling_factor = routed_scaling_factor
    self.kv_lora_rank = kv_lora_rank
    self.q_lora_rank = q_lora_rank
    self.qk_rope_head_dim = qk_rope_head_dim
    self.v_head_dim = v_head_dim
    self.qk_nope_head_dim = qk_nope_head_dim
    self.topk_method = topk_method
    self.n_group = n_group
    self.topk_group = topk_group
    self.num_experts_per_tok = num_experts_per_tok
    self.moe_layer_freq = moe_layer_freq
    self.first_k_dense_replace = first_k_dense_replace
    self.norm_topk_prob = norm_topk_prob
    self.scoring_func = scoring_func
    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

DeepseekVLV2Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/deepseek_vl2.py
class DeepseekVLV2Config(PretrainedConfig):
    model_type = "deepseek_vl_v2"
    vision_config: VisionEncoderConfig
    projector_config: MlpProjectorConfig

    tile_tag: str = "2D"
    global_view_pos: str = "head"
    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )

    def __init__(self,
                 tile_tag: str = "tile_tag",
                 global_view_pos: str = "head",
                 candidate_resolutions: tuple[tuple[int,
                                                    int]] = ((384, 384), ),
                 **kwargs):
        super().__init__(**kwargs)

        vision_config = kwargs.get("vision_config", {})
        self.vision_config = VisionEncoderConfig(**vision_config)

        projector_config = kwargs.get("projector_config", {})
        self.projector_config = MlpProjectorConfig(**projector_config)

        language_config = kwargs.get("language_config", {})
        self.text_config = DeepseekV2Config(**language_config)

        self.tile_tag = tile_tag
        self.global_view_pos = global_view_pos
        self.candidate_resolutions = candidate_resolutions
        self.vocab_size = self.text_config.vocab_size

candidate_resolutions class-attribute instance-attribute

candidate_resolutions: tuple[tuple[int, int]] = (
    candidate_resolutions
)

global_view_pos class-attribute instance-attribute

global_view_pos: str = global_view_pos

model_type class-attribute instance-attribute

model_type = 'deepseek_vl_v2'

projector_config instance-attribute

projector_config: MlpProjectorConfig = MlpProjectorConfig(
    **projector_config
)

text_config instance-attribute

text_config = DeepseekV2Config(**language_config)

tile_tag class-attribute instance-attribute

tile_tag: str = tile_tag

vision_config instance-attribute

vision_config: VisionEncoderConfig = VisionEncoderConfig(
    **vision_config
)

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    tile_tag: str = "tile_tag",
    global_view_pos: str = "head",
    candidate_resolutions: tuple[tuple[int, int]] = (
        (384, 384),
    ),
    **kwargs,
)
Source code in vllm/transformers_utils/configs/deepseek_vl2.py
def __init__(self,
             tile_tag: str = "tile_tag",
             global_view_pos: str = "head",
             candidate_resolutions: tuple[tuple[int,
                                                int]] = ((384, 384), ),
             **kwargs):
    super().__init__(**kwargs)

    vision_config = kwargs.get("vision_config", {})
    self.vision_config = VisionEncoderConfig(**vision_config)

    projector_config = kwargs.get("projector_config", {})
    self.projector_config = MlpProjectorConfig(**projector_config)

    language_config = kwargs.get("language_config", {})
    self.text_config = DeepseekV2Config(**language_config)

    self.tile_tag = tile_tag
    self.global_view_pos = global_view_pos
    self.candidate_resolutions = candidate_resolutions
    self.vocab_size = self.text_config.vocab_size

DotsOCRConfig

Bases: Qwen2Config

Source code in vllm/transformers_utils/configs/dotsocr.py
class DotsOCRConfig(Qwen2Config):
    model_type = "dots_ocr"

    def __init__(self,
                 image_token_id=151665,
                 video_token_id=151656,
                 vision_config: Optional[dict] = None,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.image_token_id = image_token_id
        self.video_token_id = video_token_id
        self.vision_config = DotsVisionConfig(**(vision_config or {}))

    def save_pretrained(self, save_directory, **kwargs):
        self._auto_class = None
        super().save_pretrained(save_directory, **kwargs)

image_token_id instance-attribute

image_token_id = image_token_id

model_type class-attribute instance-attribute

model_type = 'dots_ocr'

video_token_id instance-attribute

video_token_id = video_token_id

vision_config instance-attribute

vision_config = DotsVisionConfig(**(vision_config or {}))

__init__

__init__(
    image_token_id=151665,
    video_token_id=151656,
    vision_config: Optional[dict] = None,
    *args,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/dotsocr.py
def __init__(self,
             image_token_id=151665,
             video_token_id=151656,
             vision_config: Optional[dict] = None,
             *args,
             **kwargs):
    super().__init__(*args, **kwargs)
    self.image_token_id = image_token_id
    self.video_token_id = video_token_id
    self.vision_config = DotsVisionConfig(**(vision_config or {}))

save_pretrained

save_pretrained(save_directory, **kwargs)
Source code in vllm/transformers_utils/configs/dotsocr.py
def save_pretrained(self, save_directory, **kwargs):
    self._auto_class = None
    super().save_pretrained(save_directory, **kwargs)

EAGLEConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/eagle.py
class EAGLEConfig(PretrainedConfig):
    model_type = "eagle"

    def __init__(self,
                 model: Union[PretrainedConfig, dict, None] = None,
                 truncated_vocab_size: Optional[int] = None,
                 method: Optional[str] = 'eagle',
                 **kwargs):

        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
        if isinstance(model, dict):
            archs = model.get("architectures", [])
            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
            if any(target_arch in archs for target_arch in target_archs):
                # AutoConfig does not support DeepSeek MoE models yet
                model_config = DeepseekV2Config(**model)
            else:
                model_config = AutoConfig.for_model(**model)
        else:
            model_config = model

        for k, v in kwargs.items():
            if k != "architectures" and k != "model_type" and hasattr(
                    model_config, k):
                setattr(model_config, k, v)

        self.model = model_config

        if self.model is None:
            self.truncated_vocab_size = None
        else:
            self.truncated_vocab_size = self.model.vocab_size if \
                truncated_vocab_size is None else truncated_vocab_size

        # Eagle model name should follow naming convention of
        # LlamaForCausalLM -> EagleLlamaForCausalLM
        # LlamaForCausalLM -> Eagle3LlamaForCausalLM
        # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
        if method == "eagle":
            assert self.model is not None, \
                "model should not be None when method is eagle"
            kwargs["architectures"] = [
                f"Eagle{arch}" if not arch.startswith("Eagle") \
                    else arch for arch in self.model.architectures
            ]

        elif method == "eagle3":
            assert self.model is not None, \
                "model should not be None when method is eagle3"
            kwargs["architectures"] = [
                arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
                else f"Eagle3{arch}" for arch in self.model.architectures
            ]
        else:
            raise ValueError(f"Invalid method {method}. "
                             "Supported methods are eagle and eagle3.")

        super().__init__(**kwargs)

        if self.model is not None:
            for k, v in self.model.to_dict().items():
                if k not in kwargs:
                    setattr(self, k, v)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "EAGLEConfig":
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(config_dict, **kwargs)

model instance-attribute

model = model_config

model_type class-attribute instance-attribute

model_type = 'eagle'

truncated_vocab_size instance-attribute

truncated_vocab_size = None

__init__

__init__(
    model: Union[PretrainedConfig, dict, None] = None,
    truncated_vocab_size: Optional[int] = None,
    method: Optional[str] = "eagle",
    **kwargs,
)
Source code in vllm/transformers_utils/configs/eagle.py
def __init__(self,
             model: Union[PretrainedConfig, dict, None] = None,
             truncated_vocab_size: Optional[int] = None,
             method: Optional[str] = 'eagle',
             **kwargs):

    model_config: Union[PretrainedConfig, DeepseekV2Config, None]
    if isinstance(model, dict):
        archs = model.get("architectures", [])
        target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
        if any(target_arch in archs for target_arch in target_archs):
            # AutoConfig does not support DeepSeek MoE models yet
            model_config = DeepseekV2Config(**model)
        else:
            model_config = AutoConfig.for_model(**model)
    else:
        model_config = model

    for k, v in kwargs.items():
        if k != "architectures" and k != "model_type" and hasattr(
                model_config, k):
            setattr(model_config, k, v)

    self.model = model_config

    if self.model is None:
        self.truncated_vocab_size = None
    else:
        self.truncated_vocab_size = self.model.vocab_size if \
            truncated_vocab_size is None else truncated_vocab_size

    # Eagle model name should follow naming convention of
    # LlamaForCausalLM -> EagleLlamaForCausalLM
    # LlamaForCausalLM -> Eagle3LlamaForCausalLM
    # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
    if method == "eagle":
        assert self.model is not None, \
            "model should not be None when method is eagle"
        kwargs["architectures"] = [
            f"Eagle{arch}" if not arch.startswith("Eagle") \
                else arch for arch in self.model.architectures
        ]

    elif method == "eagle3":
        assert self.model is not None, \
            "model should not be None when method is eagle3"
        kwargs["architectures"] = [
            arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
            else f"Eagle3{arch}" for arch in self.model.architectures
        ]
    else:
        raise ValueError(f"Invalid method {method}. "
                         "Supported methods are eagle and eagle3.")

    super().__init__(**kwargs)

    if self.model is not None:
        for k, v in self.model.to_dict().items():
            if k not in kwargs:
                setattr(self, k, v)

from_pretrained classmethod

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> EAGLEConfig
Source code in vllm/transformers_utils/configs/eagle.py
@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "EAGLEConfig":
    config_dict, kwargs = cls.get_config_dict(
        pretrained_model_name_or_path, **kwargs)
    return cls.from_dict(config_dict, **kwargs)

JAISConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [JAISModel]. It is used to instantiate a JAIS model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name Type Description Default
vocab_size `int`, *optional*, defaults to 50257

Vocabulary size of the JAIS model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [JAISModel].

50257
n_positions `int`, *optional*, defaults to 1024

The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).

1024
n_embd `int`, *optional*, defaults to 768

Dimensionality of the embeddings and hidden states.

768
n_layer `int`, *optional*, defaults to 12

Number of hidden layers in the Transformer encoder.

12
n_head `int`, *optional*, defaults to 12

Number of attention heads for each attention layer in the Transformer encoder.

12
n_inner `int`, *optional*, defaults to None

Dimensionality of the inner feed-forward layers. None will set it to 4 times n_embd

None
activation_function `str`, *optional*, defaults to `"gelu"`

Activation function, to be selected in the list ["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"].

'gelu_new'
resid_pdrop `float`, *optional*, defaults to 0.1

The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.

0.1
embd_pdrop `float`, *optional*, defaults to 0.1

The dropout ratio for the embeddings.

0.1
attn_pdrop `float`, *optional*, defaults to 0.1

The dropout ratio for the attention.

0.1
layer_norm_epsilon `float`, *optional*, defaults to 1e-5

The epsilon to use in the layer normalization layers.

1e-05
initializer_range `float`, *optional*, defaults to 0.02

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

0.02
scale_attn_weights `bool`, *optional*, defaults to `True`

Scale attention weights by dividing by sqrt(hidden_size)..

True
use_cache `bool`, *optional*, defaults to `True`

Whether or not the model should return the last key/values attentions (not used by all models).

True
scale_attn_by_inverse_layer_idx `bool`, *optional*, default `True`

Whether to additionally scale attention weights by 1 / layer_idx + 1.

False
reorder_and_upcast_attn `bool`, *optional*, defaults to `False`

Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention dot-product/softmax to float() when training with mixed precision.

False
position_embedding_type `str`, *optional*, defaults to `"learned"`

Positional embedding can be either "alibi" or "learned".

'learned'
mup_width_scale `float`, *optional*, defaults to 1.0

muP parameter to scale learning rate and initializers. Calculated as (d_model,0 / d_model), where d_model is the model's width and d_model,0 is the proxy model's width.

1.0
mup_embeddings_scale `float`, *optional*, defaults to 1.0

muP parameter to scale token and position embeddings.

1.0
mup_output_alpha `float`, *optional*, defaults to 1.0

muP parameter to scale output logits (output_logits_scale = mup_output_alpha * mup_width_scale).

1.0
mup_scale_qk_dot_by_d `bool`, *optional*, defaults to `False`

Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size). Need to set scale_attn_weights to True as well.

False
alibi_scaling `dict`, *optional*

Dictionary containing the scaling configuration for ALiBi embeddings. Currently only supports linear scaling strategy. Can specify either the scaling factor (must be a float greater than 1) for fixed scaling or train_seq_len for dynamic scaling on input samples with sequence length > train_seq_len. The expected formats are {"type": strategy name, "factor": scaling factor} or {"type": strategy name, "train_seq_len": training sequence length}.

None
architectures `list`, *optional*, defaults to ['JAISLMHeadModel']

architecture names for Jais.

None

Example:

>>> from transformers import JAISConfig, JAISModel

>>> # Initializing a JAIS configuration
>>> configuration = JAISConfig()

>>> # Initializing a model (with random weights) from the configuration
>>> model = JAISModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
Source code in vllm/transformers_utils/configs/jais.py
class JAISConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a
    [`JAISModel`]. It is used to instantiate a JAIS model according to the
    specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used
    to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the JAIS model. Defines the number of different
            tokens that can be represented by the
            `inputs_ids` passed when calling [`JAISModel`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used
            with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set
            it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list
            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in
            the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
            Whether to additionally scale attention weights 
            by `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention
            (dot-product)
            and upcast attention dot-product/softmax to float() when training
            with mixed precision.
        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
            Positional embedding can be either `"alibi"` or `"learned"`.
        mup_width_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale learning rate and initializers. Calculated
            as (`d_model,0 / d_model`), where
            `d_model` is the model's width and `d_model,0` is the proxy
            model's width.
        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale token and position embeddings.
        mup_output_alpha (`float`, *optional*, defaults to 1.0):
            muP parameter to scale output logits
            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
            Scale attention weights by dividing by hidden_size instead of
            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
            well.
        alibi_scaling (`dict`, *optional*):
            Dictionary containing the scaling configuration for ALiBi
            embeddings. Currently only supports linear
            scaling strategy. Can specify either the scaling `factor` (must be
            a float greater than 1) for fixed scaling
            or `train_seq_len` for dynamic scaling on input samples with
            sequence length > `train_seq_len`. The expected
            formats are `{"type": strategy name, "factor": scaling factor}` or
            `{"type": strategy name,
            "train_seq_len": training sequence length}`.
        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
            architecture names for Jais.

    Example:

    ```python
    >>> from transformers import JAISConfig, JAISModel

    >>> # Initializing a JAIS configuration
    >>> configuration = JAISConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = JAISModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "jais"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "hidden_size": "n_embd",
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }

    def __init__(
        self,
        vocab_size=50257,
        n_positions=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
        n_inner=None,
        activation_function="gelu_new",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        scale_attn_weights=True,
        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
        scale_attn_by_inverse_layer_idx=False,
        reorder_and_upcast_attn=False,
        position_embedding_type="learned",
        mup_width_scale=1.0,
        mup_embeddings_scale=1.0,
        mup_output_alpha=1.0,
        mup_scale_qk_dot_by_d=False,
        alibi_scaling=None,
        architectures=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.scale_attn_weights = scale_attn_weights
        self.use_cache = use_cache
        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
        self.reorder_and_upcast_attn = reorder_and_upcast_attn

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        self.position_embedding_type = position_embedding_type
        self.mup_width_scale = mup_width_scale
        self.mup_embeddings_scale = mup_embeddings_scale
        self.mup_output_alpha = mup_output_alpha
        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

        self.alibi_scaling = alibi_scaling
        self._alibi_scaling_validation()
        if architectures is None:
            architectures = ["JAISLMHeadModel"]

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            architectures=architectures,
            **kwargs,
        )

    def _alibi_scaling_validation(self):
        """
        Validate the `alibi_scaling` configuration.
        """
        if self.alibi_scaling is None:
            return

        if (not isinstance(self.alibi_scaling, dict)
                or len(self.alibi_scaling) != 2):
            raise ValueError(
                "`alibi_scaling` must be a dictionary with two fields, "
                "`type` and `factor` or `type` and `train_seq_len`, "
                f"got {self.alibi_scaling}")
        alibi_scaling_type = self.alibi_scaling.get("type", None)
        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
        if alibi_scaling_type is None or alibi_scaling_type != "linear":
            raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                             f"got {alibi_scaling_type}")
        if (alibi_scaling_factor is not None
                and not isinstance(alibi_scaling_factor, float)
                or (alibi_scaling_factor is not None
                    and alibi_scaling_factor <= 1.0)):
            raise ValueError(
                f"`alibi_scaling`'s factor field must be a float > 1.0, "
                f"got {alibi_scaling_factor}")
        if (alibi_dynamic_scaling is not None
                and not isinstance(alibi_dynamic_scaling, int)
                or (alibi_dynamic_scaling is not None
                    and alibi_dynamic_scaling <= 1)):
            raise ValueError(
                f"`alibi_scaling`'s `train_seq_len` field must be an "
                f"integer > 1, got {alibi_dynamic_scaling}")

activation_function instance-attribute

activation_function = activation_function

alibi_scaling instance-attribute

alibi_scaling = alibi_scaling

attn_pdrop instance-attribute

attn_pdrop = attn_pdrop

attribute_map class-attribute instance-attribute

attribute_map = {
    "hidden_size": "n_embd",
    "max_position_embeddings": "n_positions",
    "num_attention_heads": "n_head",
    "num_hidden_layers": "n_layer",
}

bos_token_id instance-attribute

bos_token_id = bos_token_id

embd_pdrop instance-attribute

embd_pdrop = embd_pdrop

eos_token_id instance-attribute

eos_token_id = eos_token_id

initializer_range instance-attribute

initializer_range = initializer_range

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon instance-attribute

layer_norm_epsilon = layer_norm_epsilon

model_type class-attribute instance-attribute

model_type = 'jais'

mup_embeddings_scale instance-attribute

mup_embeddings_scale = mup_embeddings_scale

mup_output_alpha instance-attribute

mup_output_alpha = mup_output_alpha

mup_scale_qk_dot_by_d instance-attribute

mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

mup_width_scale instance-attribute

mup_width_scale = mup_width_scale

n_embd instance-attribute

n_embd = n_embd

n_head instance-attribute

n_head = n_head

n_inner instance-attribute

n_inner = n_inner

n_layer instance-attribute

n_layer = n_layer

n_positions instance-attribute

n_positions = n_positions

position_embedding_type instance-attribute

position_embedding_type = position_embedding_type

reorder_and_upcast_attn instance-attribute

reorder_and_upcast_attn = reorder_and_upcast_attn

resid_pdrop instance-attribute

resid_pdrop = resid_pdrop

scale_attn_by_inverse_layer_idx instance-attribute

scale_attn_by_inverse_layer_idx = (
    scale_attn_by_inverse_layer_idx
)

scale_attn_weights instance-attribute

scale_attn_weights = scale_attn_weights

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    scale_attn_by_inverse_layer_idx=False,
    reorder_and_upcast_attn=False,
    position_embedding_type="learned",
    mup_width_scale=1.0,
    mup_embeddings_scale=1.0,
    mup_output_alpha=1.0,
    mup_scale_qk_dot_by_d=False,
    alibi_scaling=None,
    architectures=None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/jais.py
def __init__(
    self,
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_new",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    scale_attn_by_inverse_layer_idx=False,
    reorder_and_upcast_attn=False,
    position_embedding_type="learned",
    mup_width_scale=1.0,
    mup_embeddings_scale=1.0,
    mup_output_alpha=1.0,
    mup_scale_qk_dot_by_d=False,
    alibi_scaling=None,
    architectures=None,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.n_positions = n_positions
    self.n_embd = n_embd
    self.n_layer = n_layer
    self.n_head = n_head
    self.n_inner = n_inner
    self.activation_function = activation_function
    self.resid_pdrop = resid_pdrop
    self.embd_pdrop = embd_pdrop
    self.attn_pdrop = attn_pdrop
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.scale_attn_weights = scale_attn_weights
    self.use_cache = use_cache
    self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
    self.reorder_and_upcast_attn = reorder_and_upcast_attn

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id

    self.position_embedding_type = position_embedding_type
    self.mup_width_scale = mup_width_scale
    self.mup_embeddings_scale = mup_embeddings_scale
    self.mup_output_alpha = mup_output_alpha
    self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d

    self.alibi_scaling = alibi_scaling
    self._alibi_scaling_validation()
    if architectures is None:
        architectures = ["JAISLMHeadModel"]

    super().__init__(
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        architectures=architectures,
        **kwargs,
    )

_alibi_scaling_validation

_alibi_scaling_validation()

Validate the alibi_scaling configuration.

Source code in vllm/transformers_utils/configs/jais.py
def _alibi_scaling_validation(self):
    """
    Validate the `alibi_scaling` configuration.
    """
    if self.alibi_scaling is None:
        return

    if (not isinstance(self.alibi_scaling, dict)
            or len(self.alibi_scaling) != 2):
        raise ValueError(
            "`alibi_scaling` must be a dictionary with two fields, "
            "`type` and `factor` or `type` and `train_seq_len`, "
            f"got {self.alibi_scaling}")
    alibi_scaling_type = self.alibi_scaling.get("type", None)
    alibi_scaling_factor = self.alibi_scaling.get("factor", None)
    alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
    if alibi_scaling_type is None or alibi_scaling_type != "linear":
        raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                         f"got {alibi_scaling_type}")
    if (alibi_scaling_factor is not None
            and not isinstance(alibi_scaling_factor, float)
            or (alibi_scaling_factor is not None
                and alibi_scaling_factor <= 1.0)):
        raise ValueError(
            f"`alibi_scaling`'s factor field must be a float > 1.0, "
            f"got {alibi_scaling_factor}")
    if (alibi_dynamic_scaling is not None
            and not isinstance(alibi_dynamic_scaling, int)
            or (alibi_dynamic_scaling is not None
                and alibi_dynamic_scaling <= 1)):
        raise ValueError(
            f"`alibi_scaling`'s `train_seq_len` field must be an "
            f"integer > 1, got {alibi_dynamic_scaling}")

KimiVLConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/kimi_vl.py
class KimiVLConfig(PretrainedConfig):
    model_type = "kimi_vl"

    def __init__(self,
                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
                 ignore_index: int = -100,
                 media_placeholder_token_id: int = 163605,
                 pad_token_id: int = 0,
                 **kwargs):
        if vision_config is None:
            vision_config = MoonViTConfig()
        elif isinstance(vision_config, dict):
            vision_config = MoonViTConfig(**vision_config)
        self.vision_config = vision_config

        if text_config is None:
            text_config = DeepseekV2Config()
        elif isinstance(text_config, dict):
            text_config = DeepseekV2Config(**text_config)
        self.text_config = text_config

        self.ignore_index = ignore_index
        self.media_placeholder_token_id = media_placeholder_token_id

        super().__init__(pad_token_id=pad_token_id, **kwargs)

ignore_index instance-attribute

ignore_index = ignore_index

media_placeholder_token_id instance-attribute

media_placeholder_token_id = media_placeholder_token_id

model_type class-attribute instance-attribute

model_type = 'kimi_vl'

text_config instance-attribute

text_config = text_config

vision_config instance-attribute

vision_config = vision_config

__init__

__init__(
    vision_config: Optional[
        Union[dict, MoonViTConfig]
    ] = None,
    text_config: Optional[
        Union[dict, DeepseekV2Config]
    ] = None,
    ignore_index: int = -100,
    media_placeholder_token_id: int = 163605,
    pad_token_id: int = 0,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/kimi_vl.py
def __init__(self,
             vision_config: Optional[Union[dict, MoonViTConfig]] = None,
             text_config: Optional[Union[dict, DeepseekV2Config]] = None,
             ignore_index: int = -100,
             media_placeholder_token_id: int = 163605,
             pad_token_id: int = 0,
             **kwargs):
    if vision_config is None:
        vision_config = MoonViTConfig()
    elif isinstance(vision_config, dict):
        vision_config = MoonViTConfig(**vision_config)
    self.vision_config = vision_config

    if text_config is None:
        text_config = DeepseekV2Config()
    elif isinstance(text_config, dict):
        text_config = DeepseekV2Config(**text_config)
    self.text_config = text_config

    self.ignore_index = ignore_index
    self.media_placeholder_token_id = media_placeholder_token_id

    super().__init__(pad_token_id=pad_token_id, **kwargs)

MLPSpeculatorConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/mlp_speculator.py
class MLPSpeculatorConfig(PretrainedConfig):
    model_type = "mlp_speculator"

    attribute_map = {
        "hidden_size": "emb_dim",
    }

    def __init__(self,
                 vocab_size: int = 32000,
                 emb_dim: int = 4096,
                 inner_dim: int = 0,
                 n_predict: int = 3,
                 top_k_tokens_per_head: Optional[list[int]] = None,
                 n_candidates: int = 5,
                 tie_weights: bool = False,
                 scale_input: bool = False,
                 **kwargs):
        """
        Initialize an MLPSpeculatorConfig

        Args:
            vocab_size: int
                the model vocab size
            emb_dim: int
                the model embedding dimension
            inner_dim: int
                the inner dimension of the model. If 0, will be the emb_dim.
            n_predict: int
                the number of lookaheads for the speculator
            top_k_tokens_per_head: list[int]
                Number of tokens to consider from each head when forming the
                candidate tree.
                For each candidate branch in the tree, head n produces topk[n]
                additional sub-branches.
                NOTE: This parameter is currently unused.
            n_candidates: int
                number of child candidates to create per sequence
            tie_weights: bool
                If true, use a single set of weights for every model
                head/stage after the first. The initial projection
                from the base model may have a different size, so that
                stays separate.
            scale_input: bool
                if True, will scale the initial hidden states from
                the base model.
        """
        if top_k_tokens_per_head is None:
            top_k_tokens_per_head = [5, 4, 3]
        assert len(top_k_tokens_per_head) == n_predict
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.inner_dim = inner_dim
        self.n_predict = n_predict
        self.top_k_tokens_per_head = top_k_tokens_per_head
        self.n_candidates = n_candidates
        self.num_lookahead_tokens = n_predict
        self.tie_weights = tie_weights
        self.scale_input = scale_input

        super().__init__(**kwargs)

attribute_map class-attribute instance-attribute

attribute_map = {'hidden_size': 'emb_dim'}

emb_dim instance-attribute

emb_dim = emb_dim

inner_dim instance-attribute

inner_dim = inner_dim

model_type class-attribute instance-attribute

model_type = 'mlp_speculator'

n_candidates instance-attribute

n_candidates = n_candidates

n_predict instance-attribute

n_predict = n_predict

num_lookahead_tokens instance-attribute

num_lookahead_tokens = n_predict

scale_input instance-attribute

scale_input = scale_input

tie_weights instance-attribute

tie_weights = tie_weights

top_k_tokens_per_head instance-attribute

top_k_tokens_per_head = top_k_tokens_per_head

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size: int = 32000,
    emb_dim: int = 4096,
    inner_dim: int = 0,
    n_predict: int = 3,
    top_k_tokens_per_head: Optional[list[int]] = None,
    n_candidates: int = 5,
    tie_weights: bool = False,
    scale_input: bool = False,
    **kwargs,
)

Initialize an MLPSpeculatorConfig

Parameters:

Name Type Description Default
vocab_size int

int the model vocab size

32000
emb_dim int

int the model embedding dimension

4096
inner_dim int

int the inner dimension of the model. If 0, will be the emb_dim.

0
n_predict int

int the number of lookaheads for the speculator

3
top_k_tokens_per_head Optional[list[int]]

list[int] Number of tokens to consider from each head when forming the candidate tree. For each candidate branch in the tree, head n produces topk[n] additional sub-branches. NOTE: This parameter is currently unused.

None
n_candidates int

int number of child candidates to create per sequence

5
tie_weights bool

bool If true, use a single set of weights for every model head/stage after the first. The initial projection from the base model may have a different size, so that stays separate.

False
scale_input bool

bool if True, will scale the initial hidden states from the base model.

False
Source code in vllm/transformers_utils/configs/mlp_speculator.py
def __init__(self,
             vocab_size: int = 32000,
             emb_dim: int = 4096,
             inner_dim: int = 0,
             n_predict: int = 3,
             top_k_tokens_per_head: Optional[list[int]] = None,
             n_candidates: int = 5,
             tie_weights: bool = False,
             scale_input: bool = False,
             **kwargs):
    """
    Initialize an MLPSpeculatorConfig

    Args:
        vocab_size: int
            the model vocab size
        emb_dim: int
            the model embedding dimension
        inner_dim: int
            the inner dimension of the model. If 0, will be the emb_dim.
        n_predict: int
            the number of lookaheads for the speculator
        top_k_tokens_per_head: list[int]
            Number of tokens to consider from each head when forming the
            candidate tree.
            For each candidate branch in the tree, head n produces topk[n]
            additional sub-branches.
            NOTE: This parameter is currently unused.
        n_candidates: int
            number of child candidates to create per sequence
        tie_weights: bool
            If true, use a single set of weights for every model
            head/stage after the first. The initial projection
            from the base model may have a different size, so that
            stays separate.
        scale_input: bool
            if True, will scale the initial hidden states from
            the base model.
    """
    if top_k_tokens_per_head is None:
        top_k_tokens_per_head = [5, 4, 3]
    assert len(top_k_tokens_per_head) == n_predict
    self.vocab_size = vocab_size
    self.emb_dim = emb_dim
    self.inner_dim = inner_dim
    self.n_predict = n_predict
    self.top_k_tokens_per_head = top_k_tokens_per_head
    self.n_candidates = n_candidates
    self.num_lookahead_tokens = n_predict
    self.tie_weights = tie_weights
    self.scale_input = scale_input

    super().__init__(**kwargs)

MedusaConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/medusa.py
class MedusaConfig(PretrainedConfig):
    model_type = "medusa"

    def __init__(self,
                 hidden_size: int = 4096,
                 vocab_size: int = 32001,
                 num_heads: int = 5,
                 num_hidden_layers: int = 1,
                 max_paths: int = 64,
                 topk: int = 10,
                 truncated_vocab_size: Optional[int] = None,
                 **kwargs):

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_hidden_layers = num_hidden_layers
        self.max_paths = max_paths
        self.topk = topk
        self.max_seq_len = int(2**20)
        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
            else truncated_vocab_size
        if "architectures" not in kwargs:
            kwargs["architectures"] = ["MedusaModel"]

        super().__init__(**kwargs)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "MedusaConfig":
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        for k in list(config_dict.keys()):
            if 'num' in k:
                if 'heads' in k:
                    config_dict["num_heads"] = config_dict.pop(k)
                elif 'layers' in k:
                    config_dict["num_hidden_layers"] = config_dict.pop(k)
        return cls.from_dict(config_dict, **kwargs)

    @property
    def num_attention_heads(self):
        return 0

    @property
    def num_lookahead_tokens(self):
        return self.num_heads

    @num_lookahead_tokens.setter
    def num_lookahead_tokens(self, num_lookahead_tokens: int):
        self.num_heads = num_lookahead_tokens

hidden_size instance-attribute

hidden_size = hidden_size

max_paths instance-attribute

max_paths = max_paths

max_seq_len instance-attribute

max_seq_len = int(2 ** 20)

model_type class-attribute instance-attribute

model_type = 'medusa'

num_attention_heads property

num_attention_heads

num_heads instance-attribute

num_heads = num_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_lookahead_tokens property writable

num_lookahead_tokens

topk instance-attribute

topk = topk

truncated_vocab_size instance-attribute

truncated_vocab_size = (
    vocab_size
    if truncated_vocab_size is None
    else truncated_vocab_size
)

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    hidden_size: int = 4096,
    vocab_size: int = 32001,
    num_heads: int = 5,
    num_hidden_layers: int = 1,
    max_paths: int = 64,
    topk: int = 10,
    truncated_vocab_size: Optional[int] = None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/medusa.py
def __init__(self,
             hidden_size: int = 4096,
             vocab_size: int = 32001,
             num_heads: int = 5,
             num_hidden_layers: int = 1,
             max_paths: int = 64,
             topk: int = 10,
             truncated_vocab_size: Optional[int] = None,
             **kwargs):

    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.num_heads = num_heads
    self.num_hidden_layers = num_hidden_layers
    self.max_paths = max_paths
    self.topk = topk
    self.max_seq_len = int(2**20)
    self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
        else truncated_vocab_size
    if "architectures" not in kwargs:
        kwargs["architectures"] = ["MedusaModel"]

    super().__init__(**kwargs)

from_pretrained classmethod

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> MedusaConfig
Source code in vllm/transformers_utils/configs/medusa.py
@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "MedusaConfig":
    config_dict, kwargs = cls.get_config_dict(
        pretrained_model_name_or_path, **kwargs)
    for k in list(config_dict.keys()):
        if 'num' in k:
            if 'heads' in k:
                config_dict["num_heads"] = config_dict.pop(k)
            elif 'layers' in k:
                config_dict["num_hidden_layers"] = config_dict.pop(k)
    return cls.from_dict(config_dict, **kwargs)

MiDashengLMConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/midashenglm.py
class MiDashengLMConfig(PretrainedConfig):
    model_type = "midashenglm"

    def __init__(
        self,
        audio_encoder_config: Optional[dict] = None,
        subsample_factor: int = 5,
        text_config: Optional[dict] = None,
        audio_token_id: Optional[int] = None,
        **kwargs,
    ):
        self.audio_encoder_config = DashengConfig(
            **(audio_encoder_config or {}))
        self.subsample_factor = subsample_factor
        self.text_config = (Qwen2_5OmniTextConfig(
            **text_config) if text_config else Qwen2_5OmniTextConfig())
        self.text_config.rope_scaling = None  # uses_mrope is false
        self.audio_token_id = audio_token_id
        super().__init__(**kwargs)

audio_encoder_config instance-attribute

audio_encoder_config = DashengConfig(
    **(audio_encoder_config or {})
)

audio_token_id instance-attribute

audio_token_id = audio_token_id

model_type class-attribute instance-attribute

model_type = 'midashenglm'

subsample_factor instance-attribute

subsample_factor = subsample_factor

text_config instance-attribute

text_config = (
    Qwen2_5OmniTextConfig(**text_config)
    if text_config
    else Qwen2_5OmniTextConfig()
)

__init__

__init__(
    audio_encoder_config: Optional[dict] = None,
    subsample_factor: int = 5,
    text_config: Optional[dict] = None,
    audio_token_id: Optional[int] = None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/midashenglm.py
def __init__(
    self,
    audio_encoder_config: Optional[dict] = None,
    subsample_factor: int = 5,
    text_config: Optional[dict] = None,
    audio_token_id: Optional[int] = None,
    **kwargs,
):
    self.audio_encoder_config = DashengConfig(
        **(audio_encoder_config or {}))
    self.subsample_factor = subsample_factor
    self.text_config = (Qwen2_5OmniTextConfig(
        **text_config) if text_config else Qwen2_5OmniTextConfig())
    self.text_config.rope_scaling = None  # uses_mrope is false
    self.audio_token_id = audio_token_id
    super().__init__(**kwargs)

MoonViTConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/moonvit.py
class MoonViTConfig(PretrainedConfig):
    model_type = "moonvit"

    def __init__(
            self,
            patch_size: int = 14,
            init_pos_emb_height: int = 64,
            init_pos_emb_width: int = 64,
            num_attention_heads: int = 16,
            num_hidden_layers: int = 27,
            hidden_size: int = 1152,
            intermediate_size: int = 4304,
            merge_kernel_size: tuple[int, int] = (2, 2),
            **kwargs,
    ):
        super().__init__(**kwargs)
        self.patch_size = patch_size
        # Positional embedding config
        self.init_pos_emb_height = init_pos_emb_height
        self.init_pos_emb_width = init_pos_emb_width
        # Transformer config
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        # Patch merger config
        self.merge_kernel_size = merge_kernel_size

hidden_size instance-attribute

hidden_size = hidden_size

init_pos_emb_height instance-attribute

init_pos_emb_height = init_pos_emb_height

init_pos_emb_width instance-attribute

init_pos_emb_width = init_pos_emb_width

intermediate_size instance-attribute

intermediate_size = intermediate_size

merge_kernel_size instance-attribute

merge_kernel_size = merge_kernel_size

model_type class-attribute instance-attribute

model_type = 'moonvit'

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

patch_size instance-attribute

patch_size = patch_size

__init__

__init__(
    patch_size: int = 14,
    init_pos_emb_height: int = 64,
    init_pos_emb_width: int = 64,
    num_attention_heads: int = 16,
    num_hidden_layers: int = 27,
    hidden_size: int = 1152,
    intermediate_size: int = 4304,
    merge_kernel_size: tuple[int, int] = (2, 2),
    **kwargs,
)
Source code in vllm/transformers_utils/configs/moonvit.py
def __init__(
        self,
        patch_size: int = 14,
        init_pos_emb_height: int = 64,
        init_pos_emb_width: int = 64,
        num_attention_heads: int = 16,
        num_hidden_layers: int = 27,
        hidden_size: int = 1152,
        intermediate_size: int = 4304,
        merge_kernel_size: tuple[int, int] = (2, 2),
        **kwargs,
):
    super().__init__(**kwargs)
    self.patch_size = patch_size
    # Positional embedding config
    self.init_pos_emb_height = init_pos_emb_height
    self.init_pos_emb_width = init_pos_emb_width
    # Transformer config
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    # Patch merger config
    self.merge_kernel_size = merge_kernel_size

NemotronConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [NemotronModel]. It is used to instantiate a Nemotron model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Nemotron-8B.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name Type Description Default
vocab_size `int`, *optional*, defaults to 256000

Vocabulary size of the Nemotron model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [NemotronModel]

256000
hidden_size `int`, *optional*, defaults to 6144

Dimension of the hidden representations.

6144
intermediate_size `int`, *optional*, defaults to 24576

Dimension of the MLP representations.

24576
num_hidden_layers `int`, *optional*, defaults to 32

Number of hidden layers in the Transformer decoder.

32
num_attention_heads `int`, *optional*, defaults to 48

Number of attention heads for each attention layer in the Transformer decoder.

48
head_dim `int`, *optional*

Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if None

None
num_key_value_heads `int`, *optional*

This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default tonum_attention_heads`.

None
hidden_act `str` or `function`, *optional*, defaults to `"relu2"`

The non-linear activation function (function or string) in the decoder.

'relu2'
max_position_embeddings `int`, *optional*, defaults to 4096

The maximum sequence length that this model might ever be used with.

4096
initializer_range `float`, *optional*, defaults to 0.0134

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

0.0134
norm_eps `float`, *optional*, defaults to 1e-05

The epsilon used by the normalization layers.

1e-05
use_cache `bool`, *optional*, defaults to `True`

Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True.

True
pad_token_id `int`, *optional*

Padding token id.

None
bos_token_id `int`, *optional*, defaults to 2

Beginning of stream token id.

2
eos_token_id `int`, *optional*, defaults to 3

End of stream token id.

3
tie_word_embeddings `bool`, *optional*, defaults to `False`

Whether to tie weight embeddings

False
rope_theta `float`, *optional*, defaults to 10000.0

The base period of the RoPE embeddings.

10000.0
partial_rotary_factor `float`, *optional*, defaults to 0.5

Percentage of the query and keys which will have rotary embedding.

0.5
attention_bias `bool`, *optional*, defaults to `False`

Whether to use a bias in the query, key, value and output projection layers during self-attention.

False
attention_dropout `float`, *optional*, defaults to 0.0

The dropout ratio for the attention probabilities.

0.0
mlp_bias `bool`, *optional*, defaults to `False`

Whether to use a bias in up_proj and down_proj layers in the MLP layers.

False
>>> from transformers import NemotronModel, NemotronConfig
>>> # Initializing a Nemotron nemotron-15b style configuration
>>> configuration = NemotronConfig()
>>> # Initializing a model from the nemotron-15b style configuration
>>> model = NemotronModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
Source code in vllm/transformers_utils/configs/nemotron.py
class NemotronConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`NemotronModel`]. It is used to instantiate a Nemotron model
    according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar
    configuration to that of the Nemotron-8B.

    Configuration objects inherit from [`PretrainedConfig`] and can be
    used to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Nemotron model. Defines the number of
            different tokens that can be represented by the
            `inputs_ids` passed when calling [`NemotronModel`]
        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the
            Transformer decoder.
        head_dim (`int`, *optional*):
            Projection weights dimension in multi-head attention. Set to
            hidden_size // num_attention_heads if None
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention
            (MQA) otherwise GQA is used. When converting a multi-head
            checkpoint to a GQA checkpoint, each group key and value
            head should be constructed by meanpooling all the original
            heads within that group. For more details checkout 
            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
            is not specified, will default to `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
            The non-linear activation function (function or string) in the
            decoder.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        initializer_range (`float`, *optional*, defaults to 0.0134):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 3):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
            Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output
            projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj and down_proj layers in the MLP
            layers.

    ```python
    >>> from transformers import NemotronModel, NemotronConfig
    >>> # Initializing a Nemotron nemotron-15b style configuration
    >>> configuration = NemotronConfig()
    >>> # Initializing a model from the nemotron-15b style configuration
    >>> model = NemotronModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "nemotron"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=256000,
        hidden_size=6144,
        intermediate_size=24576,
        num_hidden_layers=32,
        num_attention_heads=48,
        head_dim=None,
        num_key_value_heads=None,
        hidden_act="relu2",
        max_position_embeddings=4096,
        initializer_range=0.0134,
        norm_eps=1e-5,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=2,
        eos_token_id=3,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.5,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        head_dim = head_dim or kwargs.get("kv_channels")
        self.head_dim = head_dim if head_dim is not None else (
            hidden_size // num_attention_heads)

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.norm_eps = norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        # for backward compatibility
        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
            "rope_percentage") or partial_rotary_factor
        self.partial_rotary_factor = partial_rotary_factor
        self._rope_scaling_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.mlp_bias = mlp_bias

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(
                self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with two fields, "
                f"`type` and `factor`, got {self.rope_scaling}")
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        if rope_scaling_type is None or rope_scaling_type not in [
                "linear", "dynamic"
        ]:
            raise ValueError(
                "`rope_scaling`'s type field must be one of ['linear', "
                f"'dynamic'], got {rope_scaling_type}")
        if rope_scaling_factor is None or not isinstance(
                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(
                "`rope_scaling`'s factor field must be a float > 1, got "
                f"{rope_scaling_factor}")

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

head_dim instance-attribute

head_dim = (
    head_dim
    if head_dim is not None
    else hidden_size // num_attention_heads
)

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

mlp_bias instance-attribute

mlp_bias = mlp_bias

model_type class-attribute instance-attribute

model_type = 'nemotron'

norm_eps instance-attribute

norm_eps = norm_eps

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

partial_rotary_factor instance-attribute

partial_rotary_factor = partial_rotary_factor

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=256000,
    hidden_size=6144,
    intermediate_size=24576,
    num_hidden_layers=32,
    num_attention_heads=48,
    head_dim=None,
    num_key_value_heads=None,
    hidden_act="relu2",
    max_position_embeddings=4096,
    initializer_range=0.0134,
    norm_eps=1e-05,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=2,
    eos_token_id=3,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.5,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/nemotron.py
def __init__(
    self,
    vocab_size=256000,
    hidden_size=6144,
    intermediate_size=24576,
    num_hidden_layers=32,
    num_attention_heads=48,
    head_dim=None,
    num_key_value_heads=None,
    hidden_act="relu2",
    max_position_embeddings=4096,
    initializer_range=0.0134,
    norm_eps=1e-5,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=2,
    eos_token_id=3,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.5,
    attention_bias=False,
    attention_dropout=0.0,
    mlp_bias=False,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    head_dim = head_dim or kwargs.get("kv_channels")
    self.head_dim = head_dim if head_dim is not None else (
        hidden_size // num_attention_heads)

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.norm_eps = norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    # for backward compatibility
    partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
        "rope_percentage") or partial_rotary_factor
    self.partial_rotary_factor = partial_rotary_factor
    self._rope_scaling_validation()
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.mlp_bias = mlp_bias

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

_rope_scaling_validation

_rope_scaling_validation()

Validate the rope_scaling configuration.

Source code in vllm/transformers_utils/configs/nemotron.py
def _rope_scaling_validation(self):
    """
    Validate the `rope_scaling` configuration.
    """
    if self.rope_scaling is None:
        return

    if not isinstance(self.rope_scaling, dict) or len(
            self.rope_scaling) != 2:
        raise ValueError(
            "`rope_scaling` must be a dictionary with two fields, "
            f"`type` and `factor`, got {self.rope_scaling}")
    rope_scaling_type = self.rope_scaling.get("type", None)
    rope_scaling_factor = self.rope_scaling.get("factor", None)
    if rope_scaling_type is None or rope_scaling_type not in [
            "linear", "dynamic"
    ]:
        raise ValueError(
            "`rope_scaling`'s type field must be one of ['linear', "
            f"'dynamic'], got {rope_scaling_type}")
    if rope_scaling_factor is None or not isinstance(
            rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
        raise ValueError(
            "`rope_scaling`'s factor field must be a float > 1, got "
            f"{rope_scaling_factor}")

NemotronHConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [NemotronHModel]. It is used to instantiate a NemotronH model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the NemotronH-v0.1 model. Args: vocab_size (int, optional, defaults to 131072): Vocabulary size of the NemotronH model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [NemotronHModel] tie_word_embeddings (bool, optional, defaults to False): Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has an output word embedding layer. hidden_size (int, optional, defaults to 4096): Dimension of the hidden representations. intermediate_size (int, optional, defaults to 21504): Dimension of the MLP representations. num_hidden_layers (int, optional, defaults to 52): Number of hidden layers in the Transformer encoder. hybrid_override_pattern (str, optional, defaults to "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"): The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, : Attention, -: MLP num_attention_heads (int, optional, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. attention_head_dim (int, optional, defaults to 128): Dimension of each attention head. num_key_value_heads (int, optional, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. mlp_hidden_act (str, optional, defaults to "relu2"): The non-linear activation function in the MLP layers. attention_bias (bool, optional, defaults to False): Whether to use bias in attention layers. mlp_bias (bool, optional, defaults to False): Whether to use bias in MLP layers. use_bias (bool, optional, defaults to False): Whether to use bias in the model. initializer_range (float, optional, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_epsilon (float, optional, defaults to 1e-5): The epsilon used by the layer normalization layers. residual_in_fp32 (bool, optional, defaults to False): Whether or not residuals should be in float32. If set to False residuals will keep the same dtype as the rest of the model. use_cache (bool, optional, defaults to True): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True. num_logits_to_keep (int or None, optional, defaults to 1): Number of prompt logits to calculate during generation. If None, all logits will be calculated. If an integer value, only last num_logits_to_keep logits will be calculated. pad_token_id (int, optional, defaults to 0): The id of the padding token. bos_token_id (int, optional, defaults to 1): The id of the "beginning-of-sequence" token. eos_token_id (int, optional, defaults to 2): The id of the "end-of-sequence" token. sliding_window (int, optional, defaults to None): Sliding window attention window size. max_position_embeddings (int, optional, defaults to 4096): The maximum sequence length that this model might ever be used with. attention_dropout (float, optional, defaults to 0.0): The dropout ratio for the attention probabilities. hidden_dropout (float, optional, defaults to 0.0): The dropout ratio for the hidden states. use_mamba_kernels (bool, optional, defaults to True): Flag indicating whether or not to use the fast mamba kernels. These are available only if mamba-ssm and causal-conv1d are installed, and the mamba modules are running on a CUDA device. ssm_state_size (int, optional, defaults to 128): The dimension of the mamba state space latents. mamba_num_heads (int, optional, defaults to 128): Number of heads in Mamba layers. mamba_n_groups (int, optional, defaults to 8): Number of groups in Mamba layers. mamba_head_dim (int, optional, defaults to 64): Dimension of each Mamba head. mamba_d_conv (int, optional, defaults to 4): The size of the mamba convolution kernel. mamba_expand (int, optional, defaults to 2): Expanding factor used to determine the mamba intermediate size. mamba_hidden_act (str, optional, defaults to "silu"): The non-linear activation function in the Mamba layers. mamba_dt_min (float, optional, defaults to 0.001): Minimum value for the time step in Mamba. mamba_dt_max (float, optional, defaults to 0.1): Maximum value for the time step in Mamba. mamba_dt_limit (tuple, optional, defaults to (0.0, float("inf"))): Limits for the time step in Mamba. mamba_dt_init_floor (float, optional, defaults to 1e-4): Floor value for time step initialization in Mamba. mamba_conv_bias (bool, optional, defaults to True): Whether to use bias in the convolution layer of the mamba mixer block. mamba_proj_bias (bool, optional, defaults to False): Whether to use bias in the input and output projections of the mamba mixer block. mamba_chunk_size (int, optional, defaults to 256): Size of chunks for Mamba processing. rescale_prenorm_residual (bool, optional*, defaults to True): Whether to rescale the pre-normalization residual connections.

Source code in vllm/transformers_utils/configs/nemotron_h.py
class NemotronHConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
    to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to
    that of the NemotronH-v0.1 model.
    Args:
        vocab_size (`int`, *optional*, defaults to 131072):
            Vocabulary size of the NemotronH model. Defines the number of
            different tokens that can be represented by the `inputs_ids`
            passed when calling [`NemotronHModel`]
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be
            tied. Note that this is only relevant if the model has an output
            word embedding layer.
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 52):
            Number of hidden layers in the Transformer encoder.
        hybrid_override_pattern (`str`, *optional*, defaults to
            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
            The pattern of the hybrid model. The pattern is a string of
            characters where each character represents
            M: Mamba2, *: Attention, -: MLP
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        attention_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each attention head.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
            will use Multi Query Attention (MQA) otherwise GQA is used.
        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
            The non-linear activation function in the MLP layers.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in attention layers.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in MLP layers.
        use_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
            Whether or not residuals should be in `float32`. If set to `False`
            residuals will keep the same `dtype` as the rest of the model.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
            Number of prompt logits to calculate during generation. If `None`,
            all logits will be calculated. If an integer value, only last
            `num_logits_to_keep` logits will be calculated.
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        sliding_window (`int`, *optional*, defaults to None):
            Sliding window attention window size.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the hidden states.
        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
            Flag indicating whether or not to use the fast mamba kernels.
            These are available only if `mamba-ssm` and `causal-conv1d`
            are installed, and the mamba modules are running on a CUDA device.
        ssm_state_size (`int`, *optional*, defaults to 128):
            The dimension of the mamba state space latents.
        mamba_num_heads (`int`, *optional*, defaults to 128):
            Number of heads in Mamba layers.
        mamba_n_groups (`int`, *optional*, defaults to 8):
            Number of groups in Mamba layers.
        mamba_head_dim (`int`, *optional*, defaults to 64):
            Dimension of each Mamba head.
        mamba_d_conv (`int`, *optional*, defaults to 4):
            The size of the mamba convolution kernel.
        mamba_expand (`int`, *optional*, defaults to 2):
            Expanding factor used to determine the mamba intermediate size.
        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
            The non-linear activation function in the Mamba layers.
        mamba_dt_min (`float`, *optional*, defaults to 0.001):
            Minimum value for the time step in Mamba.
        mamba_dt_max (`float`, *optional*, defaults to 0.1):
            Maximum value for the time step in Mamba.
        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
            Limits for the time step in Mamba.
        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
            Floor value for time step initialization in Mamba.
        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the convolution layer of the mamba mixer
            block.
        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the input and output projections of the
            mamba mixer block.
        mamba_chunk_size (`int`, *optional*, defaults to 256):
            Size of chunks for Mamba processing.
        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
            Whether to rescale the pre-normalization residual connections.
    """

    model_type = "nemotron_h"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=131072,
        tie_word_embeddings=False,
        hidden_size=4096,
        intermediate_size=21504,
        num_hidden_layers=52,
        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
        num_attention_heads=32,
        head_dim=128,
        num_key_value_heads=8,  # nemo: num_query_groups
        mlp_hidden_act="relu2",
        attention_bias=False,
        mlp_bias=False,
        use_bias=False,
        initializer_range=0.02,  # nemo: init_method_std
        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
        residual_in_fp32=False,  #  Megatron Core default value
        use_cache=True,
        num_logits_to_keep=1,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        sliding_window=None,
        max_position_embeddings=4096,
        attention_dropout=0.0,
        hidden_dropout=0.0,  # * ADDED
        use_mamba_kernels=True,
        ssm_state_size=128,  # mamba_state_size
        mamba_num_heads=128,
        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
        mamba_head_dim=64,
        mamba_d_conv=4,
        mamba_expand=2,
        mamba_hidden_act="silu",
        mamba_dt_min=0.001,
        mamba_dt_max=0.1,
        mamba_dt_limit=(0.0, float("inf")),
        mamba_dt_init_floor=1e-4,
        mamba_conv_bias=True,
        mamba_proj_bias=False,
        mamba_chunk_size=256,
        rescale_prenorm_residual=True,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.tie_word_embeddings = tie_word_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.hybrid_override_pattern = hybrid_override_pattern
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        self.sliding_window = sliding_window
        self.max_position_embeddings = max_position_embeddings
        self.attention_dropout = attention_dropout
        self.hidden_dropout = hidden_dropout

        # Validate hybrid_override_pattern
        # M: Mamba2, *: Attention, -: MLP
        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
            "hybrid_override_pattern must have same length as "
            "num_hidden_layers")
        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
            "hybrid_override_pattern must only contain characters "
            "'M', '*', or '-'")

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.mlp_hidden_act = mlp_hidden_act
        self.attention_bias = attention_bias
        self.mlp_bias = mlp_bias
        self.use_bias = use_bias
        self.initializer_range = initializer_range
        self.layer_norm_epsilon = layer_norm_epsilon
        self.residual_in_fp32 = residual_in_fp32

        self.use_cache = use_cache
        self.num_logits_to_keep = num_logits_to_keep

        self.use_mamba_kernels = use_mamba_kernels
        self.n_groups = mamba_n_groups
        self.mamba_head_dim = mamba_head_dim
        self.ssm_state_size = ssm_state_size
        self.mamba_num_heads = mamba_num_heads
        self.conv_kernel = mamba_d_conv
        self.expand = mamba_expand
        self.mamba_hidden_act = mamba_hidden_act
        self.time_step_min = mamba_dt_min
        self.time_step_max = mamba_dt_max
        self.time_step_limit = mamba_dt_limit
        self.time_step_floor = mamba_dt_init_floor
        self.use_conv_bias = mamba_conv_bias
        self.mamba_proj_bias = mamba_proj_bias
        self.chunk_size = mamba_chunk_size
        self.rescale_prenorm_residual = rescale_prenorm_residual

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def layers_block_type(self):
        return [
            "mamba" if self.hybrid_override_pattern[i] == "M" else
            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
            for i in range(self.num_hidden_layers)
        ]

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

chunk_size instance-attribute

chunk_size = mamba_chunk_size

conv_kernel instance-attribute

conv_kernel = mamba_d_conv

expand instance-attribute

expand = mamba_expand

head_dim instance-attribute

head_dim = head_dim

hidden_dropout instance-attribute

hidden_dropout = hidden_dropout

hidden_size instance-attribute

hidden_size = hidden_size

hybrid_override_pattern instance-attribute

hybrid_override_pattern = hybrid_override_pattern

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon instance-attribute

layer_norm_epsilon = layer_norm_epsilon

layers_block_type property

layers_block_type

mamba_head_dim instance-attribute

mamba_head_dim = mamba_head_dim

mamba_hidden_act instance-attribute

mamba_hidden_act = mamba_hidden_act

mamba_num_heads instance-attribute

mamba_num_heads = mamba_num_heads

mamba_proj_bias instance-attribute

mamba_proj_bias = mamba_proj_bias

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

mlp_bias instance-attribute

mlp_bias = mlp_bias

mlp_hidden_act instance-attribute

mlp_hidden_act = mlp_hidden_act

model_type class-attribute instance-attribute

model_type = 'nemotron_h'

n_groups instance-attribute

n_groups = mamba_n_groups

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

num_logits_to_keep instance-attribute

num_logits_to_keep = num_logits_to_keep

rescale_prenorm_residual instance-attribute

rescale_prenorm_residual = rescale_prenorm_residual

residual_in_fp32 instance-attribute

residual_in_fp32 = residual_in_fp32

sliding_window instance-attribute

sliding_window = sliding_window

ssm_state_size instance-attribute

ssm_state_size = ssm_state_size

tie_word_embeddings instance-attribute

tie_word_embeddings = tie_word_embeddings

time_step_floor instance-attribute

time_step_floor = mamba_dt_init_floor

time_step_limit instance-attribute

time_step_limit = mamba_dt_limit

time_step_max instance-attribute

time_step_max = mamba_dt_max

time_step_min instance-attribute

time_step_min = mamba_dt_min

use_bias instance-attribute

use_bias = use_bias

use_cache instance-attribute

use_cache = use_cache

use_conv_bias instance-attribute

use_conv_bias = mamba_conv_bias

use_mamba_kernels instance-attribute

use_mamba_kernels = use_mamba_kernels

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=131072,
    tie_word_embeddings=False,
    hidden_size=4096,
    intermediate_size=21504,
    num_hidden_layers=52,
    hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
    num_attention_heads=32,
    head_dim=128,
    num_key_value_heads=8,
    mlp_hidden_act="relu2",
    attention_bias=False,
    mlp_bias=False,
    use_bias=False,
    initializer_range=0.02,
    layer_norm_epsilon=1e-05,
    residual_in_fp32=False,
    use_cache=True,
    num_logits_to_keep=1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sliding_window=None,
    max_position_embeddings=4096,
    attention_dropout=0.0,
    hidden_dropout=0.0,
    use_mamba_kernels=True,
    ssm_state_size=128,
    mamba_num_heads=128,
    mamba_n_groups=8,
    mamba_head_dim=64,
    mamba_d_conv=4,
    mamba_expand=2,
    mamba_hidden_act="silu",
    mamba_dt_min=0.001,
    mamba_dt_max=0.1,
    mamba_dt_limit=(0.0, float("inf")),
    mamba_dt_init_floor=0.0001,
    mamba_conv_bias=True,
    mamba_proj_bias=False,
    mamba_chunk_size=256,
    rescale_prenorm_residual=True,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/nemotron_h.py
def __init__(
    self,
    vocab_size=131072,
    tie_word_embeddings=False,
    hidden_size=4096,
    intermediate_size=21504,
    num_hidden_layers=52,
    hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
    num_attention_heads=32,
    head_dim=128,
    num_key_value_heads=8,  # nemo: num_query_groups
    mlp_hidden_act="relu2",
    attention_bias=False,
    mlp_bias=False,
    use_bias=False,
    initializer_range=0.02,  # nemo: init_method_std
    layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
    residual_in_fp32=False,  #  Megatron Core default value
    use_cache=True,
    num_logits_to_keep=1,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sliding_window=None,
    max_position_embeddings=4096,
    attention_dropout=0.0,
    hidden_dropout=0.0,  # * ADDED
    use_mamba_kernels=True,
    ssm_state_size=128,  # mamba_state_size
    mamba_num_heads=128,
    mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
    mamba_head_dim=64,
    mamba_d_conv=4,
    mamba_expand=2,
    mamba_hidden_act="silu",
    mamba_dt_min=0.001,
    mamba_dt_max=0.1,
    mamba_dt_limit=(0.0, float("inf")),
    mamba_dt_init_floor=1e-4,
    mamba_conv_bias=True,
    mamba_proj_bias=False,
    mamba_chunk_size=256,
    rescale_prenorm_residual=True,
    **kwargs,
):
    self.vocab_size = vocab_size
    self.tie_word_embeddings = tie_word_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.hybrid_override_pattern = hybrid_override_pattern
    self.num_attention_heads = num_attention_heads
    self.head_dim = head_dim
    self.sliding_window = sliding_window
    self.max_position_embeddings = max_position_embeddings
    self.attention_dropout = attention_dropout
    self.hidden_dropout = hidden_dropout

    # Validate hybrid_override_pattern
    # M: Mamba2, *: Attention, -: MLP
    assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
        "hybrid_override_pattern must have same length as "
        "num_hidden_layers")
    assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
        "hybrid_override_pattern must only contain characters "
        "'M', '*', or '-'")

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.mlp_hidden_act = mlp_hidden_act
    self.attention_bias = attention_bias
    self.mlp_bias = mlp_bias
    self.use_bias = use_bias
    self.initializer_range = initializer_range
    self.layer_norm_epsilon = layer_norm_epsilon
    self.residual_in_fp32 = residual_in_fp32

    self.use_cache = use_cache
    self.num_logits_to_keep = num_logits_to_keep

    self.use_mamba_kernels = use_mamba_kernels
    self.n_groups = mamba_n_groups
    self.mamba_head_dim = mamba_head_dim
    self.ssm_state_size = ssm_state_size
    self.mamba_num_heads = mamba_num_heads
    self.conv_kernel = mamba_d_conv
    self.expand = mamba_expand
    self.mamba_hidden_act = mamba_hidden_act
    self.time_step_min = mamba_dt_min
    self.time_step_max = mamba_dt_max
    self.time_step_limit = mamba_dt_limit
    self.time_step_floor = mamba_dt_init_floor
    self.use_conv_bias = mamba_conv_bias
    self.mamba_proj_bias = mamba_proj_bias
    self.chunk_size = mamba_chunk_size
    self.rescale_prenorm_residual = rescale_prenorm_residual

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

Nemotron_Nano_VL_Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/nemotron_vl.py
class Nemotron_Nano_VL_Config(PretrainedConfig):
    model_type = 'Llama_Nemotron_Nano_VL'
    is_composition = True

    def __init__(
        self,
        vision_config=None,
        llm_config=None,
        force_image_size=None,
        downsample_ratio=0.5,
        template=None,
        ps_version='v1',
        image_tag_type="internvl",
        projector_hidden_size=4096,
        vit_hidden_size=1280,
        **kwargs
    ):
        super().__init__(**kwargs)

        if vision_config is not None:
            assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
            vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
            self.vision_config = vision_auto_config(**vision_config)
        else:
            self.vision_config = PretrainedConfig()

        if llm_config is None:
            self.text_config = LlamaConfig()
        else:
            self.text_config = LlamaConfig(**llm_config)

        # Assign configuration values
        self.force_image_size = force_image_size
        self.downsample_ratio = downsample_ratio
        self.template = template  # TODO move out of here and into the tokenizer
        self.ps_version = ps_version  # Pixel shuffle version
        self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
        self.projector_hidden_size = projector_hidden_size
        self.vit_hidden_size = vit_hidden_size

downsample_ratio instance-attribute

downsample_ratio = downsample_ratio

force_image_size instance-attribute

force_image_size = force_image_size

image_tag_type instance-attribute

image_tag_type = image_tag_type

is_composition class-attribute instance-attribute

is_composition = True

model_type class-attribute instance-attribute

model_type = 'Llama_Nemotron_Nano_VL'

projector_hidden_size instance-attribute

projector_hidden_size = projector_hidden_size

ps_version instance-attribute

ps_version = ps_version

template instance-attribute

template = template

text_config instance-attribute

text_config = LlamaConfig()

vision_config instance-attribute

vision_config = vision_auto_config(**vision_config)

vit_hidden_size instance-attribute

vit_hidden_size = vit_hidden_size

__init__

__init__(
    vision_config=None,
    llm_config=None,
    force_image_size=None,
    downsample_ratio=0.5,
    template=None,
    ps_version="v1",
    image_tag_type="internvl",
    projector_hidden_size=4096,
    vit_hidden_size=1280,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/nemotron_vl.py
def __init__(
    self,
    vision_config=None,
    llm_config=None,
    force_image_size=None,
    downsample_ratio=0.5,
    template=None,
    ps_version='v1',
    image_tag_type="internvl",
    projector_hidden_size=4096,
    vit_hidden_size=1280,
    **kwargs
):
    super().__init__(**kwargs)

    if vision_config is not None:
        assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
        vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
        self.vision_config = vision_auto_config(**vision_config)
    else:
        self.vision_config = PretrainedConfig()

    if llm_config is None:
        self.text_config = LlamaConfig()
    else:
        self.text_config = LlamaConfig(**llm_config)

    # Assign configuration values
    self.force_image_size = force_image_size
    self.downsample_ratio = downsample_ratio
    self.template = template  # TODO move out of here and into the tokenizer
    self.ps_version = ps_version  # Pixel shuffle version
    self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
    self.projector_hidden_size = projector_hidden_size
    self.vit_hidden_size = vit_hidden_size

Olmo3Config

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/olmo3.py
class Olmo3Config(PretrainedConfig):

    model_type = "olmo3"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=50304,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        use_cache=True,
        pad_token_id=1,
        bos_token_id=None,
        eos_token_id=50279,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        rms_norm_eps=1e-5,
        sliding_window=4096,
        layer_types=None,
        **kwargs,
    ):
        # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
        # in vLLM.
        if "architectures" not in kwargs:
            kwargs["architectures"] = ["Olmo2ForCausalLM"]
        elif "Olmo3ForCausalLM" in kwargs["architectures"]:
            kwargs["architectures"].remove("Olmo3ForCausalLM")
            kwargs["architectures"].append("Olmo2ForCausalLM")

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        self.rms_norm_eps = rms_norm_eps

        self.sliding_window = sliding_window
        self.layer_types = layer_types
        if self.layer_types is None:
            self.layer_types = [
                "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
                for i in range(self.num_hidden_layers)
            ]

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_types instance-attribute

layer_types = layer_types

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

model_type class-attribute instance-attribute

model_type = 'olmo3'

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

rms_norm_eps instance-attribute

rms_norm_eps = rms_norm_eps

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

sliding_window instance-attribute

sliding_window = sliding_window

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=50304,
    hidden_size=4096,
    intermediate_size=11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    use_cache=True,
    pad_token_id=1,
    bos_token_id=None,
    eos_token_id=50279,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    rms_norm_eps=1e-05,
    sliding_window=4096,
    layer_types=None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/olmo3.py
def __init__(
    self,
    vocab_size=50304,
    hidden_size=4096,
    intermediate_size=11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    use_cache=True,
    pad_token_id=1,
    bos_token_id=None,
    eos_token_id=50279,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    rms_norm_eps=1e-5,
    sliding_window=4096,
    layer_types=None,
    **kwargs,
):
    # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
    # in vLLM.
    if "architectures" not in kwargs:
        kwargs["architectures"] = ["Olmo2ForCausalLM"]
    elif "Olmo3ForCausalLM" in kwargs["architectures"]:
        kwargs["architectures"].remove("Olmo3ForCausalLM")
        kwargs["architectures"].append("Olmo2ForCausalLM")

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout

    self.rms_norm_eps = rms_norm_eps

    self.sliding_window = sliding_window
    self.layer_types = layer_types
    if self.layer_types is None:
        self.layer_types = [
            "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
            for i in range(self.num_hidden_layers)
        ]

OvisConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/ovis.py
class OvisConfig(PretrainedConfig):
    model_type = "ovis"

    def __init__(self,
                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
                                                         dict]] = None,
                 multimodal_max_length=8192,
                 hidden_size=None,
                 conversation_formatter_class=None,
                 llm_attn_implementation=None,
                 disable_tie_weight=False,
                 **kwargs):
        super().__init__(**kwargs)
        if llm_config is not None:
            assert isinstance(llm_config, (PretrainedConfig, dict)), \
                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
            if not isinstance(llm_config, PretrainedConfig):
                model_type = llm_config['model_type']
                llm_config.pop('model_type')
                llm_config = AutoConfig.for_model(model_type, **llm_config)

        # map llm_config to text_config
        self.text_config = llm_config
        if visual_tokenizer_config is not None:
            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
            if not isinstance(visual_tokenizer_config, PretrainedConfig):
                model_type = visual_tokenizer_config['model_type']
                visual_tokenizer_config.pop('model_type')
                visual_tokenizer_config = AutoConfig.for_model(
                    model_type, **visual_tokenizer_config)

        self.visual_tokenizer_config = visual_tokenizer_config
        self.multimodal_max_length = multimodal_max_length
        self.hidden_size = hidden_size
        self.conversation_formatter_class = conversation_formatter_class
        self.llm_attn_implementation = llm_attn_implementation
        self.disable_tie_weight = disable_tie_weight

conversation_formatter_class instance-attribute

conversation_formatter_class = conversation_formatter_class

disable_tie_weight instance-attribute

disable_tie_weight = disable_tie_weight

hidden_size instance-attribute

hidden_size = hidden_size

llm_attn_implementation instance-attribute

llm_attn_implementation = llm_attn_implementation

model_type class-attribute instance-attribute

model_type = 'ovis'

multimodal_max_length instance-attribute

multimodal_max_length = multimodal_max_length

text_config instance-attribute

text_config = llm_config

visual_tokenizer_config instance-attribute

visual_tokenizer_config = visual_tokenizer_config

__init__

__init__(
    llm_config: Optional[
        Union[PretrainedConfig, dict]
    ] = None,
    visual_tokenizer_config: Optional[
        Union[PretrainedConfig, dict]
    ] = None,
    multimodal_max_length=8192,
    hidden_size=None,
    conversation_formatter_class=None,
    llm_attn_implementation=None,
    disable_tie_weight=False,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/ovis.py
def __init__(self,
             llm_config: Optional[Union[PretrainedConfig, dict]] = None,
             visual_tokenizer_config: Optional[Union[PretrainedConfig,
                                                     dict]] = None,
             multimodal_max_length=8192,
             hidden_size=None,
             conversation_formatter_class=None,
             llm_attn_implementation=None,
             disable_tie_weight=False,
             **kwargs):
    super().__init__(**kwargs)
    if llm_config is not None:
        assert isinstance(llm_config, (PretrainedConfig, dict)), \
            f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
        if not isinstance(llm_config, PretrainedConfig):
            model_type = llm_config['model_type']
            llm_config.pop('model_type')
            llm_config = AutoConfig.for_model(model_type, **llm_config)

    # map llm_config to text_config
    self.text_config = llm_config
    if visual_tokenizer_config is not None:
        assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
            f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
        if not isinstance(visual_tokenizer_config, PretrainedConfig):
            model_type = visual_tokenizer_config['model_type']
            visual_tokenizer_config.pop('model_type')
            visual_tokenizer_config = AutoConfig.for_model(
                model_type, **visual_tokenizer_config)

    self.visual_tokenizer_config = visual_tokenizer_config
    self.multimodal_max_length = multimodal_max_length
    self.hidden_size = hidden_size
    self.conversation_formatter_class = conversation_formatter_class
    self.llm_attn_implementation = llm_attn_implementation
    self.disable_tie_weight = disable_tie_weight

Qwen3NextConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [Qwen3NextModel]. It is used to instantiate a Qwen3-Next model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of Qwen3-Next-80B-A3B-Instruct Qwen/Qwen3-Next-80B-A3B-Instruct.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name Type Description Default
vocab_size `int`, *optional*, defaults to 151936

Vocabulary size of the model. Defines the number of different tokens that can be represented by the inputs_ids.

151936
hidden_size `int`, *optional*, defaults to 2048

Dimension of the hidden representations.

2048
intermediate_size `int`, *optional*, defaults to 5632

Dimension of the MLP representations.

5632
num_hidden_layers `int`, *optional*, defaults to 48

Number of hidden layers in the Transformer encoder.

48
num_attention_heads `int`, *optional*, defaults to 16

Number of attention heads for each attention layer in the Transformer encoder.

16
num_key_value_heads `int`, *optional*, defaults to 2

This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout this paper. If it is not specified, will default to 32.

2
hidden_act `str`, *optional*, defaults to `"silu"`

The non-linear activation function in the decoder.

'silu'
max_position_embeddings `int`, *optional*, defaults to 32768

The maximum sequence length that this model might ever be used with.

32768
initializer_range `float`, *optional*, defaults to 0.02

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

0.02
rms_norm_eps `float`, *optional*, defaults to 1e-06

The epsilon used by the rms normalization layers.

1e-06
use_cache `bool`, *optional*, defaults to `True`

Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True.

True
tie_word_embeddings `bool`, *optional*, defaults to `False`

Whether the model's input and output word embeddings should be tied.

False
rope_theta `float`, *optional*, defaults to 10000.0

The base period of the RoPE embeddings.

10000.0
rope_scaling `Dict`, *optional*

Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer max_position_embeddings, we recommend you to update this value accordingly. Expected contents: rope_type (str): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. factor (float, optional): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a factor of x will enable the model to handle sequences of length x * original maximum pre-trained length. original_max_position_embeddings (int, optional): Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during pretraining. attention_factor (float, optional): Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention computation. If unspecified, it defaults to value recommended by the implementation, using the factor field to infer the suggested value. beta_fast (float, optional): Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear ramp function. If unspecified, it defaults to 32. beta_slow (float, optional): Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear ramp function. If unspecified, it defaults to 1. short_factor (List[float], optional): Only used with 'longrope'. The scaling factor to be applied to short contexts (< original_max_position_embeddings). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 long_factor (List[float], optional): Only used with 'longrope'. The scaling factor to be applied to long contexts (< original_max_position_embeddings). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 low_freq_factor (float, optional): Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE high_freq_factor (float, optional): Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE

None
partial_rotary_factor `float`, *optional*, defaults to 0.25

Percentage of the query and keys which will have rotary embedding.

0.25
attention_bias `bool`, *optional*, defaults to `False`

Whether to use a bias in the query, key, value and output projection layers during self-attention.

False
attention_dropout `float`, *optional*, defaults to 0.0

The dropout ratio for the attention probabilities.

0.0
head_dim `int`, *optional*, defaults to 256

Projection weights dimension in multi-head attention.

256
linear_conv_kernel_dim `int`, *optional*, defaults to 4

Kernel size of the convolution used in linear attention layers.

4
linear_key_head_dim `int`, *optional*, defaults to 128

Dimension of each key head in linear attention.

128
linear_value_head_dim `int`, *optional*, defaults to 128

Dimension of each value head in linear attention.

128
linear_num_key_heads `int`, *optional*, defaults to 16

Number of key heads used in linear attention layers.

16
linear_num_value_heads `int`, *optional*, defaults to 32

Number of value heads used in linear attention layers.

32
decoder_sparse_step `int`, *optional*, defaults to 1

The frequency of the MoE layer.

1
moe_intermediate_size `int`, *optional*, defaults to 512

Intermediate size of the routed expert.

512
shared_expert_intermediate_size `int`, *optional*, defaults to 512

Intermediate size of the shared expert.

512
num_experts_per_tok `int`, *optional*, defaults to 10

Number of selected experts.

10
num_experts `int`, *optional*, defaults to 512

Number of routed experts.

512
norm_topk_prob `bool`, *optional*, defaults to `True`

Whether to normalize the topk probabilities.

True
output_router_logits `bool`, *optional*, defaults to `False`

Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss, including load balancing loss and router z-loss.

False
router_aux_loss_coef `float`, *optional*, defaults to 0.001

The aux loss factor for the total loss.

0.001
mlp_only_layers `list[int]`, *optional*, defaults to `[]`

Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock The list contains layer index, from 0 to num_layers-1 if we have num_layers layers If mlp_only_layers is empty, decoder_sparse_step is used to determine the sparsity.

None
layer_types `list[str]`, *optional*

Types of each layer (attention or linear).

None
>>> from transformers import Qwen3NextModel, Qwen3NextConfig

>>> # Initializing a Qwen3Next style configuration
>>> configuration =  Qwen3NextConfig()

>>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
>>> model = Qwen3NextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
Source code in vllm/transformers_utils/configs/qwen3_next.py
class Qwen3NextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
    Qwen3-Next model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of
    Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 151936):
            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
            `inputs_ids`.
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 5632):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 2):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
        hidden_act (`str`, *optional*, defaults to `"silu"`):
            The non-linear activation function in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
            Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        head_dim (`int`, *optional*, defaults to 256):
            Projection weights dimension in multi-head attention.
        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
            Kernel size of the convolution used in linear attention layers.
        linear_key_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each key head in linear attention.
        linear_value_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each value head in linear attention.
        linear_num_key_heads (`int`, *optional*, defaults to 16):
            Number of key heads used in linear attention layers.
        linear_num_value_heads (`int`, *optional*, defaults to 32):
            Number of value heads used in linear attention layers.
        decoder_sparse_step (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer.
        moe_intermediate_size (`int`, *optional*, defaults to 512):
            Intermediate size of the routed expert.
        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
            Intermediate size of the shared expert.
        num_experts_per_tok (`int`, *optional*, defaults to 10):
            Number of selected experts.
        num_experts (`int`, *optional*, defaults to 512):
            Number of routed experts.
        norm_topk_prob (`bool`, *optional*, defaults to `True`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
            Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
        layer_types (`list[str]`, *optional*):
            Types of each layer (attention or linear).

    ```python
    >>> from transformers import Qwen3NextModel, Qwen3NextConfig

    >>> # Initializing a Qwen3Next style configuration
    >>> configuration =  Qwen3NextConfig()

    >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
    >>> model = Qwen3NextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """  # noqa: E501

    model_type = "qwen3_next"
    keys_to_ignore_at_inference = ["past_key_values"]

    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.experts.*.gate_proj": "colwise",
        "layers.*.mlp.experts.*.up_proj": "colwise",
        "layers.*.mlp.experts.*.down_proj": "rowwise",
        "layers.*.mlp.shared_experts.gate_proj": "colwise",
        "layers.*.mlp.shared_experts.up_proj": "colwise",
        "layers.*.mlp.shared_experts.down_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    def __init__(
        self,
        vocab_size=151936,
        hidden_size=2048,
        intermediate_size=5632,
        num_hidden_layers=48,
        num_attention_heads=16,
        num_key_value_heads=2,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.25,
        attention_bias=False,
        attention_dropout=0.0,
        head_dim=256,
        linear_conv_kernel_dim=4,
        linear_key_head_dim=128,
        linear_value_head_dim=128,
        linear_num_key_heads=16,
        linear_num_value_heads=32,
        decoder_sparse_step=1,
        moe_intermediate_size=512,
        shared_expert_intermediate_size=512,
        num_experts_per_tok=10,
        num_experts=512,
        norm_topk_prob=True,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        mlp_only_layers=None,
        layer_types=None,
        **kwargs,
    ):
        if mlp_only_layers is None:
            mlp_only_layers = []
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.partial_rotary_factor = partial_rotary_factor
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.head_dim = head_dim
        rope_config_validation(self)

        self.layer_types = layer_types
        if self.layer_types is None:
            self.layer_types = [
                "linear_attention" if bool((i + 1) % 4) else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
        layer_type_validation(self.layer_types)

        # linear attention part
        self.linear_conv_kernel_dim = linear_conv_kernel_dim
        self.linear_key_head_dim = linear_key_head_dim
        self.linear_value_head_dim = linear_value_head_dim
        self.linear_num_key_heads = linear_num_key_heads
        self.linear_num_value_heads = linear_num_value_heads

        # MoE arguments
        self.decoder_sparse_step = decoder_sparse_step
        self.moe_intermediate_size = moe_intermediate_size
        self.shared_expert_intermediate_size = shared_expert_intermediate_size
        self.num_experts_per_tok = num_experts_per_tok
        self.num_experts = num_experts
        self.norm_topk_prob = norm_topk_prob
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.mlp_only_layers = mlp_only_layers

attention_bias instance-attribute

attention_bias = attention_bias

attention_dropout instance-attribute

attention_dropout = attention_dropout

base_model_pp_plan class-attribute instance-attribute

base_model_pp_plan = {
    "embed_tokens": (["input_ids"], ["inputs_embeds"]),
    "layers": (
        ["hidden_states", "attention_mask"],
        ["hidden_states"],
    ),
    "norm": (["hidden_states"], ["hidden_states"]),
}

base_model_tp_plan class-attribute instance-attribute

base_model_tp_plan = {
    "layers.*.self_attn.q_proj": "colwise",
    "layers.*.self_attn.k_proj": "colwise",
    "layers.*.self_attn.v_proj": "colwise",
    "layers.*.self_attn.o_proj": "rowwise",
    "layers.*.mlp.experts.*.gate_proj": "colwise",
    "layers.*.mlp.experts.*.up_proj": "colwise",
    "layers.*.mlp.experts.*.down_proj": "rowwise",
    "layers.*.mlp.shared_experts.gate_proj": "colwise",
    "layers.*.mlp.shared_experts.up_proj": "colwise",
    "layers.*.mlp.shared_experts.down_proj": "rowwise",
    "layers.*.mlp.gate_proj": "colwise",
    "layers.*.mlp.up_proj": "colwise",
    "layers.*.mlp.down_proj": "rowwise",
}

decoder_sparse_step instance-attribute

decoder_sparse_step = decoder_sparse_step

head_dim instance-attribute

head_dim = head_dim

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_types instance-attribute

layer_types = layer_types

linear_conv_kernel_dim instance-attribute

linear_conv_kernel_dim = linear_conv_kernel_dim

linear_key_head_dim instance-attribute

linear_key_head_dim = linear_key_head_dim

linear_num_key_heads instance-attribute

linear_num_key_heads = linear_num_key_heads

linear_num_value_heads instance-attribute

linear_num_value_heads = linear_num_value_heads

linear_value_head_dim instance-attribute

linear_value_head_dim = linear_value_head_dim

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

mlp_only_layers instance-attribute

mlp_only_layers = mlp_only_layers

model_type class-attribute instance-attribute

model_type = 'qwen3_next'

moe_intermediate_size instance-attribute

moe_intermediate_size = moe_intermediate_size

norm_topk_prob instance-attribute

norm_topk_prob = norm_topk_prob

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_experts instance-attribute

num_experts = num_experts

num_experts_per_tok instance-attribute

num_experts_per_tok = num_experts_per_tok

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

output_router_logits instance-attribute

output_router_logits = output_router_logits

partial_rotary_factor instance-attribute

partial_rotary_factor = partial_rotary_factor

rms_norm_eps instance-attribute

rms_norm_eps = rms_norm_eps

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

router_aux_loss_coef instance-attribute

router_aux_loss_coef = router_aux_loss_coef

shared_expert_intermediate_size instance-attribute

shared_expert_intermediate_size = (
    shared_expert_intermediate_size
)

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=151936,
    hidden_size=2048,
    intermediate_size=5632,
    num_hidden_layers=48,
    num_attention_heads=16,
    num_key_value_heads=2,
    hidden_act="silu",
    max_position_embeddings=32768,
    initializer_range=0.02,
    rms_norm_eps=1e-06,
    use_cache=True,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.25,
    attention_bias=False,
    attention_dropout=0.0,
    head_dim=256,
    linear_conv_kernel_dim=4,
    linear_key_head_dim=128,
    linear_value_head_dim=128,
    linear_num_key_heads=16,
    linear_num_value_heads=32,
    decoder_sparse_step=1,
    moe_intermediate_size=512,
    shared_expert_intermediate_size=512,
    num_experts_per_tok=10,
    num_experts=512,
    norm_topk_prob=True,
    output_router_logits=False,
    router_aux_loss_coef=0.001,
    mlp_only_layers=None,
    layer_types=None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/qwen3_next.py
def __init__(
    self,
    vocab_size=151936,
    hidden_size=2048,
    intermediate_size=5632,
    num_hidden_layers=48,
    num_attention_heads=16,
    num_key_value_heads=2,
    hidden_act="silu",
    max_position_embeddings=32768,
    initializer_range=0.02,
    rms_norm_eps=1e-6,
    use_cache=True,
    tie_word_embeddings=False,
    rope_theta=10000.0,
    rope_scaling=None,
    partial_rotary_factor=0.25,
    attention_bias=False,
    attention_dropout=0.0,
    head_dim=256,
    linear_conv_kernel_dim=4,
    linear_key_head_dim=128,
    linear_value_head_dim=128,
    linear_num_key_heads=16,
    linear_num_value_heads=32,
    decoder_sparse_step=1,
    moe_intermediate_size=512,
    shared_expert_intermediate_size=512,
    num_experts_per_tok=10,
    num_experts=512,
    norm_topk_prob=True,
    output_router_logits=False,
    router_aux_loss_coef=0.001,
    mlp_only_layers=None,
    layer_types=None,
    **kwargs,
):
    if mlp_only_layers is None:
        mlp_only_layers = []
    super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.partial_rotary_factor = partial_rotary_factor
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.head_dim = head_dim
    rope_config_validation(self)

    self.layer_types = layer_types
    if self.layer_types is None:
        self.layer_types = [
            "linear_attention" if bool((i + 1) % 4) else "full_attention"
            for i in range(self.num_hidden_layers)
        ]
    layer_type_validation(self.layer_types)

    # linear attention part
    self.linear_conv_kernel_dim = linear_conv_kernel_dim
    self.linear_key_head_dim = linear_key_head_dim
    self.linear_value_head_dim = linear_value_head_dim
    self.linear_num_key_heads = linear_num_key_heads
    self.linear_num_value_heads = linear_num_value_heads

    # MoE arguments
    self.decoder_sparse_step = decoder_sparse_step
    self.moe_intermediate_size = moe_intermediate_size
    self.shared_expert_intermediate_size = shared_expert_intermediate_size
    self.num_experts_per_tok = num_experts_per_tok
    self.num_experts = num_experts
    self.norm_topk_prob = norm_topk_prob
    self.output_router_logits = output_router_logits
    self.router_aux_loss_coef = router_aux_loss_coef
    self.mlp_only_layers = mlp_only_layers

RWConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/falcon.py
class RWConfig(PretrainedConfig):
    model_type = "falcon"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "num_hidden_layers": "n_layer",
        "num_attention_heads": "n_head",
        "num_kv_heads": "n_head_kv",
    }

    def __init__(
        self,
        vocab_size=250880,
        hidden_size=64,
        n_layer=2,
        n_head=8,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=1,
        eos_token_id=2,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        multi_query=True,
        n_head_kv=None,
        alibi=False,
        bias=False,
        parallel_attn=False,
        new_decoder_architecture=False,
        **kwargs,
    ) -> None:
        self.vocab_size = vocab_size
        # Backward compatibility with n_embed kwarg
        n_embed = kwargs.pop("n_embed", None)
        self.hidden_size = hidden_size if n_embed is None else n_embed
        self.n_layer = n_layer
        self.n_head = n_head
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.multi_query = multi_query
        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
        self.alibi = alibi
        self.bias = bias
        self.parallel_attn = parallel_attn
        self.new_decoder_architecture = new_decoder_architecture

        if self.hidden_size == 8192:
            # Hack for falcon-40b
            self.new_decoder_architecture = True

        super().__init__(bos_token_id=bos_token_id,
                         eos_token_id=eos_token_id,
                         **kwargs)

    @property
    def head_dim(self):
        return self.hidden_size // self.n_head

    @property
    def rotary(self):
        return not self.alibi

alibi instance-attribute

alibi = alibi

attention_dropout instance-attribute

attention_dropout = attention_dropout

attribute_map class-attribute instance-attribute

attribute_map = {
    "num_hidden_layers": "n_layer",
    "num_attention_heads": "n_head",
    "num_kv_heads": "n_head_kv",
}

bias instance-attribute

bias = bias

bos_token_id instance-attribute

bos_token_id = bos_token_id

eos_token_id instance-attribute

eos_token_id = eos_token_id

head_dim property

head_dim

hidden_dropout instance-attribute

hidden_dropout = hidden_dropout

hidden_size instance-attribute

hidden_size = hidden_size if n_embed is None else n_embed

initializer_range instance-attribute

initializer_range = initializer_range

keys_to_ignore_at_inference class-attribute instance-attribute

keys_to_ignore_at_inference = ['past_key_values']

layer_norm_epsilon instance-attribute

layer_norm_epsilon = layer_norm_epsilon

model_type class-attribute instance-attribute

model_type = 'falcon'

multi_query instance-attribute

multi_query = multi_query

n_head instance-attribute

n_head = n_head

n_head_kv instance-attribute

n_head_kv = 1 if n_head_kv is None else n_head_kv

n_layer instance-attribute

n_layer = n_layer

new_decoder_architecture instance-attribute

new_decoder_architecture = new_decoder_architecture

parallel_attn instance-attribute

parallel_attn = parallel_attn

rotary property

rotary

use_cache instance-attribute

use_cache = use_cache

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    vocab_size=250880,
    hidden_size=64,
    n_layer=2,
    n_head=8,
    layer_norm_epsilon=1e-05,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    multi_query=True,
    n_head_kv=None,
    alibi=False,
    bias=False,
    parallel_attn=False,
    new_decoder_architecture=False,
    **kwargs,
) -> None
Source code in vllm/transformers_utils/configs/falcon.py
def __init__(
    self,
    vocab_size=250880,
    hidden_size=64,
    n_layer=2,
    n_head=8,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    multi_query=True,
    n_head_kv=None,
    alibi=False,
    bias=False,
    parallel_attn=False,
    new_decoder_architecture=False,
    **kwargs,
) -> None:
    self.vocab_size = vocab_size
    # Backward compatibility with n_embed kwarg
    n_embed = kwargs.pop("n_embed", None)
    self.hidden_size = hidden_size if n_embed is None else n_embed
    self.n_layer = n_layer
    self.n_head = n_head
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.hidden_dropout = hidden_dropout
    self.attention_dropout = attention_dropout

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id
    self.multi_query = multi_query
    self.n_head_kv = 1 if n_head_kv is None else n_head_kv
    self.alibi = alibi
    self.bias = bias
    self.parallel_attn = parallel_attn
    self.new_decoder_architecture = new_decoder_architecture

    if self.hidden_size == 8192:
        # Hack for falcon-40b
        self.new_decoder_architecture = True

    super().__init__(bos_token_id=bos_token_id,
                     eos_token_id=eos_token_id,
                     **kwargs)

RadioConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a Radio vision model. It is used to instantiate a Radio model according to the specified arguments, defining the model architecture.

Parameters:

Name Type Description Default
model_name str

Name of the vision transformer model (e.g., "vit_base_patch16_224"). Used to determine architecture dimensions from VIT_TIMM_DIM_BY_NAME.

required
image_size int

The size (resolution) of each image.

224
patch_size int

The size (resolution) of each patch.

16
qkv_bias bool

Whether to add a bias to the queries, keys and values.

True
qk_normalization bool

Whether to apply normalization to queries and keys.

False
norm_type str

The normalization type to use.

'layer_norm'
layer_norm_eps float

The epsilon used by the layer normalization layers.

1e-06
initializer_factor float

A factor for initializing all weight matrices.

1.0
hidden_act str

The non-linear activation function in the encoder.

'gelu'
max_img_size int

Maximum image size for position embeddings.

2048
norm_mean Union[tuple[float, float, float], list]

Mean values for image normalization (RGB channels). Defaults to (0.48145466, 0.4578275, 0.40821073)).

OPENAI_CLIP_MEAN
norm_std Union[tuple[float, float, float], list]

Standard deviation values for image normalization (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).

OPENAI_CLIP_STD
reg_tokens Optional[int]

Number of register tokens to use.

None
Source code in vllm/transformers_utils/configs/radio.py
class RadioConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a Radio
    vision model. It is used to instantiate a Radio model according to the
    specified arguments, defining the model architecture.

    Args:
        model_name: Name of the vision transformer model
            (e.g., "vit_base_patch16_224"). Used to determine architecture
            dimensions from `VIT_TIMM_DIM_BY_NAME`.
        image_size: The size (resolution) of each image.
        patch_size: The size (resolution) of each patch.
        qkv_bias: Whether to add a bias to the queries, keys and values.
        qk_normalization: Whether to apply normalization to queries and keys.
        norm_type: The normalization type to use.
        layer_norm_eps: The epsilon used by the layer normalization layers.
        initializer_factor: A factor for initializing all weight matrices.
        hidden_act: The non-linear activation function in the encoder.
        max_img_size: Maximum image size for position embeddings.
        norm_mean: Mean values for image normalization (RGB channels).
            Defaults to (0.48145466, 0.4578275, 0.40821073)).
        norm_std: Standard deviation values for image normalization
            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
        reg_tokens: Number of register tokens to use.
    """

    model_type = "radio"

    def __init__(
        self,
        model_name: str,
        image_size: int = 224,
        patch_size: int = 16,
        qkv_bias: bool = True,
        qk_normalization: bool = False,
        norm_type: str = "layer_norm",
        layer_norm_eps: float = 1e-6,
        initializer_factor: float = 1.0,
        hidden_act: str = "gelu",
        max_img_size: int = 2048,
        norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
        norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
        reg_tokens: Optional[int] = None,
        **kwargs,
    ):
        self.model_name = model_name
        (
            self.hidden_size,
            self.num_hidden_layers,
            self.num_attention_heads,
            self.intermediate_size,
        ) = VIT_TIMM_DIM_BY_NAME[model_name]
        self.image_size = image_size
        self.patch_size = patch_size
        self.qkv_bias = qkv_bias
        self.qk_normalization = qk_normalization
        self.norm_type = norm_type
        self.layer_norm_eps = layer_norm_eps
        self.initializer_factor = initializer_factor
        self.hidden_act = hidden_act
        self.max_img_size = max_img_size
        self.norm_mean = list(norm_mean) if isinstance(norm_mean,
                                                       (tuple,
                                                        list)) else norm_mean
        self.norm_std = list(norm_std) if isinstance(norm_std,
                                                     (tuple,
                                                      list)) else norm_std
        self.reg_tokens = reg_tokens
        super().__init__(**kwargs)

hidden_act instance-attribute

hidden_act = hidden_act

image_size instance-attribute

image_size = image_size

initializer_factor instance-attribute

initializer_factor = initializer_factor

layer_norm_eps instance-attribute

layer_norm_eps = layer_norm_eps

max_img_size instance-attribute

max_img_size = max_img_size

model_name instance-attribute

model_name = model_name

model_type class-attribute instance-attribute

model_type = 'radio'

norm_mean instance-attribute

norm_mean = (
    list(norm_mean)
    if isinstance(norm_mean, (tuple, list))
    else norm_mean
)

norm_std instance-attribute

norm_std = (
    list(norm_std)
    if isinstance(norm_std, (tuple, list))
    else norm_std
)

norm_type instance-attribute

norm_type = norm_type

patch_size instance-attribute

patch_size = patch_size

qk_normalization instance-attribute

qk_normalization = qk_normalization

qkv_bias instance-attribute

qkv_bias = qkv_bias

reg_tokens instance-attribute

reg_tokens = reg_tokens

__init__

__init__(
    model_name: str,
    image_size: int = 224,
    patch_size: int = 16,
    qkv_bias: bool = True,
    qk_normalization: bool = False,
    norm_type: str = "layer_norm",
    layer_norm_eps: float = 1e-06,
    initializer_factor: float = 1.0,
    hidden_act: str = "gelu",
    max_img_size: int = 2048,
    norm_mean: Union[
        tuple[float, float, float], list
    ] = OPENAI_CLIP_MEAN,
    norm_std: Union[
        tuple[float, float, float], list
    ] = OPENAI_CLIP_STD,
    reg_tokens: Optional[int] = None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/radio.py
def __init__(
    self,
    model_name: str,
    image_size: int = 224,
    patch_size: int = 16,
    qkv_bias: bool = True,
    qk_normalization: bool = False,
    norm_type: str = "layer_norm",
    layer_norm_eps: float = 1e-6,
    initializer_factor: float = 1.0,
    hidden_act: str = "gelu",
    max_img_size: int = 2048,
    norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
    norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
    reg_tokens: Optional[int] = None,
    **kwargs,
):
    self.model_name = model_name
    (
        self.hidden_size,
        self.num_hidden_layers,
        self.num_attention_heads,
        self.intermediate_size,
    ) = VIT_TIMM_DIM_BY_NAME[model_name]
    self.image_size = image_size
    self.patch_size = patch_size
    self.qkv_bias = qkv_bias
    self.qk_normalization = qk_normalization
    self.norm_type = norm_type
    self.layer_norm_eps = layer_norm_eps
    self.initializer_factor = initializer_factor
    self.hidden_act = hidden_act
    self.max_img_size = max_img_size
    self.norm_mean = list(norm_mean) if isinstance(norm_mean,
                                                   (tuple,
                                                    list)) else norm_mean
    self.norm_std = list(norm_std) if isinstance(norm_std,
                                                 (tuple,
                                                  list)) else norm_std
    self.reg_tokens = reg_tokens
    super().__init__(**kwargs)

SpeculatorsConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/speculators/base.py
class SpeculatorsConfig(PretrainedConfig):
    model_type = "speculators"

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs,
    ) -> "SpeculatorsConfig":
        """Load speculators Eagle config and convert to vLLM format."""
        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
                                             **kwargs)

        vllm_config = cls.extract_vllm_speculative_config(config_dict)
        return cls(**vllm_config)

    @classmethod
    def extract_vllm_speculative_config(
            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
        speculators_model_type = config_dict.get("speculators_model_type")
        if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
            raise ValueError(
                f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
                "Please ensure you're loading a speculators-format model.")

        # validate fields
        # TODO: @dsikka - use speculators pydantic model to validate
        cls.validate_speculators_config(config_dict=config_dict)
        # Convert from speculators config -> format that can be ingested by vLLM
        vllm_config = cls.build_vllm_speculative_config(
            config_dict=config_dict)
        # Apply anything specific to the supported algorithm
        algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
        algo_updater(config_dict=config_dict, vllm_config=vllm_config)
        return vllm_config

    @classmethod
    def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
        try:
            spec_config = config_dict["speculators_config"]
            methods = spec_config["proposal_methods"]
            first_method = methods[0]
            _ = first_method["speculative_tokens"]
            _ = spec_config["verifier"]["name_or_path"]
            _ = config_dict["speculators_model_type"]
        except (KeyError, IndexError, TypeError) as e:
            raise ValueError("Invalid speculators config structure") from e

        if "transformer_layer_config" not in config_dict:
            raise ValueError("Must provide transformer_layer_config")

        if not isinstance(config_dict["transformer_layer_config"], dict):
            raise TypeError(
                "'transformer_layer_config' must be a dictionary if provided")

    @classmethod
    def build_vllm_speculative_config(
            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
        """
        Build vLLM-compatible speculative configuration from speculators format.

        This method extracts and transforms speculative configuration from the
        speculators format into the structure expected by vLLM.

        Args:
            config_dict: Configuration dictionary in speculators format

        Returns:
            Dictionary with vLLM-compatible speculative configuration
        """
        # Extract speculators configuration
        spec_config = config_dict["speculators_config"]

        # Currently we only support one proposal method
        proposal_methods = spec_config.get("proposal_methods")
        if not proposal_methods:
            raise ValueError("No proposal methods found in speculators config")

        first_method = proposal_methods[0]
        num_speculative_tokens = first_method.get("speculative_tokens")

        if num_speculative_tokens is None:
            raise ValueError(
                "Missing 'speculative_tokens' in proposal method. "
                f"Got: {first_method}")

        # Build base vLLM speculative configuration
        vllm_config = {
            "method": config_dict.get("speculators_model_type"),
            "num_speculative_tokens": num_speculative_tokens,
            "target_model": spec_config.get("verifier")["name_or_path"]
        }

        # Merge transformer layer configuration if present
        transformer_config = config_dict.get("transformer_layer_config", {})
        vllm_config.update(transformer_config)

        return vllm_config

model_type class-attribute instance-attribute

model_type = 'speculators'

build_vllm_speculative_config classmethod

build_vllm_speculative_config(
    config_dict: dict[str, Any],
) -> dict[str, Any]

Build vLLM-compatible speculative configuration from speculators format.

This method extracts and transforms speculative configuration from the speculators format into the structure expected by vLLM.

Parameters:

Name Type Description Default
config_dict dict[str, Any]

Configuration dictionary in speculators format

required

Returns:

Type Description
dict[str, Any]

Dictionary with vLLM-compatible speculative configuration

Source code in vllm/transformers_utils/configs/speculators/base.py
@classmethod
def build_vllm_speculative_config(
        cls, config_dict: dict[str, Any]) -> dict[str, Any]:
    """
    Build vLLM-compatible speculative configuration from speculators format.

    This method extracts and transforms speculative configuration from the
    speculators format into the structure expected by vLLM.

    Args:
        config_dict: Configuration dictionary in speculators format

    Returns:
        Dictionary with vLLM-compatible speculative configuration
    """
    # Extract speculators configuration
    spec_config = config_dict["speculators_config"]

    # Currently we only support one proposal method
    proposal_methods = spec_config.get("proposal_methods")
    if not proposal_methods:
        raise ValueError("No proposal methods found in speculators config")

    first_method = proposal_methods[0]
    num_speculative_tokens = first_method.get("speculative_tokens")

    if num_speculative_tokens is None:
        raise ValueError(
            "Missing 'speculative_tokens' in proposal method. "
            f"Got: {first_method}")

    # Build base vLLM speculative configuration
    vllm_config = {
        "method": config_dict.get("speculators_model_type"),
        "num_speculative_tokens": num_speculative_tokens,
        "target_model": spec_config.get("verifier")["name_or_path"]
    }

    # Merge transformer layer configuration if present
    transformer_config = config_dict.get("transformer_layer_config", {})
    vllm_config.update(transformer_config)

    return vllm_config

extract_vllm_speculative_config classmethod

extract_vllm_speculative_config(
    config_dict: dict[str, Any],
) -> dict[str, Any]
Source code in vllm/transformers_utils/configs/speculators/base.py
@classmethod
def extract_vllm_speculative_config(
        cls, config_dict: dict[str, Any]) -> dict[str, Any]:
    speculators_model_type = config_dict.get("speculators_model_type")
    if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
        raise ValueError(
            f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
            "Please ensure you're loading a speculators-format model.")

    # validate fields
    # TODO: @dsikka - use speculators pydantic model to validate
    cls.validate_speculators_config(config_dict=config_dict)
    # Convert from speculators config -> format that can be ingested by vLLM
    vllm_config = cls.build_vllm_speculative_config(
        config_dict=config_dict)
    # Apply anything specific to the supported algorithm
    algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
    algo_updater(config_dict=config_dict, vllm_config=vllm_config)
    return vllm_config

from_pretrained classmethod

from_pretrained(
    pretrained_model_name_or_path: Union[str, PathLike],
    **kwargs,
) -> SpeculatorsConfig

Load speculators Eagle config and convert to vLLM format.

Source code in vllm/transformers_utils/configs/speculators/base.py
@classmethod
def from_pretrained(
    cls,
    pretrained_model_name_or_path: Union[str, os.PathLike],
    **kwargs,
) -> "SpeculatorsConfig":
    """Load speculators Eagle config and convert to vLLM format."""
    config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
                                         **kwargs)

    vllm_config = cls.extract_vllm_speculative_config(config_dict)
    return cls(**vllm_config)

validate_speculators_config classmethod

validate_speculators_config(
    config_dict: dict[str, Any],
) -> None
Source code in vllm/transformers_utils/configs/speculators/base.py
@classmethod
def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
    try:
        spec_config = config_dict["speculators_config"]
        methods = spec_config["proposal_methods"]
        first_method = methods[0]
        _ = first_method["speculative_tokens"]
        _ = spec_config["verifier"]["name_or_path"]
        _ = config_dict["speculators_model_type"]
    except (KeyError, IndexError, TypeError) as e:
        raise ValueError("Invalid speculators config structure") from e

    if "transformer_layer_config" not in config_dict:
        raise ValueError("Must provide transformer_layer_config")

    if not isinstance(config_dict["transformer_layer_config"], dict):
        raise TypeError(
            "'transformer_layer_config' must be a dictionary if provided")

Step3TextConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py
class Step3TextConfig(PretrainedConfig):
    model_type = "step3_text"
    architectures = ["Step3TextForCausalLM"]

    def __init__(
        self,
        hidden_size: int = 7168,
        intermediate_size: int = 18432,
        num_attention_heads: int = 64,
        num_attention_groups: int = 1,
        num_hidden_layers: int = 61,
        max_seq_len: int = 65536,
        vocab_size: int = 128815,
        rms_norm_eps: float = 1e-5,
        moe_intermediate_size: int = 5120,
        moe_num_experts: int = 48,
        moe_top_k: int = 3,
        rope_theta: float = 500000,
        rope_scaling: Optional[dict[str, Any]] = None,
        max_position_embedding: int = 65536,
        share_expert_dim: int = 5120,
        share_q_dim: int = 2048,
        head_dim: int = 256,
        norm_expert_weight: bool = False,
        moe_layers_enum: tuple[int,
                               ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                       15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                       25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                                       35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
                                       45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
                                       55, 56, 57, 58, 59),
        **kwargs,
    ) -> None:
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_attention_heads = num_attention_heads
        self.num_attention_groups = num_attention_groups
        self.num_hidden_layers = num_hidden_layers
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.rms_norm_eps = rms_norm_eps
        self.moe_intermediate_size = moe_intermediate_size
        self.moe_num_experts = moe_num_experts
        self.moe_top_k = moe_top_k
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.max_position_embedding = max_position_embedding
        self.share_expert_dim = share_expert_dim
        self.share_q_dim = share_q_dim
        self.head_dim = head_dim
        self.norm_expert_weight = norm_expert_weight
        self.moe_layers_enum = moe_layers_enum

        super().__init__(**kwargs)

architectures class-attribute instance-attribute

architectures = ['Step3TextForCausalLM']

head_dim instance-attribute

head_dim = head_dim

hidden_size instance-attribute

hidden_size = hidden_size

intermediate_size instance-attribute

intermediate_size = intermediate_size

max_position_embedding instance-attribute

max_position_embedding = max_position_embedding

max_seq_len instance-attribute

max_seq_len = max_seq_len

model_type class-attribute instance-attribute

model_type = 'step3_text'

moe_intermediate_size instance-attribute

moe_intermediate_size = moe_intermediate_size

moe_layers_enum instance-attribute

moe_layers_enum = moe_layers_enum

moe_num_experts instance-attribute

moe_num_experts = moe_num_experts

moe_top_k instance-attribute

moe_top_k = moe_top_k

norm_expert_weight instance-attribute

norm_expert_weight = norm_expert_weight

num_attention_groups instance-attribute

num_attention_groups = num_attention_groups

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

rms_norm_eps instance-attribute

rms_norm_eps = rms_norm_eps

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

share_expert_dim instance-attribute

share_expert_dim = share_expert_dim

share_q_dim instance-attribute

share_q_dim = share_q_dim

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(
    hidden_size: int = 7168,
    intermediate_size: int = 18432,
    num_attention_heads: int = 64,
    num_attention_groups: int = 1,
    num_hidden_layers: int = 61,
    max_seq_len: int = 65536,
    vocab_size: int = 128815,
    rms_norm_eps: float = 1e-05,
    moe_intermediate_size: int = 5120,
    moe_num_experts: int = 48,
    moe_top_k: int = 3,
    rope_theta: float = 500000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embedding: int = 65536,
    share_expert_dim: int = 5120,
    share_q_dim: int = 2048,
    head_dim: int = 256,
    norm_expert_weight: bool = False,
    moe_layers_enum: tuple[int, ...] = (
        4,
        5,
        6,
        7,
        8,
        9,
        10,
        11,
        12,
        13,
        14,
        15,
        16,
        17,
        18,
        19,
        20,
        21,
        22,
        23,
        24,
        25,
        26,
        27,
        28,
        29,
        30,
        31,
        32,
        33,
        34,
        35,
        36,
        37,
        38,
        39,
        40,
        41,
        42,
        43,
        44,
        45,
        46,
        47,
        48,
        49,
        50,
        51,
        52,
        53,
        54,
        55,
        56,
        57,
        58,
        59,
    ),
    **kwargs,
) -> None
Source code in vllm/transformers_utils/configs/step3_vl.py
def __init__(
    self,
    hidden_size: int = 7168,
    intermediate_size: int = 18432,
    num_attention_heads: int = 64,
    num_attention_groups: int = 1,
    num_hidden_layers: int = 61,
    max_seq_len: int = 65536,
    vocab_size: int = 128815,
    rms_norm_eps: float = 1e-5,
    moe_intermediate_size: int = 5120,
    moe_num_experts: int = 48,
    moe_top_k: int = 3,
    rope_theta: float = 500000,
    rope_scaling: Optional[dict[str, Any]] = None,
    max_position_embedding: int = 65536,
    share_expert_dim: int = 5120,
    share_q_dim: int = 2048,
    head_dim: int = 256,
    norm_expert_weight: bool = False,
    moe_layers_enum: tuple[int,
                           ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                   15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
                                   25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                                   35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
                                   45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
                                   55, 56, 57, 58, 59),
    **kwargs,
) -> None:
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_attention_heads = num_attention_heads
    self.num_attention_groups = num_attention_groups
    self.num_hidden_layers = num_hidden_layers
    self.max_seq_len = max_seq_len
    self.vocab_size = vocab_size
    self.rms_norm_eps = rms_norm_eps
    self.moe_intermediate_size = moe_intermediate_size
    self.moe_num_experts = moe_num_experts
    self.moe_top_k = moe_top_k
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self.max_position_embedding = max_position_embedding
    self.share_expert_dim = share_expert_dim
    self.share_q_dim = share_q_dim
    self.head_dim = head_dim
    self.norm_expert_weight = norm_expert_weight
    self.moe_layers_enum = moe_layers_enum

    super().__init__(**kwargs)

Step3VLConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py
class Step3VLConfig(PretrainedConfig):
    model_type = "step3_vl"

    def __init__(
        self,
        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
        text_config: Optional[Union[dict, Step3TextConfig]] = None,
        understand_projector_stride: int = 1,
        projector_bias: bool = True,
        image_token_id: int = 128001,
        **kwargs,
    ) -> None:
        if vision_config is None:
            vision_config = Step3VisionEncoderConfig()
        elif isinstance(vision_config, dict):
            vision_config = Step3VisionEncoderConfig(**vision_config)
        self.vision_config = vision_config

        if text_config is None:
            text_config = Step3TextConfig()
        elif isinstance(text_config, dict):
            text_config = Step3TextConfig(**text_config)
        self.text_config = text_config

        self.understand_projector_stride = understand_projector_stride
        self.projector_bias = projector_bias
        self.hidden_size = text_config.hidden_size
        self.image_token_id = image_token_id

        super().__init__(**kwargs)

hidden_size instance-attribute

hidden_size = hidden_size

image_token_id instance-attribute

image_token_id = image_token_id

model_type class-attribute instance-attribute

model_type = 'step3_vl'

projector_bias instance-attribute

projector_bias = projector_bias

text_config instance-attribute

text_config = text_config

understand_projector_stride instance-attribute

understand_projector_stride = understand_projector_stride

vision_config instance-attribute

vision_config = vision_config

__init__

__init__(
    vision_config: Optional[
        Union[dict, Step3VisionEncoderConfig]
    ] = None,
    text_config: Optional[
        Union[dict, Step3TextConfig]
    ] = None,
    understand_projector_stride: int = 1,
    projector_bias: bool = True,
    image_token_id: int = 128001,
    **kwargs,
) -> None
Source code in vllm/transformers_utils/configs/step3_vl.py
def __init__(
    self,
    vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
    text_config: Optional[Union[dict, Step3TextConfig]] = None,
    understand_projector_stride: int = 1,
    projector_bias: bool = True,
    image_token_id: int = 128001,
    **kwargs,
) -> None:
    if vision_config is None:
        vision_config = Step3VisionEncoderConfig()
    elif isinstance(vision_config, dict):
        vision_config = Step3VisionEncoderConfig(**vision_config)
    self.vision_config = vision_config

    if text_config is None:
        text_config = Step3TextConfig()
    elif isinstance(text_config, dict):
        text_config = Step3TextConfig(**text_config)
    self.text_config = text_config

    self.understand_projector_stride = understand_projector_stride
    self.projector_bias = projector_bias
    self.hidden_size = text_config.hidden_size
    self.image_token_id = image_token_id

    super().__init__(**kwargs)

Step3VisionEncoderConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/step3_vl.py
class Step3VisionEncoderConfig(PretrainedConfig):
    model_type = "step3_vision_encoder"

    def __init__(
        self,
        hidden_size=1792,
        intermediate_size=3072,
        output_hidden_size=4096,
        num_hidden_layers=63,
        num_attention_heads=16,
        num_channels=3,
        image_size=728,
        patch_size=14,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        **kwargs,
    ):
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.output_hidden_size = output_hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        super().__init__(**kwargs)

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

image_size instance-attribute

image_size = image_size

intermediate_size instance-attribute

intermediate_size = intermediate_size

layer_norm_eps instance-attribute

layer_norm_eps = layer_norm_eps

model_type class-attribute instance-attribute

model_type = 'step3_vision_encoder'

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_channels instance-attribute

num_channels = num_channels

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

output_hidden_size instance-attribute

output_hidden_size = output_hidden_size

patch_size instance-attribute

patch_size = patch_size

__init__

__init__(
    hidden_size=1792,
    intermediate_size=3072,
    output_hidden_size=4096,
    num_hidden_layers=63,
    num_attention_heads=16,
    num_channels=3,
    image_size=728,
    patch_size=14,
    hidden_act="quick_gelu",
    layer_norm_eps=1e-05,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/step3_vl.py
def __init__(
    self,
    hidden_size=1792,
    intermediate_size=3072,
    output_hidden_size=4096,
    num_hidden_layers=63,
    num_attention_heads=16,
    num_channels=3,
    image_size=728,
    patch_size=14,
    hidden_act="quick_gelu",
    layer_norm_eps=1e-5,
    **kwargs,
):
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.output_hidden_size = output_hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.num_channels = num_channels
    self.patch_size = patch_size
    self.image_size = image_size
    self.layer_norm_eps = layer_norm_eps
    self.hidden_act = hidden_act
    super().__init__(**kwargs)

UltravoxConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [UltravoxForConditionalGeneration]. It is used to instantiate an Ultravox model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

Parameters:

Name Type Description Default
audio_config `Union[AutoConfig, dict]`, *optional*

Custom audio config or dict.

None
text_config `Union[AutoConfig, dict]`, *optional*

The config object of the text backbone.

None
audio_model_id `str`, *optional*

The model ID of the audio backbone.

None
text_model_id `str`, *optional*

The model ID of the text backbone.

None
ignore_index `int`, *optional*, defaults to -100

The ignore index for the loss function.

-100
audio_token_index `int`, *optional*, defaults to 32000

The audio token index to encode the audio prompt.

32000
stack_factor `int`, *optional*, defaults to 8

Audio downsampling factor for the multimodal projector.

8
norm_init `float`, *optional*, defaults to 0.4

The initialization value for the layer normalization.

0.4
projector_act `str`, *optional*, defaults to `"swiglu"`

The activation function used by the multimodal projector.

'swiglu'
projector_ln_mid `bool`, *optional*, defaults to `False`

Whether to apply layer normalization at the middle of the projector or at the end. Versions v0.4.1 and below use False, but v0.5 and above use True.

False
Source code in vllm/transformers_utils/configs/ultravox.py
class UltravoxConfig(transformers.PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a
    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
    Ultravox model according to the specified arguments, defining the model
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to
    control the model outputs. Read the documentation from [`PretrainedConfig`]
    for more information.

    Args:
        audio_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom audio config or dict.
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object of the text backbone.
        audio_model_id (`str`, *optional*):
            The model ID of the audio backbone.
        text_model_id (`str`, *optional*):
            The model ID of the text backbone.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        audio_token_index (`int`, *optional*, defaults to 32000):
            The audio token index to encode the audio prompt.
        stack_factor (`int`, *optional*, defaults to 8):
            Audio downsampling factor for the multimodal projector.
        norm_init (`float`, *optional*, defaults to 0.4):
            The initialization value for the layer normalization.
        projector_act (`str`, *optional*, defaults to `"swiglu"`):
            The activation function used by the multimodal projector.
        projector_ln_mid (`bool`, *optional*, defaults to `False`):
            Whether to apply layer normalization at the middle of the
            projector or at the end. Versions v0.4.1 and below
            use `False`, but v0.5 and above use `True`.
    """
    wrapped_model_config: transformers.PretrainedConfig
    model_type = "ultravox"
    audio_token = "<|audio|>"
    is_composition = False

    def __init__(
        self,
        audio_config: Optional[dict[str, Any]] = None,
        text_config: Optional[dict[str, Any]] = None,
        audio_model_id: Optional[str] = None,
        text_model_id: Optional[str] = None,
        ignore_index: int = -100,
        audio_token_index: int = 32000,
        hidden_size: int = 4096,
        stack_factor: int = 8,
        norm_init: float = 0.4,
        projector_act: str = "swiglu",
        projector_ln_mid: bool = False,
        **kwargs,
    ):
        self.ignore_index = ignore_index
        self.audio_token_index = audio_token_index

        self.hidden_size = hidden_size
        self.stack_factor = stack_factor
        self.norm_init = norm_init
        self.projector_act = projector_act
        self.projector_ln_mid = projector_ln_mid

        # N.B. May set the wrapped_model_config below.
        self.text_model_id = text_model_id
        if text_model_id is None:
            text_config = text_config or {}
            self.wrapped_model_config = transformers.CONFIG_MAPPING[
                text_config.get("model_type", "llama")](**text_config)

        # N.B. May set the audio_config below.
        self.audio_model_id = audio_model_id
        if audio_model_id is None:
            self.audio_model_id = None
            audio_config = audio_config or {}
            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
                "model_type", "whisper")](**audio_config)

        super().__init__(**kwargs)

    def __setattr__(self, key, value):
        # Since --hf-overrides are applied _after_ the UltravoxConfig is
        # instantiated, load the configs implicitly when assigning text_model_id
        # or audio_model_id. This allows:
        #
        #   --hf-overrides.text_model_id=<quantized variant>
        #
        # to behave as intended.
        if key == "text_model_id" and value is not None:
            from vllm.transformers_utils.config import get_config

            self.wrapped_model_config = get_config(value,
                                                   trust_remote_code=False)
        elif key == "audio_model_id" and value is not None:
            from vllm.transformers_utils.config import get_config

            self.audio_config = get_config(value, trust_remote_code=False)

        return super().__setattr__(key, value)

    @property
    def text_config(self) -> transformers.PretrainedConfig:
        # When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
        # the full model, but the text config is the text config of the inner
        # model.
        return self.wrapped_model_config.get_text_config()

audio_config instance-attribute

audio_config = CONFIG_MAPPING[get("model_type", "whisper")](
    **audio_config
)

audio_model_id instance-attribute

audio_model_id = audio_model_id

audio_token class-attribute instance-attribute

audio_token = '<|audio|>'

audio_token_index instance-attribute

audio_token_index = audio_token_index

hidden_size instance-attribute

hidden_size = hidden_size

ignore_index instance-attribute

ignore_index = ignore_index

is_composition class-attribute instance-attribute

is_composition = False

model_type class-attribute instance-attribute

model_type = 'ultravox'

norm_init instance-attribute

norm_init = norm_init

projector_act instance-attribute

projector_act = projector_act

projector_ln_mid instance-attribute

projector_ln_mid = projector_ln_mid

stack_factor instance-attribute

stack_factor = stack_factor

text_config property

text_config: PretrainedConfig

text_model_id instance-attribute

text_model_id = text_model_id

wrapped_model_config instance-attribute

wrapped_model_config: PretrainedConfig

__init__

__init__(
    audio_config: Optional[dict[str, Any]] = None,
    text_config: Optional[dict[str, Any]] = None,
    audio_model_id: Optional[str] = None,
    text_model_id: Optional[str] = None,
    ignore_index: int = -100,
    audio_token_index: int = 32000,
    hidden_size: int = 4096,
    stack_factor: int = 8,
    norm_init: float = 0.4,
    projector_act: str = "swiglu",
    projector_ln_mid: bool = False,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/ultravox.py
def __init__(
    self,
    audio_config: Optional[dict[str, Any]] = None,
    text_config: Optional[dict[str, Any]] = None,
    audio_model_id: Optional[str] = None,
    text_model_id: Optional[str] = None,
    ignore_index: int = -100,
    audio_token_index: int = 32000,
    hidden_size: int = 4096,
    stack_factor: int = 8,
    norm_init: float = 0.4,
    projector_act: str = "swiglu",
    projector_ln_mid: bool = False,
    **kwargs,
):
    self.ignore_index = ignore_index
    self.audio_token_index = audio_token_index

    self.hidden_size = hidden_size
    self.stack_factor = stack_factor
    self.norm_init = norm_init
    self.projector_act = projector_act
    self.projector_ln_mid = projector_ln_mid

    # N.B. May set the wrapped_model_config below.
    self.text_model_id = text_model_id
    if text_model_id is None:
        text_config = text_config or {}
        self.wrapped_model_config = transformers.CONFIG_MAPPING[
            text_config.get("model_type", "llama")](**text_config)

    # N.B. May set the audio_config below.
    self.audio_model_id = audio_model_id
    if audio_model_id is None:
        self.audio_model_id = None
        audio_config = audio_config or {}
        self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
            "model_type", "whisper")](**audio_config)

    super().__init__(**kwargs)

__setattr__

__setattr__(key, value)
Source code in vllm/transformers_utils/configs/ultravox.py
def __setattr__(self, key, value):
    # Since --hf-overrides are applied _after_ the UltravoxConfig is
    # instantiated, load the configs implicitly when assigning text_model_id
    # or audio_model_id. This allows:
    #
    #   --hf-overrides.text_model_id=<quantized variant>
    #
    # to behave as intended.
    if key == "text_model_id" and value is not None:
        from vllm.transformers_utils.config import get_config

        self.wrapped_model_config = get_config(value,
                                               trust_remote_code=False)
    elif key == "audio_model_id" and value is not None:
        from vllm.transformers_utils.config import get_config

        self.audio_config = get_config(value, trust_remote_code=False)

    return super().__setattr__(key, value)