vllm.model_executor.models.qwen ¶
Inference-only QWen model compatible with HuggingFace weights.
QWenAttention ¶
Bases: Module
Source code in vllm/model_executor/models/qwen.py
attn instance-attribute
¶
attn = Attention(
num_heads,
head_dim,
scaling,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
)
c_attn instance-attribute
¶
c_attn = QKVParallelLinear(
hidden_size,
head_dim,
total_num_heads,
bias=True,
quant_config=quant_config,
)
c_proj instance-attribute
¶
c_proj = RowParallelLinear(
total_num_heads * head_dim,
hidden_size,
bias=False,
quant_config=quant_config,
)
rotary_emb instance-attribute
¶
rotary_emb = get_rope(
head_dim,
rotary_dim=head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
)
__init__ ¶
__init__(
hidden_size: int,
num_heads: int,
max_position_embeddings: int,
rope_theta: float = 10000,
rope_scaling: Optional[dict[str, Any]] = None,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
)
Source code in vllm/model_executor/models/qwen.py
forward ¶
Source code in vllm/model_executor/models/qwen.py
QWenBaseModel ¶
Bases: Module
Source code in vllm/model_executor/models/qwen.py
lm_head instance-attribute
¶
lm_head = ParallelLMHead(
vocab_size,
hidden_size,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"),
)
make_empty_intermediate_tensors instance-attribute
¶
transformer instance-attribute
¶
transformer = transformer_type(
vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "transformer"),
)
__init__ ¶
__init__(
*,
vllm_config: VllmConfig,
prefix: str = "",
transformer_type: type[QWenModel] = QWenModel,
) -> None
Source code in vllm/model_executor/models/qwen.py
compute_logits ¶
load_weights ¶
Source code in vllm/model_executor/models/qwen.py
QWenBlock ¶
Bases: Module
Source code in vllm/model_executor/models/qwen.py
attn instance-attribute
¶
attn = QWenAttention(
hidden_size,
num_attention_heads,
max_position_embeddings,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
)
mlp instance-attribute
¶
mlp = QWenMLP(
hidden_size,
intermediate_size // 2,
quant_config=quant_config,
)
__init__ ¶
__init__(
config: PretrainedConfig,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
)
Source code in vllm/model_executor/models/qwen.py
forward ¶
forward(
positions: Tensor,
hidden_states: Tensor,
residual: Optional[Tensor],
) -> tuple[Tensor, Tensor]
Source code in vllm/model_executor/models/qwen.py
QWenLMHeadModel ¶
Bases: QWenBaseModel
, SupportsPP
, SupportsLoRA
Source code in vllm/model_executor/models/qwen.py
packed_modules_mapping class-attribute
instance-attribute
¶
__init__ ¶
__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/qwen.py
forward ¶
forward(
input_ids: Tensor,
positions: Tensor,
intermediate_tensors: Optional[
IntermediateTensors
] = None,
inputs_embeds: Optional[Tensor] = None,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/qwen.py
QWenMLP ¶
Bases: Module
MLP for the language component of the Qwen model, which contains a MergedColumnParallelLinear merging 2 outputs via silu activation.
Source code in vllm/model_executor/models/qwen.py
c_proj instance-attribute
¶
c_proj = RowParallelLinear(
intermediate_size,
hidden_size,
bias=False,
quant_config=quant_config,
)
gate_up_proj instance-attribute
¶
gate_up_proj = MergedColumnParallelLinear(
hidden_size,
[intermediate_size] * 2,
bias=False,
quant_config=quant_config,
)
__init__ ¶
__init__(
hidden_size: int,
intermediate_size: int,
hidden_act: str = "silu",
quant_config: Optional[QuantizationConfig] = None,
)
Source code in vllm/model_executor/models/qwen.py
QWenModel ¶
Bases: Module
Source code in vllm/model_executor/models/qwen.py
make_empty_intermediate_tensors instance-attribute
¶
make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], hidden_size
)
)
__init__ ¶
__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/qwen.py
forward ¶
forward(
input_ids: Tensor,
positions: Tensor,
intermediate_tensors: Optional[IntermediateTensors],
inputs_embeds: Optional[Tensor] = None,
) -> Union[Tensor, IntermediateTensors]