Skip to content

vllm.envs

CMAKE_BUILD_TYPE module-attribute

CMAKE_BUILD_TYPE: Optional[
    Literal["Debug", "Release", "RelWithDebInfo"]
] = None

CUDA_VISIBLE_DEVICES module-attribute

CUDA_VISIBLE_DEVICES: Optional[str] = None

GPT_OSS_SYSTEM_TOOL_MCP_LABELS module-attribute

GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = []

K_SCALE_CONSTANT module-attribute

K_SCALE_CONSTANT: int = 200

LD_LIBRARY_PATH module-attribute

LD_LIBRARY_PATH: Optional[str] = None

LOCAL_RANK module-attribute

LOCAL_RANK: int = 0

MAX_JOBS module-attribute

MAX_JOBS: Optional[str] = None

NVCC_THREADS module-attribute

NVCC_THREADS: Optional[str] = None

Q_SCALE_CONSTANT module-attribute

Q_SCALE_CONSTANT: int = 200

S3_ACCESS_KEY_ID module-attribute

S3_ACCESS_KEY_ID: Optional[str] = None

S3_ENDPOINT_URL module-attribute

S3_ENDPOINT_URL: Optional[str] = None

S3_SECRET_ACCESS_KEY module-attribute

S3_SECRET_ACCESS_KEY: Optional[str] = None

VERBOSE module-attribute

VERBOSE: bool = False

VLLM_ALL2ALL_BACKEND module-attribute

VLLM_ALL2ALL_BACKEND: Literal[
    "naive",
    "pplx",
    "deepep_high_throughput",
    "deepep_low_latency",
    "allgather_reducescatter",
    "flashinfer_all2allv",
] = "allgather_reducescatter"

VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE module-attribute

VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = (
    False
)

VLLM_ALLOW_INSECURE_SERIALIZATION module-attribute

VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False

VLLM_ALLOW_LONG_MAX_MODEL_LEN module-attribute

VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False

VLLM_ALLOW_RUNTIME_LORA_UPDATING module-attribute

VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False

VLLM_ALLREDUCE_USE_SYMM_MEM module-attribute

VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True

VLLM_API_KEY module-attribute

VLLM_API_KEY: Optional[str] = None

VLLM_ASSETS_CACHE module-attribute

VLLM_ASSETS_CACHE: str = join(VLLM_CACHE_ROOT, 'assets')

VLLM_ASSETS_CACHE_MODEL_CLEAN module-attribute

VLLM_ASSETS_CACHE_MODEL_CLEAN: bool = False

VLLM_ATTENTION_BACKEND module-attribute

VLLM_ATTENTION_BACKEND: Optional[str] = None

VLLM_AUDIO_FETCH_TIMEOUT module-attribute

VLLM_AUDIO_FETCH_TIMEOUT: int = 10

VLLM_CACHE_ROOT module-attribute

VLLM_CACHE_ROOT: str = expanduser('~/.cache/vllm')

VLLM_COMPUTE_NANS_IN_LOGITS module-attribute

VLLM_COMPUTE_NANS_IN_LOGITS: bool = False

VLLM_CONFIGURE_LOGGING module-attribute

VLLM_CONFIGURE_LOGGING: int = 1

VLLM_CONFIG_ROOT module-attribute

VLLM_CONFIG_ROOT: str = expanduser('~/.config/vllm')

VLLM_CPU_KVCACHE_SPACE module-attribute

VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0

VLLM_CPU_MOE_PREPACK module-attribute

VLLM_CPU_MOE_PREPACK: bool = True

VLLM_CPU_NUM_OF_RESERVED_CPU module-attribute

VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None

VLLM_CPU_OMP_THREADS_BIND module-attribute

VLLM_CPU_OMP_THREADS_BIND: str = ''

VLLM_CPU_SGL_KERNEL module-attribute

VLLM_CPU_SGL_KERNEL: bool = False

VLLM_CUDART_SO_PATH module-attribute

VLLM_CUDART_SO_PATH: Optional[str] = None

VLLM_CUSTOM_SCOPES_FOR_PROFILING module-attribute

VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False

VLLM_DBO_COMM_SMS module-attribute

VLLM_DBO_COMM_SMS: int = 20

VLLM_DEBUG_DUMP_PATH module-attribute

VLLM_DEBUG_DUMP_PATH: Optional[str] = None

VLLM_DEEPEP_BUFFER_SIZE_MB module-attribute

VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024

VLLM_DISABLED_KERNELS module-attribute

VLLM_DISABLED_KERNELS: list[str] = []

VLLM_DISABLE_COMPILE_CACHE module-attribute

VLLM_DISABLE_COMPILE_CACHE: bool = False

VLLM_DISABLE_FLASHINFER_PREFILL module-attribute

VLLM_DISABLE_FLASHINFER_PREFILL: bool = False

VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION module-attribute

VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: bool = False

VLLM_DISABLE_PAD_FOR_CUDAGRAPH module-attribute

VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False

VLLM_DOCKER_BUILD_CONTEXT module-attribute

VLLM_DOCKER_BUILD_CONTEXT: bool = False

VLLM_DO_NOT_TRACK module-attribute

VLLM_DO_NOT_TRACK: bool = False

VLLM_DP_MASTER_IP module-attribute

VLLM_DP_MASTER_IP: str = ''

VLLM_DP_MASTER_PORT module-attribute

VLLM_DP_MASTER_PORT: int = 0

VLLM_DP_RANK module-attribute

VLLM_DP_RANK: int = 0

VLLM_DP_RANK_LOCAL module-attribute

VLLM_DP_RANK_LOCAL: int = -1

VLLM_DP_SIZE module-attribute

VLLM_DP_SIZE: int = 1

VLLM_ENABLE_CUDAGRAPH_GC module-attribute

VLLM_ENABLE_CUDAGRAPH_GC: bool = False

VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING module-attribute

VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True

VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING module-attribute

VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True

VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE module-attribute

VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True

VLLM_ENABLE_RESPONSES_API_STORE module-attribute

VLLM_ENABLE_RESPONSES_API_STORE: bool = False

VLLM_ENABLE_V1_MULTIPROCESSING module-attribute

VLLM_ENABLE_V1_MULTIPROCESSING: bool = True

VLLM_ENGINE_ITERATION_TIMEOUT_S module-attribute

VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60

VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS module-attribute

VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300

VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION module-attribute

VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False

VLLM_FLASHINFER_MOE_BACKEND module-attribute

VLLM_FLASHINFER_MOE_BACKEND: Literal[
    "throughput", "latency"
] = "throughput"

VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH module-attribute

VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32

VLLM_FLASH_ATTN_VERSION module-attribute

VLLM_FLASH_ATTN_VERSION: Optional[int] = None

VLLM_FUSED_MOE_CHUNK_SIZE module-attribute

VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024

VLLM_GC_DEBUG module-attribute

VLLM_GC_DEBUG: str = ''

VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS module-attribute

VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False

VLLM_HAS_FLASHINFER_CUBIN module-attribute

VLLM_HAS_FLASHINFER_CUBIN: bool = False

VLLM_HOST_IP module-attribute

VLLM_HOST_IP: str = ''

VLLM_HTTP_TIMEOUT_KEEP_ALIVE module-attribute

VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5

VLLM_IMAGE_FETCH_TIMEOUT module-attribute

VLLM_IMAGE_FETCH_TIMEOUT: int = 5

VLLM_KEEP_ALIVE_ON_ENGINE_DEATH module-attribute

VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False

VLLM_KV_CACHE_LAYOUT module-attribute

VLLM_KV_CACHE_LAYOUT: Optional[Literal["NHD", "HND"]] = None

VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES module-attribute

VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True

VLLM_LOGGING_CONFIG_PATH module-attribute

VLLM_LOGGING_CONFIG_PATH: Optional[str] = None

VLLM_LOGGING_LEVEL module-attribute

VLLM_LOGGING_LEVEL: str = 'INFO'

VLLM_LOGGING_PREFIX module-attribute

VLLM_LOGGING_PREFIX: str = ''

VLLM_LOGGING_STREAM module-attribute

VLLM_LOGGING_STREAM: str = 'ext://sys.stdout'

VLLM_LOGITS_PROCESSOR_THREADS module-attribute

VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None

VLLM_LOG_BATCHSIZE_INTERVAL module-attribute

VLLM_LOG_BATCHSIZE_INTERVAL: float = -1

VLLM_LOG_STATS_INTERVAL module-attribute

VLLM_LOG_STATS_INTERVAL: float = 10.0

VLLM_LOOPBACK_IP module-attribute

VLLM_LOOPBACK_IP: str = ''

VLLM_LORA_RESOLVER_CACHE_DIR module-attribute

VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None

VLLM_MAIN_CUDA_VERSION module-attribute

VLLM_MAIN_CUDA_VERSION: str = '12.8'

VLLM_MARLIN_USE_ATOMIC_ADD module-attribute

VLLM_MARLIN_USE_ATOMIC_ADD: bool = False

VLLM_MAX_AUDIO_CLIP_FILESIZE_MB module-attribute

VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25

VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE module-attribute

VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840

VLLM_MEDIA_LOADING_THREAD_COUNT module-attribute

VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8

VLLM_MLA_DISABLE module-attribute

VLLM_MLA_DISABLE: bool = False

VLLM_MM_INPUT_CACHE_GIB module-attribute

VLLM_MM_INPUT_CACHE_GIB: int = 4

VLLM_MODEL_REDIRECT_PATH module-attribute

VLLM_MODEL_REDIRECT_PATH: Optional[str] = None

VLLM_MOE_DP_CHUNK_SIZE module-attribute

VLLM_MOE_DP_CHUNK_SIZE: int = 256

VLLM_MQ_MAX_CHUNK_BYTES_MB module-attribute

VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16

VLLM_MSGPACK_ZERO_COPY_THRESHOLD module-attribute

VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256

VLLM_MXFP4_USE_MARLIN module-attribute

VLLM_MXFP4_USE_MARLIN: Optional[bool] = None

VLLM_NCCL_INCLUDE_PATH module-attribute

VLLM_NCCL_INCLUDE_PATH: Optional[str] = None

VLLM_NCCL_SO_PATH module-attribute

VLLM_NCCL_SO_PATH: Optional[str] = None

VLLM_NIXL_ABORT_REQUEST_TIMEOUT module-attribute

VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480

VLLM_NIXL_SIDE_CHANNEL_HOST module-attribute

VLLM_NIXL_SIDE_CHANNEL_HOST: str = 'localhost'

VLLM_NIXL_SIDE_CHANNEL_PORT module-attribute

VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600

VLLM_NO_USAGE_STATS module-attribute

VLLM_NO_USAGE_STATS: bool = False

VLLM_NVTX_SCOPES_FOR_PROFILING module-attribute

VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False

VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME module-attribute

VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = (
    "VLLM_OBJECT_STORAGE_SHM_BUFFER"
)

VLLM_PATTERN_MATCH_DEBUG module-attribute

VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None

VLLM_PLUGINS module-attribute

VLLM_PLUGINS: Optional[list[str]] = None

VLLM_PORT module-attribute

VLLM_PORT: Optional[int] = None

VLLM_PP_LAYER_PARTITION module-attribute

VLLM_PP_LAYER_PARTITION: Optional[str] = None

VLLM_RANDOMIZE_DP_DUMMY_INPUTS module-attribute

VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False

VLLM_RAY_BUNDLE_INDICES module-attribute

VLLM_RAY_BUNDLE_INDICES: str = ''

VLLM_RAY_PER_WORKER_GPUS module-attribute

VLLM_RAY_PER_WORKER_GPUS: float = 1.0

VLLM_RINGBUFFER_WARNING_INTERVAL module-attribute

VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60

VLLM_ROCM_CUSTOM_PAGED_ATTN module-attribute

VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True

VLLM_ROCM_FP8_MFMA_PAGE_ATTN module-attribute

VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False

VLLM_ROCM_FP8_PADDING module-attribute

VLLM_ROCM_FP8_PADDING: bool = True

VLLM_ROCM_MOE_PADDING module-attribute

VLLM_ROCM_MOE_PADDING: bool = True

VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16 module-attribute

VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True

VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB module-attribute

VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = (
    None
)

VLLM_ROCM_QUICK_REDUCE_QUANTIZATION module-attribute

VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal[
    "FP", "INT8", "INT6", "INT4", "NONE"
] = "NONE"

VLLM_ROCM_USE_AITER module-attribute

VLLM_ROCM_USE_AITER: bool = False

VLLM_ROCM_USE_AITER_FP4_ASM_GEMM module-attribute

VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False

VLLM_ROCM_USE_AITER_FP8BMM module-attribute

VLLM_ROCM_USE_AITER_FP8BMM: bool = True

VLLM_ROCM_USE_AITER_LINEAR module-attribute

VLLM_ROCM_USE_AITER_LINEAR: bool = True

VLLM_ROCM_USE_AITER_MHA module-attribute

VLLM_ROCM_USE_AITER_MHA: bool = True

VLLM_ROCM_USE_AITER_MLA module-attribute

VLLM_ROCM_USE_AITER_MLA: bool = True

VLLM_ROCM_USE_AITER_MOE module-attribute

VLLM_ROCM_USE_AITER_MOE: bool = True

VLLM_ROCM_USE_AITER_PAGED_ATTN module-attribute

VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False

VLLM_ROCM_USE_AITER_RMSNORM module-attribute

VLLM_ROCM_USE_AITER_RMSNORM: bool = True

VLLM_ROCM_USE_SKINNY_GEMM module-attribute

VLLM_ROCM_USE_SKINNY_GEMM: bool = True

VLLM_ROCM_USE_TRITON_ROPE module-attribute

VLLM_ROCM_USE_TRITON_ROPE: bool = False

VLLM_RPC_BASE_PATH module-attribute

VLLM_RPC_BASE_PATH: str = gettempdir()

VLLM_RPC_TIMEOUT module-attribute

VLLM_RPC_TIMEOUT: int = 10000

VLLM_SERVER_DEV_MODE module-attribute

VLLM_SERVER_DEV_MODE: bool = False

VLLM_SKIP_DEEP_GEMM_WARMUP module-attribute

VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False

VLLM_SKIP_P2P_CHECK module-attribute

VLLM_SKIP_P2P_CHECK: bool = False

VLLM_SLEEP_WHEN_IDLE module-attribute

VLLM_SLEEP_WHEN_IDLE: bool = False

VLLM_TARGET_DEVICE module-attribute

VLLM_TARGET_DEVICE: str = 'cuda'

VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL module-attribute

VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False

VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS module-attribute

VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1

VLLM_TORCH_PROFILER_DIR module-attribute

VLLM_TORCH_PROFILER_DIR: Optional[str] = None

VLLM_TORCH_PROFILER_RECORD_SHAPES module-attribute

VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False

VLLM_TORCH_PROFILER_WITH_FLOPS module-attribute

VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False

VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY module-attribute

VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False

VLLM_TORCH_PROFILER_WITH_STACK module-attribute

VLLM_TORCH_PROFILER_WITH_STACK: bool = True

VLLM_TPU_BUCKET_PADDING_GAP module-attribute

VLLM_TPU_BUCKET_PADDING_GAP: int = 0

VLLM_TPU_MOST_MODEL_LEN module-attribute

VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None

VLLM_TPU_USING_PATHWAYS module-attribute

VLLM_TPU_USING_PATHWAYS: bool = False

VLLM_TRACE_FUNCTION module-attribute

VLLM_TRACE_FUNCTION: int = 0

VLLM_TUNED_CONFIG_FOLDER module-attribute

VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None

VLLM_USAGE_SOURCE module-attribute

VLLM_USAGE_SOURCE: str = ''

VLLM_USAGE_STATS_SERVER module-attribute

VLLM_USAGE_STATS_SERVER: str = 'https://stats.vllm.ai'

VLLM_USE_AITER_UNIFIED_ATTENTION module-attribute

VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False

VLLM_USE_CUDNN_PREFILL module-attribute

VLLM_USE_CUDNN_PREFILL: bool = False

VLLM_USE_DEEP_GEMM module-attribute

VLLM_USE_DEEP_GEMM: bool = True

VLLM_USE_DEEP_GEMM_E8M0 module-attribute

VLLM_USE_DEEP_GEMM_E8M0: bool = True

VLLM_USE_DEEP_GEMM_E8M0_HOPPER module-attribute

VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False

VLLM_USE_FBGEMM module-attribute

VLLM_USE_FBGEMM: bool = False

VLLM_USE_FLASHINFER_MOE_FP16 module-attribute

VLLM_USE_FLASHINFER_MOE_FP16: bool = False

VLLM_USE_FLASHINFER_MOE_FP4 module-attribute

VLLM_USE_FLASHINFER_MOE_FP4: bool = False

VLLM_USE_FLASHINFER_MOE_FP8 module-attribute

VLLM_USE_FLASHINFER_MOE_FP8: bool = False

VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 module-attribute

VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False

VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 module-attribute

VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False

VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS module-attribute

VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False

VLLM_USE_FLASHINFER_SAMPLER module-attribute

VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None

VLLM_USE_FUSED_MOE_GROUPED_TOPK module-attribute

VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True

VLLM_USE_MODELSCOPE module-attribute

VLLM_USE_MODELSCOPE: bool = False

VLLM_USE_NCCL_SYMM_MEM module-attribute

VLLM_USE_NCCL_SYMM_MEM: bool = False

VLLM_USE_NVFP4_CT_EMULATIONS module-attribute

VLLM_USE_NVFP4_CT_EMULATIONS: bool = False

VLLM_USE_PRECOMPILED module-attribute

VLLM_USE_PRECOMPILED: bool = False

VLLM_USE_RAY_COMPILED_DAG module-attribute

VLLM_USE_RAY_COMPILED_DAG: bool = False

VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE module-attribute

VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal[
    "auto", "nccl", "shm"
] = "auto"

VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM module-attribute

VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False

VLLM_USE_RAY_SPMD_WORKER module-attribute

VLLM_USE_RAY_SPMD_WORKER: bool = False

VLLM_USE_RAY_WRAPPED_PP_COMM module-attribute

VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True

VLLM_USE_STANDALONE_COMPILE module-attribute

VLLM_USE_STANDALONE_COMPILE: bool = False

VLLM_USE_TRITON_AWQ module-attribute

VLLM_USE_TRITON_AWQ: bool = False

VLLM_USE_TRITON_FLASH_ATTN module-attribute

VLLM_USE_TRITON_FLASH_ATTN: bool = True

VLLM_USE_TRTLLM_ATTENTION module-attribute

VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None

VLLM_USE_V1 module-attribute

VLLM_USE_V1: bool = True

VLLM_V0_USE_OUTLINES_CACHE module-attribute

VLLM_V0_USE_OUTLINES_CACHE: bool = False

VLLM_V1_OUTPUT_PROC_CHUNK_SIZE module-attribute

VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128

VLLM_V1_USE_OUTLINES_CACHE module-attribute

VLLM_V1_USE_OUTLINES_CACHE: bool = False

VLLM_V1_USE_PREFILL_DECODE_ATTENTION module-attribute

VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False

VLLM_VIDEO_FETCH_TIMEOUT module-attribute

VLLM_VIDEO_FETCH_TIMEOUT: int = 30

VLLM_VIDEO_LOADER_BACKEND module-attribute

VLLM_VIDEO_LOADER_BACKEND: str = 'opencv'

VLLM_WORKER_MULTIPROC_METHOD module-attribute

VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = (
    "fork"
)

VLLM_XGRAMMAR_CACHE_MB module-attribute

VLLM_XGRAMMAR_CACHE_MB: int = 0

VLLM_XLA_CACHE_PATH module-attribute

VLLM_XLA_CACHE_PATH: str = join(
    VLLM_CACHE_ROOT, "xla_cache"
)

VLLM_XLA_CHECK_RECOMPILATION module-attribute

VLLM_XLA_CHECK_RECOMPILATION: bool = False

VLLM_XLA_USE_SPMD module-attribute

VLLM_XLA_USE_SPMD: bool = False

V_SCALE_CONSTANT module-attribute

V_SCALE_CONSTANT: int = 100

environment_variables module-attribute

environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_TARGET_DEVICE": lambda: lower(),
    "VLLM_MAIN_CUDA_VERSION": lambda: lower() or "12.8",
    "MAX_JOBS": lambda: getenv("MAX_JOBS", None),
    "NVCC_THREADS": lambda: getenv("NVCC_THREADS", None),
    "VLLM_USE_PRECOMPILED": lambda: lower() in ("1", "true")
    or bool(get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
    "VLLM_DOCKER_BUILD_CONTEXT": lambda: lower()
    in ("1", "true"),
    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
        int(
            getenv(
                "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL",
                "0",
            )
        )
    ),
    "CMAKE_BUILD_TYPE": env_with_choices(
        "CMAKE_BUILD_TYPE",
        None,
        ["Debug", "Release", "RelWithDebInfo"],
    ),
    "VERBOSE": lambda: bool(int(getenv("VERBOSE", "0"))),
    "VLLM_CONFIG_ROOT": lambda: expanduser(
        getenv(
            "VLLM_CONFIG_ROOT",
            join(get_default_config_root(), "vllm"),
        )
    ),
    "VLLM_CACHE_ROOT": lambda: expanduser(
        getenv(
            "VLLM_CACHE_ROOT",
            join(get_default_cache_root(), "vllm"),
        )
    ),
    "VLLM_HOST_IP": lambda: getenv("VLLM_HOST_IP", ""),
    "VLLM_PORT": get_vllm_port,
    "VLLM_RPC_BASE_PATH": lambda: getenv(
        "VLLM_RPC_BASE_PATH", gettempdir()
    ),
    "VLLM_USE_MODELSCOPE": lambda: lower() == "true",
    "VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(
        get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")
    ),
    "CUDA_HOME": lambda: get("CUDA_HOME", None),
    "VLLM_NCCL_SO_PATH": lambda: get(
        "VLLM_NCCL_SO_PATH", None
    ),
    "LD_LIBRARY_PATH": lambda: get("LD_LIBRARY_PATH", None),
    "VLLM_USE_TRITON_FLASH_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: lower()
    in ("true", "1"),
    "VLLM_USE_AITER_UNIFIED_ATTENTION": lambda: lower()
    in ("true", "1"),
    "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
        get("VLLM_FLASH_ATTN_VERSION", None)
    ),
    "VLLM_USE_STANDALONE_COMPILE": lambda: get(
        "VLLM_USE_STANDALONE_COMPILE", "0"
    )
    == "1",
    "VLLM_PATTERN_MATCH_DEBUG": lambda: get(
        "VLLM_PATTERN_MATCH_DEBUG", None
    ),
    "VLLM_DEBUG_DUMP_PATH": lambda: get(
        "VLLM_DEBUG_DUMP_PATH", None
    ),
    "LOCAL_RANK": lambda: int(get("LOCAL_RANK", "0")),
    "CUDA_VISIBLE_DEVICES": lambda: get(
        "CUDA_VISIBLE_DEVICES", None
    ),
    "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
        get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
    ),
    "VLLM_API_KEY": lambda: get("VLLM_API_KEY", None),
    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: lower()
    == "true",
    "S3_ACCESS_KEY_ID": lambda: get(
        "S3_ACCESS_KEY_ID", None
    ),
    "S3_SECRET_ACCESS_KEY": lambda: get(
        "S3_SECRET_ACCESS_KEY", None
    ),
    "S3_ENDPOINT_URL": lambda: get("S3_ENDPOINT_URL", None),
    "VLLM_USAGE_STATS_SERVER": lambda: get(
        "VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
    ),
    "VLLM_NO_USAGE_STATS": lambda: get(
        "VLLM_NO_USAGE_STATS", "0"
    )
    == "1",
    "VLLM_DISABLE_FLASHINFER_PREFILL": lambda: get(
        "VLLM_DISABLE_FLASHINFER_PREFILL", "0"
    )
    == "1",
    "VLLM_DO_NOT_TRACK": lambda: (
        get("VLLM_DO_NOT_TRACK", None)
        or get("DO_NOT_TRACK", None)
        or "0"
    )
    == "1",
    "VLLM_USAGE_SOURCE": lambda: get(
        "VLLM_USAGE_SOURCE", "production"
    ),
    "VLLM_CONFIGURE_LOGGING": lambda: int(
        getenv("VLLM_CONFIGURE_LOGGING", "1")
    ),
    "VLLM_LOGGING_CONFIG_PATH": lambda: getenv(
        "VLLM_LOGGING_CONFIG_PATH"
    ),
    "VLLM_LOGGING_LEVEL": lambda: upper(),
    "VLLM_LOGGING_STREAM": lambda: getenv(
        "VLLM_LOGGING_STREAM", "ext://sys.stdout"
    ),
    "VLLM_LOGGING_PREFIX": lambda: getenv(
        "VLLM_LOGGING_PREFIX", ""
    ),
    "VLLM_LOGITS_PROCESSOR_THREADS": lambda: int(
        getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")
    )
    if "VLLM_LOGITS_PROCESSOR_THREADS" in environ
    else None,
    "VLLM_LOG_STATS_INTERVAL": lambda: val
    if (
        val := (
            float(getenv("VLLM_LOG_STATS_INTERVAL", "10."))
        )
    )
    > 0.0
    else 10.0,
    "VLLM_TRACE_FUNCTION": lambda: int(
        getenv("VLLM_TRACE_FUNCTION", "0")
    ),
    "VLLM_ATTENTION_BACKEND": env_with_choices(
        "VLLM_ATTENTION_BACKEND", None, lambda: list(keys())
    ),
    "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
        int(environ["VLLM_USE_FLASHINFER_SAMPLER"])
    )
    if "VLLM_USE_FLASHINFER_SAMPLER" in environ
    else None,
    "VLLM_PP_LAYER_PARTITION": lambda: getenv(
        "VLLM_PP_LAYER_PARTITION", None
    ),
    "VLLM_CPU_KVCACHE_SPACE": lambda: int(
        getenv("VLLM_CPU_KVCACHE_SPACE", "0")
    )
    if "VLLM_CPU_KVCACHE_SPACE" in environ
    else None,
    "VLLM_CPU_OMP_THREADS_BIND": lambda: getenv(
        "VLLM_CPU_OMP_THREADS_BIND", "auto"
    ),
    "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
        getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
    )
    if "VLLM_CPU_NUM_OF_RESERVED_CPU" in environ
    else None,
    "VLLM_CPU_MOE_PREPACK": lambda: bool(
        int(getenv("VLLM_CPU_MOE_PREPACK", "1"))
    ),
    "VLLM_CPU_SGL_KERNEL": lambda: bool(
        int(getenv("VLLM_CPU_SGL_KERNEL", "0"))
    ),
    "VLLM_USE_RAY_SPMD_WORKER": lambda: bool(
        int(getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))
    ),
    "VLLM_USE_RAY_COMPILED_DAG": lambda: bool(
        int(getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))
    ),
    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": env_with_choices(
        "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE",
        "auto",
        ["auto", "nccl", "shm"],
    ),
    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
        int(
            getenv(
                "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM",
                "0",
            )
        )
    ),
    "VLLM_USE_RAY_WRAPPED_PP_COMM": lambda: bool(
        int(getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))
    ),
    "VLLM_WORKER_MULTIPROC_METHOD": env_with_choices(
        "VLLM_WORKER_MULTIPROC_METHOD",
        "fork",
        ["spawn", "fork"],
    ),
    "VLLM_ASSETS_CACHE": lambda: expanduser(
        getenv(
            "VLLM_ASSETS_CACHE",
            join(
                get_default_cache_root(), "vllm", "assets"
            ),
        )
    ),
    "VLLM_ASSETS_CACHE_MODEL_CLEAN": lambda: bool(
        int(getenv("VLLM_ASSETS_CACHE_MODEL_CLEAN", "0"))
    ),
    "VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")
    ),
    "VLLM_VIDEO_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")
    ),
    "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
    ),
    "VLLM_MEDIA_LOADING_THREAD_COUNT": lambda: int(
        getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")
    ),
    "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
        getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
    ),
    "VLLM_VIDEO_LOADER_BACKEND": lambda: getenv(
        "VLLM_VIDEO_LOADER_BACKEND", "opencv"
    ),
    "VLLM_MM_INPUT_CACHE_GIB": lambda: int(
        getenv("VLLM_MM_INPUT_CACHE_GIB", "4")
    ),
    "VLLM_XLA_CACHE_PATH": lambda: expanduser(
        getenv(
            "VLLM_XLA_CACHE_PATH",
            join(
                get_default_cache_root(),
                "vllm",
                "xla_cache",
            ),
        )
    ),
    "VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(
        int(getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))
    ),
    "VLLM_XLA_USE_SPMD": lambda: bool(
        int(getenv("VLLM_XLA_USE_SPMD", "0"))
    ),
    "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
        getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
    ),
    "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": lambda: bool(
        int(
            getenv(
                "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING",
                "1",
            )
        )
    ),
    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
        getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
    ),
    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: lower()
    in ("1", "true"),
    "VLLM_TEST_FORCE_FP8_MARLIN": lambda: lower()
    in ("1", "true"),
    "VLLM_TEST_FORCE_LOAD_FORMAT": lambda: getenv(
        "VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
    ),
    "VLLM_RPC_TIMEOUT": lambda: int(
        getenv("VLLM_RPC_TIMEOUT", "10000")
    ),
    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
        get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
    ),
    "VLLM_PLUGINS": lambda: None
    if "VLLM_PLUGINS" not in environ
    else split(","),
    "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: getenv(
        "VLLM_LORA_RESOLVER_CACHE_DIR", None
    ),
    "VLLM_TORCH_PROFILER_DIR": lambda: None
    if getenv("VLLM_TORCH_PROFILER_DIR", None) is None
    else abspath(
        expanduser(getenv("VLLM_TORCH_PROFILER_DIR", "."))
    ),
    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
        getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0")
        != "0"
    ),
    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
        getenv(
            "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0"
        )
        != "0"
    ),
    "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
        getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
    ),
    "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
        getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
    ),
    "VLLM_USE_TRITON_AWQ": lambda: bool(
        int(getenv("VLLM_USE_TRITON_AWQ", "0"))
    ),
    "VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: lower()
    in ("1", "true"),
    "VLLM_SKIP_P2P_CHECK": lambda: getenv(
        "VLLM_SKIP_P2P_CHECK", "1"
    )
    == "1",
    "VLLM_DISABLED_KERNELS": lambda: []
    if "VLLM_DISABLED_KERNELS" not in environ
    else split(","),
    "VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION": lambda: lower()
    in ("true", "1"),
    "VLLM_USE_V1": lambda: bool(
        int(getenv("VLLM_USE_V1", "1"))
    ),
    "VLLM_ROCM_USE_AITER": lambda: lower() in ("true", "1"),
    "VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_LINEAR": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MOE": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_RMSNORM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MLA": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MHA": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_TRITON_ROPE": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_FP8BMM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_SKINNY_GEMM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_FP8_PADDING": lambda: bool(
        int(getenv("VLLM_ROCM_FP8_PADDING", "1"))
    ),
    "VLLM_ROCM_MOE_PADDING": lambda: bool(
        int(getenv("VLLM_ROCM_MOE_PADDING", "1"))
    ),
    "VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": env_with_choices(
        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
        "NONE",
        ["FP", "INT8", "INT6", "INT4", "NONE"],
    ),
    "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB": lambda: maybe_convert_int(
        get(
            "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None
        )
    ),
    "Q_SCALE_CONSTANT": lambda: int(
        getenv("Q_SCALE_CONSTANT", "200")
    ),
    "K_SCALE_CONSTANT": lambda: int(
        getenv("K_SCALE_CONSTANT", "200")
    ),
    "V_SCALE_CONSTANT": lambda: int(
        getenv("V_SCALE_CONSTANT", "100")
    ),
    "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(
        int(getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))
    ),
    "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
        getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
    ),
    "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(
        int(getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))
    ),
    "VLLM_SERVER_DEV_MODE": lambda: bool(
        int(getenv("VLLM_SERVER_DEV_MODE", "0"))
    ),
    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(
        getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")
    ),
    "VLLM_MLA_DISABLE": lambda: bool(
        int(getenv("VLLM_MLA_DISABLE", "0"))
    ),
    "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int(
        getenv(
            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
            "32",
        )
    ),
    "VLLM_RAY_PER_WORKER_GPUS": lambda: float(
        getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")
    ),
    "VLLM_RAY_BUNDLE_INDICES": lambda: getenv(
        "VLLM_RAY_BUNDLE_INDICES", ""
    ),
    "VLLM_CUDART_SO_PATH": lambda: getenv(
        "VLLM_CUDART_SO_PATH", None
    ),
    "VLLM_DP_RANK": lambda: int(
        getenv("VLLM_DP_RANK", "0")
    ),
    "VLLM_DP_RANK_LOCAL": lambda: int(
        getenv("VLLM_DP_RANK_LOCAL", VLLM_DP_RANK)
    ),
    "VLLM_DP_SIZE": lambda: int(
        getenv("VLLM_DP_SIZE", "1")
    ),
    "VLLM_DP_MASTER_IP": lambda: getenv(
        "VLLM_DP_MASTER_IP", "127.0.0.1"
    ),
    "VLLM_DP_MASTER_PORT": lambda: int(
        getenv("VLLM_DP_MASTER_PORT", "0")
    ),
    "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(
        getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")
    ),
    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: get(
        "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
    )
    == "1",
    "VLLM_CI_USE_S3": lambda: get("VLLM_CI_USE_S3", "0")
    == "1",
    "VLLM_MODEL_REDIRECT_PATH": lambda: get(
        "VLLM_MODEL_REDIRECT_PATH", None
    ),
    "VLLM_MARLIN_USE_ATOMIC_ADD": lambda: get(
        "VLLM_MARLIN_USE_ATOMIC_ADD", "0"
    )
    == "1",
    "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
        get("VLLM_MXFP4_USE_MARLIN", None)
    ),
    "VLLM_V0_USE_OUTLINES_CACHE": lambda: get(
        "VLLM_V0_USE_OUTLINES_CACHE", "0"
    )
    == "1",
    "VLLM_V1_USE_OUTLINES_CACHE": lambda: get(
        "VLLM_V1_USE_OUTLINES_CACHE", "0"
    )
    == "1",
    "VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
        environ["VLLM_TPU_BUCKET_PADDING_GAP"]
    )
    if "VLLM_TPU_BUCKET_PADDING_GAP" in environ
    else 0,
    "VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
        get("VLLM_TPU_MOST_MODEL_LEN", None)
    ),
    "VLLM_TPU_USING_PATHWAYS": lambda: bool(
        "proxy" in lower()
    ),
    "VLLM_USE_DEEP_GEMM": lambda: bool(
        int(getenv("VLLM_USE_DEEP_GEMM", "1"))
    ),
    "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
        int(getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
    ),
    "VLLM_USE_DEEP_GEMM_E8M0_HOPPER": lambda: bool(
        int(getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))
    ),
    "VLLM_SKIP_DEEP_GEMM_WARMUP": lambda: bool(
        int(getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))
    ),
    "VLLM_USE_FUSED_MOE_GROUPED_TOPK": lambda: bool(
        int(getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))
    ),
    "VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(
        int(getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))
    ),
    "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(
        int(getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))
    ),
    "VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(
        int(getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))
    ),
    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool(
        int(
            getenv(
                "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"
            )
        )
    ),
    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool(
        int(
            getenv(
                "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
                "0",
            )
        )
    ),
    "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
        int(
            getenv(
                "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"
            )
        )
    ),
    "VLLM_XGRAMMAR_CACHE_MB": lambda: int(
        getenv("VLLM_XGRAMMAR_CACHE_MB", "512")
    ),
    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(
        getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")
    ),
    "VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool(
        int(
            getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
        )
    ),
    "VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: getenv(
        "VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"
    ),
    "VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
        getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5600")
    ),
    "VLLM_ALL2ALL_BACKEND": env_with_choices(
        "VLLM_ALL2ALL_BACKEND",
        "allgather_reducescatter",
        [
            "naive",
            "pplx",
            "deepep_high_throughput",
            "deepep_low_latency",
            "allgather_reducescatter",
            "flashinfer_all2allv",
        ],
    ),
    "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
        "VLLM_FLASHINFER_MOE_BACKEND",
        "throughput",
        ["throughput", "latency"],
    ),
    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(
        getenv(
            "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840"
        )
    ),
    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": lambda: loads(
        getenv(
            "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB",
            "{}",
        )
    ),
    "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: lower(),
    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
        getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
    ),
    "VLLM_SLEEP_WHEN_IDLE": lambda: bool(
        int(getenv("VLLM_SLEEP_WHEN_IDLE", "0"))
    ),
    "VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(
        getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")
    ),
    "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": lambda: int(
        getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")
    ),
    "VLLM_KV_CACHE_LAYOUT": env_with_choices(
        "VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]
    ),
    "VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(
        int(getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))
    ),
    "VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
        int(getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
    ),
    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
        getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")
    ),
    "VLLM_USE_CUDNN_PREFILL": lambda: bool(
        int(getenv("VLLM_USE_CUDNN_PREFILL", "0"))
    ),
    "VLLM_USE_TRTLLM_ATTENTION": lambda: None
    if "VLLM_USE_TRTLLM_ATTENTION" not in environ
    else lower() in ("1", "true"),
    "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool(
        int(
            getenv(
                "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
                "0",
            )
        )
    ),
    "VLLM_HAS_FLASHINFER_CUBIN": lambda: getenv(
        "VLLM_HAS_FLASHINFER_CUBIN", False
    ),
    "VLLM_USE_TRTLLM_FP4_GEMM": lambda: bool(
        int(getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))
    ),
    "VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(
        int(getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))
    ),
    "VLLM_DISABLE_PAD_FOR_CUDAGRAPH": lambda: bool(
        int(getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))
    ),
    "VLLM_LOOPBACK_IP": lambda: getenv(
        "VLLM_LOOPBACK_IP", ""
    ),
    "VLLM_PROCESS_NAME_PREFIX": lambda: getenv(
        "VLLM_PROCESS_NAME_PREFIX", "VLLM"
    ),
    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
        int(
            getenv(
                "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE",
                "0",
            )
        )
    ),
    "VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool(
        int(getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))
    ),
    "VLLM_ROCM_FP8_MFMA_PAGE_ATTN": lambda: bool(
        int(getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))
    ),
    "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
        int(getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
    ),
    "VLLM_TUNED_CONFIG_FOLDER": lambda: getenv(
        "VLLM_TUNED_CONFIG_FOLDER", None
    ),
    "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool(
        int(
            getenv(
                "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS",
                "0",
            )
        )
    ),
    "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(
        int(getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))
    ),
    "VLLM_NVTX_SCOPES_FOR_PROFILING": lambda: bool(
        int(getenv("VLLM_NVTX_SCOPES_FOR_PROFILING", "0"))
    ),
    "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": lambda: bool(
        int(
            getenv(
                "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"
            )
        )
    ),
    "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": lambda: getenv(
        "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME",
        "VLLM_OBJECT_STORAGE_SHM_BUFFER",
    ),
    "VLLM_DEEPEP_BUFFER_SIZE_MB": lambda: int(
        getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")
    ),
    "VLLM_DBO_COMM_SMS": lambda: int(
        getenv("VLLM_DBO_COMM_SMS", "20")
    ),
    "GPT_OSS_SYSTEM_TOOL_MCP_LABELS": env_list_with_choices(
        "GPT_OSS_SYSTEM_TOOL_MCP_LABELS",
        [],
        [
            "container",
            "code_interpreter",
            "web_search_preview",
        ],
    ),
    "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE": lambda: bool(
        int(
            getenv("VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", "1")
        )
    ),
    "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": lambda: bool(
        int(
            getenv(
                "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
                "1",
            )
        )
    ),
    "VLLM_USE_NCCL_SYMM_MEM": lambda: bool(
        int(getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))
    ),
    "VLLM_NCCL_INCLUDE_PATH": lambda: get(
        "VLLM_NCCL_INCLUDE_PATH", None
    ),
    "VLLM_USE_FBGEMM": lambda: bool(
        int(getenv("VLLM_USE_FBGEMM", "0"))
    ),
    "VLLM_GC_DEBUG": lambda: getenv("VLLM_GC_DEBUG", ""),
}

__dir__

__dir__()
Source code in vllm/envs.py
def __dir__():
    return list(environment_variables.keys())

__getattr__

__getattr__(name: str)
Source code in vllm/envs.py
def __getattr__(name: str):
    # lazy evaluation of environment variables
    if name in environment_variables:
        return environment_variables[name]()
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

compute_hash

compute_hash() -> str

WARNING: Whenever a new key is added to this environment variables, ensure that it is included in the factors list if it affects the computation graph. For example, different values of VLLM_PP_LAYER_PARTITION will generate different computation graphs, so it is included in the factors list. The env vars that affect the choice of different kernels or attention backends should also be included in the factors list.

Source code in vllm/envs.py
def compute_hash() -> str:
    """
    WARNING: Whenever a new key is added to this environment
    variables, ensure that it is included in the factors list if
    it affects the computation graph. For example, different values
    of VLLM_PP_LAYER_PARTITION will generate different computation
    graphs, so it is included in the factors list. The env vars that
    affect the choice of different kernels or attention backends should
    also be included in the factors list.
    """

    # The values of envs may affects the computation graph.
    # TODO(DefTruth): hash all environment variables?
    # for key in environment_variables:
    #     factorize(key)
    environment_variables_to_hash = [
        "VLLM_PP_LAYER_PARTITION",
        "VLLM_MLA_DISABLE",
        "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
        "VLLM_USE_TRITON_FLASH_ATTN",
        "VLLM_USE_TRITON_AWQ",
        "VLLM_DP_RANK",
        "VLLM_DP_SIZE",
        "VLLM_USE_STANDALONE_COMPILE",
        "VLLM_FUSED_MOE_CHUNK_SIZE",
        "VLLM_FLASHINFER_MOE_BACKEND",
        "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
        "VLLM_USE_AITER_UNIFIED_ATTENTION",
        "VLLM_ATTENTION_BACKEND",
        "VLLM_USE_FLASHINFER_SAMPLER",
        "VLLM_DISABLED_KERNELS",
        "VLLM_USE_DEEP_GEMM",
        "VLLM_USE_DEEP_GEMM_E8M0",
        "VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
        "VLLM_USE_TRTLLM_FP4_GEMM",
        "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
        "VLLM_USE_FLASHINFER_MOE_FP16",
        "VLLM_USE_FLASHINFER_MOE_FP8",
        "VLLM_USE_FLASHINFER_MOE_FP4",
        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
        "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
        "VLLM_USE_CUDNN_PREFILL",
        "VLLM_USE_TRTLLM_ATTENTION",
        "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
        "VLLM_ROCM_USE_AITER",
        "VLLM_ROCM_USE_AITER_PAGED_ATTN",
        "VLLM_ROCM_USE_AITER_LINEAR",
        "VLLM_ROCM_USE_AITER_MOE",
        "VLLM_ROCM_USE_AITER_RMSNORM",
        "VLLM_ROCM_USE_AITER_MLA",
        "VLLM_ROCM_USE_AITER_MHA",
        "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM",
        "VLLM_ROCM_USE_TRITON_ROPE",
        "VLLM_ROCM_USE_AITER_FP8BMM",
        "VLLM_ROCM_USE_SKINNY_GEMM",
        "VLLM_ROCM_FP8_PADDING",
        "VLLM_ROCM_MOE_PADDING",
        "VLLM_ROCM_CUSTOM_PAGED_ATTN",
        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
        "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16",
        "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB",
        "VLLM_ROCM_FP8_MFMA_PAGE_ATTN",
        "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE",
        "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
        "VLLM_USE_FBGEMM",
    ]
    for key in environment_variables_to_hash:
        # if this goes out of sync with environment_variables,
        # it's not a user error, it's a bug
        assert key in environment_variables, \
            "Please update environment_variables_to_hash in envs.py"

    factors = [
        environment_variables[key]() for key in environment_variables_to_hash
    ]

    hash_str = hashlib.md5(str(factors).encode(),
                           usedforsecurity=False).hexdigest()

    return hash_str

env_list_with_choices

env_list_with_choices(
    env_name: str,
    default: list[str],
    choices: Union[list[str], Callable[[], list[str]]],
    case_sensitive: bool = True,
) -> Callable[[], list[str]]

Create a lambda that validates environment variable containing comma-separated values against allowed choices

Parameters:

Name Type Description Default
env_name str

Name of the environment variable

required
default list[str]

Default list of values if not set

required
choices Union[list[str], Callable[[], list[str]]]

List of valid string options or callable that returns list

required
case_sensitive bool

Whether validation should be case sensitive

True

Returns:

Type Description
Callable[[], list[str]]

Lambda function for environment_variables

Callable[[], list[str]]

dict that returns list of strings

Source code in vllm/envs.py
def env_list_with_choices(
        env_name: str,
        default: list[str],
        choices: Union[list[str], Callable[[], list[str]]],
        case_sensitive: bool = True) -> Callable[[], list[str]]:
    """
    Create a lambda that validates environment variable 
    containing comma-separated values against allowed choices

    Args:
        env_name: Name of the environment variable
        default: Default list of values if not set
        choices: List of valid string options or callable that returns list
        case_sensitive: Whether validation should be case sensitive

    Returns:
        Lambda function for environment_variables
        dict that returns list of strings
    """

    def _get_validated_env_list() -> list[str]:
        value = os.getenv(env_name)
        if value is None:
            return default

        # Split comma-separated values and strip whitespace
        values = [v.strip() for v in value.split(",") if v.strip()]

        if not values:
            return default

        # Resolve choices if it's a callable (for lazy loading)
        actual_choices = choices() if callable(choices) else choices

        # Validate each value
        for val in values:
            if not case_sensitive:
                check_value = val.lower()
                check_choices = [choice.lower() for choice in actual_choices]
            else:
                check_value = val
                check_choices = actual_choices

            if check_value not in check_choices:
                raise ValueError(f"Invalid value '{val}' in {env_name}. "
                                 f"Valid options: {actual_choices}.")

        return values

    return _get_validated_env_list

env_with_choices

env_with_choices(
    env_name: str,
    default: Optional[str],
    choices: Union[list[str], Callable[[], list[str]]],
    case_sensitive: bool = True,
) -> Callable[[], Optional[str]]

Create a lambda that validates environment variable against allowed choices

Parameters:

Name Type Description Default
env_name str

Name of the environment variable

required
default Optional[str]

Default value if not set (can be None)

required
choices Union[list[str], Callable[[], list[str]]]

List of valid string options or callable that returns list

required
case_sensitive bool

Whether validation should be case sensitive

True

Returns:

Type Description
Callable[[], Optional[str]]

Lambda function for environment_variables dict

Source code in vllm/envs.py
def env_with_choices(
        env_name: str,
        default: Optional[str],
        choices: Union[list[str], Callable[[], list[str]]],
        case_sensitive: bool = True) -> Callable[[], Optional[str]]:
    """
    Create a lambda that validates environment variable against allowed choices

    Args:
        env_name: Name of the environment variable
        default: Default value if not set (can be None)
        choices: List of valid string options or callable that returns list
        case_sensitive: Whether validation should be case sensitive

    Returns:
        Lambda function for environment_variables dict
    """

    def _get_validated_env() -> Optional[str]:
        value = os.getenv(env_name)
        if value is None:
            return default

        # Resolve choices if it's a callable (for lazy loading)
        actual_choices = choices() if callable(choices) else choices

        if not case_sensitive:
            check_value = value.lower()
            check_choices = [choice.lower() for choice in actual_choices]
        else:
            check_value = value
            check_choices = actual_choices

        if check_value not in check_choices:
            raise ValueError(f"Invalid value '{value}' for {env_name}. "
                             f"Valid options: {actual_choices}.")

        return value

    return _get_validated_env

get_default_cache_root

get_default_cache_root()
Source code in vllm/envs.py
def get_default_cache_root():
    return os.getenv(
        "XDG_CACHE_HOME",
        os.path.join(os.path.expanduser("~"), ".cache"),
    )

get_default_config_root

get_default_config_root()
Source code in vllm/envs.py
def get_default_config_root():
    return os.getenv(
        "XDG_CONFIG_HOME",
        os.path.join(os.path.expanduser("~"), ".config"),
    )

get_vllm_port

get_vllm_port() -> Optional[int]

Get the port from VLLM_PORT environment variable.

Returns:

Type Description
Optional[int]

The port number as an integer if VLLM_PORT is set, None otherwise.

Raises:

Type Description
ValueError

If VLLM_PORT is a URI, suggest k8s service discovery issue.

Source code in vllm/envs.py
def get_vllm_port() -> Optional[int]:
    """Get the port from VLLM_PORT environment variable.

    Returns:
        The port number as an integer if VLLM_PORT is set, None otherwise.

    Raises:
        ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
    """
    if 'VLLM_PORT' not in os.environ:
        return None

    port = os.getenv('VLLM_PORT', '0')

    try:
        return int(port)
    except ValueError as err:
        from urllib.parse import urlparse
        parsed = urlparse(port)
        if parsed.scheme:
            raise ValueError(
                f"VLLM_PORT '{port}' appears to be a URI. "
                "This may be caused by a Kubernetes service discovery issue,"
                "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
            ) from None
        raise ValueError(
            f"VLLM_PORT '{port}' must be a valid integer") from err

is_set

is_set(name: str)

Check if an environment variable is explicitly set.

Source code in vllm/envs.py
def is_set(name: str):
    """Check if an environment variable is explicitly set."""
    if name in environment_variables:
        return name in os.environ
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

maybe_convert_bool

maybe_convert_bool(value: Optional[str]) -> Optional[bool]
Source code in vllm/envs.py
def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
    if value is None:
        return None
    return bool(int(value))

maybe_convert_int

maybe_convert_int(value: Optional[str]) -> Optional[int]
Source code in vllm/envs.py
def maybe_convert_int(value: Optional[str]) -> Optional[int]:
    if value is None:
        return None
    return int(value)

set_vllm_use_v1

set_vllm_use_v1(use_v1: bool)
Source code in vllm/envs.py
def set_vllm_use_v1(use_v1: bool):
    if is_set("VLLM_USE_V1"):
        raise ValueError(
            "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
            "explicitly by the user. Please raise this as a Github "
            "Issue and explicitly set VLLM_USE_V1=0 or 1.")
    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"