Skip to content

vllm.transformers_utils.configs.radio

Radio vision model configuration

OPENAI_CLIP_MEAN module-attribute

OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)

OPENAI_CLIP_STD module-attribute

OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)

VIT_TIMM_DIM_BY_NAME module-attribute

VIT_TIMM_DIM_BY_NAME: dict[
    str, tuple[int, int, int, int]
] = {
    "vit_small_patch16_224": (384, 12, 6, 1536),
    "vit_base_patch16_224": (768, 12, 12, 3072),
    "vit_large_patch16_224": (1024, 24, 16, 4096),
    "vit_huge_patch16_224": (1280, 32, 16, 5120),
}

logger module-attribute

logger = get_logger(__name__)

RadioConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a Radio vision model. It is used to instantiate a Radio model according to the specified arguments, defining the model architecture.

Parameters:

Name Type Description Default
model_name str

Name of the vision transformer model (e.g., "vit_base_patch16_224"). Used to determine architecture dimensions from VIT_TIMM_DIM_BY_NAME.

required
image_size int

The size (resolution) of each image.

224
patch_size int

The size (resolution) of each patch.

16
qkv_bias bool

Whether to add a bias to the queries, keys and values.

True
qk_normalization bool

Whether to apply normalization to queries and keys.

False
norm_type str

The normalization type to use.

'layer_norm'
layer_norm_eps float

The epsilon used by the layer normalization layers.

1e-06
initializer_factor float

A factor for initializing all weight matrices.

1.0
hidden_act str

The non-linear activation function in the encoder.

'gelu'
max_img_size int

Maximum image size for position embeddings.

2048
norm_mean Union[tuple[float, float, float], list]

Mean values for image normalization (RGB channels). Defaults to (0.48145466, 0.4578275, 0.40821073)).

OPENAI_CLIP_MEAN
norm_std Union[tuple[float, float, float], list]

Standard deviation values for image normalization (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).

OPENAI_CLIP_STD
reg_tokens Optional[int]

Number of register tokens to use.

None
Source code in vllm/transformers_utils/configs/radio.py
class RadioConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a Radio
    vision model. It is used to instantiate a Radio model according to the
    specified arguments, defining the model architecture.

    Args:
        model_name: Name of the vision transformer model
            (e.g., "vit_base_patch16_224"). Used to determine architecture
            dimensions from `VIT_TIMM_DIM_BY_NAME`.
        image_size: The size (resolution) of each image.
        patch_size: The size (resolution) of each patch.
        qkv_bias: Whether to add a bias to the queries, keys and values.
        qk_normalization: Whether to apply normalization to queries and keys.
        norm_type: The normalization type to use.
        layer_norm_eps: The epsilon used by the layer normalization layers.
        initializer_factor: A factor for initializing all weight matrices.
        hidden_act: The non-linear activation function in the encoder.
        max_img_size: Maximum image size for position embeddings.
        norm_mean: Mean values for image normalization (RGB channels).
            Defaults to (0.48145466, 0.4578275, 0.40821073)).
        norm_std: Standard deviation values for image normalization
            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
        reg_tokens: Number of register tokens to use.
    """

    model_type = "radio"

    def __init__(
        self,
        model_name: str,
        image_size: int = 224,
        patch_size: int = 16,
        qkv_bias: bool = True,
        qk_normalization: bool = False,
        norm_type: str = "layer_norm",
        layer_norm_eps: float = 1e-6,
        initializer_factor: float = 1.0,
        hidden_act: str = "gelu",
        max_img_size: int = 2048,
        norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
        norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
        reg_tokens: Optional[int] = None,
        **kwargs,
    ):
        self.model_name = model_name
        (
            self.hidden_size,
            self.num_hidden_layers,
            self.num_attention_heads,
            self.intermediate_size,
        ) = VIT_TIMM_DIM_BY_NAME[model_name]
        self.image_size = image_size
        self.patch_size = patch_size
        self.qkv_bias = qkv_bias
        self.qk_normalization = qk_normalization
        self.norm_type = norm_type
        self.layer_norm_eps = layer_norm_eps
        self.initializer_factor = initializer_factor
        self.hidden_act = hidden_act
        self.max_img_size = max_img_size
        self.norm_mean = list(norm_mean) if isinstance(norm_mean,
                                                       (tuple,
                                                        list)) else norm_mean
        self.norm_std = list(norm_std) if isinstance(norm_std,
                                                     (tuple,
                                                      list)) else norm_std
        self.reg_tokens = reg_tokens
        super().__init__(**kwargs)

hidden_act instance-attribute

hidden_act = hidden_act

image_size instance-attribute

image_size = image_size

initializer_factor instance-attribute

initializer_factor = initializer_factor

layer_norm_eps instance-attribute

layer_norm_eps = layer_norm_eps

max_img_size instance-attribute

max_img_size = max_img_size

model_name instance-attribute

model_name = model_name

model_type class-attribute instance-attribute

model_type = 'radio'

norm_mean instance-attribute

norm_mean = (
    list(norm_mean)
    if isinstance(norm_mean, (tuple, list))
    else norm_mean
)

norm_std instance-attribute

norm_std = (
    list(norm_std)
    if isinstance(norm_std, (tuple, list))
    else norm_std
)

norm_type instance-attribute

norm_type = norm_type

patch_size instance-attribute

patch_size = patch_size

qk_normalization instance-attribute

qk_normalization = qk_normalization

qkv_bias instance-attribute

qkv_bias = qkv_bias

reg_tokens instance-attribute

reg_tokens = reg_tokens

__init__

__init__(
    model_name: str,
    image_size: int = 224,
    patch_size: int = 16,
    qkv_bias: bool = True,
    qk_normalization: bool = False,
    norm_type: str = "layer_norm",
    layer_norm_eps: float = 1e-06,
    initializer_factor: float = 1.0,
    hidden_act: str = "gelu",
    max_img_size: int = 2048,
    norm_mean: Union[
        tuple[float, float, float], list
    ] = OPENAI_CLIP_MEAN,
    norm_std: Union[
        tuple[float, float, float], list
    ] = OPENAI_CLIP_STD,
    reg_tokens: Optional[int] = None,
    **kwargs,
)
Source code in vllm/transformers_utils/configs/radio.py
def __init__(
    self,
    model_name: str,
    image_size: int = 224,
    patch_size: int = 16,
    qkv_bias: bool = True,
    qk_normalization: bool = False,
    norm_type: str = "layer_norm",
    layer_norm_eps: float = 1e-6,
    initializer_factor: float = 1.0,
    hidden_act: str = "gelu",
    max_img_size: int = 2048,
    norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
    norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
    reg_tokens: Optional[int] = None,
    **kwargs,
):
    self.model_name = model_name
    (
        self.hidden_size,
        self.num_hidden_layers,
        self.num_attention_heads,
        self.intermediate_size,
    ) = VIT_TIMM_DIM_BY_NAME[model_name]
    self.image_size = image_size
    self.patch_size = patch_size
    self.qkv_bias = qkv_bias
    self.qk_normalization = qk_normalization
    self.norm_type = norm_type
    self.layer_norm_eps = layer_norm_eps
    self.initializer_factor = initializer_factor
    self.hidden_act = hidden_act
    self.max_img_size = max_img_size
    self.norm_mean = list(norm_mean) if isinstance(norm_mean,
                                                   (tuple,
                                                    list)) else norm_mean
    self.norm_std = list(norm_std) if isinstance(norm_std,
                                                 (tuple,
                                                  list)) else norm_std
    self.reg_tokens = reg_tokens
    super().__init__(**kwargs)