Skip to content

vLLM

vllm.model_executor.layers.quantization.utils.mxfp4_utils

vllm.model_executor.layers.quantization.utils.mxfp4_utils

OCP_MX_BLOCK_SIZE `module-attribute` ¶

OCP_MX_BLOCK_SIZE = 32

dequant_mxfp4 `module-attribute` ¶

dequant_mxfp4 = dequant_mxfp4

quant_dequant_mxfp4 `module-attribute` ¶

quant_dequant_mxfp4 = quant_dequant_mxfp4

_dequant_mxfp4 ¶

_dequant_mxfp4(
    x: Tensor, scale: Tensor, float_dtype: dtype
) -> Tensor

Source code in vllm/model_executor/layers/quantization/utils/mxfp4_utils.py

def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor,
                   float_dtype: torch.dtype) -> torch.Tensor:
    try:
        from quark.torch.kernel import mx
    except ImportError as err:
        raise ImportError("The package `amd-quark` is required to use "
                          "MX-FP4 models. Please install it with `pip install "
                          "amd-quark`.") from err

    return mx.dq_mxfp4(x, scale, float_dtype)

_dequant_mxfp4_fake ¶

_dequant_mxfp4_fake(
    x: Tensor, scale: Tensor, float_dtype: dtype
) -> Tensor

Source code in vllm/model_executor/layers/quantization/utils/mxfp4_utils.py

def _dequant_mxfp4_fake(x: torch.Tensor, scale: torch.Tensor,
                        float_dtype: torch.dtype) -> torch.Tensor:
    return torch.empty((*x.shape[:-1], x.shape[-1] * 2),
                       dtype=float_dtype,
                       device=x.device)

_quant_dequant_mxfp4 ¶

_quant_dequant_mxfp4(
    x: Tensor, scale_calculation_mode: str = "even"
) -> Tensor

Source code in vllm/model_executor/layers/quantization/utils/mxfp4_utils.py

def _quant_dequant_mxfp4(x: torch.Tensor,
                         scale_calculation_mode: str = "even") -> torch.Tensor:
    try:
        from quark.torch.kernel import mx
    except ImportError as err:
        raise ImportError("The package `amd-quark` is required to use "
                          "MX-FP4 models. Please install it with `pip install "
                          "amd-quark`.") from err

    return mx.qdq_mxfp4(x, scale_calculation_mode)

_quant_dequant_mxfp4_fake ¶

_quant_dequant_mxfp4_fake(
    x: Tensor, scale_calculation_mode: str = "even"
) -> Tensor

Source code in vllm/model_executor/layers/quantization/utils/mxfp4_utils.py

def _quant_dequant_mxfp4_fake(x: torch.Tensor,
                              scale_calculation_mode: str = "even"
                              ) -> torch.Tensor:
    return torch.empty_like(x)