From 4b1bec587efae004d9e6772baa085e2c14db902d Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 00:38:12 -0800 Subject: [PATCH 1/7] fix a nvfp4 quantization amax attribute error Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/quant_utils.py | 20 ++++++++++++--- .../quantization/qtensor/nvfp4_tensor.py | 25 +++++++++++++++---- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 198dacc5f..7e342d845 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -282,10 +282,15 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8: # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6. # This is because the kernel dequantizes weight to fp8, which is in range 448. - weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0 + if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: + weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0 + else: + # Compute from weight if amax not set + from ..utils import reduce_amax + weight_scaling_factor_2 = reduce_amax(weight).float() / 448.0 else: weight_scaling_factor_2 = NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer( - weight_quantizer + weight_quantizer, weight ) return NVFP4QTensor.get_weights_scaling_factor( weight, @@ -312,11 +317,18 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, ]: - return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) + weight = getattr(module, weight_name) + return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer, weight) elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8: # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6. # This is because the kernel dequantizes weight to fp8, which is in range 448. - return weight_quantizer._amax.float() / 448.0 + if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: + return weight_quantizer._amax.float() / 448.0 + else: + # Compute from weight if amax not set + from ..quantization.utils import reduce_amax + weight = getattr(module, weight_name) + return reduce_amax(weight).float() / 448.0 # SequentialQuantizer is required if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled: diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index 2ff1b17e9..60ec265b9 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -53,11 +53,26 @@ def get_e2m1_bounds(cls, device): return cls.e2m1_bounds_on_device[device] @classmethod - def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer): - """Returns per tensor weight scaling factor from the weight_quantizer amax.""" - # Assert that weight_quantizer has attribute amax - assert hasattr(weight_quantizer, "_amax"), "Weight quantizer does not have attribute amax" - return weight_quantizer._amax.float() / (6.0 * 448.0) + def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer, weight=None): + """Returns per tensor weight scaling factor from the weight_quantizer amax. + + Args: + weight_quantizer: The weight quantizer module + weight: Optional weight tensor to compute amax from if not set on quantizer + """ + # Check if weight_quantizer has amax attribute and it's not None + if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: + return weight_quantizer._amax.float() / (6.0 * 448.0) + + # Fallback: compute amax from weight if provided + if weight is not None: + return cls.get_weights_scaling_factor_2(weight) + + # If neither amax nor weight is available, raise an error + raise ValueError( + "Weight quantizer does not have attribute amax and no weight tensor provided. " + "Cannot compute scaling factor." + ) @classmethod def get_weights_scaling_factor( From 928364987a6aea1ea6e76402b5026d2a7e4986dc Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 00:38:41 -0800 Subject: [PATCH 2/7] fix a nvfp4 quantization amax attribute error Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/quant_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 7e342d845..ef0f536ae 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -287,6 +287,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> else: # Compute from weight if amax not set from ..utils import reduce_amax + weight_scaling_factor_2 = reduce_amax(weight).float() / 448.0 else: weight_scaling_factor_2 = NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer( @@ -327,6 +328,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") else: # Compute from weight if amax not set from ..quantization.utils import reduce_amax + weight = getattr(module, weight_name) return reduce_amax(weight).float() / 448.0 From 97af258ab73f72f8cd00b8528dbeba1048366a6a Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 00:42:55 -0800 Subject: [PATCH 3/7] fix a nvfp4 quantization amax attribute error Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/quant_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index ef0f536ae..95addea11 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -286,7 +286,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0 else: # Compute from weight if amax not set - from ..utils import reduce_amax + from ..quantization.utils import reduce_amax weight_scaling_factor_2 = reduce_amax(weight).float() / 448.0 else: From 1045532701154ea95ad721d664d083690254e153 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 15 Jan 2026 11:17:06 -0800 Subject: [PATCH 4/7] address reviews Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/quant_utils.py | 48 ++++++++++++------- .../quantization/qtensor/nvfp4_tensor.py | 25 ++-------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 95addea11..6846c8195 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -236,6 +236,25 @@ def get_scaling_factor(quantizer: TensorQuantizer) -> torch.Tensor: return scaling_factor +def _ensure_weight_quantizer_calibrated( + weight_quantizer: TensorQuantizer, weight: torch.Tensor +) -> None: + """Calibrate weight quantizer if amax is not set. + + This is a lazy calibration pattern used during export when weight quantizers + may not have been calibrated during the main calibration phase. + + Args: + weight_quantizer: The weight quantizer to calibrate + weight: The weight tensor to use for calibration + """ + if not hasattr(weight_quantizer, "_amax") or weight_quantizer._amax is None: + weight_quantizer.reset_amax() + enable_stats_collection(weight_quantizer) + weight_quantizer(weight) + finish_stats_collection(weight_quantizer) + + def get_activation_scaling_factor( module: nn.Module, input_quantizer_name: str = "input_quantizer" ) -> torch.Tensor: @@ -279,19 +298,16 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_NVFP4_SVDQUANT, QUANTIZATION_W4A8_NVFP4_FP8, ]: + # Calibrate weight quantizer if amax is not set + _ensure_weight_quantizer_calibrated(weight_quantizer, weight) + if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8: # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6. # This is because the kernel dequantizes weight to fp8, which is in range 448. - if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: - weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0 - else: - # Compute from weight if amax not set - from ..quantization.utils import reduce_amax - - weight_scaling_factor_2 = reduce_amax(weight).float() / 448.0 + weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0 else: weight_scaling_factor_2 = NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer( - weight_quantizer, weight + weight_quantizer ) return NVFP4QTensor.get_weights_scaling_factor( weight, @@ -318,19 +334,17 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, ]: + # Calibrate weight quantizer if amax is not set weight = getattr(module, weight_name) - return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer, weight) + _ensure_weight_quantizer_calibrated(weight_quantizer, weight) + return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8: # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6. # This is because the kernel dequantizes weight to fp8, which is in range 448. - if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: - return weight_quantizer._amax.float() / 448.0 - else: - # Compute from weight if amax not set - from ..quantization.utils import reduce_amax - - weight = getattr(module, weight_name) - return reduce_amax(weight).float() / 448.0 + # Calibrate weight quantizer if amax is not set + weight = getattr(module, weight_name) + _ensure_weight_quantizer_calibrated(weight_quantizer, weight) + return weight_quantizer._amax.float() / 448.0 # SequentialQuantizer is required if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled: diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index 60ec265b9..2ff1b17e9 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -53,26 +53,11 @@ def get_e2m1_bounds(cls, device): return cls.e2m1_bounds_on_device[device] @classmethod - def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer, weight=None): - """Returns per tensor weight scaling factor from the weight_quantizer amax. - - Args: - weight_quantizer: The weight quantizer module - weight: Optional weight tensor to compute amax from if not set on quantizer - """ - # Check if weight_quantizer has amax attribute and it's not None - if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: - return weight_quantizer._amax.float() / (6.0 * 448.0) - - # Fallback: compute amax from weight if provided - if weight is not None: - return cls.get_weights_scaling_factor_2(weight) - - # If neither amax nor weight is available, raise an error - raise ValueError( - "Weight quantizer does not have attribute amax and no weight tensor provided. " - "Cannot compute scaling factor." - ) + def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer): + """Returns per tensor weight scaling factor from the weight_quantizer amax.""" + # Assert that weight_quantizer has attribute amax + assert hasattr(weight_quantizer, "_amax"), "Weight quantizer does not have attribute amax" + return weight_quantizer._amax.float() / (6.0 * 448.0) @classmethod def get_weights_scaling_factor( From 4d54f55f8406781df4f24ef9004e9bb6b791418b Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 21 Jan 2026 18:25:01 -0800 Subject: [PATCH 5/7] address reviews Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/quant_utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 6846c8195..43e197a5f 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -329,21 +329,23 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") if weight_quantizer is None: return None - if get_quantization_format(module) in [ + quantization_format = get_quantization_format(module) + + # Calibrate weight quantizer if amax is not set for all NVFP4 variants + if quantization_format in [ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_W4A8_NVFP4_FP8, ]: - # Calibrate weight quantizer if amax is not set weight = getattr(module, weight_name) _ensure_weight_quantizer_calibrated(weight_quantizer, weight) + + if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) - elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8: + elif quantization_format == QUANTIZATION_W4A8_NVFP4_FP8: # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6. # This is because the kernel dequantizes weight to fp8, which is in range 448. - # Calibrate weight quantizer if amax is not set - weight = getattr(module, weight_name) - _ensure_weight_quantizer_calibrated(weight_quantizer, weight) return weight_quantizer._amax.float() / 448.0 # SequentialQuantizer is required From eee6d2fcb3881132ff1be2f8a2e6a7207aba4801 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 22 Jan 2026 17:47:06 -0800 Subject: [PATCH 6/7] address reviews Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/quant_utils.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 43e197a5f..793e84e28 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -237,7 +237,7 @@ def get_scaling_factor(quantizer: TensorQuantizer) -> torch.Tensor: def _ensure_weight_quantizer_calibrated( - weight_quantizer: TensorQuantizer, weight: torch.Tensor + weight_quantizer: TensorQuantizer, weight: torch.Tensor, module_name: str = "" ) -> None: """Calibrate weight quantizer if amax is not set. @@ -247,8 +247,14 @@ def _ensure_weight_quantizer_calibrated( Args: weight_quantizer: The weight quantizer to calibrate weight: The weight tensor to use for calibration + module_name: Optional module name for better warning messages """ if not hasattr(weight_quantizer, "_amax") or weight_quantizer._amax is None: + warn( + f"Weight quantizer{f' for {module_name}' if module_name else ''} was not calibrated. " + f"Computing amax from weights. This may occur if: " + f"some experts were not activated during calibration (expected for MoE models), try increasing --calib_size" + ) weight_quantizer.reset_amax() enable_stats_collection(weight_quantizer) weight_quantizer(weight) @@ -299,7 +305,8 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_W4A8_NVFP4_FP8, ]: # Calibrate weight quantizer if amax is not set - _ensure_weight_quantizer_calibrated(weight_quantizer, weight) + module_name = f"{type(module).__name__}.{weight_name}" + _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name) if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8: # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6. @@ -339,7 +346,8 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") QUANTIZATION_W4A8_NVFP4_FP8, ]: weight = getattr(module, weight_name) - _ensure_weight_quantizer_calibrated(weight_quantizer, weight) + module_name = f"{type(module).__name__}.{weight_name}" + _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name) if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) From 8c0eb8fa7a3970afd183e48bbd02bec7fd052dd6 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 22 Jan 2026 22:38:33 -0800 Subject: [PATCH 7/7] fix ci Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/quant_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 793e84e28..5417b7fa2 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -349,7 +349,11 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") module_name = f"{type(module).__name__}.{weight_name}" _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name) - if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: + if quantization_format in [ + QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, + ]: return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) elif quantization_format == QUANTIZATION_W4A8_NVFP4_FP8: # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.