From 4b1bec587efae004d9e6772baa085e2c14db902d Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 00:38:12 -0800
Subject: [PATCH 1/7] fix a nvfp4 quantization amax attribute error

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/quant_utils.py          | 20 ++++++++++++---
 .../quantization/qtensor/nvfp4_tensor.py      | 25 +++++++++++++++----
 2 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 198dacc5f..7e342d845 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -282,10 +282,15 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
         if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
             # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
             # This is because the kernel dequantizes weight to fp8, which is in range 448.
-            weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0
+            if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
+                weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0
+            else:
+                # Compute from weight if amax not set
+                from ..utils import reduce_amax
+                weight_scaling_factor_2 = reduce_amax(weight).float() / 448.0
         else:
             weight_scaling_factor_2 = NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(
-                weight_quantizer
+                weight_quantizer, weight
             )
         return NVFP4QTensor.get_weights_scaling_factor(
             weight,
@@ -312,11 +317,18 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
     ]:
-        return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
+        weight = getattr(module, weight_name)
+        return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer, weight)
     elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8:
         # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
         # This is because the kernel dequantizes weight to fp8, which is in range 448.
-        return weight_quantizer._amax.float() / 448.0
+        if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
+            return weight_quantizer._amax.float() / 448.0
+        else:
+            # Compute from weight if amax not set
+            from ..quantization.utils import reduce_amax
+            weight = getattr(module, weight_name)
+            return reduce_amax(weight).float() / 448.0
 
     # SequentialQuantizer is required
     if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled:
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index 2ff1b17e9..60ec265b9 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -53,11 +53,26 @@ def get_e2m1_bounds(cls, device):
         return cls.e2m1_bounds_on_device[device]
 
     @classmethod
-    def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer):
-        """Returns per tensor weight scaling factor from the weight_quantizer amax."""
-        # Assert that weight_quantizer has attribute amax
-        assert hasattr(weight_quantizer, "_amax"), "Weight quantizer does not have attribute amax"
-        return weight_quantizer._amax.float() / (6.0 * 448.0)
+    def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer, weight=None):
+        """Returns per tensor weight scaling factor from the weight_quantizer amax.
+
+        Args:
+            weight_quantizer: The weight quantizer module
+            weight: Optional weight tensor to compute amax from if not set on quantizer
+        """
+        # Check if weight_quantizer has amax attribute and it's not None
+        if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
+            return weight_quantizer._amax.float() / (6.0 * 448.0)
+
+        # Fallback: compute amax from weight if provided
+        if weight is not None:
+            return cls.get_weights_scaling_factor_2(weight)
+
+        # If neither amax nor weight is available, raise an error
+        raise ValueError(
+            "Weight quantizer does not have attribute amax and no weight tensor provided. "
+            "Cannot compute scaling factor."
+        )
 
     @classmethod
     def get_weights_scaling_factor(

From 928364987a6aea1ea6e76402b5026d2a7e4986dc Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 00:38:41 -0800
Subject: [PATCH 2/7] fix a nvfp4 quantization amax attribute error

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/quant_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 7e342d845..ef0f536ae 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -287,6 +287,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
             else:
                 # Compute from weight if amax not set
                 from ..utils import reduce_amax
+
                 weight_scaling_factor_2 = reduce_amax(weight).float() / 448.0
         else:
             weight_scaling_factor_2 = NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(
@@ -327,6 +328,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         else:
             # Compute from weight if amax not set
             from ..quantization.utils import reduce_amax
+
             weight = getattr(module, weight_name)
             return reduce_amax(weight).float() / 448.0
 

From 97af258ab73f72f8cd00b8528dbeba1048366a6a Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 00:42:55 -0800
Subject: [PATCH 3/7] fix a nvfp4 quantization amax attribute error

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/quant_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index ef0f536ae..95addea11 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -286,7 +286,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
                 weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0
             else:
                 # Compute from weight if amax not set
-                from ..utils import reduce_amax
+                from ..quantization.utils import reduce_amax
 
                 weight_scaling_factor_2 = reduce_amax(weight).float() / 448.0
         else:

From 1045532701154ea95ad721d664d083690254e153 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 15 Jan 2026 11:17:06 -0800
Subject: [PATCH 4/7] address reviews

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/quant_utils.py          | 48 ++++++++++++-------
 .../quantization/qtensor/nvfp4_tensor.py      | 25 ++--------
 2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 95addea11..6846c8195 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -236,6 +236,25 @@ def get_scaling_factor(quantizer: TensorQuantizer) -> torch.Tensor:
     return scaling_factor
 
 
+def _ensure_weight_quantizer_calibrated(
+    weight_quantizer: TensorQuantizer, weight: torch.Tensor
+) -> None:
+    """Calibrate weight quantizer if amax is not set.
+
+    This is a lazy calibration pattern used during export when weight quantizers
+    may not have been calibrated during the main calibration phase.
+
+    Args:
+        weight_quantizer: The weight quantizer to calibrate
+        weight: The weight tensor to use for calibration
+    """
+    if not hasattr(weight_quantizer, "_amax") or weight_quantizer._amax is None:
+        weight_quantizer.reset_amax()
+        enable_stats_collection(weight_quantizer)
+        weight_quantizer(weight)
+        finish_stats_collection(weight_quantizer)
+
+
 def get_activation_scaling_factor(
     module: nn.Module, input_quantizer_name: str = "input_quantizer"
 ) -> torch.Tensor:
@@ -279,19 +298,16 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
         QUANTIZATION_NVFP4_SVDQUANT,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
+        # Calibrate weight quantizer if amax is not set
+        _ensure_weight_quantizer_calibrated(weight_quantizer, weight)
+
         if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
             # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
             # This is because the kernel dequantizes weight to fp8, which is in range 448.
-            if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
-                weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0
-            else:
-                # Compute from weight if amax not set
-                from ..quantization.utils import reduce_amax
-
-                weight_scaling_factor_2 = reduce_amax(weight).float() / 448.0
+            weight_scaling_factor_2 = weight_quantizer._amax.float() / 448.0
         else:
             weight_scaling_factor_2 = NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(
-                weight_quantizer, weight
+                weight_quantizer
             )
         return NVFP4QTensor.get_weights_scaling_factor(
             weight,
@@ -318,19 +334,17 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
     ]:
+        # Calibrate weight quantizer if amax is not set
         weight = getattr(module, weight_name)
-        return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer, weight)
+        _ensure_weight_quantizer_calibrated(weight_quantizer, weight)
+        return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
     elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8:
         # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
         # This is because the kernel dequantizes weight to fp8, which is in range 448.
-        if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
-            return weight_quantizer._amax.float() / 448.0
-        else:
-            # Compute from weight if amax not set
-            from ..quantization.utils import reduce_amax
-
-            weight = getattr(module, weight_name)
-            return reduce_amax(weight).float() / 448.0
+        # Calibrate weight quantizer if amax is not set
+        weight = getattr(module, weight_name)
+        _ensure_weight_quantizer_calibrated(weight_quantizer, weight)
+        return weight_quantizer._amax.float() / 448.0
 
     # SequentialQuantizer is required
     if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled:
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index 60ec265b9..2ff1b17e9 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -53,26 +53,11 @@ def get_e2m1_bounds(cls, device):
         return cls.e2m1_bounds_on_device[device]
 
     @classmethod
-    def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer, weight=None):
-        """Returns per tensor weight scaling factor from the weight_quantizer amax.
-
-        Args:
-            weight_quantizer: The weight quantizer module
-            weight: Optional weight tensor to compute amax from if not set on quantizer
-        """
-        # Check if weight_quantizer has amax attribute and it's not None
-        if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
-            return weight_quantizer._amax.float() / (6.0 * 448.0)
-
-        # Fallback: compute amax from weight if provided
-        if weight is not None:
-            return cls.get_weights_scaling_factor_2(weight)
-
-        # If neither amax nor weight is available, raise an error
-        raise ValueError(
-            "Weight quantizer does not have attribute amax and no weight tensor provided. "
-            "Cannot compute scaling factor."
-        )
+    def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer):
+        """Returns per tensor weight scaling factor from the weight_quantizer amax."""
+        # Assert that weight_quantizer has attribute amax
+        assert hasattr(weight_quantizer, "_amax"), "Weight quantizer does not have attribute amax"
+        return weight_quantizer._amax.float() / (6.0 * 448.0)
 
     @classmethod
     def get_weights_scaling_factor(

From 4d54f55f8406781df4f24ef9004e9bb6b791418b Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 21 Jan 2026 18:25:01 -0800
Subject: [PATCH 5/7] address reviews

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/quant_utils.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 6846c8195..43e197a5f 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -329,21 +329,23 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
     if weight_quantizer is None:
         return None
 
-    if get_quantization_format(module) in [
+    quantization_format = get_quantization_format(module)
+
+    # Calibrate weight quantizer if amax is not set for all NVFP4 variants
+    if quantization_format in [
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
+        QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
-        # Calibrate weight quantizer if amax is not set
         weight = getattr(module, weight_name)
         _ensure_weight_quantizer_calibrated(weight_quantizer, weight)
+
+    if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
         return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
-    elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8:
+    elif quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
         # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
         # This is because the kernel dequantizes weight to fp8, which is in range 448.
-        # Calibrate weight quantizer if amax is not set
-        weight = getattr(module, weight_name)
-        _ensure_weight_quantizer_calibrated(weight_quantizer, weight)
         return weight_quantizer._amax.float() / 448.0
 
     # SequentialQuantizer is required

From eee6d2fcb3881132ff1be2f8a2e6a7207aba4801 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 22 Jan 2026 17:47:06 -0800
Subject: [PATCH 6/7] address reviews

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/quant_utils.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 43e197a5f..793e84e28 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -237,7 +237,7 @@ def get_scaling_factor(quantizer: TensorQuantizer) -> torch.Tensor:
 
 
 def _ensure_weight_quantizer_calibrated(
-    weight_quantizer: TensorQuantizer, weight: torch.Tensor
+    weight_quantizer: TensorQuantizer, weight: torch.Tensor, module_name: str = ""
 ) -> None:
     """Calibrate weight quantizer if amax is not set.
 
@@ -247,8 +247,14 @@ def _ensure_weight_quantizer_calibrated(
     Args:
         weight_quantizer: The weight quantizer to calibrate
         weight: The weight tensor to use for calibration
+        module_name: Optional module name for better warning messages
     """
     if not hasattr(weight_quantizer, "_amax") or weight_quantizer._amax is None:
+        warn(
+            f"Weight quantizer{f' for {module_name}' if module_name else ''} was not calibrated. "
+            f"Computing amax from weights. This may occur if: "
+            f"some experts were not activated during calibration (expected for MoE models), try increasing --calib_size"
+        )
         weight_quantizer.reset_amax()
         enable_stats_collection(weight_quantizer)
         weight_quantizer(weight)
@@ -299,7 +305,8 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
         # Calibrate weight quantizer if amax is not set
-        _ensure_weight_quantizer_calibrated(weight_quantizer, weight)
+        module_name = f"{type(module).__name__}.{weight_name}"
+        _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name)
 
         if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
             # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
@@ -339,7 +346,8 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
         weight = getattr(module, weight_name)
-        _ensure_weight_quantizer_calibrated(weight_quantizer, weight)
+        module_name = f"{type(module).__name__}.{weight_name}"
+        _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name)
 
     if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
         return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)

From 8c0eb8fa7a3970afd183e48bbd02bec7fd052dd6 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 22 Jan 2026 22:38:33 -0800
Subject: [PATCH 7/7] fix ci

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/quant_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 793e84e28..5417b7fa2 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -349,7 +349,11 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         module_name = f"{type(module).__name__}.{weight_name}"
         _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name)
 
-    if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
+    if quantization_format in [
+        QUANTIZATION_NVFP4,
+        QUANTIZATION_NVFP4_AWQ,
+        QUANTIZATION_NVFP4_SVDQUANT,
+    ]:
         return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
     elif quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
         # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.