From 3bc8bc835cfc47e2c5283910f39819a689b67ecc Mon Sep 17 00:00:00 2001
From: Shiyang Chen <shiychen@nvidia.com>
Date: Tue, 28 Oct 2025 15:16:12 -0700
Subject: [PATCH 1/5] support nvfp4_svdquant hf export

Signed-off-by: Shiyang Chen <shiychen@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py                    |  1 +
 .../llm_ptq/scripts/huggingface_example.sh    |  4 +--
 modelopt/torch/export/model_config.py         | 13 ++++++--
 modelopt/torch/export/postprocess.py          |  7 ++++-
 modelopt/torch/export/quant_utils.py          | 30 +++++++++++++++----
 modelopt/torch/export/unified_export_hf.py    |  8 ++++-
 6 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index ddf6bfbdc..5da22bf6d 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -83,6 +83,7 @@
     "w4a8_nvfp4_fp8": mtq.W4A8_NVFP4_FP8_CFG,
     "w4a8_mxfp4_fp8": mtq.W4A8_MXFP4_FP8_CFG,
     "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG,
+    "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG,
 }
 
 KV_QUANT_CFG_CHOICES = {
diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh
index 3ea85de9e..f8b05189b 100755
--- a/examples/llm_ptq/scripts/huggingface_example.sh
+++ b/examples/llm_ptq/scripts/huggingface_example.sh
@@ -53,9 +53,9 @@ esac
 IFS=","
 for qformat in $QFORMAT; do
     case $qformat in
-    fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_mlp_only) ;;
+    fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_mlp_only | nvfp4_svdquant) ;;
     *)
-        echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_mlp_only]" >&2
+        echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_mlp_only, nvfp4_svdquant]" >&2
         exit 1
         ;;
     esac
diff --git a/modelopt/torch/export/model_config.py b/modelopt/torch/export/model_config.py
index 306348f2c..73ed85e3a 100755
--- a/modelopt/torch/export/model_config.py
+++ b/modelopt/torch/export/model_config.py
@@ -33,6 +33,7 @@
 QUANTIZATION_INT4_AWQ = "int4_awq"
 QUANTIZATION_W4A8_AWQ = "w4a8_awq"
 QUANTIZATION_NVFP4 = "nvfp4"
+QUANTIZATION_NVFP4_SVDQUANT = "nvfp4_svdquant"
 QUANTIZATION_W4A8_NVFP4_FP8 = "w4a8_nvfp4_fp8"
 QUANTIZATION_MXFP4 = "mxfp4"
 QUANTIZATION_W4A8_MXFP4_FP8 = "w4a8_mxfp4_fp8"
@@ -507,12 +508,20 @@ def hidden_size(self):
         """Returns the hidden size of the transformer model."""
         if isinstance(self.mlp, MOEConfig):
             # fc.weight for MOE is stacked
-            if self.mlp.fc.quantization in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
+            if self.mlp.fc.quantization in [
+                QUANTIZATION_NVFP4,
+                QUANTIZATION_NVFP4_AWQ,
+                QUANTIZATION_NVFP4_SVDQUANT,
+            ]:
                 return self.mlp.fc.weight.shape[-1] * 2
             return self.mlp.fc.weight.shape[-1]
         else:
             k = self.mlp.fc.weight.shape[1]
-            if self.mlp.fc.quantization in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
+            if self.mlp.fc.quantization in [
+                QUANTIZATION_NVFP4,
+                QUANTIZATION_NVFP4_AWQ,
+                QUANTIZATION_NVFP4_SVDQUANT,
+            ]:
                 return k * 2
             return k
 
diff --git a/modelopt/torch/export/postprocess.py b/modelopt/torch/export/postprocess.py
index 5c3d0fcf3..376a52a41 100644
--- a/modelopt/torch/export/postprocess.py
+++ b/modelopt/torch/export/postprocess.py
@@ -35,6 +35,7 @@
     LINEAR_ROW,
     QUANTIZATION_NVFP4,
     QUANTIZATION_NVFP4_AWQ,
+    QUANTIZATION_NVFP4_SVDQUANT,
     ConvConfig,
     EmbeddingConfig,
     ExpertConfig,
@@ -398,7 +399,10 @@ def _merge_model_configs_to_first_tp(config, ranks: list[int], group=None):
                                 group_size=config.awq_block_size,
                                 quantization=config.quantization,
                             )
-                            if config.quantization == QUANTIZATION_NVFP4_AWQ:
+                            if config.quantization in [
+                                QUANTIZATION_NVFP4_AWQ,
+                                QUANTIZATION_NVFP4_SVDQUANT,
+                            ]:
                                 # We have to update weight_scaling_factor and weight_scaling_factor_2
                                 config.weights_scaling_factor, config.weights_scaling_factor_2 = (
                                     NVFP4QTensor.get_weights_scaling_factor(
@@ -430,6 +434,7 @@ def _merge_model_configs_to_first_tp(config, ranks: list[int], group=None):
                                     if config.quantization in [
                                         QUANTIZATION_NVFP4,
                                         QUANTIZATION_NVFP4_AWQ,
+                                        QUANTIZATION_NVFP4_SVDQUANT,
                                     ]:
                                         (
                                             config.weights_scaling_factor,
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 1a64d028f..7584a6259 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -57,6 +57,7 @@
     QUANTIZATION_NONE,
     QUANTIZATION_NVFP4,
     QUANTIZATION_NVFP4_AWQ,
+    QUANTIZATION_NVFP4_SVDQUANT,
     QUANTIZATION_W4A8_AWQ,
     QUANTIZATION_W4A8_MXFP4_FP8,
     QUANTIZATION_W4A8_NVFP4_FP8,
@@ -165,7 +166,7 @@ def resmooth_and_get_scale(
         )
         new_weights.append(weight)
         # If NVFP4_AWQ then we view the scales as uint8 to allow for cat later
-        if quantization == QUANTIZATION_NVFP4_AWQ:
+        if quantization in [QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT]:
             scale, _ = NVFP4QTensor.get_weights_scaling_factor(weight, group_size).view(torch.uint8)
         else:
             scale = get_scaling_factor_from_weight(weight, group_size)
@@ -176,7 +177,7 @@ def resmooth_and_get_scale(
     return (
         torch.cat(new_weights, dim=0),
         resmoothed_scales.view(torch.float8_e4m3fn)
-        if quantization == QUANTIZATION_NVFP4_AWQ
+        if quantization in [QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT]
         else resmoothed_scales,  # if NVFP4_AWQ we view the scales back as float8_e4m3fn after cat
         new_pre_quant_scale,
     )
@@ -243,6 +244,7 @@ def get_activation_scaling_factor(
     if get_quantization_format(module) in [
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
+        QUANTIZATION_NVFP4_SVDQUANT,
     ]:
         return NVFP4QTensor.get_activation_scaling_factor(input_quantizer)
     return get_scaling_factor(input_quantizer)
@@ -270,6 +272,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
     if quantization_format in [
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
+        QUANTIZATION_NVFP4_SVDQUANT,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
         if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
@@ -303,6 +306,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
     if get_quantization_format(module) in [
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
+        QUANTIZATION_NVFP4_SVDQUANT,
     ]:
         return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
     elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8:
@@ -487,6 +491,12 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
             block_sizes = getattr(weight_quantizer, "block_sizes")
             scale_bits = block_sizes.get("scale_bits")
 
+            if (
+                input_quantizer is not None
+                and hasattr(input_quantizer, "_pre_quant_scale")
+                and hasattr(weight_quantizer, "svdquant_lora_a")
+            ):
+                return QUANTIZATION_NVFP4_SVDQUANT
             if input_quantizer is not None and hasattr(input_quantizer, "_pre_quant_scale"):
                 return QUANTIZATION_NVFP4_AWQ
             if getattr(layer, "fused_with_prequant", False):
@@ -660,15 +670,18 @@ def process_layer_quant_config(layer_config_dict):
         elif v == "w4a8_nvfp4_fp8":
             layer_config = {
                 "quant_algo": "W4A8_NVFP4_FP8",
-                "group_size": layer_config_dict[prefix + ".awq_block_size"],
-                "has_zero_point": False,
-                "pre_quant_scale": True,
+                "group_size": block_size_value,
             }
         elif v == "w4a8_mxfp4_fp8":
             layer_config = {
                 "quant_algo": "W4A8_MXFP4_FP8",
                 "group_size": block_size_value,
             }
+        elif v == "nvfp4_svdquant":
+            layer_config = {
+                "quant_algo": "NVFP4_SVD",
+                "group_size": block_size_value,
+            }
         else:
             layer_config = {"quant_algo": v}
 
@@ -813,7 +826,12 @@ def to_quantized_weight(
     if quantization in [QUANTIZATION_INT4_AWQ, QUANTIZATION_W4A8_AWQ]:
         return pack_int4_in_uint8(weight, weights_scaling_factor)
 
-    if quantization in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_W4A8_NVFP4_FP8]:
+    if quantization in [
+        QUANTIZATION_NVFP4,
+        QUANTIZATION_NVFP4_AWQ,
+        QUANTIZATION_W4A8_NVFP4_FP8,
+        QUANTIZATION_NVFP4_SVDQUANT,
+    ]:
         assert block_size is not None, "Block size not passed. Unable to quantize to NVFP4 format."
         assert weights_scaling_factor2 is not None, (
             "Weights scaling factor 2 not passed. Unable to quantize to NVFP4 format"
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 37b7f3b64..2a7e07dba 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -62,6 +62,7 @@
     QUANTIZATION_NONE,
     QUANTIZATION_NVFP4,
     QUANTIZATION_NVFP4_AWQ,
+    QUANTIZATION_NVFP4_SVDQUANT,
     QUANTIZATION_W4A8_AWQ,
     QUANTIZATION_W4A8_NVFP4_FP8,
 )
@@ -414,6 +415,7 @@ def _export_quantized_weight(
 
     if quantization_format in [
         QUANTIZATION_NVFP4_AWQ,
+        QUANTIZATION_NVFP4_SVDQUANT,
         QUANTIZATION_NVFP4,
         QUANTIZATION_W4A8_AWQ,
         QUANTIZATION_W4A8_NVFP4_FP8,
@@ -434,7 +436,11 @@ def _export_quantized_weight(
         for expert_type in ["Llama4TextExperts", "GptOssExperts"]
     )
 
-    if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
+    if quantization_format in [
+        QUANTIZATION_NVFP4,
+        QUANTIZATION_NVFP4_AWQ,
+        QUANTIZATION_NVFP4_SVDQUANT,
+    ]:
         # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)
         # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization
         weight, _ = maybe_transpose_expert_weight_dimensions(

From 8a2cab285cf954ceec6a9905ac7add51eb128128 Mon Sep 17 00:00:00 2001
From: Shiyang Chen <shiychen@nvidia.com>
Date: Tue, 6 Jan 2026 14:23:58 -0800
Subject: [PATCH 2/5] handle q/k/v and gate/up merging for svdquant

Signed-off-by: Shiyang Chen <shiychen@nvidia.com>
---
 modelopt/torch/export/quant_utils.py       | 55 ++++++++++++++++++----
 modelopt/torch/export/unified_export_hf.py |  7 ++-
 modelopt/torch/quantization/model_calib.py | 29 +++++++-----
 3 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 7584a6259..3beb100a6 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -25,7 +25,11 @@
 import torch.nn as nn
 
 from modelopt import __version__
-from modelopt.torch.quantization.model_calib import enable_stats_collection, finish_stats_collection
+from modelopt.torch.quantization.model_calib import (
+    enable_stats_collection,
+    finish_stats_collection,
+    svd,
+)
 from modelopt.torch.quantization.nn.modules.quant_linear import RealQuantLinear
 from modelopt.torch.quantization.qtensor import (
     FP8QTensor,
@@ -491,11 +495,7 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
             block_sizes = getattr(weight_quantizer, "block_sizes")
             scale_bits = block_sizes.get("scale_bits")
 
-            if (
-                input_quantizer is not None
-                and hasattr(input_quantizer, "_pre_quant_scale")
-                and hasattr(weight_quantizer, "svdquant_lora_a")
-            ):
+            if input_quantizer is not None and hasattr(weight_quantizer, "svdquant_lora_a"):
                 return QUANTIZATION_NVFP4_SVDQUANT
             if input_quantizer is not None and hasattr(input_quantizer, "_pre_quant_scale"):
                 return QUANTIZATION_NVFP4_AWQ
@@ -1032,6 +1032,40 @@ def _update_pre_quant_scale(module, new_pre_quant_scale):
     finish_stats_collection(module.weight_quantizer)
 
 
+def _update_svdquant(modules, new_pre_quant_scale):
+    """Updates the pre_quant_scale, svdquant_lora_a and svdquant_lora_b matrices when pre_quant_scale is changed."""
+    new_pre_quant_scale = new_pre_quant_scale.to(torch.float32)
+    lora_a = [m.weight_quantizer.svdquant_lora_a.to(torch.float32) for m in modules]
+    lora_b = [m.weight_quantizer.svdquant_lora_b.to(torch.float32) for m in modules]
+    weight = [m.weight.to(torch.float32) for m in modules]
+    old_pre_quant_scale = [m.input_quantizer._pre_quant_scale.to(torch.float32) for m in modules]
+    weight = [
+        (w + (lb @ la)) * (s / new_pre_quant_scale)
+        for w, la, lb, s in zip(weight, lora_a, lora_b, old_pre_quant_scale)
+    ]
+    weight_concatenated = torch.cat(weight, dim=0)
+    lb, la = svd(weight_concatenated, rank=lora_a[0].shape[0])
+    weight_concatenated -= lb @ la
+    weight_concatenated = weight_concatenated.to(modules[0].weight.dtype)
+    la = la.to(modules[0].weight_quantizer.svdquant_lora_a.dtype)
+    lb = lb.to(modules[0].weight_quantizer.svdquant_lora_b.dtype)
+    new_pre_quant_scale = new_pre_quant_scale.to(modules[0].input_quantizer.pre_quant_scale.dtype)
+
+    index = 0
+    for i, module in enumerate(modules):
+        module.input_quantizer.pre_quant_scale = new_pre_quant_scale
+        module.weight_quantizer.svdquant_lora_a = la
+        assert lora_b[i].shape[0] == module.weight.shape[0]
+        module.weight_quantizer.svdquant_lora_b = lb[index : index + lora_b[i].shape[0], :]
+        module.weight = nn.Parameter(weight_concatenated[index : index + lora_b[i].shape[0], :])
+        index += lora_b[i].shape[0]
+        # Redo weights collection
+        module.weight_quantizer.reset_amax()
+        enable_stats_collection(module.weight_quantizer)
+        module.weight_quantizer(module.weight)
+        finish_stats_collection(module.weight_quantizer)
+
+
 # Format: (list of target modules, tuple of (linear_to_fuse_into, linear_from_with_scale))
 PQS_FUSE_MODULE_MAPPING = [
     # Attention: Fuse o_proj's pre_quant_scale into v_proj's output dimension
@@ -1184,9 +1218,12 @@ def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False
                 dim=0,
             )
 
-            for module in modules:
-                if not torch.equal(module.input_quantizer.pre_quant_scale, avg_prequant_scale):
-                    _update_pre_quant_scale(module, avg_prequant_scale)
+            if hasattr(modules[0].weight_quantizer, "svdquant_lora_a"):
+                _update_svdquant(modules, avg_prequant_scale)
+            else:
+                for module in modules:
+                    if not torch.equal(module.input_quantizer.pre_quant_scale, avg_prequant_scale):
+                        _update_pre_quant_scale(module, avg_prequant_scale)
 
         if resmooth_only:
             return
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 2a7e07dba..189c8f5f9 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -234,6 +234,10 @@ def requantize_resmooth_fused_llm_layers(model: torch.nn.Module):
     model_type = type(model).__name__.lower()
     module_names = set()
 
+    # NVFP4 SVDQuant does not need pre-quant scale fusion (either into previous linear or layernorm) because
+    # 1) its kernel handles pre-quant scale.
+    # 2) fusing into previous linear will need to change the lora_up in up_proj which may cause issue in
+    #    the later gate up fusion.
     # Fuse pre_quant_scale to the linear weights if possible
     if quantization_format is not None and "nvfp4_awq" in quantization_format.lower():
         fuse_prequant_to_linear(model)
@@ -244,7 +248,8 @@ def requantize_resmooth_fused_llm_layers(model: torch.nn.Module):
 
         # For MoE models update pre_quant_scale to average pre_quant_scale amongst experts
         if is_moe(module) and (
-            quantization_format is not QUANTIZATION_NONE and "awq" in quantization_format
+            quantization_format is not QUANTIZATION_NONE
+            and ("awq" in quantization_format or quantization_format == QUANTIZATION_NVFP4_SVDQUANT)
         ):
             # update_experts_avg_prequant_scale(module)
             grouped_experts = get_experts_list(module, model_type)
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
index b8461a080..0cd9b63f8 100644
--- a/modelopt/torch/quantization/model_calib.py
+++ b/modelopt/torch/quantization/model_calib.py
@@ -1075,6 +1075,18 @@ def _get_awq_quantizer_block_size(tensor: torch.Tensor, quantizer: TensorQuantiz
     return blocksize
 
 
+def svd(weight, rank):
+    original_device = weight.device
+    original_dtype = weight.dtype
+    weight_f64 = weight.to(dtype=torch.float64, device=original_device)
+    u, s, vt = torch.linalg.svd(weight_f64, full_matrices=False)
+    us = u[:, :rank] * s[:rank]
+    vt = vt[:rank]
+    return us.to(device=original_device, dtype=original_dtype), vt.to(
+        device=original_device, dtype=original_dtype
+    )
+
+
 @torch.no_grad()
 def svdquant(
     model: nn.Module,
@@ -1096,25 +1108,16 @@ def svdquant(
     def postprocess(module, name):
         print_rank_0(f"SVD {name}")
         weight = module.weight.data
-        original_device = weight.device
-        original_dtype = weight.dtype
-        weight_f64 = weight.to(dtype=torch.float64, device=original_device)
-        u, s, vt = torch.linalg.svd(weight_f64, full_matrices=False)
-        if u.shape[1] < lowrank or vt.shape[0] < lowrank:
+        us, vt = svd(weight, lowrank)
+        if us.shape[1] < lowrank or vt.shape[0] < lowrank:
             warnings.warn(
                 "The low-rank dimensions do not match the layer dimensions. "
                 "Please verify your configuration and model settings. "
                 f"SVD will be skipped for this layer {name}."
             )
             return
-        us = u[:, :lowrank] * s[:lowrank]
-        vt = vt[:lowrank]
-        module.weight_quantizer.svdquant_lora_a = vt.to(
-            dtype=original_dtype, device=original_device
-        )
-        module.weight_quantizer.svdquant_lora_b = us.to(
-            dtype=original_dtype, device=original_device
-        )
+        module.weight_quantizer.svdquant_lora_a = vt
+        module.weight_quantizer.svdquant_lora_b = us
         module.weight.data.sub_(
             module.weight_quantizer.svdquant_lora_b @ module.weight_quantizer.svdquant_lora_a
         )

From b7dabf54ba0d7caae46250957311bf684589a232 Mon Sep 17 00:00:00 2001
From: Shiyang Chen <shiychen@nvidia.com>
Date: Fri, 16 Jan 2026 14:41:35 -0800
Subject: [PATCH 3/5] update based on review

Signed-off-by: Shiyang Chen <shiychen@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py           | 4 ++++
 modelopt/torch/export/quant_utils.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 5da22bf6d..051ee95b7 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -507,6 +507,10 @@ def export_quantized(
             or args.sparsity_fmt != "dense"
             or "int8_sq" in args.qformat
         ):
+            if (
+                args.inference_tensor_parallel != 1 or args.inference_pipeline_parallel != 1
+            ) and args.qformat == "nvfp4_svdquant":
+                raise NotImplementedError("Svdquant does not support mulitple GPUs yet.")
             warnings.warn(
                 "Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime."
             )
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 3beb100a6..f76539048 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -1218,7 +1218,7 @@ def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False
                 dim=0,
             )
 
-            if hasattr(modules[0].weight_quantizer, "svdquant_lora_a"):
+            if getattr(modules[0].weight_quantizer, "svdquant_lora_a", None) is not None:
                 _update_svdquant(modules, avg_prequant_scale)
             else:
                 for module in modules:

From 55ca49f4abcf8635e660529da18ce90e4dd83b9a Mon Sep 17 00:00:00 2001
From: Shiyang Chen <shiychen@nvidia.com>
Date: Tue, 20 Jan 2026 13:12:06 -0800
Subject: [PATCH 4/5] add unittest and fix a bug in existing unittests

Signed-off-by: Shiyang Chen <shiychen@nvidia.com>
---
 modelopt/torch/export/quant_utils.py          |  4 +-
 modelopt/torch/quantization/model_calib.py    | 25 +++++----
 ...unified_hf_export_and_check_safetensors.py | 55 ++++++++++---------
 3 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index f76539048..81ea8561c 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -1218,7 +1218,9 @@ def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False
                 dim=0,
             )
 
-            if getattr(modules[0].weight_quantizer, "svdquant_lora_a", None) is not None:
+            if all(
+                getattr(m.weight_quantizer, "svdquant_lora_a", None) is not None for m in modules
+            ):
                 _update_svdquant(modules, avg_prequant_scale)
             else:
                 for module in modules:
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
index 0cd9b63f8..d27fe7214 100644
--- a/modelopt/torch/quantization/model_calib.py
+++ b/modelopt/torch/quantization/model_calib.py
@@ -1082,9 +1082,21 @@ def svd(weight, rank):
     u, s, vt = torch.linalg.svd(weight_f64, full_matrices=False)
     us = u[:, :rank] * s[:rank]
     vt = vt[:rank]
-    return us.to(device=original_device, dtype=original_dtype), vt.to(
-        device=original_device, dtype=original_dtype
-    )
+    us = us.to(device=original_device, dtype=original_dtype)
+    vt = vt.to(device=original_device, dtype=original_dtype)
+    if us.shape[1] < rank or vt.shape[0] < rank:
+        warnings.warn(
+            "The low-rank dimensions do not match the layer dimensions. "
+            "Please verify your configuration and model settings. "
+            f"Rank is {us.shape[1]} and {vt.shape[0]}"
+        )
+        us_temp = torch.zeros((us.shape[0], rank), dtype=us.dtype, device=us.device)
+        vt_temp = torch.zeros((rank, vt.shape[1]), dtype=vt.dtype, device=vt.device)
+        us_temp[:, : us.shape[1]] = us
+        vt_temp[: vt.shape[0], :] = vt
+        us = us_temp
+        vt = vt_temp
+    return us, vt
 
 
 @torch.no_grad()
@@ -1109,13 +1121,6 @@ def postprocess(module, name):
         print_rank_0(f"SVD {name}")
         weight = module.weight.data
         us, vt = svd(weight, lowrank)
-        if us.shape[1] < lowrank or vt.shape[0] < lowrank:
-            warnings.warn(
-                "The low-rank dimensions do not match the layer dimensions. "
-                "Please verify your configuration and model settings. "
-                f"SVD will be skipped for this layer {name}."
-            )
-            return
         module.weight_quantizer.svdquant_lora_a = vt
         module.weight_quantizer.svdquant_lora_b = us
         module.weight.data.sub_(
diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py
index a6f360872..23a1439e9 100644
--- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py
+++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py
@@ -29,15 +29,17 @@
         "fuse_input_scale",
         "fuse_weight_scale",
         "fuse_weight_scale_2",
-        "fuse_prequant_scale",
+        "fuse_pre_quant_scale",
+        "fuse_svdquant_lora_a",
     ),
     [
-        ("fp8", "tiny_llama-fp8", True, False, True, True),
-        ("nvfp4", "tiny_llama-nvfp4", True, False, True, True),
-        ("nvfp4_awq", "tiny_llama-nvfp4-awq", True, False, True, True),
-        ("int4_awq", "tiny_llama-int4-awq", True, False, True, True),
-        ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True),
-        ("int8_wo", "tiny_llama-int8-wo", False, False, False, False),
+        ("fp8", "tiny_llama-fp8", True, False, True, True, False),
+        ("nvfp4", "tiny_llama-nvfp4", True, False, True, True, False),
+        ("nvfp4_awq", "tiny_llama-nvfp4-awq", True, False, True, True, False),
+        ("int4_awq", "tiny_llama-int4-awq", True, False, True, True, False),
+        ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True, False),
+        ("int8_wo", "tiny_llama-int8-wo", False, False, False, False, False),
+        ("nvfp4_svdquant", "tiny_llama-nvfp4-svdquant", True, False, True, True, True),
     ],
 )
 def test_unified_hf_export_and_check_safetensors(
@@ -47,7 +49,8 @@ def test_unified_hf_export_and_check_safetensors(
     fuse_input_scale,
     fuse_weight_scale,
     fuse_weight_scale_2,
-    fuse_prequant_scale,
+    fuse_pre_quant_scale,
+    fuse_svdquant_lora_a,
 ):
     """
     1) Generates a .safetensors file by running hf_ptq.py with each --qformat.
@@ -92,6 +95,18 @@ def test_unified_hf_export_and_check_safetensors(
         f"Expected .safetensors file not found for qformat={qformat}: {generated_file}"
     )
 
+    # Map scale types to their conditions
+    scale_types = [
+        ("input_scale", fuse_input_scale),
+        ("weight_scale", fuse_weight_scale),
+        ("weight_scale_2", fuse_weight_scale_2),
+        ("pre_quant_scale", fuse_pre_quant_scale),
+        ("weight_quantizer._svdquant_lora_a", fuse_svdquant_lora_a),
+    ]
+
+    # Projection pairs to check for equality
+    proj_pairs = [("gate_proj", "up_proj"), ("q_proj", "k_proj"), ("q_proj", "v_proj")]
+
     def _same_scale(name, key1, key2, f):
         if key1 in name:
             tensor1 = f.get_tensor(name)
@@ -108,23 +123,11 @@ def _same_scale(name, key1, key2, f):
             assert tensor.shape is not None, f"Tensor '{name}' shape is None!"
             assert tensor.dtype is not None, f"Tensor '{name}' dtype is None!"
 
-            if "scale" in name:
-                # Map scale types to their conditions
-                scale_types = [
-                    ("input_scale", fuse_input_scale),
-                    ("weight_scale", fuse_weight_scale),
-                    ("weight_scale_2", fuse_weight_scale_2),
-                    ("prequant_scale", fuse_prequant_scale),
-                ]
-
-                # Projection pairs to check for equality
-                proj_pairs = [("gate_proj", "up_proj"), ("q_proj", "k_proj"), ("q_proj", "v_proj")]
-
-                # Check each scale type if its condition is met
-                for scale_suffix, condition in scale_types:
-                    if name.endswith(scale_suffix) and condition:
-                        # Check each projection pair
-                        for proj1, proj2 in proj_pairs:
-                            _same_scale(name, proj1, proj2, f)
+            # Check each scale type if its condition is met
+            for scale_suffix, condition in scale_types:
+                if name.endswith(scale_suffix) and condition:
+                    # Check each projection pair
+                    for proj1, proj2 in proj_pairs:
+                        _same_scale(name, proj1, proj2, f)
 
     # TODO: Load a pre-dumped log to compare textually or use pre-defined dict for sanity checks

From 9ed776847ff07af55a6093d9154cd8e337f2e984 Mon Sep 17 00:00:00 2001
From: Shiyang Chen <shiychen@nvidia.com>
Date: Wed, 21 Jan 2026 13:03:18 -0800
Subject: [PATCH 5/5] fix typo

Signed-off-by: Shiyang Chen <shiychen@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 051ee95b7..e970f7f7c 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -510,7 +510,7 @@ def export_quantized(
             if (
                 args.inference_tensor_parallel != 1 or args.inference_pipeline_parallel != 1
             ) and args.qformat == "nvfp4_svdquant":
-                raise NotImplementedError("Svdquant does not support mulitple GPUs yet.")
+                raise NotImplementedError("Svdquant does not support multiple GPUs yet.")
             warnings.warn(
                 "Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime."
             )