From 3bc8bc835cfc47e2c5283910f39819a689b67ecc Mon Sep 17 00:00:00 2001 From: Shiyang Chen Date: Tue, 28 Oct 2025 15:16:12 -0700 Subject: [PATCH 1/5] support nvfp4_svdquant hf export Signed-off-by: Shiyang Chen --- examples/llm_ptq/hf_ptq.py | 1 + .../llm_ptq/scripts/huggingface_example.sh | 4 +-- modelopt/torch/export/model_config.py | 13 ++++++-- modelopt/torch/export/postprocess.py | 7 ++++- modelopt/torch/export/quant_utils.py | 30 +++++++++++++++---- modelopt/torch/export/unified_export_hf.py | 8 ++++- 6 files changed, 51 insertions(+), 12 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index ddf6bfbdc..5da22bf6d 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -83,6 +83,7 @@ "w4a8_nvfp4_fp8": mtq.W4A8_NVFP4_FP8_CFG, "w4a8_mxfp4_fp8": mtq.W4A8_MXFP4_FP8_CFG, "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG, + "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG, } KV_QUANT_CFG_CHOICES = { diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 3ea85de9e..f8b05189b 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -53,9 +53,9 @@ esac IFS="," for qformat in $QFORMAT; do case $qformat in - fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_mlp_only) ;; + fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_mlp_only | nvfp4_svdquant) ;; *) - echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_mlp_only]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_mlp_only, nvfp4_svdquant]" >&2 exit 1 ;; esac diff --git a/modelopt/torch/export/model_config.py b/modelopt/torch/export/model_config.py index 306348f2c..73ed85e3a 100755 --- a/modelopt/torch/export/model_config.py +++ b/modelopt/torch/export/model_config.py @@ -33,6 +33,7 @@ QUANTIZATION_INT4_AWQ = "int4_awq" QUANTIZATION_W4A8_AWQ = "w4a8_awq" QUANTIZATION_NVFP4 = "nvfp4" +QUANTIZATION_NVFP4_SVDQUANT = "nvfp4_svdquant" QUANTIZATION_W4A8_NVFP4_FP8 = "w4a8_nvfp4_fp8" QUANTIZATION_MXFP4 = "mxfp4" QUANTIZATION_W4A8_MXFP4_FP8 = "w4a8_mxfp4_fp8" @@ -507,12 +508,20 @@ def hidden_size(self): """Returns the hidden size of the transformer model.""" if isinstance(self.mlp, MOEConfig): # fc.weight for MOE is stacked - if self.mlp.fc.quantization in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: + if self.mlp.fc.quantization in [ + QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, + ]: return self.mlp.fc.weight.shape[-1] * 2 return self.mlp.fc.weight.shape[-1] else: k = self.mlp.fc.weight.shape[1] - if self.mlp.fc.quantization in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: + if self.mlp.fc.quantization in [ + QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, + ]: return k * 2 return k diff --git a/modelopt/torch/export/postprocess.py b/modelopt/torch/export/postprocess.py index 5c3d0fcf3..376a52a41 100644 --- a/modelopt/torch/export/postprocess.py +++ b/modelopt/torch/export/postprocess.py @@ -35,6 +35,7 @@ LINEAR_ROW, QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, ConvConfig, EmbeddingConfig, ExpertConfig, @@ -398,7 +399,10 @@ def _merge_model_configs_to_first_tp(config, ranks: list[int], group=None): group_size=config.awq_block_size, quantization=config.quantization, ) - if config.quantization == QUANTIZATION_NVFP4_AWQ: + if config.quantization in [ + QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, + ]: # We have to update weight_scaling_factor and weight_scaling_factor_2 config.weights_scaling_factor, config.weights_scaling_factor_2 = ( NVFP4QTensor.get_weights_scaling_factor( @@ -430,6 +434,7 @@ def _merge_model_configs_to_first_tp(config, ranks: list[int], group=None): if config.quantization in [ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, ]: ( config.weights_scaling_factor, diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 1a64d028f..7584a6259 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -57,6 +57,7 @@ QUANTIZATION_NONE, QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_W4A8_NVFP4_FP8, @@ -165,7 +166,7 @@ def resmooth_and_get_scale( ) new_weights.append(weight) # If NVFP4_AWQ then we view the scales as uint8 to allow for cat later - if quantization == QUANTIZATION_NVFP4_AWQ: + if quantization in [QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT]: scale, _ = NVFP4QTensor.get_weights_scaling_factor(weight, group_size).view(torch.uint8) else: scale = get_scaling_factor_from_weight(weight, group_size) @@ -176,7 +177,7 @@ def resmooth_and_get_scale( return ( torch.cat(new_weights, dim=0), resmoothed_scales.view(torch.float8_e4m3fn) - if quantization == QUANTIZATION_NVFP4_AWQ + if quantization in [QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT] else resmoothed_scales, # if NVFP4_AWQ we view the scales back as float8_e4m3fn after cat new_pre_quant_scale, ) @@ -243,6 +244,7 @@ def get_activation_scaling_factor( if get_quantization_format(module) in [ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, ]: return NVFP4QTensor.get_activation_scaling_factor(input_quantizer) return get_scaling_factor(input_quantizer) @@ -270,6 +272,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> if quantization_format in [ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, QUANTIZATION_W4A8_NVFP4_FP8, ]: if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8: @@ -303,6 +306,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") if get_quantization_format(module) in [ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, ]: return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) elif get_quantization_format(module) == QUANTIZATION_W4A8_NVFP4_FP8: @@ -487,6 +491,12 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames block_sizes = getattr(weight_quantizer, "block_sizes") scale_bits = block_sizes.get("scale_bits") + if ( + input_quantizer is not None + and hasattr(input_quantizer, "_pre_quant_scale") + and hasattr(weight_quantizer, "svdquant_lora_a") + ): + return QUANTIZATION_NVFP4_SVDQUANT if input_quantizer is not None and hasattr(input_quantizer, "_pre_quant_scale"): return QUANTIZATION_NVFP4_AWQ if getattr(layer, "fused_with_prequant", False): @@ -660,15 +670,18 @@ def process_layer_quant_config(layer_config_dict): elif v == "w4a8_nvfp4_fp8": layer_config = { "quant_algo": "W4A8_NVFP4_FP8", - "group_size": layer_config_dict[prefix + ".awq_block_size"], - "has_zero_point": False, - "pre_quant_scale": True, + "group_size": block_size_value, } elif v == "w4a8_mxfp4_fp8": layer_config = { "quant_algo": "W4A8_MXFP4_FP8", "group_size": block_size_value, } + elif v == "nvfp4_svdquant": + layer_config = { + "quant_algo": "NVFP4_SVD", + "group_size": block_size_value, + } else: layer_config = {"quant_algo": v} @@ -813,7 +826,12 @@ def to_quantized_weight( if quantization in [QUANTIZATION_INT4_AWQ, QUANTIZATION_W4A8_AWQ]: return pack_int4_in_uint8(weight, weights_scaling_factor) - if quantization in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_W4A8_NVFP4_FP8]: + if quantization in [ + QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_W4A8_NVFP4_FP8, + QUANTIZATION_NVFP4_SVDQUANT, + ]: assert block_size is not None, "Block size not passed. Unable to quantize to NVFP4 format." assert weights_scaling_factor2 is not None, ( "Weights scaling factor 2 not passed. Unable to quantize to NVFP4 format" diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 37b7f3b64..2a7e07dba 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -62,6 +62,7 @@ QUANTIZATION_NONE, QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ) @@ -414,6 +415,7 @@ def _export_quantized_weight( if quantization_format in [ QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, QUANTIZATION_NVFP4, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, @@ -434,7 +436,11 @@ def _export_quantized_weight( for expert_type in ["Llama4TextExperts", "GptOssExperts"] ) - if quantization_format in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: + if quantization_format in [ + QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_SVDQUANT, + ]: # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim) # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization weight, _ = maybe_transpose_expert_weight_dimensions( From 8a2cab285cf954ceec6a9905ac7add51eb128128 Mon Sep 17 00:00:00 2001 From: Shiyang Chen Date: Tue, 6 Jan 2026 14:23:58 -0800 Subject: [PATCH 2/5] handle q/k/v and gate/up merging for svdquant Signed-off-by: Shiyang Chen --- modelopt/torch/export/quant_utils.py | 55 ++++++++++++++++++---- modelopt/torch/export/unified_export_hf.py | 7 ++- modelopt/torch/quantization/model_calib.py | 29 +++++++----- 3 files changed, 68 insertions(+), 23 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 7584a6259..3beb100a6 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -25,7 +25,11 @@ import torch.nn as nn from modelopt import __version__ -from modelopt.torch.quantization.model_calib import enable_stats_collection, finish_stats_collection +from modelopt.torch.quantization.model_calib import ( + enable_stats_collection, + finish_stats_collection, + svd, +) from modelopt.torch.quantization.nn.modules.quant_linear import RealQuantLinear from modelopt.torch.quantization.qtensor import ( FP8QTensor, @@ -491,11 +495,7 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames block_sizes = getattr(weight_quantizer, "block_sizes") scale_bits = block_sizes.get("scale_bits") - if ( - input_quantizer is not None - and hasattr(input_quantizer, "_pre_quant_scale") - and hasattr(weight_quantizer, "svdquant_lora_a") - ): + if input_quantizer is not None and hasattr(weight_quantizer, "svdquant_lora_a"): return QUANTIZATION_NVFP4_SVDQUANT if input_quantizer is not None and hasattr(input_quantizer, "_pre_quant_scale"): return QUANTIZATION_NVFP4_AWQ @@ -1032,6 +1032,40 @@ def _update_pre_quant_scale(module, new_pre_quant_scale): finish_stats_collection(module.weight_quantizer) +def _update_svdquant(modules, new_pre_quant_scale): + """Updates the pre_quant_scale, svdquant_lora_a and svdquant_lora_b matrices when pre_quant_scale is changed.""" + new_pre_quant_scale = new_pre_quant_scale.to(torch.float32) + lora_a = [m.weight_quantizer.svdquant_lora_a.to(torch.float32) for m in modules] + lora_b = [m.weight_quantizer.svdquant_lora_b.to(torch.float32) for m in modules] + weight = [m.weight.to(torch.float32) for m in modules] + old_pre_quant_scale = [m.input_quantizer._pre_quant_scale.to(torch.float32) for m in modules] + weight = [ + (w + (lb @ la)) * (s / new_pre_quant_scale) + for w, la, lb, s in zip(weight, lora_a, lora_b, old_pre_quant_scale) + ] + weight_concatenated = torch.cat(weight, dim=0) + lb, la = svd(weight_concatenated, rank=lora_a[0].shape[0]) + weight_concatenated -= lb @ la + weight_concatenated = weight_concatenated.to(modules[0].weight.dtype) + la = la.to(modules[0].weight_quantizer.svdquant_lora_a.dtype) + lb = lb.to(modules[0].weight_quantizer.svdquant_lora_b.dtype) + new_pre_quant_scale = new_pre_quant_scale.to(modules[0].input_quantizer.pre_quant_scale.dtype) + + index = 0 + for i, module in enumerate(modules): + module.input_quantizer.pre_quant_scale = new_pre_quant_scale + module.weight_quantizer.svdquant_lora_a = la + assert lora_b[i].shape[0] == module.weight.shape[0] + module.weight_quantizer.svdquant_lora_b = lb[index : index + lora_b[i].shape[0], :] + module.weight = nn.Parameter(weight_concatenated[index : index + lora_b[i].shape[0], :]) + index += lora_b[i].shape[0] + # Redo weights collection + module.weight_quantizer.reset_amax() + enable_stats_collection(module.weight_quantizer) + module.weight_quantizer(module.weight) + finish_stats_collection(module.weight_quantizer) + + # Format: (list of target modules, tuple of (linear_to_fuse_into, linear_from_with_scale)) PQS_FUSE_MODULE_MAPPING = [ # Attention: Fuse o_proj's pre_quant_scale into v_proj's output dimension @@ -1184,9 +1218,12 @@ def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False dim=0, ) - for module in modules: - if not torch.equal(module.input_quantizer.pre_quant_scale, avg_prequant_scale): - _update_pre_quant_scale(module, avg_prequant_scale) + if hasattr(modules[0].weight_quantizer, "svdquant_lora_a"): + _update_svdquant(modules, avg_prequant_scale) + else: + for module in modules: + if not torch.equal(module.input_quantizer.pre_quant_scale, avg_prequant_scale): + _update_pre_quant_scale(module, avg_prequant_scale) if resmooth_only: return diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 2a7e07dba..189c8f5f9 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -234,6 +234,10 @@ def requantize_resmooth_fused_llm_layers(model: torch.nn.Module): model_type = type(model).__name__.lower() module_names = set() + # NVFP4 SVDQuant does not need pre-quant scale fusion (either into previous linear or layernorm) because + # 1) its kernel handles pre-quant scale. + # 2) fusing into previous linear will need to change the lora_up in up_proj which may cause issue in + # the later gate up fusion. # Fuse pre_quant_scale to the linear weights if possible if quantization_format is not None and "nvfp4_awq" in quantization_format.lower(): fuse_prequant_to_linear(model) @@ -244,7 +248,8 @@ def requantize_resmooth_fused_llm_layers(model: torch.nn.Module): # For MoE models update pre_quant_scale to average pre_quant_scale amongst experts if is_moe(module) and ( - quantization_format is not QUANTIZATION_NONE and "awq" in quantization_format + quantization_format is not QUANTIZATION_NONE + and ("awq" in quantization_format or quantization_format == QUANTIZATION_NVFP4_SVDQUANT) ): # update_experts_avg_prequant_scale(module) grouped_experts = get_experts_list(module, model_type) diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index b8461a080..0cd9b63f8 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -1075,6 +1075,18 @@ def _get_awq_quantizer_block_size(tensor: torch.Tensor, quantizer: TensorQuantiz return blocksize +def svd(weight, rank): + original_device = weight.device + original_dtype = weight.dtype + weight_f64 = weight.to(dtype=torch.float64, device=original_device) + u, s, vt = torch.linalg.svd(weight_f64, full_matrices=False) + us = u[:, :rank] * s[:rank] + vt = vt[:rank] + return us.to(device=original_device, dtype=original_dtype), vt.to( + device=original_device, dtype=original_dtype + ) + + @torch.no_grad() def svdquant( model: nn.Module, @@ -1096,25 +1108,16 @@ def svdquant( def postprocess(module, name): print_rank_0(f"SVD {name}") weight = module.weight.data - original_device = weight.device - original_dtype = weight.dtype - weight_f64 = weight.to(dtype=torch.float64, device=original_device) - u, s, vt = torch.linalg.svd(weight_f64, full_matrices=False) - if u.shape[1] < lowrank or vt.shape[0] < lowrank: + us, vt = svd(weight, lowrank) + if us.shape[1] < lowrank or vt.shape[0] < lowrank: warnings.warn( "The low-rank dimensions do not match the layer dimensions. " "Please verify your configuration and model settings. " f"SVD will be skipped for this layer {name}." ) return - us = u[:, :lowrank] * s[:lowrank] - vt = vt[:lowrank] - module.weight_quantizer.svdquant_lora_a = vt.to( - dtype=original_dtype, device=original_device - ) - module.weight_quantizer.svdquant_lora_b = us.to( - dtype=original_dtype, device=original_device - ) + module.weight_quantizer.svdquant_lora_a = vt + module.weight_quantizer.svdquant_lora_b = us module.weight.data.sub_( module.weight_quantizer.svdquant_lora_b @ module.weight_quantizer.svdquant_lora_a ) From b7dabf54ba0d7caae46250957311bf684589a232 Mon Sep 17 00:00:00 2001 From: Shiyang Chen Date: Fri, 16 Jan 2026 14:41:35 -0800 Subject: [PATCH 3/5] update based on review Signed-off-by: Shiyang Chen --- examples/llm_ptq/hf_ptq.py | 4 ++++ modelopt/torch/export/quant_utils.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 5da22bf6d..051ee95b7 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -507,6 +507,10 @@ def export_quantized( or args.sparsity_fmt != "dense" or "int8_sq" in args.qformat ): + if ( + args.inference_tensor_parallel != 1 or args.inference_pipeline_parallel != 1 + ) and args.qformat == "nvfp4_svdquant": + raise NotImplementedError("Svdquant does not support mulitple GPUs yet.") warnings.warn( "Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime." ) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 3beb100a6..f76539048 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -1218,7 +1218,7 @@ def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False dim=0, ) - if hasattr(modules[0].weight_quantizer, "svdquant_lora_a"): + if getattr(modules[0].weight_quantizer, "svdquant_lora_a", None) is not None: _update_svdquant(modules, avg_prequant_scale) else: for module in modules: From 55ca49f4abcf8635e660529da18ce90e4dd83b9a Mon Sep 17 00:00:00 2001 From: Shiyang Chen Date: Tue, 20 Jan 2026 13:12:06 -0800 Subject: [PATCH 4/5] add unittest and fix a bug in existing unittests Signed-off-by: Shiyang Chen --- modelopt/torch/export/quant_utils.py | 4 +- modelopt/torch/quantization/model_calib.py | 25 +++++---- ...unified_hf_export_and_check_safetensors.py | 55 ++++++++++--------- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index f76539048..81ea8561c 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -1218,7 +1218,9 @@ def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False dim=0, ) - if getattr(modules[0].weight_quantizer, "svdquant_lora_a", None) is not None: + if all( + getattr(m.weight_quantizer, "svdquant_lora_a", None) is not None for m in modules + ): _update_svdquant(modules, avg_prequant_scale) else: for module in modules: diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 0cd9b63f8..d27fe7214 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -1082,9 +1082,21 @@ def svd(weight, rank): u, s, vt = torch.linalg.svd(weight_f64, full_matrices=False) us = u[:, :rank] * s[:rank] vt = vt[:rank] - return us.to(device=original_device, dtype=original_dtype), vt.to( - device=original_device, dtype=original_dtype - ) + us = us.to(device=original_device, dtype=original_dtype) + vt = vt.to(device=original_device, dtype=original_dtype) + if us.shape[1] < rank or vt.shape[0] < rank: + warnings.warn( + "The low-rank dimensions do not match the layer dimensions. " + "Please verify your configuration and model settings. " + f"Rank is {us.shape[1]} and {vt.shape[0]}" + ) + us_temp = torch.zeros((us.shape[0], rank), dtype=us.dtype, device=us.device) + vt_temp = torch.zeros((rank, vt.shape[1]), dtype=vt.dtype, device=vt.device) + us_temp[:, : us.shape[1]] = us + vt_temp[: vt.shape[0], :] = vt + us = us_temp + vt = vt_temp + return us, vt @torch.no_grad() @@ -1109,13 +1121,6 @@ def postprocess(module, name): print_rank_0(f"SVD {name}") weight = module.weight.data us, vt = svd(weight, lowrank) - if us.shape[1] < lowrank or vt.shape[0] < lowrank: - warnings.warn( - "The low-rank dimensions do not match the layer dimensions. " - "Please verify your configuration and model settings. " - f"SVD will be skipped for this layer {name}." - ) - return module.weight_quantizer.svdquant_lora_a = vt module.weight_quantizer.svdquant_lora_b = us module.weight.data.sub_( diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index a6f360872..23a1439e9 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -29,15 +29,17 @@ "fuse_input_scale", "fuse_weight_scale", "fuse_weight_scale_2", - "fuse_prequant_scale", + "fuse_pre_quant_scale", + "fuse_svdquant_lora_a", ), [ - ("fp8", "tiny_llama-fp8", True, False, True, True), - ("nvfp4", "tiny_llama-nvfp4", True, False, True, True), - ("nvfp4_awq", "tiny_llama-nvfp4-awq", True, False, True, True), - ("int4_awq", "tiny_llama-int4-awq", True, False, True, True), - ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True), - ("int8_wo", "tiny_llama-int8-wo", False, False, False, False), + ("fp8", "tiny_llama-fp8", True, False, True, True, False), + ("nvfp4", "tiny_llama-nvfp4", True, False, True, True, False), + ("nvfp4_awq", "tiny_llama-nvfp4-awq", True, False, True, True, False), + ("int4_awq", "tiny_llama-int4-awq", True, False, True, True, False), + ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True, False), + ("int8_wo", "tiny_llama-int8-wo", False, False, False, False, False), + ("nvfp4_svdquant", "tiny_llama-nvfp4-svdquant", True, False, True, True, True), ], ) def test_unified_hf_export_and_check_safetensors( @@ -47,7 +49,8 @@ def test_unified_hf_export_and_check_safetensors( fuse_input_scale, fuse_weight_scale, fuse_weight_scale_2, - fuse_prequant_scale, + fuse_pre_quant_scale, + fuse_svdquant_lora_a, ): """ 1) Generates a .safetensors file by running hf_ptq.py with each --qformat. @@ -92,6 +95,18 @@ def test_unified_hf_export_and_check_safetensors( f"Expected .safetensors file not found for qformat={qformat}: {generated_file}" ) + # Map scale types to their conditions + scale_types = [ + ("input_scale", fuse_input_scale), + ("weight_scale", fuse_weight_scale), + ("weight_scale_2", fuse_weight_scale_2), + ("pre_quant_scale", fuse_pre_quant_scale), + ("weight_quantizer._svdquant_lora_a", fuse_svdquant_lora_a), + ] + + # Projection pairs to check for equality + proj_pairs = [("gate_proj", "up_proj"), ("q_proj", "k_proj"), ("q_proj", "v_proj")] + def _same_scale(name, key1, key2, f): if key1 in name: tensor1 = f.get_tensor(name) @@ -108,23 +123,11 @@ def _same_scale(name, key1, key2, f): assert tensor.shape is not None, f"Tensor '{name}' shape is None!" assert tensor.dtype is not None, f"Tensor '{name}' dtype is None!" - if "scale" in name: - # Map scale types to their conditions - scale_types = [ - ("input_scale", fuse_input_scale), - ("weight_scale", fuse_weight_scale), - ("weight_scale_2", fuse_weight_scale_2), - ("prequant_scale", fuse_prequant_scale), - ] - - # Projection pairs to check for equality - proj_pairs = [("gate_proj", "up_proj"), ("q_proj", "k_proj"), ("q_proj", "v_proj")] - - # Check each scale type if its condition is met - for scale_suffix, condition in scale_types: - if name.endswith(scale_suffix) and condition: - # Check each projection pair - for proj1, proj2 in proj_pairs: - _same_scale(name, proj1, proj2, f) + # Check each scale type if its condition is met + for scale_suffix, condition in scale_types: + if name.endswith(scale_suffix) and condition: + # Check each projection pair + for proj1, proj2 in proj_pairs: + _same_scale(name, proj1, proj2, f) # TODO: Load a pre-dumped log to compare textually or use pre-defined dict for sanity checks From 9ed776847ff07af55a6093d9154cd8e337f2e984 Mon Sep 17 00:00:00 2001 From: Shiyang Chen Date: Wed, 21 Jan 2026 13:03:18 -0800 Subject: [PATCH 5/5] fix typo Signed-off-by: Shiyang Chen --- examples/llm_ptq/hf_ptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 051ee95b7..e970f7f7c 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -510,7 +510,7 @@ def export_quantized( if ( args.inference_tensor_parallel != 1 or args.inference_pipeline_parallel != 1 ) and args.qformat == "nvfp4_svdquant": - raise NotImplementedError("Svdquant does not support mulitple GPUs yet.") + raise NotImplementedError("Svdquant does not support multiple GPUs yet.") warnings.warn( "Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime." )