From 44402d26da893ee199a2c7397dece32fb8a549f9 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 14:35:04 -0800
Subject: [PATCH 1/6] add support for nemotron parse fp8

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py          |  79 ++++++---
 examples/llm_ptq/hf_ptq.py                 | 186 +++++++++++++++++----
 examples/llm_ptq/vlm_utils.py              | 131 +++++++++++----
 modelopt/torch/export/model_utils.py       |  12 +-
 modelopt/torch/export/unified_export_hf.py |  68 +++++---
 5 files changed, 363 insertions(+), 113 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 40f700781..1929ef2ce 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -28,6 +28,7 @@
 from accelerate.utils import get_max_memory
 from transformers import (
     AutoConfig,
+    AutoModel,
     AutoModelForCausalLM,
     AutoProcessor,
     AutoTokenizer,
@@ -64,27 +65,39 @@ def run_nemotron_vl_preview(
     """
     from vlm_utils import run_text_only_generation, run_vl_preview_generation
 
-    print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
-    question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
-    generation_config = {
-        "max_new_tokens": 100,
-        "do_sample": False,
-        "eos_token_id": tokenizer.eos_token_id,
-    }
-
-    # Try text-only generation
-    text_response = run_text_only_generation(
-        full_model, tokenizer, question, generation_config, pyt_ckpt_path
-    )
+    # Check if this is Nemotron-Parse (encoder-decoder model that requires images)
+    config = full_model.config
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+    generated_ids = None
+
+    if not is_nemotron_parse:
+        # Only try text-only generation for models that support it (not Nemotron-Parse)
+        print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
+        question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        generation_config = {
+            "max_new_tokens": 100,
+            "do_sample": False,
+            "eos_token_id": tokenizer.eos_token_id,
+        }
+
+        # Try text-only generation
+        text_response = run_text_only_generation(
+            full_model, tokenizer, question, generation_config, pyt_ckpt_path
+        )
 
-    if text_response is not None:
-        print(f"✅ Text-only generation successful: {text_response[:100]}...")
-        generated_ids = text_response
-    elif allow_fallback:
-        print("Text-only generation failed, falling back to standard generate...")
-        generated_ids = full_model.generate(input_ids, max_new_tokens=100)
+        if text_response is not None:
+            print(f"✅ Text-only generation successful: {text_response[:100]}...")
+            generated_ids = text_response
+        elif allow_fallback:
+            print("Text-only generation failed, falling back to standard generate...")
+            generated_ids = full_model.generate(input_ids, max_new_tokens=100)
     else:
-        generated_ids = None
+        print(
+            f"Skipping text-only generation for Nemotron-Parse ({stage_name}) - "
+            "this encoder-decoder model requires images for all operations."
+        )
 
     # Run additional VL test with images
     print(f"Running additional VL test with images ({stage_name})...")
@@ -95,6 +108,10 @@ def run_nemotron_vl_preview(
 
 def _is_multimodal_config(config):
     """Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
+    # Check for Nemotron-Parse encoder-decoder architecture
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
     return (
         hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
         or getattr(config, "model_type", "") == "phi4mm"  # Phi-4 multimodal
@@ -103,6 +120,7 @@ def _is_multimodal_config(config):
         or (
             hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
         )  # Image embedding layers
+        or is_nemotron_parse  # Nemotron-Parse conditional generation model
     )
 
 
@@ -257,8 +275,19 @@ def get_processor(
         )
 
         return MllamaImageProcessor(processor, device)
-
-    return None
+    else:
+        # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
+        # This will only work if the model has a processor config
+        try:
+            processor = AutoProcessor.from_pretrained(
+                ckpt_path,
+                **model_kwargs,
+            )
+            print(f"Loaded AutoProcessor for model type: {model_type}")
+            return processor
+        except Exception as e:
+            print(f"Could not load processor for {model_type}: {e}")
+            return None
 
 
 def get_dtype(dtype):
@@ -320,8 +349,6 @@ def get_model(
         model_kwargs.setdefault("torch_dtype", "auto")
 
     if "vila" in ckpt_path.lower():
-        from transformers import AutoModel
-
         hf_vila = AutoModel.from_pretrained(
             ckpt_path,
             device_map=device_map,
@@ -353,13 +380,13 @@ def get_model(
                 if not hasattr(transformers, architecture):
                     warnings.warn(
                         f"Architecture {architecture} not found in transformers: {transformers.__version__}. "
-                        "Falling back to AutoModelForCausalLM."
+                        "Falling back to AutoModel."
                     )
                 assert trust_remote_code, (
                     "Please set trust_remote_code to True if you want to use this architecture"
                 )
 
-                auto_model_module = AutoModelForCausalLM
+                auto_model_module = AutoModel
                 from_config = auto_model_module.from_config
             else:
                 auto_model_module = getattr(transformers, architecture)
@@ -370,7 +397,7 @@ def get_model(
                 # unless specified by the hf_config.
                 torch_dtype = getattr(hf_config, "torch_dtype", torch.float16)
                 model_kwargs2 = model_kwargs.copy()
-                if auto_model_module != AutoModelForCausalLM:
+                if auto_model_module not in [AutoModelForCausalLM, AutoModel]:
                     model_kwargs2.pop("trust_remote_code", None)
                 model_kwargs2["torch_dtype"] = torch_dtype
                 model_kwargs2.pop("max_memory", None)
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index a9862a742..962a564f7 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -97,6 +97,76 @@
 mto.enable_huggingface_checkpointing()
 
 
+def create_nemotron_parse_calib_wrapper(base_dataloader, processor, device, decoder_only=False):
+    """Wrap a text-only dataloader to add dummy images for Nemotron-Parse calibration.
+
+    Nemotron-Parse is an encoder-decoder model that requires pixel_values (for encoder)
+    and decoder_input_ids (for decoder) during calibration. This wrapper adds properly
+    formatted dummy images and decoder inputs.
+
+    Args:
+        base_dataloader: The base text-only dataloader
+        processor: The Nemotron-Parse processor
+        device: Device to place tensors on
+        decoder_only: If True, only provide decoder inputs (for when quantizing just the decoder)
+    """
+    import torch
+    from PIL import Image
+
+    class NemotronParseCalibWrapper:
+        def __init__(self, base_dataloader, processor, device, decoder_only=False):
+            self.base_dataloader = base_dataloader
+            self.processor = processor
+            self.device = device
+            self.decoder_only = decoder_only
+            # Create a simple dummy image (will be processed by the model's processor)
+            self.dummy_image = Image.new('RGB', (1024, 1280), color='white')
+
+        def __iter__(self):
+            for batch in self.base_dataloader:
+                # batch contains input_ids and attention_mask from text data
+                batch_size = batch['input_ids'].shape[0]
+
+                if self.decoder_only:
+                    # When calibrating just the decoder, it expects input_ids directly
+                    # (not decoder_input_ids, as that's only for the full encoder-decoder forward)
+                    # Just pass through the original batch
+                    yield batch
+                else:
+                    # When calibrating the full model, we need pixel_values and decoder_input_ids
+                    # Process dummy images using the Nemotron-Parse processor
+                    dummy_images = [self.dummy_image] * batch_size
+
+                    # Use the processor to get properly formatted pixel_values
+                    prompts = ["</s><s><predict_bbox><predict_classes><output_markdown>"] * batch_size
+                    processed = self.processor(
+                        text=prompts,
+                        images=dummy_images,
+                        return_tensors="pt"
+                    )
+
+                    # For encoder-decoder models like Nemotron-Parse:
+                    # - pixel_values go to the vision encoder
+                    # - decoder_input_ids are needed for the decoder
+                    batch['pixel_values'] = processed['pixel_values'].to(self.device)
+                    batch['decoder_input_ids'] = processed['input_ids'].to(self.device)
+                    batch['decoder_attention_mask'] = processed['attention_mask'].to(self.device)
+
+                    # Remove the encoder input_ids and attention_mask as they're not needed
+                    # The model will use pixel_values for the encoder
+                    if 'input_ids' in batch:
+                        del batch['input_ids']
+                    if 'attention_mask' in batch:
+                        del batch['attention_mask']
+
+                    yield batch
+
+        def __len__(self):
+            return len(self.base_dataloader)
+
+    return NemotronParseCalibWrapper(base_dataloader, processor, device, decoder_only)
+
+
 def make_calib_dataloader(
     args: argparse.Namespace,
     language_model: torch.nn.Module,
@@ -317,6 +387,18 @@ def load_model(args: argparse.Namespace):
         args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[
             : len(args.dataset)
         ]
+
+        # Check if this is a Nemotron VL model that needs a processor
+        is_nemotron_vl_model = is_nemotron_vl(full_model)
+        if is_nemotron_vl_model:
+            # Load processor for Nemotron VL models (like Nemotron-Parse)
+            processor = get_processor(
+                args.pyt_ckpt_path,
+                model_type,
+                device,
+                trust_remote_code=args.trust_remote_code,
+            )
+
         tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
 
         default_padding_side = tokenizer.padding_side
@@ -569,10 +651,20 @@ def pre_quantize(
     post-quantize generation.
 
     """
+    # Check if this is Nemotron-Parse (encoder-decoder model)
+    config = full_model.config
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
     # Only run single sample for preview
-    preview_input_ids = next(iter(calib_dataloader))[
-        "input_features" if model_type == "whisper" else "input_ids"
-    ][0:1]
+    # For Nemotron-Parse, use decoder_input_ids instead of input_ids
+    sample_batch = next(iter(calib_dataloader))
+    if is_nemotron_parse and "decoder_input_ids" in sample_batch:
+        preview_input_ids = sample_batch["decoder_input_ids"][0:1]
+    elif model_type == "whisper":
+        preview_input_ids = sample_batch["input_features"][0:1]
+    else:
+        preview_input_ids = sample_batch["input_ids"][0:1]
 
     # Generate preview before quantization
     if is_nemotron_vl_model and tokenizer is not None:
@@ -693,36 +785,46 @@ def quantize_main(
     device: torch.device,
 ):
     if args.batch_size == 0:
-        # Calibration/sparsification will actually take much more memory than regular inference
-        # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio
-        # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference.
-        sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1
-        # Whisper model expects mel-spectrogram input features of length 3000
-        # Whisper model needs input of shape (batch_size, num_mel_bins, 3000)
-        # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float
-        # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size()
-        if model_type == "whisper":
-            max_sample_length = 3000
-            num_mel_bins = language_model.config.num_mel_bins
-            sample_input_single_batch = (
-                torch.ones([1, num_mel_bins, max_sample_length], dtype=language_model.dtype).to(
-                    language_model.device
-                )
-                * 100
+        # Check if this is a vision-language model
+        # For VL models, skip automatic batch size detection and use a conservative default
+        # since proper multimodal input preparation is complex
+        if is_multimodal_model(full_model) or is_nemotron_vl(full_model):
+            print(
+                "Vision-language model detected. Using default batch_size=1 for calibration "
+                "to ensure proper handling of multimodal inputs."
             )
+            args.batch_size = 1
         else:
-            sample_input_single_batch = None
+            # Calibration/sparsification will actually take much more memory than regular inference
+            # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio
+            # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference.
+            sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1
+            # Whisper model expects mel-spectrogram input features of length 3000
+            # Whisper model needs input of shape (batch_size, num_mel_bins, 3000)
+            # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float
+            # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size()
+            if model_type == "whisper":
+                max_sample_length = 3000
+                num_mel_bins = language_model.config.num_mel_bins
+                sample_input_single_batch = (
+                    torch.ones([1, num_mel_bins, max_sample_length], dtype=language_model.dtype).to(
+                        language_model.device
+                    )
+                    * 100
+                )
+            else:
+                sample_input_single_batch = None
 
-        run_auto_quant = args.auto_quantize_bits is not None
+            run_auto_quant = args.auto_quantize_bits is not None
 
-        args.batch_size = get_max_batch_size(
-            language_model,
-            max_sample_length=args.calib_seq,
-            sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0,
-            sample_input_single_batch=sample_input_single_batch,
-            enable_grad=run_auto_quant,
-        )
-        args.batch_size = min(args.batch_size, sum(args.calib_size))
+            args.batch_size = get_max_batch_size(
+                language_model,
+                max_sample_length=args.calib_seq,
+                sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0,
+                sample_input_single_batch=sample_input_single_batch,
+                enable_grad=run_auto_quant,
+            )
+            args.batch_size = min(args.batch_size, sum(args.calib_size))
 
     print(f"Use calib batch_size {args.batch_size}")
 
@@ -733,6 +835,32 @@ def quantize_main(
     # Detect if this is a Nemotron VL model using architecture-based detection
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
+    # For Nemotron-Parse, wrap the text-only dataloader to add dummy images
+    # Nemotron-Parse is an encoder-decoder model that requires pixel_values
+    if is_nemotron_vl_model and processor is not None:
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+        if is_nemotron_parse:
+            # Check if we're quantizing just the decoder or the full model
+            decoder_only = language_model is not full_model
+
+            if decoder_only:
+                print(
+                    "Calibration will use text-only inputs for Nemotron-Parse decoder. "
+                    "Vision encoder is excluded from quantization."
+                )
+            else:
+                print(
+                    "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. "
+                    "Nemotron-Parse requires pixel_values for full model calibration."
+                )
+
+            calib_dataloader = create_nemotron_parse_calib_wrapper(
+                calib_dataloader, processor, device, decoder_only=decoder_only
+            )
+
     preview_input_ids, generated_ids_before_ptq = pre_quantize(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py
index 6c9d921b8..4789130cd 100644
--- a/examples/llm_ptq/vlm_utils.py
+++ b/examples/llm_ptq/vlm_utils.py
@@ -18,7 +18,7 @@
 import os
 
 from PIL import Image
-from transformers import AutoImageProcessor, AutoProcessor
+from transformers import AutoImageProcessor, AutoProcessor, GenerationConfig
 
 
 def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
@@ -73,13 +73,34 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             print("   Skipping VL preview generation.")
             return None
 
+        # Check if this is Nemotron-Parse early to set up proper generation config
+        config = model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         # Generate response
         question = "Describe this image briefly."  # Updated for single image
-        generation_config = {
-            "max_new_tokens": 50,
-            "do_sample": False,
-            "eos_token_id": tokenizer.eos_token_id,
-        }
+
+        # Use model's GenerationConfig for Nemotron-Parse, dict for others
+        if is_nemotron_parse:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_path, trust_remote_code=True
+                )
+                print("Using Nemotron-Parse GenerationConfig from model")
+            except Exception as e:
+                print(f"Warning: Could not load GenerationConfig: {e}, using defaults")
+                generation_config = {
+                    "max_new_tokens": 50,
+                    "do_sample": False,
+                    "eos_token_id": tokenizer.eos_token_id,
+                }
+        else:
+            generation_config = {
+                "max_new_tokens": 50,
+                "do_sample": False,
+                "eos_token_id": tokenizer.eos_token_id,
+            }
 
         print(f"Generating VL response ({stage_name})...")
 
@@ -105,27 +126,39 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
         else:
             processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
-            messages = [
-                {"role": "system", "content": "/no_think"},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image",
-                            "image": "",
-                        },
-                        {
-                            "type": "text",
-                            "text": question,
-                        },
-                    ],
-                },
-            ]
+            # Check if this is Nemotron-Parse (uses task prompts instead of chat templates)
+            config = model.config
+            architectures = getattr(config, "architectures", [])
+            is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
 
-            # Apply chat template
-            prompt = tokenizer.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True
-            )
+            if is_nemotron_parse:
+                # Nemotron-Parse uses a specific task prompt format
+                # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example
+                prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
+                print(f"Using Nemotron-Parse task prompt: {prompt}")
+            else:
+                # Other VL models use chat templates
+                messages = [
+                    {"role": "system", "content": "/no_think"},
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image",
+                                "image": "",
+                            },
+                            {
+                                "type": "text",
+                                "text": question,
+                            },
+                        ],
+                    },
+                ]
+
+                # Apply chat template
+                prompt = tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
 
             # Process inputs using the processor with single image
             inputs = processor(
@@ -139,21 +172,49 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             inputs = inputs.to(model_device)
             print(f"    Moved inputs to {model_device}")
 
+            # Verify we have pixel_values for the vision encoder
+            if not hasattr(inputs, 'pixel_values') or inputs.pixel_values is None:
+                raise ValueError("Processor did not generate pixel_values. Check processor configuration.")
+
             # Generate response using model.generate
-            generated_ids = model.generate(
-                pixel_values=inputs.pixel_values,
-                input_ids=inputs.input_ids,
-                attention_mask=inputs.attention_mask,
-                **generation_config,
-            )
+            if isinstance(generation_config, GenerationConfig):
+                # For Nemotron-Parse with GenerationConfig object
+                generated_ids = model.generate(
+                    pixel_values=inputs.pixel_values,
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                    generation_config=generation_config,
+                )
+            else:
+                # For other models with dict generation config
+                generated_ids = model.generate(
+                    pixel_values=inputs.pixel_values,
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                    **generation_config,
+                )
 
             # Decode the response (trim input tokens like in the working example)
+            if generated_ids is None:
+                raise ValueError("Model generate returned None")
+
             generated_ids_trimmed = [
                 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
             ]
-            output_text = processor.batch_decode(
-                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-            )
+
+            # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode
+            if is_nemotron_parse and hasattr(tokenizer, 'batch_decode'):
+                output_text = tokenizer.batch_decode(
+                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                )
+            else:
+                output_text = processor.batch_decode(
+                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                )
+
+            if output_text is None or len(output_text) == 0:
+                raise ValueError("Decoding returned empty output")
+
             response = output_text[0]
 
         print(f"✅ VL generation {stage_name} successful!")
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 5a24429ad..40c313ad2 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -85,6 +85,7 @@ def is_multimodal_model(model):
     - Vision LoRA configurations
     - Audio processing capabilities
     - Image embedding layers
+    - Nemotron-Parse conditional generation models
 
     Args:
         model: The HuggingFace model instance to check
@@ -103,6 +104,10 @@ def is_multimodal_model(model):
     """
     config = model.config
 
+    # Check for Nemotron-Parse encoder-decoder architecture
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
     return (
         hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
         or hasattr(model, "language_model")  # Language model attribute (e.g., LLaVA)
@@ -112,6 +117,7 @@ def is_multimodal_model(model):
         or (
             hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
         )  # Image embedding layers
+        or is_nemotron_parse  # Nemotron-Parse conditional generation model
     )
 
 
@@ -141,5 +147,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None:
     if hasattr(model, "language_model"):
         return [model, model.language_model]
 
-    # Pattern 3: No language_model found
+    # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model
+    if hasattr(model, "decoder"):
+        return [model, model.decoder]
+
+    # Pattern 4: No language_model found
     return None
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index ccfc01200..76b982a3a 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -155,12 +155,14 @@ def _output_hook(module, input, output):
 
         # Run forward pass so that all modules sharing the same input are collected using forward hook.
 
+        # Check if this is Nemotron-Parse (encoder-decoder VL model)
+        architectures = getattr(model.config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         with set_quantizer_by_cfg_context(model, {"*": {"enable": False}}):
-            if getattr(model.config, "is_encoder_decoder", False):
-                # For encoder-decoder models, we need to pass both the encoder and decoder input ids
-                model(fake_input, decoder_input_ids=decoder_fake_input)
-            elif is_vl_model and "nemotron" in model_type:
-                # For Nemotron VL models, try to run optimization on just the language model part
+            if is_vl_model and ("nemotron" in model_type or is_nemotron_parse):
+                # For Nemotron VL models (including Nemotron-Parse), run optimization on just the language model/decoder
+                # This avoids needing to create proper pixel_values for the vision encoder
                 language_model_lineage = get_language_model_from_vl(model)
 
                 if language_model_lineage is not None:
@@ -177,6 +179,9 @@ def _output_hook(module, input, output):
                         "This is required for requantization/resmoothing optimization. "
                         "Please ensure the model architecture is supported or file an issue."
                     )
+            elif getattr(model.config, "is_encoder_decoder", False):
+                # For other encoder-decoder models (non-VL), we need to pass both encoder and decoder input ids
+                model(fake_input, decoder_input_ids=decoder_fake_input)
             else:
                 model(fake_input)
 
@@ -257,25 +262,36 @@ def _export_quantized_weight(
 
     if quantization_format == QUANTIZATION_FP8:
         # Convert amax to float32
-        weight_quantizer._amax = weight_quantizer._amax.to(torch.float32)
-
-        if weight_quantizer._amax.dim() == 1:
-            # Per-tensor amax
-            weight_scaling_factor = torch.tensor(
-                weight_quantizer.amax.item() / weight_quantizer.maxbound
-            )
+        # Note: Use the public 'amax' property, not the private '_amax' attribute
+        if hasattr(weight_quantizer, '_amax') and weight_quantizer._amax is not None:
+            weight_quantizer._amax = weight_quantizer._amax.to(torch.float32)
+            amax_tensor = weight_quantizer._amax
         else:
-            # Per-channel amax
-            weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound)
+            # Fallback to public amax property
+            amax_tensor = weight_quantizer.amax
+            if amax_tensor is not None and hasattr(amax_tensor, 'to'):
+                amax_tensor = amax_tensor.to(torch.float32)
+
+        # Only compute scaling factor if amax_tensor is valid
+        if amax_tensor is not None and hasattr(amax_tensor, 'dim'):
+            if amax_tensor.dim() == 1:
+                # Per-tensor amax
+                weight_scaling_factor = torch.tensor(
+                    weight_quantizer.amax.item() / weight_quantizer.maxbound
+                )
+            else:
+                # Per-channel amax
+                weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound)
 
-        sub_module.register_buffer(
-            quantizer_attrs.weight_scale,
-            weight_scaling_factor,
-        )
+            sub_module.register_buffer(
+                quantizer_attrs.weight_scale,
+                weight_scaling_factor,
+            )
 
-        if hasattr(input_quantizer, "_amax"):
+        if hasattr(input_quantizer, "_amax") or (hasattr(input_quantizer, "amax") and input_quantizer.amax is not None):
             assert input_quantizer is not None
-            input_quantizer._amax = input_quantizer._amax.to(torch.float32)
+            if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
+                input_quantizer._amax = input_quantizer._amax.to(torch.float32)
 
             sub_module.register_buffer(
                 quantizer_attrs.input_scale,
@@ -284,9 +300,10 @@ def _export_quantized_weight(
                 ).squeeze(),
             )
 
-        if hasattr(output_quantizer, "_amax"):
+        if hasattr(output_quantizer, "_amax") or (hasattr(output_quantizer, "amax") and output_quantizer.amax is not None):
             assert output_quantizer is not None
-            output_quantizer._amax = output_quantizer._amax.to(torch.float32)
+            if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:
+                output_quantizer._amax = output_quantizer._amax.to(torch.float32)
     else:
         # Register weight_scale and input_scale
         if quantization_format == QUANTIZATION_FP8_PB_REAL:
@@ -327,6 +344,13 @@ def _export_quantized_weight(
     weight_scale: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale, None)
     weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None)
 
+    # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module
+    # This can happen for modules that were disabled from quantization or have invalid calibration data
+    if weight_scale is None and quantization_format not in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
+        # For NVFP4, weight_scale is computed later, so we can't check here
+        print(f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found")
+        return
+
     # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)
     # Check if this is a BMM-style expert weight that needs transposition
     is_bmm_expert_weight = weight.dim() == 3 and any(

From 6870525add1215eb409c2e3e127e365db3cfe00a Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 14:36:03 -0800
Subject: [PATCH 2/6] add support for nemotron parse fp8

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py                 | 27 +++++++++++-----------
 examples/llm_ptq/vlm_utils.py              | 16 +++++++++----
 modelopt/torch/export/unified_export_hf.py | 27 +++++++++++++++-------
 3 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 962a564f7..b583fe48b 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -110,7 +110,6 @@ def create_nemotron_parse_calib_wrapper(base_dataloader, processor, device, deco
         device: Device to place tensors on
         decoder_only: If True, only provide decoder inputs (for when quantizing just the decoder)
     """
-    import torch
     from PIL import Image
 
     class NemotronParseCalibWrapper:
@@ -120,12 +119,12 @@ def __init__(self, base_dataloader, processor, device, decoder_only=False):
             self.device = device
             self.decoder_only = decoder_only
             # Create a simple dummy image (will be processed by the model's processor)
-            self.dummy_image = Image.new('RGB', (1024, 1280), color='white')
+            self.dummy_image = Image.new("RGB", (1024, 1280), color="white")
 
         def __iter__(self):
             for batch in self.base_dataloader:
                 # batch contains input_ids and attention_mask from text data
-                batch_size = batch['input_ids'].shape[0]
+                batch_size = batch["input_ids"].shape[0]
 
                 if self.decoder_only:
                     # When calibrating just the decoder, it expects input_ids directly
@@ -138,26 +137,26 @@ def __iter__(self):
                     dummy_images = [self.dummy_image] * batch_size
 
                     # Use the processor to get properly formatted pixel_values
-                    prompts = ["</s><s><predict_bbox><predict_classes><output_markdown>"] * batch_size
+                    prompts = [
+                        "</s><s><predict_bbox><predict_classes><output_markdown>"
+                    ] * batch_size
                     processed = self.processor(
-                        text=prompts,
-                        images=dummy_images,
-                        return_tensors="pt"
+                        text=prompts, images=dummy_images, return_tensors="pt"
                     )
 
                     # For encoder-decoder models like Nemotron-Parse:
                     # - pixel_values go to the vision encoder
                     # - decoder_input_ids are needed for the decoder
-                    batch['pixel_values'] = processed['pixel_values'].to(self.device)
-                    batch['decoder_input_ids'] = processed['input_ids'].to(self.device)
-                    batch['decoder_attention_mask'] = processed['attention_mask'].to(self.device)
+                    batch["pixel_values"] = processed["pixel_values"].to(self.device)
+                    batch["decoder_input_ids"] = processed["input_ids"].to(self.device)
+                    batch["decoder_attention_mask"] = processed["attention_mask"].to(self.device)
 
                     # Remove the encoder input_ids and attention_mask as they're not needed
                     # The model will use pixel_values for the encoder
-                    if 'input_ids' in batch:
-                        del batch['input_ids']
-                    if 'attention_mask' in batch:
-                        del batch['attention_mask']
+                    if "input_ids" in batch:
+                        del batch["input_ids"]
+                    if "attention_mask" in batch:
+                        del batch["attention_mask"]
 
                     yield batch
 
diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py
index 4789130cd..2d3d9f82c 100644
--- a/examples/llm_ptq/vlm_utils.py
+++ b/examples/llm_ptq/vlm_utils.py
@@ -173,8 +173,10 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             print(f"    Moved inputs to {model_device}")
 
             # Verify we have pixel_values for the vision encoder
-            if not hasattr(inputs, 'pixel_values') or inputs.pixel_values is None:
-                raise ValueError("Processor did not generate pixel_values. Check processor configuration.")
+            if not hasattr(inputs, "pixel_values") or inputs.pixel_values is None:
+                raise ValueError(
+                    "Processor did not generate pixel_values. Check processor configuration."
+                )
 
             # Generate response using model.generate
             if isinstance(generation_config, GenerationConfig):
@@ -203,13 +205,17 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             ]
 
             # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode
-            if is_nemotron_parse and hasattr(tokenizer, 'batch_decode'):
+            if is_nemotron_parse and hasattr(tokenizer, "batch_decode"):
                 output_text = tokenizer.batch_decode(
-                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                    generated_ids_trimmed,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False,
                 )
             else:
                 output_text = processor.batch_decode(
-                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                    generated_ids_trimmed,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False,
                 )
 
             if output_text is None or len(output_text) == 0:
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 76b982a3a..32a5d49d4 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -263,17 +263,17 @@ def _export_quantized_weight(
     if quantization_format == QUANTIZATION_FP8:
         # Convert amax to float32
         # Note: Use the public 'amax' property, not the private '_amax' attribute
-        if hasattr(weight_quantizer, '_amax') and weight_quantizer._amax is not None:
+        if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
             weight_quantizer._amax = weight_quantizer._amax.to(torch.float32)
             amax_tensor = weight_quantizer._amax
         else:
             # Fallback to public amax property
             amax_tensor = weight_quantizer.amax
-            if amax_tensor is not None and hasattr(amax_tensor, 'to'):
+            if amax_tensor is not None and hasattr(amax_tensor, "to"):
                 amax_tensor = amax_tensor.to(torch.float32)
 
         # Only compute scaling factor if amax_tensor is valid
-        if amax_tensor is not None and hasattr(amax_tensor, 'dim'):
+        if amax_tensor is not None and hasattr(amax_tensor, "dim"):
             if amax_tensor.dim() == 1:
                 # Per-tensor amax
                 weight_scaling_factor = torch.tensor(
@@ -281,14 +281,18 @@ def _export_quantized_weight(
                 )
             else:
                 # Per-channel amax
-                weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound)
+                weight_scaling_factor = torch.tensor(
+                    weight_quantizer.amax / weight_quantizer.maxbound
+                )
 
             sub_module.register_buffer(
                 quantizer_attrs.weight_scale,
                 weight_scaling_factor,
             )
 
-        if hasattr(input_quantizer, "_amax") or (hasattr(input_quantizer, "amax") and input_quantizer.amax is not None):
+        if hasattr(input_quantizer, "_amax") or (
+            hasattr(input_quantizer, "amax") and input_quantizer.amax is not None
+        ):
             assert input_quantizer is not None
             if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
                 input_quantizer._amax = input_quantizer._amax.to(torch.float32)
@@ -300,7 +304,9 @@ def _export_quantized_weight(
                 ).squeeze(),
             )
 
-        if hasattr(output_quantizer, "_amax") or (hasattr(output_quantizer, "amax") and output_quantizer.amax is not None):
+        if hasattr(output_quantizer, "_amax") or (
+            hasattr(output_quantizer, "amax") and output_quantizer.amax is not None
+        ):
             assert output_quantizer is not None
             if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:
                 output_quantizer._amax = output_quantizer._amax.to(torch.float32)
@@ -346,9 +352,14 @@ def _export_quantized_weight(
 
     # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module
     # This can happen for modules that were disabled from quantization or have invalid calibration data
-    if weight_scale is None and quantization_format not in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
+    if weight_scale is None and quantization_format not in [
+        QUANTIZATION_NVFP4,
+        QUANTIZATION_NVFP4_AWQ,
+    ]:
         # For NVFP4, weight_scale is computed later, so we can't check here
-        print(f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found")
+        print(
+            f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found"
+        )
         return
 
     # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)

From e1bd0137d682de23a3feb37c4df13149cc344838 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 14:38:52 -0800
Subject: [PATCH 3/6] add support for nemotron parse fp8

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/unified_export_hf.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 32a5d49d4..eaefcbe9d 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -291,7 +291,9 @@ def _export_quantized_weight(
             )
 
         if hasattr(input_quantizer, "_amax") or (
-            hasattr(input_quantizer, "amax") and input_quantizer.amax is not None
+            input_quantizer is not None
+            and hasattr(input_quantizer, "amax")
+            and input_quantizer.amax is not None
         ):
             assert input_quantizer is not None
             if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
@@ -305,7 +307,9 @@ def _export_quantized_weight(
             )
 
         if hasattr(output_quantizer, "_amax") or (
-            hasattr(output_quantizer, "amax") and output_quantizer.amax is not None
+            output_quantizer is not None
+            and hasattr(output_quantizer, "amax")
+            and output_quantizer.amax is not None
         ):
             assert output_quantizer is not None
             if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:

From 52eee8444ccce560679c19eb148b353c5e7fd15e Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 20 Jan 2026 17:42:11 -0800
Subject: [PATCH 4/6] add image-text data calibration support

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py          |  90 ++++++++--
 examples/llm_ptq/hf_ptq.py                 | 198 +++++++++++++++------
 modelopt/torch/export/unified_export_hf.py |   6 +-
 3 files changed, 222 insertions(+), 72 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 1929ef2ce..3aaae3556 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -221,9 +221,33 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok
     if "vila" in ckpt_path.lower():
         ckpt_path += "/llm"
 
-    tokenizer = AutoTokenizer.from_pretrained(
-        ckpt_path, trust_remote_code=trust_remote_code, **kwargs
-    )
+    # Suppress verbose tokenizer output (e.g., printing all special tokens)
+    import contextlib
+    import io
+    import logging
+    import os
+
+    # Save current settings
+    old_verbosity = os.environ.get("TOKENIZERS_PARALLELISM", None)
+    transformers_log_level = logging.getLogger("transformers").level
+
+    # Suppress output
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    logging.getLogger("transformers").setLevel(logging.ERROR)
+
+    # Also capture stdout to suppress verbose tokenizer printing
+    with contextlib.redirect_stdout(io.StringIO()):
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                ckpt_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+        finally:
+            # Restore original settings
+            if old_verbosity is not None:
+                os.environ["TOKENIZERS_PARALLELISM"] = old_verbosity
+            else:
+                os.environ.pop("TOKENIZERS_PARALLELISM", None)
+            logging.getLogger("transformers").setLevel(transformers_log_level)
 
     # can't set attribute 'pad_token' for "<unk>"
     # We skip this step for Nemo models
@@ -279,10 +303,23 @@ def get_processor(
         # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
         # This will only work if the model has a processor config
         try:
-            processor = AutoProcessor.from_pretrained(
-                ckpt_path,
-                **model_kwargs,
-            )
+            import contextlib
+            import io
+            import logging
+
+            # Suppress verbose output from processor/tokenizer loading
+            transformers_log_level = logging.getLogger("transformers").level
+            logging.getLogger("transformers").setLevel(logging.ERROR)
+
+            with contextlib.redirect_stdout(io.StringIO()):
+                processor = AutoProcessor.from_pretrained(
+                    ckpt_path,
+                    **model_kwargs,
+                )
+
+            # Restore logging
+            logging.getLogger("transformers").setLevel(transformers_log_level)
+
             print(f"Loaded AutoProcessor for model type: {model_type}")
             return processor
         except Exception as e:
@@ -330,12 +367,26 @@ def get_model(
     # Load config once and handle VL model detection
     try:
         hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
+
+        # Check specifically for Nemotron-Parse
+        architectures = getattr(hf_config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         if is_nemotron_vl(hf_config):
-            print(
-                "Detected Nemotron VL model from config. "
-                "Disabling automatic device mapping for compatibility."
-            )
-            device_map = None
+            if is_nemotron_parse:
+                # Nemotron-Parse works fine with device_map="auto"
+                # Keep device_map="auto" to ensure proper device placement
+                print(
+                    "Detected Nemotron-Parse model from config. "
+                    "Using automatic device mapping."
+                )
+            else:
+                # For other Nemotron VL models, disable device_map for compatibility
+                print(
+                    "Detected Nemotron VL model from config. "
+                    "Disabling automatic device mapping for compatibility."
+                )
+                device_map = None
     except Exception as e:
         print(f"Error: Could not load config from {ckpt_path}: {e}")
         raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
@@ -433,6 +484,21 @@ def get_model(
         print(f"Moving model to {device} device...")
         model = model.to(device)
 
+    # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device
+    # The RADIO encoder has buffers that might not be properly moved even with device_map="auto"
+    # This is because custom RADIO modules might not fully support accelerate's device_map
+    if device != "cpu" and hasattr(model, "encoder"):
+        # Check if encoder has any buffers on CPU
+        cpu_buffers = []
+        for name, buffer in model.encoder.named_buffers():
+            if buffer.device.type == "cpu":
+                cpu_buffers.append(name)
+
+        if cpu_buffers:
+            print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...")
+            model.encoder = model.encoder.to(device)
+            print(f"Encoder moved to {device}")
+
     if device == "cuda" and not is_model_on_gpu(model):
         print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index b583fe48b..7e0c0316e 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -64,6 +64,7 @@
 )
 from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
+from modelopt.torch.utils.nemotron_vlm_dataset_utils import get_nemotron_vlm_dataset_dataloader
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
 from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
 
@@ -173,9 +174,50 @@ def make_calib_dataloader(
     tokenizer: PreTrainedTokenizerBase | None,
     device: torch.device,
     model_type: str | None,
+    full_model: torch.nn.Module | None = None,
 ) -> tuple[DataLoader, str | None]:
     calib_dataloader = None
     first_text_speech_dataset = None
+
+    # Check if this is Nemotron-Parse - use image-text data for better calibration
+    if full_model is not None:
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+        if is_nemotron_parse and processor is not None:
+            print(
+                "✓ Detected Nemotron-Parse model. Using image-text dataset for calibration "
+                "to provide realistic visual embeddings to the decoder."
+            )
+
+            # Override dataset to use image-text dataset if not specified
+            supported_datasets = ["nemotron_vlm_v2", "chartqa", "scienceqa"]
+            if not args.dataset or args.dataset[0] not in supported_datasets:
+                print(
+                    f"[INFO] Dataset '{args.dataset}' is not a supported image-text dataset. "
+                    f"Automatically using 'nemotron_vlm_v2' for Nemotron-Parse calibration."
+                )
+                dataset_to_use = "nemotron_vlm_v2"
+            else:
+                dataset_to_use = args.dataset[0]
+
+            # Nemotron-Parse needs single dataset for now
+            if len(args.calib_size) > 1:
+                print(f"[INFO] Using first calib_size value: {args.calib_size[0]}")
+                calib_size_to_use = args.calib_size[0]
+            else:
+                calib_size_to_use = args.calib_size[0] if args.calib_size else 512
+
+            calib_dataloader = get_nemotron_vlm_dataset_dataloader(
+                dataset_name=dataset_to_use,
+                processor=processor,
+                batch_size=args.batch_size,
+                num_samples=calib_size_to_use,
+                device=device,  # Move data to model's device
+            )
+            return calib_dataloader, first_text_speech_dataset
+
     if model_type == "mllama":
         assert processor is not None and isinstance(processor, MllamaImageProcessor), (
             "The MllamaImageProcessor must be set."
@@ -377,18 +419,35 @@ def load_model(args: argparse.Namespace):
             trust_remote_code=args.trust_remote_code,
         )
     else:
+        # Check if this is a Nemotron VL model that needs a processor
+        # Do this BEFORE setting default datasets so we can use image-text data for Nemotron-Parse
+        is_nemotron_vl_model = is_nemotron_vl(full_model)
+
+        # Check specifically for Nemotron-Parse to set appropriate dataset defaults
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         if args.dataset is None:
-            args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
-            warnings.warn(
-                "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
-            )
+            if is_nemotron_parse:
+                # For Nemotron-Parse, default to Nemotron VLM Dataset v2
+                args.dataset = ["nemotron_vlm_v2"]
+                print(
+                    "No dataset specified. Defaulting to 'nemotron_vlm_v2' for Nemotron-Parse "
+                    "(NVIDIA's image-text dataset for better calibration)."
+                )
+            else:
+                # For other models, use text-only datasets
+                args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
+                warnings.warn(
+                    "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
+                )
+
         # Adjust calib_size to match dataset length by extending or truncating as needed
         args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[
             : len(args.dataset)
         ]
 
-        # Check if this is a Nemotron VL model that needs a processor
-        is_nemotron_vl_model = is_nemotron_vl(full_model)
         if is_nemotron_vl_model:
             # Load processor for Nemotron VL models (like Nemotron-Parse)
             processor = get_processor(
@@ -404,26 +463,41 @@ def load_model(args: argparse.Namespace):
         # Left padding usually provides better calibration result.
         tokenizer.padding_side = "left"
 
-        # We only quantize the language model for VLMs other than the type supported above.
-        language_model_lineage = get_language_model_from_vl(full_model)
-        if language_model_lineage is not None:
-            language_model = language_model_lineage.pop(-1)
-            ancestors = language_model_lineage
-            # Apply disabled quant to all modules that are not part of language_model so we can exclude them during
-            # HF export.
-            disabled_quant_cfg = {
-                "quant_cfg": {"default": {"enable": False}},
-                "algorithm": "max",
-            }
-
-            memo = set(ancestors) | {language_model}
-            for ancestor in ancestors:
-                for _, module in ancestor.named_children():
-                    if module not in memo:
-                        mtq.quantize(module, disabled_quant_cfg, forward_loop=None)
-                        memo.add(module)
-
-            model_type = get_model_type(language_model)
+        # Check if this is Nemotron-Parse
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+        # For Nemotron-Parse, DON'T extract the decoder
+        # We want to calibrate the full model so the decoder sees realistic visual embeddings
+        # The vision encoder won't be quantized (disabled via quant_cfg in mono_quantize)
+        if is_nemotron_parse:
+            print(
+                "Nemotron-Parse detected: Keeping full encoder-decoder model for calibration "
+                "with image-text data. Vision encoder will be disabled from quantization."
+            )
+            # language_model = full_model (already set above)
+        else:
+            # For other VLMs, extract the language model for quantization
+            language_model_lineage = get_language_model_from_vl(full_model)
+            if language_model_lineage is not None:
+                language_model = language_model_lineage.pop(-1)
+                ancestors = language_model_lineage
+                # Apply disabled quant to all modules that are not part of language_model so we can exclude them during
+                # HF export.
+                disabled_quant_cfg = {
+                    "quant_cfg": {"default": {"enable": False}},
+                    "algorithm": "max",
+                }
+
+                memo = set(ancestors) | {language_model}
+                for ancestor in ancestors:
+                    for _, module in ancestor.named_children():
+                        if module not in memo:
+                            mtq.quantize(module, disabled_quant_cfg, forward_loop=None)
+                            memo.add(module)
+
+                model_type = get_model_type(language_model)
 
     if model_type == "phi4mm":
         warnings.warn("Please set the default input_mode to InputMode.LANGUAGE before quantizing.")
@@ -494,14 +568,23 @@ def mono_quantize(
             "Consider reducing calib_size to reduce calibration time.\n####\n"
         )
 
+    # Check if this is Nemotron-Parse
+    config = full_model.config
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+    original_forward = None  # Track original forward method if we wrap it
+
     # For Nemotron VL models, disable quantization of vision components
     if is_nemotron_vl_model:
         print("Disabling quantization for vision components in Nemotron VL model")
         quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
-        # Also disable radio model components specifically
+        # Also disable radio model components specifically (for Nemotron-Parse)
         quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*encoder*"] = {"enable": False}  # Disable encoder
+        quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False}  # Nemotron-Parse specific
+        print("Quantization will only be applied to the decoder (text generation) component")
 
     if not model_is_already_quantized or calibration_only:
         if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
@@ -513,9 +596,25 @@ def mono_quantize(
 
         if not use_calibration:
             warnings.warn("Dynamic quantization. Calibration skipped.")
-        calibrate_loop = (
-            create_forward_loop(dataloader=calib_dataloader) if use_calibration else None
-        )
+
+        # Create calibration loop
+        if use_calibration:
+            if is_nemotron_parse:
+                # For Nemotron-Parse, wrap the model to force use_cache=False
+                print("Wrapping Nemotron-Parse model for calibration (use_cache=False)")
+                original_forward = language_model.forward
+
+                def wrapped_forward(*args, **kwargs):
+                    kwargs["use_cache"] = False
+                    return original_forward(*args, **kwargs)
+
+                # Temporarily replace forward method
+                language_model.forward = wrapped_forward
+                calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+            else:
+                calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+        else:
+            calibrate_loop = None
 
         if calibration_only:
             language_model = mtq.calibrate(
@@ -524,8 +623,15 @@ def mono_quantize(
         else:
             language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop)
 
-        # For VL models, update full_model to use the quantized language model
-        if is_nemotron_vl_model:
+        # Restore original forward method if we wrapped it for Nemotron-Parse
+        if is_nemotron_parse and original_forward is not None:
+            print("Restoring original forward method after calibration")
+            language_model.forward = original_forward
+            original_forward = None
+
+        # For VL models (except Nemotron-Parse), update full_model to use the quantized language model
+        # For Nemotron-Parse, language_model IS full_model, so no update needed
+        if is_nemotron_vl_model and language_model is not full_model:
             language_model_lineage = get_language_model_from_vl(full_model)
             if language_model_lineage is not None:
                 print("Updating full_model with quantized language_model...")
@@ -828,38 +934,12 @@ def quantize_main(
     print(f"Use calib batch_size {args.batch_size}")
 
     calib_dataloader, first_text_speech_dataset = make_calib_dataloader(
-        args, language_model, processor, tokenizer, device, model_type
+        args, language_model, processor, tokenizer, device, model_type, full_model
     )
 
     # Detect if this is a Nemotron VL model using architecture-based detection
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
-    # For Nemotron-Parse, wrap the text-only dataloader to add dummy images
-    # Nemotron-Parse is an encoder-decoder model that requires pixel_values
-    if is_nemotron_vl_model and processor is not None:
-        config = full_model.config
-        architectures = getattr(config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
-        if is_nemotron_parse:
-            # Check if we're quantizing just the decoder or the full model
-            decoder_only = language_model is not full_model
-
-            if decoder_only:
-                print(
-                    "Calibration will use text-only inputs for Nemotron-Parse decoder. "
-                    "Vision encoder is excluded from quantization."
-                )
-            else:
-                print(
-                    "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. "
-                    "Nemotron-Parse requires pixel_values for full model calibration."
-                )
-
-            calib_dataloader = create_nemotron_parse_calib_wrapper(
-                calib_dataloader, processor, device, decoder_only=decoder_only
-            )
-
     preview_input_ids, generated_ids_before_ptq = pre_quantize(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index eaefcbe9d..3c8dde154 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -172,7 +172,11 @@ def _output_hook(module, input, output):
                     print(
                         f"Running optimization on language model with fake_input shape: {fake_input.shape}"
                     )
-                    language_model(fake_input)
+                    # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
+                    if is_nemotron_parse:
+                        language_model(fake_input, use_cache=False)
+                    else:
+                        language_model(fake_input)
                 else:
                     raise ValueError(
                         f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "

From e252fb27ee88d04f7c20c87a35d7e46d95e345d8 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 20 Jan 2026 17:42:46 -0800
Subject: [PATCH 5/6] add image-text data calibration support

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 3aaae3556..a244fe862 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -376,10 +376,7 @@ def get_model(
             if is_nemotron_parse:
                 # Nemotron-Parse works fine with device_map="auto"
                 # Keep device_map="auto" to ensure proper device placement
-                print(
-                    "Detected Nemotron-Parse model from config. "
-                    "Using automatic device mapping."
-                )
+                print("Detected Nemotron-Parse model from config. Using automatic device mapping.")
             else:
                 # For other Nemotron VL models, disable device_map for compatibility
                 print(

From 64fa1fa250e4d5ef26266802ce1ea72a70ccd73e Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 21 Jan 2026 01:17:41 -0800
Subject: [PATCH 6/6] add image-text data calibration support

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 7e0c0316e..802b658f2 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -602,11 +602,14 @@ def mono_quantize(
             if is_nemotron_parse:
                 # For Nemotron-Parse, wrap the model to force use_cache=False
                 print("Wrapping Nemotron-Parse model for calibration (use_cache=False)")
-                original_forward = language_model.forward
+                # Store original forward before wrapping
+                _original_forward = language_model.forward
+                original_forward = _original_forward  # Capture in outer scope
 
                 def wrapped_forward(*args, **kwargs):
                     kwargs["use_cache"] = False
-                    return original_forward(*args, **kwargs)
+                    # Call the captured forward method
+                    return _original_forward(*args, **kwargs)
 
                 # Temporarily replace forward method
                 language_model.forward = wrapped_forward