From 44402d26da893ee199a2c7397dece32fb8a549f9 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 14:35:04 -0800 Subject: [PATCH 1/6] add support for nemotron parse fp8 Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 79 ++++++--- examples/llm_ptq/hf_ptq.py | 186 +++++++++++++++++---- examples/llm_ptq/vlm_utils.py | 131 +++++++++++---- modelopt/torch/export/model_utils.py | 12 +- modelopt/torch/export/unified_export_hf.py | 68 +++++--- 5 files changed, 363 insertions(+), 113 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 40f700781..1929ef2ce 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -28,6 +28,7 @@ from accelerate.utils import get_max_memory from transformers import ( AutoConfig, + AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer, @@ -64,27 +65,39 @@ def run_nemotron_vl_preview( """ from vlm_utils import run_text_only_generation, run_vl_preview_generation - print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...") - question = tokenizer.decode(input_ids[0], skip_special_tokens=True) - generation_config = { - "max_new_tokens": 100, - "do_sample": False, - "eos_token_id": tokenizer.eos_token_id, - } - - # Try text-only generation - text_response = run_text_only_generation( - full_model, tokenizer, question, generation_config, pyt_ckpt_path - ) + # Check if this is Nemotron-Parse (encoder-decoder model that requires images) + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + + generated_ids = None + + if not is_nemotron_parse: + # Only try text-only generation for models that support it (not Nemotron-Parse) + print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...") + question = tokenizer.decode(input_ids[0], skip_special_tokens=True) + generation_config = { + "max_new_tokens": 100, + "do_sample": False, + "eos_token_id": tokenizer.eos_token_id, + } + + # Try text-only generation + text_response = run_text_only_generation( + full_model, tokenizer, question, generation_config, pyt_ckpt_path + ) - if text_response is not None: - print(f"✅ Text-only generation successful: {text_response[:100]}...") - generated_ids = text_response - elif allow_fallback: - print("Text-only generation failed, falling back to standard generate...") - generated_ids = full_model.generate(input_ids, max_new_tokens=100) + if text_response is not None: + print(f"✅ Text-only generation successful: {text_response[:100]}...") + generated_ids = text_response + elif allow_fallback: + print("Text-only generation failed, falling back to standard generate...") + generated_ids = full_model.generate(input_ids, max_new_tokens=100) else: - generated_ids = None + print( + f"Skipping text-only generation for Nemotron-Parse ({stage_name}) - " + "this encoder-decoder model requires images for all operations." + ) # Run additional VL test with images print(f"Running additional VL test with images ({stage_name})...") @@ -95,6 +108,10 @@ def run_nemotron_vl_preview( def _is_multimodal_config(config): """Check if a config indicates a multimodal model (config-only version of is_multimodal_model).""" + # Check for Nemotron-Parse encoder-decoder architecture + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + return ( hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL) or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal @@ -103,6 +120,7 @@ def _is_multimodal_config(config): or ( hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer") ) # Image embedding layers + or is_nemotron_parse # Nemotron-Parse conditional generation model ) @@ -257,8 +275,19 @@ def get_processor( ) return MllamaImageProcessor(processor, device) - - return None + else: + # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse) + # This will only work if the model has a processor config + try: + processor = AutoProcessor.from_pretrained( + ckpt_path, + **model_kwargs, + ) + print(f"Loaded AutoProcessor for model type: {model_type}") + return processor + except Exception as e: + print(f"Could not load processor for {model_type}: {e}") + return None def get_dtype(dtype): @@ -320,8 +349,6 @@ def get_model( model_kwargs.setdefault("torch_dtype", "auto") if "vila" in ckpt_path.lower(): - from transformers import AutoModel - hf_vila = AutoModel.from_pretrained( ckpt_path, device_map=device_map, @@ -353,13 +380,13 @@ def get_model( if not hasattr(transformers, architecture): warnings.warn( f"Architecture {architecture} not found in transformers: {transformers.__version__}. " - "Falling back to AutoModelForCausalLM." + "Falling back to AutoModel." ) assert trust_remote_code, ( "Please set trust_remote_code to True if you want to use this architecture" ) - auto_model_module = AutoModelForCausalLM + auto_model_module = AutoModel from_config = auto_model_module.from_config else: auto_model_module = getattr(transformers, architecture) @@ -370,7 +397,7 @@ def get_model( # unless specified by the hf_config. torch_dtype = getattr(hf_config, "torch_dtype", torch.float16) model_kwargs2 = model_kwargs.copy() - if auto_model_module != AutoModelForCausalLM: + if auto_model_module not in [AutoModelForCausalLM, AutoModel]: model_kwargs2.pop("trust_remote_code", None) model_kwargs2["torch_dtype"] = torch_dtype model_kwargs2.pop("max_memory", None) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index a9862a742..962a564f7 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -97,6 +97,76 @@ mto.enable_huggingface_checkpointing() +def create_nemotron_parse_calib_wrapper(base_dataloader, processor, device, decoder_only=False): + """Wrap a text-only dataloader to add dummy images for Nemotron-Parse calibration. + + Nemotron-Parse is an encoder-decoder model that requires pixel_values (for encoder) + and decoder_input_ids (for decoder) during calibration. This wrapper adds properly + formatted dummy images and decoder inputs. + + Args: + base_dataloader: The base text-only dataloader + processor: The Nemotron-Parse processor + device: Device to place tensors on + decoder_only: If True, only provide decoder inputs (for when quantizing just the decoder) + """ + import torch + from PIL import Image + + class NemotronParseCalibWrapper: + def __init__(self, base_dataloader, processor, device, decoder_only=False): + self.base_dataloader = base_dataloader + self.processor = processor + self.device = device + self.decoder_only = decoder_only + # Create a simple dummy image (will be processed by the model's processor) + self.dummy_image = Image.new('RGB', (1024, 1280), color='white') + + def __iter__(self): + for batch in self.base_dataloader: + # batch contains input_ids and attention_mask from text data + batch_size = batch['input_ids'].shape[0] + + if self.decoder_only: + # When calibrating just the decoder, it expects input_ids directly + # (not decoder_input_ids, as that's only for the full encoder-decoder forward) + # Just pass through the original batch + yield batch + else: + # When calibrating the full model, we need pixel_values and decoder_input_ids + # Process dummy images using the Nemotron-Parse processor + dummy_images = [self.dummy_image] * batch_size + + # Use the processor to get properly formatted pixel_values + prompts = [""] * batch_size + processed = self.processor( + text=prompts, + images=dummy_images, + return_tensors="pt" + ) + + # For encoder-decoder models like Nemotron-Parse: + # - pixel_values go to the vision encoder + # - decoder_input_ids are needed for the decoder + batch['pixel_values'] = processed['pixel_values'].to(self.device) + batch['decoder_input_ids'] = processed['input_ids'].to(self.device) + batch['decoder_attention_mask'] = processed['attention_mask'].to(self.device) + + # Remove the encoder input_ids and attention_mask as they're not needed + # The model will use pixel_values for the encoder + if 'input_ids' in batch: + del batch['input_ids'] + if 'attention_mask' in batch: + del batch['attention_mask'] + + yield batch + + def __len__(self): + return len(self.base_dataloader) + + return NemotronParseCalibWrapper(base_dataloader, processor, device, decoder_only) + + def make_calib_dataloader( args: argparse.Namespace, language_model: torch.nn.Module, @@ -317,6 +387,18 @@ def load_model(args: argparse.Namespace): args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[ : len(args.dataset) ] + + # Check if this is a Nemotron VL model that needs a processor + is_nemotron_vl_model = is_nemotron_vl(full_model) + if is_nemotron_vl_model: + # Load processor for Nemotron VL models (like Nemotron-Parse) + processor = get_processor( + args.pyt_ckpt_path, + model_type, + device, + trust_remote_code=args.trust_remote_code, + ) + tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code) default_padding_side = tokenizer.padding_side @@ -569,10 +651,20 @@ def pre_quantize( post-quantize generation. """ + # Check if this is Nemotron-Parse (encoder-decoder model) + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + # Only run single sample for preview - preview_input_ids = next(iter(calib_dataloader))[ - "input_features" if model_type == "whisper" else "input_ids" - ][0:1] + # For Nemotron-Parse, use decoder_input_ids instead of input_ids + sample_batch = next(iter(calib_dataloader)) + if is_nemotron_parse and "decoder_input_ids" in sample_batch: + preview_input_ids = sample_batch["decoder_input_ids"][0:1] + elif model_type == "whisper": + preview_input_ids = sample_batch["input_features"][0:1] + else: + preview_input_ids = sample_batch["input_ids"][0:1] # Generate preview before quantization if is_nemotron_vl_model and tokenizer is not None: @@ -693,36 +785,46 @@ def quantize_main( device: torch.device, ): if args.batch_size == 0: - # Calibration/sparsification will actually take much more memory than regular inference - # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio - # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference. - sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1 - # Whisper model expects mel-spectrogram input features of length 3000 - # Whisper model needs input of shape (batch_size, num_mel_bins, 3000) - # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float - # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size() - if model_type == "whisper": - max_sample_length = 3000 - num_mel_bins = language_model.config.num_mel_bins - sample_input_single_batch = ( - torch.ones([1, num_mel_bins, max_sample_length], dtype=language_model.dtype).to( - language_model.device - ) - * 100 + # Check if this is a vision-language model + # For VL models, skip automatic batch size detection and use a conservative default + # since proper multimodal input preparation is complex + if is_multimodal_model(full_model) or is_nemotron_vl(full_model): + print( + "Vision-language model detected. Using default batch_size=1 for calibration " + "to ensure proper handling of multimodal inputs." ) + args.batch_size = 1 else: - sample_input_single_batch = None + # Calibration/sparsification will actually take much more memory than regular inference + # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio + # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference. + sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1 + # Whisper model expects mel-spectrogram input features of length 3000 + # Whisper model needs input of shape (batch_size, num_mel_bins, 3000) + # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float + # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size() + if model_type == "whisper": + max_sample_length = 3000 + num_mel_bins = language_model.config.num_mel_bins + sample_input_single_batch = ( + torch.ones([1, num_mel_bins, max_sample_length], dtype=language_model.dtype).to( + language_model.device + ) + * 100 + ) + else: + sample_input_single_batch = None - run_auto_quant = args.auto_quantize_bits is not None + run_auto_quant = args.auto_quantize_bits is not None - args.batch_size = get_max_batch_size( - language_model, - max_sample_length=args.calib_seq, - sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0, - sample_input_single_batch=sample_input_single_batch, - enable_grad=run_auto_quant, - ) - args.batch_size = min(args.batch_size, sum(args.calib_size)) + args.batch_size = get_max_batch_size( + language_model, + max_sample_length=args.calib_seq, + sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0, + sample_input_single_batch=sample_input_single_batch, + enable_grad=run_auto_quant, + ) + args.batch_size = min(args.batch_size, sum(args.calib_size)) print(f"Use calib batch_size {args.batch_size}") @@ -733,6 +835,32 @@ def quantize_main( # Detect if this is a Nemotron VL model using architecture-based detection is_nemotron_vl_model = is_nemotron_vl(full_model) + # For Nemotron-Parse, wrap the text-only dataloader to add dummy images + # Nemotron-Parse is an encoder-decoder model that requires pixel_values + if is_nemotron_vl_model and processor is not None: + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + + if is_nemotron_parse: + # Check if we're quantizing just the decoder or the full model + decoder_only = language_model is not full_model + + if decoder_only: + print( + "Calibration will use text-only inputs for Nemotron-Parse decoder. " + "Vision encoder is excluded from quantization." + ) + else: + print( + "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. " + "Nemotron-Parse requires pixel_values for full model calibration." + ) + + calib_dataloader = create_nemotron_parse_calib_wrapper( + calib_dataloader, processor, device, decoder_only=decoder_only + ) + preview_input_ids, generated_ids_before_ptq = pre_quantize( args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model ) diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 6c9d921b8..4789130cd 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -18,7 +18,7 @@ import os from PIL import Image -from transformers import AutoImageProcessor, AutoProcessor +from transformers import AutoImageProcessor, AutoProcessor, GenerationConfig def run_vl_preview_generation(model, tokenizer, model_path, stage_name): @@ -73,13 +73,34 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): print(" Skipping VL preview generation.") return None + # Check if this is Nemotron-Parse early to set up proper generation config + config = model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + # Generate response question = "Describe this image briefly." # Updated for single image - generation_config = { - "max_new_tokens": 50, - "do_sample": False, - "eos_token_id": tokenizer.eos_token_id, - } + + # Use model's GenerationConfig for Nemotron-Parse, dict for others + if is_nemotron_parse: + try: + generation_config = GenerationConfig.from_pretrained( + model_path, trust_remote_code=True + ) + print("Using Nemotron-Parse GenerationConfig from model") + except Exception as e: + print(f"Warning: Could not load GenerationConfig: {e}, using defaults") + generation_config = { + "max_new_tokens": 50, + "do_sample": False, + "eos_token_id": tokenizer.eos_token_id, + } + else: + generation_config = { + "max_new_tokens": 50, + "do_sample": False, + "eos_token_id": tokenizer.eos_token_id, + } print(f"Generating VL response ({stage_name})...") @@ -105,27 +126,39 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): else: processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) - messages = [ - {"role": "system", "content": "/no_think"}, - { - "role": "user", - "content": [ - { - "type": "image", - "image": "", - }, - { - "type": "text", - "text": question, - }, - ], - }, - ] + # Check if this is Nemotron-Parse (uses task prompts instead of chat templates) + config = model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - # Apply chat template - prompt = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) + if is_nemotron_parse: + # Nemotron-Parse uses a specific task prompt format + # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example + prompt = "" + print(f"Using Nemotron-Parse task prompt: {prompt}") + else: + # Other VL models use chat templates + messages = [ + {"role": "system", "content": "/no_think"}, + { + "role": "user", + "content": [ + { + "type": "image", + "image": "", + }, + { + "type": "text", + "text": question, + }, + ], + }, + ] + + # Apply chat template + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Process inputs using the processor with single image inputs = processor( @@ -139,21 +172,49 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): inputs = inputs.to(model_device) print(f" Moved inputs to {model_device}") + # Verify we have pixel_values for the vision encoder + if not hasattr(inputs, 'pixel_values') or inputs.pixel_values is None: + raise ValueError("Processor did not generate pixel_values. Check processor configuration.") + # Generate response using model.generate - generated_ids = model.generate( - pixel_values=inputs.pixel_values, - input_ids=inputs.input_ids, - attention_mask=inputs.attention_mask, - **generation_config, - ) + if isinstance(generation_config, GenerationConfig): + # For Nemotron-Parse with GenerationConfig object + generated_ids = model.generate( + pixel_values=inputs.pixel_values, + input_ids=inputs.input_ids, + attention_mask=inputs.attention_mask, + generation_config=generation_config, + ) + else: + # For other models with dict generation config + generated_ids = model.generate( + pixel_values=inputs.pixel_values, + input_ids=inputs.input_ids, + attention_mask=inputs.attention_mask, + **generation_config, + ) # Decode the response (trim input tokens like in the working example) + if generated_ids is None: + raise ValueError("Model generate returned None") + generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] - output_text = processor.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False - ) + + # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode + if is_nemotron_parse and hasattr(tokenizer, 'batch_decode'): + output_text = tokenizer.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + else: + output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + if output_text is None or len(output_text) == 0: + raise ValueError("Decoding returned empty output") + response = output_text[0] print(f"✅ VL generation {stage_name} successful!") diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 5a24429ad..40c313ad2 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -85,6 +85,7 @@ def is_multimodal_model(model): - Vision LoRA configurations - Audio processing capabilities - Image embedding layers + - Nemotron-Parse conditional generation models Args: model: The HuggingFace model instance to check @@ -103,6 +104,10 @@ def is_multimodal_model(model): """ config = model.config + # Check for Nemotron-Parse encoder-decoder architecture + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + return ( hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL) or hasattr(model, "language_model") # Language model attribute (e.g., LLaVA) @@ -112,6 +117,7 @@ def is_multimodal_model(model): or ( hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer") ) # Image embedding layers + or is_nemotron_parse # Nemotron-Parse conditional generation model ) @@ -141,5 +147,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None: if hasattr(model, "language_model"): return [model, model.language_model] - # Pattern 3: No language_model found + # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model + if hasattr(model, "decoder"): + return [model, model.decoder] + + # Pattern 4: No language_model found return None diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index ccfc01200..76b982a3a 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -155,12 +155,14 @@ def _output_hook(module, input, output): # Run forward pass so that all modules sharing the same input are collected using forward hook. + # Check if this is Nemotron-Parse (encoder-decoder VL model) + architectures = getattr(model.config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + with set_quantizer_by_cfg_context(model, {"*": {"enable": False}}): - if getattr(model.config, "is_encoder_decoder", False): - # For encoder-decoder models, we need to pass both the encoder and decoder input ids - model(fake_input, decoder_input_ids=decoder_fake_input) - elif is_vl_model and "nemotron" in model_type: - # For Nemotron VL models, try to run optimization on just the language model part + if is_vl_model and ("nemotron" in model_type or is_nemotron_parse): + # For Nemotron VL models (including Nemotron-Parse), run optimization on just the language model/decoder + # This avoids needing to create proper pixel_values for the vision encoder language_model_lineage = get_language_model_from_vl(model) if language_model_lineage is not None: @@ -177,6 +179,9 @@ def _output_hook(module, input, output): "This is required for requantization/resmoothing optimization. " "Please ensure the model architecture is supported or file an issue." ) + elif getattr(model.config, "is_encoder_decoder", False): + # For other encoder-decoder models (non-VL), we need to pass both encoder and decoder input ids + model(fake_input, decoder_input_ids=decoder_fake_input) else: model(fake_input) @@ -257,25 +262,36 @@ def _export_quantized_weight( if quantization_format == QUANTIZATION_FP8: # Convert amax to float32 - weight_quantizer._amax = weight_quantizer._amax.to(torch.float32) - - if weight_quantizer._amax.dim() == 1: - # Per-tensor amax - weight_scaling_factor = torch.tensor( - weight_quantizer.amax.item() / weight_quantizer.maxbound - ) + # Note: Use the public 'amax' property, not the private '_amax' attribute + if hasattr(weight_quantizer, '_amax') and weight_quantizer._amax is not None: + weight_quantizer._amax = weight_quantizer._amax.to(torch.float32) + amax_tensor = weight_quantizer._amax else: - # Per-channel amax - weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound) + # Fallback to public amax property + amax_tensor = weight_quantizer.amax + if amax_tensor is not None and hasattr(amax_tensor, 'to'): + amax_tensor = amax_tensor.to(torch.float32) + + # Only compute scaling factor if amax_tensor is valid + if amax_tensor is not None and hasattr(amax_tensor, 'dim'): + if amax_tensor.dim() == 1: + # Per-tensor amax + weight_scaling_factor = torch.tensor( + weight_quantizer.amax.item() / weight_quantizer.maxbound + ) + else: + # Per-channel amax + weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound) - sub_module.register_buffer( - quantizer_attrs.weight_scale, - weight_scaling_factor, - ) + sub_module.register_buffer( + quantizer_attrs.weight_scale, + weight_scaling_factor, + ) - if hasattr(input_quantizer, "_amax"): + if hasattr(input_quantizer, "_amax") or (hasattr(input_quantizer, "amax") and input_quantizer.amax is not None): assert input_quantizer is not None - input_quantizer._amax = input_quantizer._amax.to(torch.float32) + if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None: + input_quantizer._amax = input_quantizer._amax.to(torch.float32) sub_module.register_buffer( quantizer_attrs.input_scale, @@ -284,9 +300,10 @@ def _export_quantized_weight( ).squeeze(), ) - if hasattr(output_quantizer, "_amax"): + if hasattr(output_quantizer, "_amax") or (hasattr(output_quantizer, "amax") and output_quantizer.amax is not None): assert output_quantizer is not None - output_quantizer._amax = output_quantizer._amax.to(torch.float32) + if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None: + output_quantizer._amax = output_quantizer._amax.to(torch.float32) else: # Register weight_scale and input_scale if quantization_format == QUANTIZATION_FP8_PB_REAL: @@ -327,6 +344,13 @@ def _export_quantized_weight( weight_scale: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale, None) weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None) + # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module + # This can happen for modules that were disabled from quantization or have invalid calibration data + if weight_scale is None and quantization_format not in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: + # For NVFP4, weight_scale is computed later, so we can't check here + print(f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found") + return + # Transpose weight for bmm-style expert quantization (llama4, gpt-oss) # Check if this is a BMM-style expert weight that needs transposition is_bmm_expert_weight = weight.dim() == 3 and any( From 6870525add1215eb409c2e3e127e365db3cfe00a Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 14:36:03 -0800 Subject: [PATCH 2/6] add support for nemotron parse fp8 Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/hf_ptq.py | 27 +++++++++++----------- examples/llm_ptq/vlm_utils.py | 16 +++++++++---- modelopt/torch/export/unified_export_hf.py | 27 +++++++++++++++------- 3 files changed, 43 insertions(+), 27 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 962a564f7..b583fe48b 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -110,7 +110,6 @@ def create_nemotron_parse_calib_wrapper(base_dataloader, processor, device, deco device: Device to place tensors on decoder_only: If True, only provide decoder inputs (for when quantizing just the decoder) """ - import torch from PIL import Image class NemotronParseCalibWrapper: @@ -120,12 +119,12 @@ def __init__(self, base_dataloader, processor, device, decoder_only=False): self.device = device self.decoder_only = decoder_only # Create a simple dummy image (will be processed by the model's processor) - self.dummy_image = Image.new('RGB', (1024, 1280), color='white') + self.dummy_image = Image.new("RGB", (1024, 1280), color="white") def __iter__(self): for batch in self.base_dataloader: # batch contains input_ids and attention_mask from text data - batch_size = batch['input_ids'].shape[0] + batch_size = batch["input_ids"].shape[0] if self.decoder_only: # When calibrating just the decoder, it expects input_ids directly @@ -138,26 +137,26 @@ def __iter__(self): dummy_images = [self.dummy_image] * batch_size # Use the processor to get properly formatted pixel_values - prompts = [""] * batch_size + prompts = [ + "" + ] * batch_size processed = self.processor( - text=prompts, - images=dummy_images, - return_tensors="pt" + text=prompts, images=dummy_images, return_tensors="pt" ) # For encoder-decoder models like Nemotron-Parse: # - pixel_values go to the vision encoder # - decoder_input_ids are needed for the decoder - batch['pixel_values'] = processed['pixel_values'].to(self.device) - batch['decoder_input_ids'] = processed['input_ids'].to(self.device) - batch['decoder_attention_mask'] = processed['attention_mask'].to(self.device) + batch["pixel_values"] = processed["pixel_values"].to(self.device) + batch["decoder_input_ids"] = processed["input_ids"].to(self.device) + batch["decoder_attention_mask"] = processed["attention_mask"].to(self.device) # Remove the encoder input_ids and attention_mask as they're not needed # The model will use pixel_values for the encoder - if 'input_ids' in batch: - del batch['input_ids'] - if 'attention_mask' in batch: - del batch['attention_mask'] + if "input_ids" in batch: + del batch["input_ids"] + if "attention_mask" in batch: + del batch["attention_mask"] yield batch diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 4789130cd..2d3d9f82c 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -173,8 +173,10 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): print(f" Moved inputs to {model_device}") # Verify we have pixel_values for the vision encoder - if not hasattr(inputs, 'pixel_values') or inputs.pixel_values is None: - raise ValueError("Processor did not generate pixel_values. Check processor configuration.") + if not hasattr(inputs, "pixel_values") or inputs.pixel_values is None: + raise ValueError( + "Processor did not generate pixel_values. Check processor configuration." + ) # Generate response using model.generate if isinstance(generation_config, GenerationConfig): @@ -203,13 +205,17 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): ] # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode - if is_nemotron_parse and hasattr(tokenizer, 'batch_decode'): + if is_nemotron_parse and hasattr(tokenizer, "batch_decode"): output_text = tokenizer.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, ) else: output_text = processor.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, ) if output_text is None or len(output_text) == 0: diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 76b982a3a..32a5d49d4 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -263,17 +263,17 @@ def _export_quantized_weight( if quantization_format == QUANTIZATION_FP8: # Convert amax to float32 # Note: Use the public 'amax' property, not the private '_amax' attribute - if hasattr(weight_quantizer, '_amax') and weight_quantizer._amax is not None: + if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: weight_quantizer._amax = weight_quantizer._amax.to(torch.float32) amax_tensor = weight_quantizer._amax else: # Fallback to public amax property amax_tensor = weight_quantizer.amax - if amax_tensor is not None and hasattr(amax_tensor, 'to'): + if amax_tensor is not None and hasattr(amax_tensor, "to"): amax_tensor = amax_tensor.to(torch.float32) # Only compute scaling factor if amax_tensor is valid - if amax_tensor is not None and hasattr(amax_tensor, 'dim'): + if amax_tensor is not None and hasattr(amax_tensor, "dim"): if amax_tensor.dim() == 1: # Per-tensor amax weight_scaling_factor = torch.tensor( @@ -281,14 +281,18 @@ def _export_quantized_weight( ) else: # Per-channel amax - weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound) + weight_scaling_factor = torch.tensor( + weight_quantizer.amax / weight_quantizer.maxbound + ) sub_module.register_buffer( quantizer_attrs.weight_scale, weight_scaling_factor, ) - if hasattr(input_quantizer, "_amax") or (hasattr(input_quantizer, "amax") and input_quantizer.amax is not None): + if hasattr(input_quantizer, "_amax") or ( + hasattr(input_quantizer, "amax") and input_quantizer.amax is not None + ): assert input_quantizer is not None if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None: input_quantizer._amax = input_quantizer._amax.to(torch.float32) @@ -300,7 +304,9 @@ def _export_quantized_weight( ).squeeze(), ) - if hasattr(output_quantizer, "_amax") or (hasattr(output_quantizer, "amax") and output_quantizer.amax is not None): + if hasattr(output_quantizer, "_amax") or ( + hasattr(output_quantizer, "amax") and output_quantizer.amax is not None + ): assert output_quantizer is not None if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None: output_quantizer._amax = output_quantizer._amax.to(torch.float32) @@ -346,9 +352,14 @@ def _export_quantized_weight( # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module # This can happen for modules that were disabled from quantization or have invalid calibration data - if weight_scale is None and quantization_format not in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: + if weight_scale is None and quantization_format not in [ + QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_AWQ, + ]: # For NVFP4, weight_scale is computed later, so we can't check here - print(f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found") + print( + f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found" + ) return # Transpose weight for bmm-style expert quantization (llama4, gpt-oss) From e1bd0137d682de23a3feb37c4df13149cc344838 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 14:38:52 -0800 Subject: [PATCH 3/6] add support for nemotron parse fp8 Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/unified_export_hf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 32a5d49d4..eaefcbe9d 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -291,7 +291,9 @@ def _export_quantized_weight( ) if hasattr(input_quantizer, "_amax") or ( - hasattr(input_quantizer, "amax") and input_quantizer.amax is not None + input_quantizer is not None + and hasattr(input_quantizer, "amax") + and input_quantizer.amax is not None ): assert input_quantizer is not None if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None: @@ -305,7 +307,9 @@ def _export_quantized_weight( ) if hasattr(output_quantizer, "_amax") or ( - hasattr(output_quantizer, "amax") and output_quantizer.amax is not None + output_quantizer is not None + and hasattr(output_quantizer, "amax") + and output_quantizer.amax is not None ): assert output_quantizer is not None if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None: From 52eee8444ccce560679c19eb148b353c5e7fd15e Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 20 Jan 2026 17:42:11 -0800 Subject: [PATCH 4/6] add image-text data calibration support Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 90 ++++++++-- examples/llm_ptq/hf_ptq.py | 198 +++++++++++++++------ modelopt/torch/export/unified_export_hf.py | 6 +- 3 files changed, 222 insertions(+), 72 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 1929ef2ce..3aaae3556 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -221,9 +221,33 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok if "vila" in ckpt_path.lower(): ckpt_path += "/llm" - tokenizer = AutoTokenizer.from_pretrained( - ckpt_path, trust_remote_code=trust_remote_code, **kwargs - ) + # Suppress verbose tokenizer output (e.g., printing all special tokens) + import contextlib + import io + import logging + import os + + # Save current settings + old_verbosity = os.environ.get("TOKENIZERS_PARALLELISM", None) + transformers_log_level = logging.getLogger("transformers").level + + # Suppress output + os.environ["TOKENIZERS_PARALLELISM"] = "false" + logging.getLogger("transformers").setLevel(logging.ERROR) + + # Also capture stdout to suppress verbose tokenizer printing + with contextlib.redirect_stdout(io.StringIO()): + try: + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, trust_remote_code=trust_remote_code, **kwargs + ) + finally: + # Restore original settings + if old_verbosity is not None: + os.environ["TOKENIZERS_PARALLELISM"] = old_verbosity + else: + os.environ.pop("TOKENIZERS_PARALLELISM", None) + logging.getLogger("transformers").setLevel(transformers_log_level) # can't set attribute 'pad_token' for "" # We skip this step for Nemo models @@ -279,10 +303,23 @@ def get_processor( # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse) # This will only work if the model has a processor config try: - processor = AutoProcessor.from_pretrained( - ckpt_path, - **model_kwargs, - ) + import contextlib + import io + import logging + + # Suppress verbose output from processor/tokenizer loading + transformers_log_level = logging.getLogger("transformers").level + logging.getLogger("transformers").setLevel(logging.ERROR) + + with contextlib.redirect_stdout(io.StringIO()): + processor = AutoProcessor.from_pretrained( + ckpt_path, + **model_kwargs, + ) + + # Restore logging + logging.getLogger("transformers").setLevel(transformers_log_level) + print(f"Loaded AutoProcessor for model type: {model_type}") return processor except Exception as e: @@ -330,12 +367,26 @@ def get_model( # Load config once and handle VL model detection try: hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs) + + # Check specifically for Nemotron-Parse + architectures = getattr(hf_config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + if is_nemotron_vl(hf_config): - print( - "Detected Nemotron VL model from config. " - "Disabling automatic device mapping for compatibility." - ) - device_map = None + if is_nemotron_parse: + # Nemotron-Parse works fine with device_map="auto" + # Keep device_map="auto" to ensure proper device placement + print( + "Detected Nemotron-Parse model from config. " + "Using automatic device mapping." + ) + else: + # For other Nemotron VL models, disable device_map for compatibility + print( + "Detected Nemotron VL model from config. " + "Disabling automatic device mapping for compatibility." + ) + device_map = None except Exception as e: print(f"Error: Could not load config from {ckpt_path}: {e}") raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e @@ -433,6 +484,21 @@ def get_model( print(f"Moving model to {device} device...") model = model.to(device) + # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device + # The RADIO encoder has buffers that might not be properly moved even with device_map="auto" + # This is because custom RADIO modules might not fully support accelerate's device_map + if device != "cpu" and hasattr(model, "encoder"): + # Check if encoder has any buffers on CPU + cpu_buffers = [] + for name, buffer in model.encoder.named_buffers(): + if buffer.device.type == "cpu": + cpu_buffers.append(name) + + if cpu_buffers: + print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...") + model.encoder = model.encoder.to(device) + print(f"Encoder moved to {device}") + if device == "cuda" and not is_model_on_gpu(model): print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM") diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index b583fe48b..7e0c0316e 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -64,6 +64,7 @@ ) from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor from modelopt.torch.utils.memory_monitor import launch_memory_monitor +from modelopt.torch.utils.nemotron_vlm_dataset_utils import get_nemotron_vlm_dataset_dataloader from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader @@ -173,9 +174,50 @@ def make_calib_dataloader( tokenizer: PreTrainedTokenizerBase | None, device: torch.device, model_type: str | None, + full_model: torch.nn.Module | None = None, ) -> tuple[DataLoader, str | None]: calib_dataloader = None first_text_speech_dataset = None + + # Check if this is Nemotron-Parse - use image-text data for better calibration + if full_model is not None: + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + + if is_nemotron_parse and processor is not None: + print( + "✓ Detected Nemotron-Parse model. Using image-text dataset for calibration " + "to provide realistic visual embeddings to the decoder." + ) + + # Override dataset to use image-text dataset if not specified + supported_datasets = ["nemotron_vlm_v2", "chartqa", "scienceqa"] + if not args.dataset or args.dataset[0] not in supported_datasets: + print( + f"[INFO] Dataset '{args.dataset}' is not a supported image-text dataset. " + f"Automatically using 'nemotron_vlm_v2' for Nemotron-Parse calibration." + ) + dataset_to_use = "nemotron_vlm_v2" + else: + dataset_to_use = args.dataset[0] + + # Nemotron-Parse needs single dataset for now + if len(args.calib_size) > 1: + print(f"[INFO] Using first calib_size value: {args.calib_size[0]}") + calib_size_to_use = args.calib_size[0] + else: + calib_size_to_use = args.calib_size[0] if args.calib_size else 512 + + calib_dataloader = get_nemotron_vlm_dataset_dataloader( + dataset_name=dataset_to_use, + processor=processor, + batch_size=args.batch_size, + num_samples=calib_size_to_use, + device=device, # Move data to model's device + ) + return calib_dataloader, first_text_speech_dataset + if model_type == "mllama": assert processor is not None and isinstance(processor, MllamaImageProcessor), ( "The MllamaImageProcessor must be set." @@ -377,18 +419,35 @@ def load_model(args: argparse.Namespace): trust_remote_code=args.trust_remote_code, ) else: + # Check if this is a Nemotron VL model that needs a processor + # Do this BEFORE setting default datasets so we can use image-text data for Nemotron-Parse + is_nemotron_vl_model = is_nemotron_vl(full_model) + + # Check specifically for Nemotron-Parse to set appropriate dataset defaults + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + if args.dataset is None: - args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] - warnings.warn( - "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2." - ) + if is_nemotron_parse: + # For Nemotron-Parse, default to Nemotron VLM Dataset v2 + args.dataset = ["nemotron_vlm_v2"] + print( + "No dataset specified. Defaulting to 'nemotron_vlm_v2' for Nemotron-Parse " + "(NVIDIA's image-text dataset for better calibration)." + ) + else: + # For other models, use text-only datasets + args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] + warnings.warn( + "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2." + ) + # Adjust calib_size to match dataset length by extending or truncating as needed args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[ : len(args.dataset) ] - # Check if this is a Nemotron VL model that needs a processor - is_nemotron_vl_model = is_nemotron_vl(full_model) if is_nemotron_vl_model: # Load processor for Nemotron VL models (like Nemotron-Parse) processor = get_processor( @@ -404,26 +463,41 @@ def load_model(args: argparse.Namespace): # Left padding usually provides better calibration result. tokenizer.padding_side = "left" - # We only quantize the language model for VLMs other than the type supported above. - language_model_lineage = get_language_model_from_vl(full_model) - if language_model_lineage is not None: - language_model = language_model_lineage.pop(-1) - ancestors = language_model_lineage - # Apply disabled quant to all modules that are not part of language_model so we can exclude them during - # HF export. - disabled_quant_cfg = { - "quant_cfg": {"default": {"enable": False}}, - "algorithm": "max", - } - - memo = set(ancestors) | {language_model} - for ancestor in ancestors: - for _, module in ancestor.named_children(): - if module not in memo: - mtq.quantize(module, disabled_quant_cfg, forward_loop=None) - memo.add(module) - - model_type = get_model_type(language_model) + # Check if this is Nemotron-Parse + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + + # For Nemotron-Parse, DON'T extract the decoder + # We want to calibrate the full model so the decoder sees realistic visual embeddings + # The vision encoder won't be quantized (disabled via quant_cfg in mono_quantize) + if is_nemotron_parse: + print( + "Nemotron-Parse detected: Keeping full encoder-decoder model for calibration " + "with image-text data. Vision encoder will be disabled from quantization." + ) + # language_model = full_model (already set above) + else: + # For other VLMs, extract the language model for quantization + language_model_lineage = get_language_model_from_vl(full_model) + if language_model_lineage is not None: + language_model = language_model_lineage.pop(-1) + ancestors = language_model_lineage + # Apply disabled quant to all modules that are not part of language_model so we can exclude them during + # HF export. + disabled_quant_cfg = { + "quant_cfg": {"default": {"enable": False}}, + "algorithm": "max", + } + + memo = set(ancestors) | {language_model} + for ancestor in ancestors: + for _, module in ancestor.named_children(): + if module not in memo: + mtq.quantize(module, disabled_quant_cfg, forward_loop=None) + memo.add(module) + + model_type = get_model_type(language_model) if model_type == "phi4mm": warnings.warn("Please set the default input_mode to InputMode.LANGUAGE before quantizing.") @@ -494,14 +568,23 @@ def mono_quantize( "Consider reducing calib_size to reduce calibration time.\n####\n" ) + # Check if this is Nemotron-Parse + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + original_forward = None # Track original forward method if we wrap it + # For Nemotron VL models, disable quantization of vision components if is_nemotron_vl_model: print("Disabling quantization for vision components in Nemotron VL model") quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} quant_cfg["quant_cfg"]["*image*"] = {"enable": False} - # Also disable radio model components specifically + # Also disable radio model components specifically (for Nemotron-Parse) quant_cfg["quant_cfg"]["*radio*"] = {"enable": False} quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} + quant_cfg["quant_cfg"]["*encoder*"] = {"enable": False} # Disable encoder + quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False} # Nemotron-Parse specific + print("Quantization will only be applied to the decoder (text generation) component") if not model_is_already_quantized or calibration_only: if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only": @@ -513,9 +596,25 @@ def mono_quantize( if not use_calibration: warnings.warn("Dynamic quantization. Calibration skipped.") - calibrate_loop = ( - create_forward_loop(dataloader=calib_dataloader) if use_calibration else None - ) + + # Create calibration loop + if use_calibration: + if is_nemotron_parse: + # For Nemotron-Parse, wrap the model to force use_cache=False + print("Wrapping Nemotron-Parse model for calibration (use_cache=False)") + original_forward = language_model.forward + + def wrapped_forward(*args, **kwargs): + kwargs["use_cache"] = False + return original_forward(*args, **kwargs) + + # Temporarily replace forward method + language_model.forward = wrapped_forward + calibrate_loop = create_forward_loop(dataloader=calib_dataloader) + else: + calibrate_loop = create_forward_loop(dataloader=calib_dataloader) + else: + calibrate_loop = None if calibration_only: language_model = mtq.calibrate( @@ -524,8 +623,15 @@ def mono_quantize( else: language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop) - # For VL models, update full_model to use the quantized language model - if is_nemotron_vl_model: + # Restore original forward method if we wrapped it for Nemotron-Parse + if is_nemotron_parse and original_forward is not None: + print("Restoring original forward method after calibration") + language_model.forward = original_forward + original_forward = None + + # For VL models (except Nemotron-Parse), update full_model to use the quantized language model + # For Nemotron-Parse, language_model IS full_model, so no update needed + if is_nemotron_vl_model and language_model is not full_model: language_model_lineage = get_language_model_from_vl(full_model) if language_model_lineage is not None: print("Updating full_model with quantized language_model...") @@ -828,38 +934,12 @@ def quantize_main( print(f"Use calib batch_size {args.batch_size}") calib_dataloader, first_text_speech_dataset = make_calib_dataloader( - args, language_model, processor, tokenizer, device, model_type + args, language_model, processor, tokenizer, device, model_type, full_model ) # Detect if this is a Nemotron VL model using architecture-based detection is_nemotron_vl_model = is_nemotron_vl(full_model) - # For Nemotron-Parse, wrap the text-only dataloader to add dummy images - # Nemotron-Parse is an encoder-decoder model that requires pixel_values - if is_nemotron_vl_model and processor is not None: - config = full_model.config - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - - if is_nemotron_parse: - # Check if we're quantizing just the decoder or the full model - decoder_only = language_model is not full_model - - if decoder_only: - print( - "Calibration will use text-only inputs for Nemotron-Parse decoder. " - "Vision encoder is excluded from quantization." - ) - else: - print( - "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. " - "Nemotron-Parse requires pixel_values for full model calibration." - ) - - calib_dataloader = create_nemotron_parse_calib_wrapper( - calib_dataloader, processor, device, decoder_only=decoder_only - ) - preview_input_ids, generated_ids_before_ptq = pre_quantize( args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model ) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index eaefcbe9d..3c8dde154 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -172,7 +172,11 @@ def _output_hook(module, input, output): print( f"Running optimization on language model with fake_input shape: {fake_input.shape}" ) - language_model(fake_input) + # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors + if is_nemotron_parse: + language_model(fake_input, use_cache=False) + else: + language_model(fake_input) else: raise ValueError( f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " From e252fb27ee88d04f7c20c87a35d7e46d95e345d8 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 20 Jan 2026 17:42:46 -0800 Subject: [PATCH 5/6] add image-text data calibration support Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 3aaae3556..a244fe862 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -376,10 +376,7 @@ def get_model( if is_nemotron_parse: # Nemotron-Parse works fine with device_map="auto" # Keep device_map="auto" to ensure proper device placement - print( - "Detected Nemotron-Parse model from config. " - "Using automatic device mapping." - ) + print("Detected Nemotron-Parse model from config. Using automatic device mapping.") else: # For other Nemotron VL models, disable device_map for compatibility print( From 64fa1fa250e4d5ef26266802ce1ea72a70ccd73e Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 21 Jan 2026 01:17:41 -0800 Subject: [PATCH 6/6] add image-text data calibration support Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/hf_ptq.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 7e0c0316e..802b658f2 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -602,11 +602,14 @@ def mono_quantize( if is_nemotron_parse: # For Nemotron-Parse, wrap the model to force use_cache=False print("Wrapping Nemotron-Parse model for calibration (use_cache=False)") - original_forward = language_model.forward + # Store original forward before wrapping + _original_forward = language_model.forward + original_forward = _original_forward # Capture in outer scope def wrapped_forward(*args, **kwargs): kwargs["use_cache"] = False - return original_forward(*args, **kwargs) + # Call the captured forward method + return _original_forward(*args, **kwargs) # Temporarily replace forward method language_model.forward = wrapped_forward