vllm-project · kylesayrs · Dec 17, 2025 · Dec 17, 2025 · Dec 18, 2025
diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
@@ -39,6 +39,7 @@
     splits=DATASET_SPLIT,
     recipe=recipe,
     batch_size=BATCH_SIZE,
+    shuffle_calibration_samples=False,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )

diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -143,7 +143,7 @@ class DatasetArguments(CustomDatasetArguments):
         metadata={"help": "Number of samples to use for one-shot calibration"},
     )
     shuffle_calibration_samples: bool = field(
-        default=False,
+        default=True,
         metadata={
             "help": "whether to shuffle the dataset before selecting calibration data"
         },

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -253,7 +253,7 @@ def oneshot(
     batch_size: int = 1,
     data_collator: str | Callable = "truncation",
     num_calibration_samples: int = 512,
-    shuffle_calibration_samples: bool = False,
+    shuffle_calibration_samples: bool = True,
     max_seq_length: int = 384,
     pad_to_max_length: bool = True,
     text_column: str = "text",
@@ -263,6 +263,23 @@ def oneshot(
     preprocessing_num_workers: int | None = None,
     min_tokens_per_module: float | None = None,
     moe_calibrate_all_experts: bool = True,
+    pipeline: str | None = "independent",
+    tracing_ignore: list[str] = [
+        "_update_causal_mask",
+        "create_causal_mask",
+        "_update_mamba_mask",
+        "make_causal_mask",
+        "get_causal_mask",
+        "mask_interface",
+        "mask_function",
+        "_prepare_4d_causal_attention_mask",
+        "_prepare_fsmt_decoder_inputs",
+        "_prepare_4d_causal_attention_mask_with_cache_position",
+        "_update_linear_attn_mask",
+        "project_per_layer_inputs",
+    ],
+    sequential_targets: list[str] | None = None,
+    sequential_offload_device: str = "cpu",
     quantization_aware_calibration: bool = True,
     # Miscellaneous arguments
     output_dir: str | None = None,
@@ -335,6 +352,17 @@ def oneshot(
         model calibration. When True, all experts will see all tokens during
         calibration, ensuring proper quantization statistics. When False, only
         routed experts will be used. Only relevant for MoE models. Default is True.
+    :param pipeline: Calibration pipeline used to calibrate model Options:
+        ['basic', 'datafree', 'sequential', 'independent']
+    :param tracing_ignore: List of functions to ignore during tracing, either
+        {module}.{method_name} or {function_name}
+    :param sequential_targets: List of layer targets for the sequential pipeline.
+        This is typically a single DecoderLayer. Not specifying this argument will
+        cause the sequential pipeline to default to using the `no_split_params`
+        specified by the HF model definition
+    :param sequential_offload_device: Device used to offload intermediate activations
+        between sequential layers. It is recommended to use `cuda:1` if using more
+        than one gpu. Default is cpu.
     :param quantization_aware_calibration: Whether to enable quantization-aware
         calibration in the sequential pipeline. When True, quantization is applied
         during forward pass in calibration. When False, quantization is disabled