@@ -253,7 +253,7 @@ def oneshot(
253253 batch_size : int = 1 ,
254254 data_collator : str | Callable = "truncation" ,
255255 num_calibration_samples : int = 512 ,
256- shuffle_calibration_samples : bool = False ,
256+ shuffle_calibration_samples : bool = True ,
257257 max_seq_length : int = 384 ,
258258 pad_to_max_length : bool = True ,
259259 text_column : str = "text" ,
@@ -263,6 +263,23 @@ def oneshot(
263263 preprocessing_num_workers : int | None = None ,
264264 min_tokens_per_module : float | None = None ,
265265 moe_calibrate_all_experts : bool = True ,
266+ pipeline : str | None = "independent" ,
267+ tracing_ignore : list [str ] = [
268+ "_update_causal_mask" ,
269+ "create_causal_mask" ,
270+ "_update_mamba_mask" ,
271+ "make_causal_mask" ,
272+ "get_causal_mask" ,
273+ "mask_interface" ,
274+ "mask_function" ,
275+ "_prepare_4d_causal_attention_mask" ,
276+ "_prepare_fsmt_decoder_inputs" ,
277+ "_prepare_4d_causal_attention_mask_with_cache_position" ,
278+ "_update_linear_attn_mask" ,
279+ "project_per_layer_inputs" ,
280+ ],
281+ sequential_targets : list [str ] | None = None ,
282+ sequential_offload_device : str = "cpu" ,
266283 quantization_aware_calibration : bool = True ,
267284 # Miscellaneous arguments
268285 output_dir : str | None = None ,
@@ -335,6 +352,17 @@ def oneshot(
335352 model calibration. When True, all experts will see all tokens during
336353 calibration, ensuring proper quantization statistics. When False, only
337354 routed experts will be used. Only relevant for MoE models. Default is True.
355+ :param pipeline: Calibration pipeline used to calibrate model Options:
356+ ['basic', 'datafree', 'sequential', independent]
357+ :param tracing_ignore: List of functions to ignore during tracing, either
358+ {module}.{method_name} or {function_name}
359+ :param sequential_targets: List of layer targets for the sequential pipeline.
360+ This is typically a single DecoderLayer. Not specifying this argument will
361+ cause the sequential pipeline to default to using the `no_split_params`
362+ specified by the HF model definition
363+ :param sequential_offload_device: Device used to offload intermediate activations
364+ between sequential layers. It is recommended to use `cuda:1` if using more
365+ than one gpu. Default is cpu.
338366 :param quantization_aware_calibration: Whether to enable quantization-aware
339367 calibration in the sequential pipeline. When True, quantization is applied
340368 during forward pass in calibration. When False, quantization is disabled
0 commit comments