Update lm-eval set-up to address regression (#2142)

dsikka · brian-dellabetta · web-flow · commit 24278256ec2c · 2025-12-19T05:19:32.000-05:00
SUMMARY:
- Seems like setting the collator from tuncation to default + shuffling
addresses the regression we're seeing in lm-eval
- Given the recovery values you see in these tests were determined using
these settings, I think they should be how we evaluate our lm-eval tests
for the time being

---------

Signed-off-by: Dipika Sikka &lt;ds3822@columbia.edu&gt;
Co-authored-by: Brian Dellabetta &lt;brian-dellabetta@users.noreply.github.com&gt;
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
@@ -1,8 +1,10 @@
+from typing import Callable
+
 import torch
 import transformers
 from datasets import load_dataset
 from loguru import logger
-from transformers import AutoProcessor
+from transformers import AutoProcessor, DefaultDataCollator
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
@@ -34,9 +36,12 @@ def run_oneshot_for_e2e_testing(
     dataset_config: str,
     scheme: str,
     quant_type: str,
+    shuffle_calibration_samples: bool = True,
+    data_collator: str | Callable = DefaultDataCollator(),
 ):
     # Load model.
     oneshot_kwargs = {}
+    oneshot_kwargs["data_collator"] = data_collator
 
     loaded_model = load_model(model=model, model_class=model_class)
     processor = AutoProcessor.from_pretrained(model)
@@ -74,6 +79,7 @@ def data_collator(batch):
             oneshot_kwargs["data_collator"] = data_collator
 
     oneshot_kwargs["model"] = loaded_model
+    oneshot_kwargs["shuffle_calibration_samples"] = shuffle_calibration_samples
     if recipe:
         oneshot_kwargs["recipe"] = recipe
     else: