From cc0a12af04dd438199e17688da5aaf48ed28de32 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Fri, 5 Dec 2025 08:55:20 +0000
Subject: [PATCH 1/5] add kv quant example

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 examples/autoround/README.md         | 41 +++++++++++++++++++++++++---
 examples/autoround/llama3_example.py | 40 +++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/examples/autoround/README.md b/examples/autoround/README.md
index 3abdd7d17..cfa76bca9 100644
--- a/examples/autoround/README.md
+++ b/examples/autoround/README.md
@@ -70,15 +70,39 @@ ds = get_dataset(
 ### 3) Apply Quantization
 
 With the dataset ready, we will now apply AutoRound quantization to the model.
+Add `--fp8_kv` when running the script if you want to quantize the kv cache.
 
 ```python
 from llmcompressor import oneshot
-from llmcompressor.modifiers.autoround import AutoRoundModifier
 
 # Configure the quantization algorithm to run.
-recipe = AutoRoundModifier(
-    targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200
-)
+if args.fp8_kv:
+    recipe = """
+    quant_stage:
+      quant_modifiers:
+        QuantizationModifier:
+          kv_cache_scheme:
+            num_bits: 8
+            type: float
+            strategy: tensor
+            dynamic: false
+            symmetric: true
+        AutoRoundModifier:
+          targets: [Linear]
+          scheme: W4A16
+          ignore: [lm_head]
+          iters: 200
+    """
+else:
+    recipe = """
+    quant_stage:
+      quant_modifiers:
+        AutoRoundModifier:
+          targets: [Linear]
+          scheme: W4A16
+          ignore: [lm_head]
+          iters: 200
+    """
 
 # Apply quantization.
 oneshot(
@@ -116,6 +140,7 @@ Run the following to test accuracy on GSM-8K:
 
 ```bash
 lm_eval --model vllm \
+  # use pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true,kv_cache_dtype=fp8 if kv cache is quantized
   --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true \
   --tasks gsm8k \
   --num_fewshot 5 \
@@ -126,10 +151,18 @@ lm_eval --model vllm \
 We can see the resulting scores look good!
 
 ```bash
+w/o kv cache quantization:
 | Tasks | Version | Filter           | n-shot | Metric      |     | Value |     | Stderr |
 | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: |
 | gsm8k |       3 | flexible-extract |      5 | exact_match | ↑   | 0.737 | ±   | 0.0139 |
 |       |         | strict-match     |      5 | exact_match | ↑   | 0.736 | ±   | 0.0139 |
+
+w/ kv cache quantizzation:
+
+| Tasks | Version | Filter           | n-shot | Metric      |     | Value |     | Stderr |
+| ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: |
+| gsm8k |       3 | flexible-extract |      5 | exact_match | ↑   | 0.740 | ±   | 0.0139 |
+|       |         | strict-match     |      5 | exact_match | ↑   | 0.742 | ±   | 0.0138 |
 ```
 > Note: quantized model accuracy may vary slightly due to nondeterminism.
 
diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py
index 9843073bd..c445781b2 100644
--- a/examples/autoround/llama3_example.py
+++ b/examples/autoround/llama3_example.py
@@ -1,10 +1,15 @@
+import argparse
 from auto_round.calib_dataset import get_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers.autoround import AutoRoundModifier
 from llmcompressor.utils import dispatch_for_generation
 
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--fp8_kv", action="store_true")
+args = parser.parse_args()
+
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
@@ -21,13 +26,36 @@
     nsamples=NUM_CALIBRATION_SAMPLES,
 )
 
-
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with AutoRound with a group size 128
-recipe = AutoRoundModifier(
-    targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200
-)
-
+#   * quantize the kv cache to fp8
+if args.fp8_kv:
+    recipe = """
+    quant_stage:
+      quant_modifiers:
+        QuantizationModifier:
+          kv_cache_scheme:
+            num_bits: 8
+            type: float
+            strategy: tensor
+            dynamic: false
+            symmetric: true
+        AutoRoundModifier:
+          targets: [Linear]
+          scheme: W4A16
+          ignore: [lm_head]
+          iters: 200
+    """
+else:
+    recipe = """
+    quant_stage:
+      quant_modifiers:
+        AutoRoundModifier:
+          targets: [Linear]
+          scheme: W4A16
+          ignore: [lm_head]
+          iters: 200
+    """
 
 # Apply algorithms.
 oneshot(

From bc591df8ed1a527cac2ca9ea7dda8c178f6beb18 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Fri, 5 Dec 2025 17:10:45 +0800
Subject: [PATCH 2/5] Update examples/autoround/README.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Wang, Mengni <mengni.wang@intel.com>
---
 examples/autoround/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/autoround/README.md b/examples/autoround/README.md
index cfa76bca9..153b1bfcd 100644
--- a/examples/autoround/README.md
+++ b/examples/autoround/README.md
@@ -157,7 +157,7 @@ w/o kv cache quantization:
 | gsm8k |       3 | flexible-extract |      5 | exact_match | ↑   | 0.737 | ±   | 0.0139 |
 |       |         | strict-match     |      5 | exact_match | ↑   | 0.736 | ±   | 0.0139 |
 
-w/ kv cache quantizzation:
+w/ kv cache quantization:
 
 | Tasks | Version | Filter           | n-shot | Metric      |     | Value |     | Stderr |
 | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: |

From 322933977d98ac36ce25c2cc42cc862f2ec92623 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 8 Dec 2025 09:48:36 +0800
Subject: [PATCH 3/5] Update examples/autoround/README.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Wang, Mengni <mengni.wang@intel.com>
---
 examples/autoround/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/autoround/README.md b/examples/autoround/README.md
index 153b1bfcd..b119958de 100644
--- a/examples/autoround/README.md
+++ b/examples/autoround/README.md
@@ -150,7 +150,7 @@ lm_eval --model vllm \
 
 We can see the resulting scores look good!
 
-```bash
+```text
 w/o kv cache quantization:
 | Tasks | Version | Filter           | n-shot | Metric      |     | Value |     | Stderr |
 | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: |

From 411653a4a636781cce7148a46e1212b8068801ec Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 8 Dec 2025 09:55:14 +0800
Subject: [PATCH 4/5] Update README.md

Signed-off-by: Wang, Mengni <mengni.wang@intel.com>
---
 examples/autoround/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/autoround/README.md b/examples/autoround/README.md
index b119958de..e667ff7ad 100644
--- a/examples/autoround/README.md
+++ b/examples/autoround/README.md
@@ -140,7 +140,7 @@ Run the following to test accuracy on GSM-8K:
 
 ```bash
 lm_eval --model vllm \
-  # use pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true,kv_cache_dtype=fp8 if kv cache is quantized
+  # If KV cache is quantized, add 'kv_cache_dtype=fp8' to the --model_args below.
   --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true \
   --tasks gsm8k \
   --num_fewshot 5 \

From 674fe1f0ff48c6a23fb5bd6e2453792856e37810 Mon Sep 17 00:00:00 2001
From: Mengni Wang <mengni.wang@intel.com>
Date: Tue, 23 Dec 2025 12:52:50 +0000
Subject: [PATCH 5/5] update example

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
---
 examples/autoround/README.md                  | 43 ++----------
 examples/autoround/llama3_example.py          | 36 +---------
 .../quantization_kv_cache/llama3_example.py   | 68 +++++++++++++++++++
 3 files changed, 76 insertions(+), 71 deletions(-)
 create mode 100644 examples/autoround/quantization_kv_cache/llama3_example.py

diff --git a/examples/autoround/README.md b/examples/autoround/README.md
index e667ff7ad..b5f1ed738 100644
--- a/examples/autoround/README.md
+++ b/examples/autoround/README.md
@@ -70,39 +70,15 @@ ds = get_dataset(
 ### 3) Apply Quantization
 
 With the dataset ready, we will now apply AutoRound quantization to the model.
-Add `--fp8_kv` when running the script if you want to quantize the kv cache.
 
 ```python
 from llmcompressor import oneshot
+from llmcompressor.modifiers.autoround import AutoRoundModifier
 
 # Configure the quantization algorithm to run.
-if args.fp8_kv:
-    recipe = """
-    quant_stage:
-      quant_modifiers:
-        QuantizationModifier:
-          kv_cache_scheme:
-            num_bits: 8
-            type: float
-            strategy: tensor
-            dynamic: false
-            symmetric: true
-        AutoRoundModifier:
-          targets: [Linear]
-          scheme: W4A16
-          ignore: [lm_head]
-          iters: 200
-    """
-else:
-    recipe = """
-    quant_stage:
-      quant_modifiers:
-        AutoRoundModifier:
-          targets: [Linear]
-          scheme: W4A16
-          ignore: [lm_head]
-          iters: 200
-    """
+recipe = AutoRoundModifier(
+    targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200
+)
 
 # Apply quantization.
 oneshot(
@@ -115,7 +91,6 @@ oneshot(
     shuffle_calibration_samples=False,
 )
 
-
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
@@ -140,7 +115,6 @@ Run the following to test accuracy on GSM-8K:
 
 ```bash
 lm_eval --model vllm \
-  # If KV cache is quantized, add 'kv_cache_dtype=fp8' to the --model_args below.
   --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true \
   --tasks gsm8k \
   --num_fewshot 5 \
@@ -151,18 +125,11 @@ lm_eval --model vllm \
 We can see the resulting scores look good!
 
 ```text
-w/o kv cache quantization:
+```bash
 | Tasks | Version | Filter           | n-shot | Metric      |     | Value |     | Stderr |
 | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: |
 | gsm8k |       3 | flexible-extract |      5 | exact_match | ↑   | 0.737 | ±   | 0.0139 |
 |       |         | strict-match     |      5 | exact_match | ↑   | 0.736 | ±   | 0.0139 |
-
-w/ kv cache quantization:
-
-| Tasks | Version | Filter           | n-shot | Metric      |     | Value |     | Stderr |
-| ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: |
-| gsm8k |       3 | flexible-extract |      5 | exact_match | ↑   | 0.740 | ±   | 0.0139 |
-|       |         | strict-match     |      5 | exact_match | ↑   | 0.742 | ±   | 0.0138 |
 ```
 > Note: quantized model accuracy may vary slightly due to nondeterminism.
 
diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py
index c445781b2..d2a2f3bec 100644
--- a/examples/autoround/llama3_example.py
+++ b/examples/autoround/llama3_example.py
@@ -1,4 +1,3 @@
-import argparse
 from auto_round.calib_dataset import get_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -6,10 +5,6 @@
 from llmcompressor.utils import dispatch_for_generation
 
 
-parser = argparse.ArgumentParser()
-parser.add_argument("--fp8_kv", action="store_true")
-args = parser.parse_args()
-
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
@@ -28,34 +23,9 @@
 
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with AutoRound with a group size 128
-#   * quantize the kv cache to fp8
-if args.fp8_kv:
-    recipe = """
-    quant_stage:
-      quant_modifiers:
-        QuantizationModifier:
-          kv_cache_scheme:
-            num_bits: 8
-            type: float
-            strategy: tensor
-            dynamic: false
-            symmetric: true
-        AutoRoundModifier:
-          targets: [Linear]
-          scheme: W4A16
-          ignore: [lm_head]
-          iters: 200
-    """
-else:
-    recipe = """
-    quant_stage:
-      quant_modifiers:
-        AutoRoundModifier:
-          targets: [Linear]
-          scheme: W4A16
-          ignore: [lm_head]
-          iters: 200
-    """
+recipe = AutoRoundModifier(
+    targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200
+)
 
 # Apply algorithms.
 oneshot(
diff --git a/examples/autoround/quantization_kv_cache/llama3_example.py b/examples/autoround/quantization_kv_cache/llama3_example.py
new file mode 100644
index 000000000..b47b3df61
--- /dev/null
+++ b/examples/autoround/quantization_kv_cache/llama3_example.py
@@ -0,0 +1,68 @@
+from auto_round.calib_dataset import get_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.utils import dispatch_for_generation
+
+
+# Select model and load it.
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Select calibration dataset.
+NUM_CALIBRATION_SAMPLES = 128
+MAX_SEQUENCE_LENGTH = 2048
+# Get aligned calibration dataset.
+
+ds = get_dataset(
+    tokenizer=tokenizer,
+    seqlen=MAX_SEQUENCE_LENGTH,
+    nsamples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with AutoRound with a group size 128
+#   * quantize the kv cache to fp8
+recipe = """
+quant_stage:
+  quant_modifiers:
+    QuantizationModifier:
+      kv_cache_scheme:
+        num_bits: 8
+        type: float
+        strategy: tensor
+        dynamic: false
+        symmetric: true
+    AutoRoundModifier:
+      targets: [Linear]
+      scheme: W4A16
+      ignore: [lm_head]
+      iters: 200
+"""
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    # disable shuffling to get slightly better mmlu score
+    shuffle_calibration_samples=False,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)