From cc0a12af04dd438199e17688da5aaf48ed28de32 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Fri, 5 Dec 2025 08:55:20 +0000 Subject: [PATCH 1/5] add kv quant example Signed-off-by: Mengni Wang --- examples/autoround/README.md | 41 +++++++++++++++++++++++++--- examples/autoround/llama3_example.py | 40 +++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/examples/autoround/README.md b/examples/autoround/README.md index 3abdd7d17..cfa76bca9 100644 --- a/examples/autoround/README.md +++ b/examples/autoround/README.md @@ -70,15 +70,39 @@ ds = get_dataset( ### 3) Apply Quantization With the dataset ready, we will now apply AutoRound quantization to the model. +Add `--fp8_kv` when running the script if you want to quantize the kv cache. ```python from llmcompressor import oneshot -from llmcompressor.modifiers.autoround import AutoRoundModifier # Configure the quantization algorithm to run. -recipe = AutoRoundModifier( - targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200 -) +if args.fp8_kv: + recipe = """ + quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true + AutoRoundModifier: + targets: [Linear] + scheme: W4A16 + ignore: [lm_head] + iters: 200 + """ +else: + recipe = """ + quant_stage: + quant_modifiers: + AutoRoundModifier: + targets: [Linear] + scheme: W4A16 + ignore: [lm_head] + iters: 200 + """ # Apply quantization. oneshot( @@ -116,6 +140,7 @@ Run the following to test accuracy on GSM-8K: ```bash lm_eval --model vllm \ + # use pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true,kv_cache_dtype=fp8 if kv cache is quantized --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true \ --tasks gsm8k \ --num_fewshot 5 \ @@ -126,10 +151,18 @@ lm_eval --model vllm \ We can see the resulting scores look good! ```bash +w/o kv cache quantization: | Tasks | Version | Filter | n-shot | Metric | | Value | | Stderr | | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: | | gsm8k | 3 | flexible-extract | 5 | exact_match | ↑ | 0.737 | ± | 0.0139 | | | | strict-match | 5 | exact_match | ↑ | 0.736 | ± | 0.0139 | + +w/ kv cache quantizzation: + +| Tasks | Version | Filter | n-shot | Metric | | Value | | Stderr | +| ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: | +| gsm8k | 3 | flexible-extract | 5 | exact_match | ↑ | 0.740 | ± | 0.0139 | +| | | strict-match | 5 | exact_match | ↑ | 0.742 | ± | 0.0138 | ``` > Note: quantized model accuracy may vary slightly due to nondeterminism. diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py index 9843073bd..c445781b2 100644 --- a/examples/autoround/llama3_example.py +++ b/examples/autoround/llama3_example.py @@ -1,10 +1,15 @@ +import argparse from auto_round.calib_dataset import get_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers.autoround import AutoRoundModifier from llmcompressor.utils import dispatch_for_generation + +parser = argparse.ArgumentParser() +parser.add_argument("--fp8_kv", action="store_true") +args = parser.parse_args() + # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") @@ -21,13 +26,36 @@ nsamples=NUM_CALIBRATION_SAMPLES, ) - # Configure the quantization algorithm to run. # * quantize the weights to 4 bit with AutoRound with a group size 128 -recipe = AutoRoundModifier( - targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200 -) - +# * quantize the kv cache to fp8 +if args.fp8_kv: + recipe = """ + quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true + AutoRoundModifier: + targets: [Linear] + scheme: W4A16 + ignore: [lm_head] + iters: 200 + """ +else: + recipe = """ + quant_stage: + quant_modifiers: + AutoRoundModifier: + targets: [Linear] + scheme: W4A16 + ignore: [lm_head] + iters: 200 + """ # Apply algorithms. oneshot( From bc591df8ed1a527cac2ca9ea7dda8c178f6beb18 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 5 Dec 2025 17:10:45 +0800 Subject: [PATCH 2/5] Update examples/autoround/README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Wang, Mengni --- examples/autoround/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/autoround/README.md b/examples/autoround/README.md index cfa76bca9..153b1bfcd 100644 --- a/examples/autoround/README.md +++ b/examples/autoround/README.md @@ -157,7 +157,7 @@ w/o kv cache quantization: | gsm8k | 3 | flexible-extract | 5 | exact_match | ↑ | 0.737 | ± | 0.0139 | | | | strict-match | 5 | exact_match | ↑ | 0.736 | ± | 0.0139 | -w/ kv cache quantizzation: +w/ kv cache quantization: | Tasks | Version | Filter | n-shot | Metric | | Value | | Stderr | | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: | From 322933977d98ac36ce25c2cc42cc862f2ec92623 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 8 Dec 2025 09:48:36 +0800 Subject: [PATCH 3/5] Update examples/autoround/README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Wang, Mengni --- examples/autoround/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/autoround/README.md b/examples/autoround/README.md index 153b1bfcd..b119958de 100644 --- a/examples/autoround/README.md +++ b/examples/autoround/README.md @@ -150,7 +150,7 @@ lm_eval --model vllm \ We can see the resulting scores look good! -```bash +```text w/o kv cache quantization: | Tasks | Version | Filter | n-shot | Metric | | Value | | Stderr | | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: | From 411653a4a636781cce7148a46e1212b8068801ec Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 8 Dec 2025 09:55:14 +0800 Subject: [PATCH 4/5] Update README.md Signed-off-by: Wang, Mengni --- examples/autoround/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/autoround/README.md b/examples/autoround/README.md index b119958de..e667ff7ad 100644 --- a/examples/autoround/README.md +++ b/examples/autoround/README.md @@ -140,7 +140,7 @@ Run the following to test accuracy on GSM-8K: ```bash lm_eval --model vllm \ - # use pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true,kv_cache_dtype=fp8 if kv cache is quantized + # If KV cache is quantized, add 'kv_cache_dtype=fp8' to the --model_args below. --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true \ --tasks gsm8k \ --num_fewshot 5 \ From 674fe1f0ff48c6a23fb5bd6e2453792856e37810 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Tue, 23 Dec 2025 12:52:50 +0000 Subject: [PATCH 5/5] update example Signed-off-by: Mengni Wang --- examples/autoround/README.md | 43 ++---------- examples/autoround/llama3_example.py | 36 +--------- .../quantization_kv_cache/llama3_example.py | 68 +++++++++++++++++++ 3 files changed, 76 insertions(+), 71 deletions(-) create mode 100644 examples/autoround/quantization_kv_cache/llama3_example.py diff --git a/examples/autoround/README.md b/examples/autoround/README.md index e667ff7ad..b5f1ed738 100644 --- a/examples/autoround/README.md +++ b/examples/autoround/README.md @@ -70,39 +70,15 @@ ds = get_dataset( ### 3) Apply Quantization With the dataset ready, we will now apply AutoRound quantization to the model. -Add `--fp8_kv` when running the script if you want to quantize the kv cache. ```python from llmcompressor import oneshot +from llmcompressor.modifiers.autoround import AutoRoundModifier # Configure the quantization algorithm to run. -if args.fp8_kv: - recipe = """ - quant_stage: - quant_modifiers: - QuantizationModifier: - kv_cache_scheme: - num_bits: 8 - type: float - strategy: tensor - dynamic: false - symmetric: true - AutoRoundModifier: - targets: [Linear] - scheme: W4A16 - ignore: [lm_head] - iters: 200 - """ -else: - recipe = """ - quant_stage: - quant_modifiers: - AutoRoundModifier: - targets: [Linear] - scheme: W4A16 - ignore: [lm_head] - iters: 200 - """ +recipe = AutoRoundModifier( + targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200 +) # Apply quantization. oneshot( @@ -115,7 +91,6 @@ oneshot( shuffle_calibration_samples=False, ) - # Save to disk compressed. SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound" model.save_pretrained(SAVE_DIR, save_compressed=True) @@ -140,7 +115,6 @@ Run the following to test accuracy on GSM-8K: ```bash lm_eval --model vllm \ - # If KV cache is quantized, add 'kv_cache_dtype=fp8' to the --model_args below. --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128-AutoRound",add_bos_token=true \ --tasks gsm8k \ --num_fewshot 5 \ @@ -151,18 +125,11 @@ lm_eval --model vllm \ We can see the resulting scores look good! ```text -w/o kv cache quantization: +```bash | Tasks | Version | Filter | n-shot | Metric | | Value | | Stderr | | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: | | gsm8k | 3 | flexible-extract | 5 | exact_match | ↑ | 0.737 | ± | 0.0139 | | | | strict-match | 5 | exact_match | ↑ | 0.736 | ± | 0.0139 | - -w/ kv cache quantization: - -| Tasks | Version | Filter | n-shot | Metric | | Value | | Stderr | -| ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: | -| gsm8k | 3 | flexible-extract | 5 | exact_match | ↑ | 0.740 | ± | 0.0139 | -| | | strict-match | 5 | exact_match | ↑ | 0.742 | ± | 0.0138 | ``` > Note: quantized model accuracy may vary slightly due to nondeterminism. diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py index c445781b2..d2a2f3bec 100644 --- a/examples/autoround/llama3_example.py +++ b/examples/autoround/llama3_example.py @@ -1,4 +1,3 @@ -import argparse from auto_round.calib_dataset import get_dataset from transformers import AutoModelForCausalLM, AutoTokenizer @@ -6,10 +5,6 @@ from llmcompressor.utils import dispatch_for_generation -parser = argparse.ArgumentParser() -parser.add_argument("--fp8_kv", action="store_true") -args = parser.parse_args() - # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") @@ -28,34 +23,9 @@ # Configure the quantization algorithm to run. # * quantize the weights to 4 bit with AutoRound with a group size 128 -# * quantize the kv cache to fp8 -if args.fp8_kv: - recipe = """ - quant_stage: - quant_modifiers: - QuantizationModifier: - kv_cache_scheme: - num_bits: 8 - type: float - strategy: tensor - dynamic: false - symmetric: true - AutoRoundModifier: - targets: [Linear] - scheme: W4A16 - ignore: [lm_head] - iters: 200 - """ -else: - recipe = """ - quant_stage: - quant_modifiers: - AutoRoundModifier: - targets: [Linear] - scheme: W4A16 - ignore: [lm_head] - iters: 200 - """ +recipe = AutoRoundModifier( + targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200 +) # Apply algorithms. oneshot( diff --git a/examples/autoround/quantization_kv_cache/llama3_example.py b/examples/autoround/quantization_kv_cache/llama3_example.py new file mode 100644 index 000000000..b47b3df61 --- /dev/null +++ b/examples/autoround/quantization_kv_cache/llama3_example.py @@ -0,0 +1,68 @@ +from auto_round.calib_dataset import get_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.utils import dispatch_for_generation + + +# Select model and load it. +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) + +# Select calibration dataset. +NUM_CALIBRATION_SAMPLES = 128 +MAX_SEQUENCE_LENGTH = 2048 +# Get aligned calibration dataset. + +ds = get_dataset( + tokenizer=tokenizer, + seqlen=MAX_SEQUENCE_LENGTH, + nsamples=NUM_CALIBRATION_SAMPLES, +) + +# Configure the quantization algorithm to run. +# * quantize the weights to 4 bit with AutoRound with a group size 128 +# * quantize the kv cache to fp8 +recipe = """ +quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true + AutoRoundModifier: + targets: [Linear] + scheme: W4A16 + ignore: [lm_head] + iters: 200 +""" + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + # disable shuffling to get slightly better mmlu score + shuffle_calibration_samples=False, +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to(model.device) for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR)