diff --git a/examples/autoround/README.md b/examples/autoround/README.md index 3abdd7d17..b5f1ed738 100644 --- a/examples/autoround/README.md +++ b/examples/autoround/README.md @@ -91,7 +91,6 @@ oneshot( shuffle_calibration_samples=False, ) - # Save to disk compressed. SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound" model.save_pretrained(SAVE_DIR, save_compressed=True) @@ -125,6 +124,7 @@ lm_eval --model vllm \ We can see the resulting scores look good! +```text ```bash | Tasks | Version | Filter | n-shot | Metric | | Value | | Stderr | | ----- | ------: | ---------------- | -----: | ----------- | --- | ----: | --- | -----: | diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py index 9843073bd..d2a2f3bec 100644 --- a/examples/autoround/llama3_example.py +++ b/examples/autoround/llama3_example.py @@ -2,9 +2,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers.autoround import AutoRoundModifier from llmcompressor.utils import dispatch_for_generation + # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") @@ -21,14 +21,12 @@ nsamples=NUM_CALIBRATION_SAMPLES, ) - # Configure the quantization algorithm to run. # * quantize the weights to 4 bit with AutoRound with a group size 128 recipe = AutoRoundModifier( targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200 ) - # Apply algorithms. oneshot( model=model, diff --git a/examples/autoround/quantization_kv_cache/llama3_example.py b/examples/autoround/quantization_kv_cache/llama3_example.py new file mode 100644 index 000000000..b47b3df61 --- /dev/null +++ b/examples/autoround/quantization_kv_cache/llama3_example.py @@ -0,0 +1,68 @@ +from auto_round.calib_dataset import get_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.utils import dispatch_for_generation + + +# Select model and load it. +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) + +# Select calibration dataset. +NUM_CALIBRATION_SAMPLES = 128 +MAX_SEQUENCE_LENGTH = 2048 +# Get aligned calibration dataset. + +ds = get_dataset( + tokenizer=tokenizer, + seqlen=MAX_SEQUENCE_LENGTH, + nsamples=NUM_CALIBRATION_SAMPLES, +) + +# Configure the quantization algorithm to run. +# * quantize the weights to 4 bit with AutoRound with a group size 128 +# * quantize the kv cache to fp8 +recipe = """ +quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true + AutoRoundModifier: + targets: [Linear] + scheme: W4A16 + ignore: [lm_head] + iters: 200 +""" + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + # disable shuffling to get slightly better mmlu score + shuffle_calibration_samples=False, +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +sample = tokenizer("Hello my name is", return_tensors="pt") +sample = {key: value.to(model.device) for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR)