From 91551907cbe5c407ce5beeca3914d432045084ec Mon Sep 17 00:00:00 2001 From: "omobayode.fagbohungbe" Date: Wed, 16 Jul 2025 22:29:08 +0000 Subject: [PATCH 1/3] feat: enabled GPTQv2 Signed-off-by: omobayode.fagbohungbe --- examples/GPTQ/README.md | 81 +++++++++++++++++++++++++++-------------- fms_mo/run_quant.py | 25 +++++++++---- fms_mo/training_args.py | 3 +- 3 files changed, 73 insertions(+), 36 deletions(-) diff --git a/examples/GPTQ/README.md b/examples/GPTQ/README.md index f5661a12..d173a7f2 100644 --- a/examples/GPTQ/README.md +++ b/examples/GPTQ/README.md @@ -7,6 +7,7 @@ For generative LLMs, very often the bottleneck of inference is no longer the com - [FMS Model Optimizer requirements](../../README.md#requirements) - `gptqmodel` is needed for this example. Use `pip install gptqmodel` or [install from source](https://github.com/ModelCloud/GPTQModel/tree/main?tab=readme-ov-file) + - It is advised to install from source if you plan to use `GPTQv2` - Optionally for the evaluation section below, install [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) ``` pip install lm-eval @@ -32,7 +33,7 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m > - Tokenized data will be saved in `_train` and `_test` > - If you have trouble downloading Llama family of models from Hugging Face ([LLama models require access](https://www.llama.com/docs/getting-the-models/hugging-face/)), you can use `ibm-granite/granite-8b-code` instead -2. **Quantize the model** using the data generated above, the following command will kick off the quantization job (by invoking `gptqmodel` under the hood.) Additional acceptable arguments can be found here in [GPTQArguments](../../fms_mo/training_args.py#L127). +2. **Quantize the model** using the data generated above, the following command will kick off the `GPTQv1' quantization job (by invoking `gptqmodel` under the hood.) Additional acceptable arguments can be found here in [GPTQArguments](../../fms_mo/training_args.py#L127). ```bash python -m fms_mo.run_quant \ @@ -41,9 +42,10 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m --quant_method gptq \ --output_dir Meta-Llama-3-8B-GPTQ \ --bits 4 \ - --group_size 128 + --group_size 128 \ + ``` - The model that can be found in the specified output directory (`Meta-Llama-3-8B-GPTQ` in our case) can be deployed and inferenced via `vLLM`. + The model that can be found in the specified output directory (`Meta-Llama-3-8B-GPTQ` in our case) can be deployed and inferenced via `vLLM`. To enable `GPTQv2`, set the `quant_method` argument to `gptqv2`. > [!NOTE] > - In GPTQ, `group_size` is a trade-off between accuracy and speed, but there is an additional constraint that `in_features` of the Linear layer to be quantized needs to be an **integer multiple** of `group_size`, i.e. some models may have to use smaller `group_size` than default. @@ -82,25 +84,33 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m ## Example Test Results - Unquantized Model -- -|Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| -|------------|--------------|------:|------|-----:|----------|---|-----:|---|-----:| -| LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.7103|± |0.0063| -| | | |none | 5|perplexity|↓ |3.7915|± |0.0727| + + |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| + |------------|--------------|------:|------|-----:|----------|---|-----:|---|-----:| + | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.7103|± |0.0063| + | | | |none | 5|perplexity|↓ |3.7915|± |0.0727| - Quantized model with the settings showed above (`desc_act` default to False.) -- -|Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| -|------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| -| LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6365 |± |0.0067| -| | | |none | 5|perplexity|↓ |5.9307 |± |0.1830| + - `GPTQv1` + + |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| + |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| + | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6365 |± |0.0067| + | | | |none | 5|perplexity|↓ |5.9307 |± |0.1830| + + - `GPTQv2` + + |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| + |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| + | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6817 |± |0.0065| + | | | |none | 5|perplexity|↓ |4.3994 |± |0.0995| - Quantized model with `desc_act` set to `True` (could improve the model quality, but at the cost of inference speed.) -- -|Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| -|------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| -| LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6193 |± |0.0068| -| | | |none | 5|perplexity|↓ |5.8879 |± |0.1546| + - `GPTQv1` + |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| + |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| + | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6193 |± |0.0068| + | | | |none | 5|perplexity|↓ |5.8879 |± |0.1546| > [!NOTE] > There is some randomness in generating the model and data, the resulting accuracy may vary ~$\pm$ 0.05. @@ -108,18 +118,33 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m ## Code Walk-through -1. Command line arguments will be used to create a GPTQ quantization config. Information about the required arguments and their default values can be found [here](../../fms_mo/training_args.py) +1. Command line arguments will be used to create a GPTQ quantization config. Information about the required arguments and their default values can be found [here](../../fms_mo/training_args.py). `GPTQv1` and `GPTQv2` is supported. - ```python - from gptqmodel import GPTQModel, QuantizeConfig + - To use `GPTQv1`, set the parameter `quant_method` to `gptq` in the command line. - quantize_config = QuantizeConfig( - bits=gptq_args.bits, - group_size=gptq_args.group_size, - desc_act=gptq_args.desc_act, - damp_percent=gptq_args.damp_percent, - ) + ```python + from gptqmodel import GPTQModel, QuantizeConfig + + quantize_config = QuantizeConfig( + bits=gptq_args.bits, + group_size=gptq_args.group_size, + desc_act=gptq_args.desc_act, + damp_percent=gptq_args.damp_percent, + ) + ``` + - To use `GPTQv2`, simply set `quant_method` to `gptqv2`in the command line. Under the hood, two additional arguments will be added to QuantizeConfig, i.e. `v2` = `True` and `v2_memory_device` = `cpu`. + ```python + from gptqmodel import GPTQModel, QuantizeConfig + + quantize_config = QuantizeConfig( + bits=gptq_args.bits, + group_size=gptq_args.group_size, + desc_act=gptq_args.desc_act, + damp_percent=gptq_args.damp_percent, + v2=True, + v2_memory_device='cpu', + ) ``` 2. Load the pre_trained model with `gptqmodel` class/wrapper. Tokenizer is optional because we already tokenized the data in a previous step. @@ -158,4 +183,4 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m tokenizer.save_pretrained(output_dir) # optional ``` > [!NOTE] -> 1. GPTQ of a 70B model usually takes ~4-10 hours on A100. +> 1. GPTQ of a 70B model usually takes ~4-10 hours on A100 with `GPTQv1`. diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py index 06ceecdc..e9062112 100644 --- a/fms_mo/run_quant.py +++ b/fms_mo/run_quant.py @@ -88,7 +88,7 @@ def quantize( logger.info(f"{fms_mo_args}\n{opt_args.quant_method}\n") - if opt_args.quant_method == "gptq": + if opt_args.quant_method in ["gptq","gptqv2"]: if not available_packages["gptqmodel"]: raise ImportError( "Quantization method has been selected as gptq but unable to use external library, " @@ -138,12 +138,23 @@ def run_gptq(model_args, data_args, opt_args, gptq_args): logger = set_log_level(opt_args.log_level, "fms_mo.run_gptq") - quantize_config = QuantizeConfig( - bits=gptq_args.bits, - group_size=gptq_args.group_size, - desc_act=gptq_args.desc_act, - damp_percent=gptq_args.damp_percent, - ) + if opt_args.quant_method == "gptq": + quantize_config = QuantizeConfig( + bits=gptq_args.bits, + group_size=gptq_args.group_size, + desc_act=gptq_args.desc_act, + damp_percent=gptq_args.damp_percent, + ) + else: + quantize_config = QuantizeConfig( + bits=gptq_args.bits, + group_size=gptq_args.group_size, + desc_act=gptq_args.desc_act, + damp_percent=gptq_args.damp_percent, + v2=True, + v2_memory_device="cpu", + ) + # Add custom model_type mapping to gptqmodel LUT so GPTQModel can recognize them. for mtype, cls in custom_gptq_classes.items(): diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py index 661f72bd..ae041d59 100644 --- a/fms_mo/training_args.py +++ b/fms_mo/training_args.py @@ -138,7 +138,7 @@ class OptArguments(TypeChecker): """Dataclass for optimization related arguments.""" quant_method: str = field( - metadata={"choices": ["gptq", "fp8", "dq"], "help": "Quantization technique"} + metadata={"choices": ["gptq", "gptqv2", "fp8", "dq"], "help": "Quantization technique"} ) output_dir: str = field( metadata={ @@ -226,6 +226,7 @@ class GPTQArguments(TypeChecker): cache_examples_on_gpu: bool = True + @dataclass class FP8Arguments(TypeChecker): """Dataclass for FP8 related arguments that will be used by llm-compressor.""" From 3a069a2acfb15772ad3e263c013ee15f6da0a449 Mon Sep 17 00:00:00 2001 From: "omobayode.fagbohungbe" Date: Thu, 17 Jul 2025 11:17:47 +0000 Subject: [PATCH 2/3] fix: implementing edits for lint Signed-off-by: omobayode.fagbohungbe --- fms_mo/run_quant.py | 2 +- fms_mo/training_args.py | 5 +- tutorials/quantization_tutorial.ipynb | 133 +++++++++++++------------- 3 files changed, 72 insertions(+), 68 deletions(-) diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py index e9062112..f13ee0bc 100644 --- a/fms_mo/run_quant.py +++ b/fms_mo/run_quant.py @@ -88,7 +88,7 @@ def quantize( logger.info(f"{fms_mo_args}\n{opt_args.quant_method}\n") - if opt_args.quant_method in ["gptq","gptqv2"]: + if opt_args.quant_method in ["gptq", "gptqv2"]: if not available_packages["gptqmodel"]: raise ImportError( "Quantization method has been selected as gptq but unable to use external library, " diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py index ae041d59..9f7da8e0 100644 --- a/fms_mo/training_args.py +++ b/fms_mo/training_args.py @@ -138,7 +138,10 @@ class OptArguments(TypeChecker): """Dataclass for optimization related arguments.""" quant_method: str = field( - metadata={"choices": ["gptq", "gptqv2", "fp8", "dq"], "help": "Quantization technique"} + metadata={ + "choices": ["gptq", "gptqv2", "fp8", "dq"], + "help": "Quantization technique" + } ) output_dir: str = field( metadata={ diff --git a/tutorials/quantization_tutorial.ipynb b/tutorials/quantization_tutorial.ipynb index 387685b2..5354b3d4 100644 --- a/tutorials/quantization_tutorial.ipynb +++ b/tutorials/quantization_tutorial.ipynb @@ -162,7 +162,7 @@ "\n", "# Plotting the histogram.\n", "plt.figure(figsize=(16, 10))\n", - "plt.hist(raw_data, density=True, bins=128, alpha=0.8, label='y')\n", + "plt.hist(raw_data, density=True, bins=128, alpha=0.8, label=\"y\")\n", "#plt.legend(loc='upper right')\n", "plt.xlabel(\"Data\")\n", "plt.ylabel(\"density\")\n", @@ -220,9 +220,9 @@ "isClipped=np.logical_or(raw_data>clip_max, raw_data Date: Thu, 17 Jul 2025 15:49:00 +0000 Subject: [PATCH 3/3] fix: tutorial file restored Signed-off-by: omobayode.fagbohungbe --- tutorials/quantization_tutorial.ipynb | 133 +++++++++++++------------- 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/tutorials/quantization_tutorial.ipynb b/tutorials/quantization_tutorial.ipynb index 5354b3d4..387685b2 100644 --- a/tutorials/quantization_tutorial.ipynb +++ b/tutorials/quantization_tutorial.ipynb @@ -162,7 +162,7 @@ "\n", "# Plotting the histogram.\n", "plt.figure(figsize=(16, 10))\n", - "plt.hist(raw_data, density=True, bins=128, alpha=0.8, label=\"y\")\n", + "plt.hist(raw_data, density=True, bins=128, alpha=0.8, label='y')\n", "#plt.legend(loc='upper right')\n", "plt.xlabel(\"Data\")\n", "plt.ylabel(\"density\")\n", @@ -220,9 +220,9 @@ "isClipped=np.logical_or(raw_data>clip_max, raw_data