From a8346415021dae356d0bb9866ac63ab86fca2f98 Mon Sep 17 00:00:00 2001 From: "omobayode.fagbohungbe" Date: Fri, 13 Jun 2025 17:33:48 +0000 Subject: [PATCH] feat: GPTQv2 enablement Signed-off-by: omobayode.fagbohungbe --- examples/GPTQ/README.md | 40 +++++++++++++++++++++++++++------------- fms_mo/run_quant.py | 2 ++ fms_mo/training_args.py | 2 ++ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/examples/GPTQ/README.md b/examples/GPTQ/README.md index f5661a12..05ab1272 100644 --- a/examples/GPTQ/README.md +++ b/examples/GPTQ/README.md @@ -7,6 +7,7 @@ For generative LLMs, very often the bottleneck of inference is no longer the com - [FMS Model Optimizer requirements](../../README.md#requirements) - `gptqmodel` is needed for this example. Use `pip install gptqmodel` or [install from source](https://github.com/ModelCloud/GPTQModel/tree/main?tab=readme-ov-file) + - It is advised to install from source if you plan to use GPTQv2 - Optionally for the evaluation section below, install [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) ``` pip install lm-eval @@ -41,7 +42,10 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m --quant_method gptq \ --output_dir Meta-Llama-3-8B-GPTQ \ --bits 4 \ - --group_size 128 + --group_size 128 \ + --use_version2 False \ + --v2_mem_device cpu \ + ``` The model that can be found in the specified output directory (`Meta-Llama-3-8B-GPTQ` in our case) can be deployed and inferenced via `vLLM`. @@ -89,18 +93,26 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m | | | |none | 5|perplexity|↓ |3.7915|± |0.0727| - Quantized model with the settings showed above (`desc_act` default to False.) -- -|Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| -|------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| -| LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6365 |± |0.0067| -| | | |none | 5|perplexity|↓ |5.9307 |± |0.1830| + - GPTQv1 + + |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| + |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| + | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6365 |± |0.0067| + | | | |none | 5|perplexity|↓ |5.9307 |± |0.1830| + + - GPTQv2 + + |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| + |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| + | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6817 |± |0.0065| + | | | |none | 5|perplexity|↓ |4.3994 |± |0.0995| - Quantized model with `desc_act` set to `True` (could improve the model quality, but at the cost of inference speed.) -- -|Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| -|------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| -| LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6193 |± |0.0068| -| | | |none | 5|perplexity|↓ |5.8879 |± |0.1546| + - GPTQv1 + |Model | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| + |------------|--------------|------:|------|-----:|----------|---|------:|---|-----:| + | LLAMA3-8B |lambada_openai| 1|none | 5|acc |↑ |0.6193 |± |0.0068| + | | | |none | 5|perplexity|↓ |5.8879 |± |0.1546| > [!NOTE] > There is some randomness in generating the model and data, the resulting accuracy may vary ~$\pm$ 0.05. @@ -108,7 +120,7 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m ## Code Walk-through -1. Command line arguments will be used to create a GPTQ quantization config. Information about the required arguments and their default values can be found [here](../../fms_mo/training_args.py) +1. Command line arguments will be used to create a GPTQ quantization config. Information about the required arguments and their default values can be found [here](../../fms_mo/training_args.py). GPTQv1 is supported by default. To use GPTQv2, set the parameter `v2` to `True` and `v2_memory_device` to `cpu`. ```python from gptqmodel import GPTQModel, QuantizeConfig @@ -118,6 +130,8 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m group_size=gptq_args.group_size, desc_act=gptq_args.desc_act, damp_percent=gptq_args.damp_percent, + v2=gptq_args.use_version2, + v2_memory_device=gptq_args.v2_mem_device, ) ``` @@ -158,4 +172,4 @@ This end-to-end example utilizes the common set of interfaces provided by `fms_m tokenizer.save_pretrained(output_dir) # optional ``` > [!NOTE] -> 1. GPTQ of a 70B model usually takes ~4-10 hours on A100. +> 1. GPTQ of a 70B model usually takes ~4-10 hours on A100 with GPTQv1. diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py index a7a60b9b..a537eb4f 100644 --- a/fms_mo/run_quant.py +++ b/fms_mo/run_quant.py @@ -140,6 +140,8 @@ def run_gptq(model_args, data_args, opt_args, gptq_args): group_size=gptq_args.group_size, desc_act=gptq_args.desc_act, damp_percent=gptq_args.damp_percent, + v2=gptq_args.use_version2, + v2_memory_device=gptq_args.v2_mem_device, ) # Add custom model_type mapping to gptqmodel LUT so GPTQModel can recognize them. diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py index e7beafc6..c92775c2 100644 --- a/fms_mo/training_args.py +++ b/fms_mo/training_args.py @@ -206,6 +206,8 @@ class GPTQArguments(TypeChecker): use_cuda_fp16: bool = True autotune_warmup_after_quantized: bool = False cache_examples_on_gpu: bool = True + use_version2: bool = False + v2_mem_device: Optional[str] = field(default="cpu", metadata={"choices": ["auto", "cpu", "cuda"]}) @dataclass