intel
diff --git a/‎examples/README.md‎
Lines changed: 12 additions & 0 deletions b/‎examples/README.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md‎
Lines changed: 65 additions & 0 deletions b/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/generate.py‎
Lines changed: 67 additions & 0 deletions b/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/generate.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py‎
Lines changed: 128 additions & 0 deletions b/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt‎
Lines changed: 2 additions & 0 deletions
@@ -15,6 +15,18 @@ Intel® Neural Compressor validated examples with multiple compression technique
   </tr>
 </thead>
 <tbody>
+<tr>
+    <td>deepseek-ai/DeepSeek-R1</td>
+    <td>Natural Language Processing</td>
+    <td>Quantization (MXFP8/MXFP4)</td>
+    <td><a href="./pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek">link</a></td>
+</tr>
+<tr>
+    <td>Qwen/Qwen3-235B-A22B</td>
+    <td>Natural Language Processing</td>
+    <td>Quantization (MXFP8/MXFP4)</td>
+    <td><a href="./pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen">link</a></td>
+</tr>
 <tr>
     <td>Framepack</td>
     <td>Image + Text to Video</td>
 
@@ -0,0 +1,65 @@
+This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8 and evaluate them using a custom vLLM fork.
+
+## Requirement
+```bash
+pip install neural-compressor-pt==3.7
+# auto-round
+pip install auto-round==0.9.2
+# vLLM
+git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
+VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
+# other requirements
+pip install -r requirements.txt
+pip uninstall flash_attn
+```
+
+### Quantize Model
+- Export model path
+```bash
+export MODEL=deepseek-ai/DeepSeek-R1
+```
+
+- MXFP8
+```bash
+bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels
+```
+
+- MXFP4
+```bash
+bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels
+```
+
+## Evaluation
+
+### Prompt Tests
+
+Usage: 
+```bash
+bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path]
+```
+
+- MXFP8
+```bash
+bash ./run_generate.sh -s mxfp8 -tp 8 -m /path/to/ds_mxfp8
+```
+- MXFP4
+```bash
+bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4
+```
+### Evaluation
+
+
+Usage: 
+```bash
+bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
+```
+```bash
+bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8
+bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
+
+```
+- MXFP4
+```bash
+bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4
+bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
+```
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copied from https://github.com/vllm-project/vllm/
+
+from vllm import LLM, EngineArgs
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+
+    return parser
+
+
+def main(args: dict):
+    # Pop arguments not used by LLM
+    max_tokens = args.pop("max_tokens")
+    temperature = args.pop("temperature")
+    top_p = args.pop("top_p")
+    top_k = args.pop("top_k")
+
+    # Create an LLM
+    llm = LLM(**args)
+
+    # Create a sampling params object
+    sampling_params = llm.get_default_sampling_params()
+    if max_tokens is not None:
+        sampling_params.max_tokens = max_tokens
+    if temperature is not None:
+        sampling_params.temperature = temperature
+    if top_p is not None:
+        sampling_params.top_p = top_p
+    if top_k is not None:
+        sampling_params.top_k = top_k
+
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args: dict = vars(parser.parse_args())
+    main(args)
@@ -0,0 +1,128 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import transformers
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+topologies_config = {
+    "mxfp8": {
+        "scheme": "MXFP8",
+        "fp_layers": "lm_head",
+        "iters": 0,
+    },
+    "mxfp4": {
+        "scheme": "MXFP4",
+        "fp_layers": "lm_head,self_attn",
+        "iters": 0,
+    },
+}
+
+
+def get_model_and_tokenizer(model_name):
+    # Load model and tokenizer
+    fp32_model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map="cpu",
+        trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        trust_remote_code=True,
+    )
+    return fp32_model, tokenizer
+
+
+def quant_model(args):
+    from neural_compressor.torch.quantization import (
+        AutoRoundConfig,
+        convert,
+        prepare,
+    )
+
+    config = topologies_config[args.t]
+    export_format = "auto_round" if args.use_autoround_format else "llm_compressor"
+    output_dir = f"{args.output_dir}/quantized_model_{args.t}"
+    fp32_model, tokenizer = get_model_and_tokenizer(args.model)
+    quant_config = AutoRoundConfig(
+        tokenizer=tokenizer,
+        scheme=config["scheme"],
+        enable_torch_compile=args.enable_torch_compile,
+        iters=config["iters"],
+        fp_layers=config["fp_layers"],
+        export_format=export_format,
+        output_dir=output_dir,
+    )
+
+    # quantizer execute
+    model = prepare(model=fp32_model, quant_config=quant_config)
+    inc_model = convert(model)
+    logger.info(f"Quantized model saved to {output_dir}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Select a quantization scheme.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        help="Path to the pre-trained model or model identifier from Hugging Face Hub.",
+    )
+    parser.add_argument(
+        "-t",
+        type=str,
+        choices=topologies_config.keys(),
+        default="mxfp4",
+        help="Quantization scheme to use. Available options: " + ", ".join(topologies_config.keys()),
+    )
+
+    parser.add_argument(
+        "--enable_torch_compile",
+        action="store_true",
+        help="Enable torch compile for the model.",
+    )
+    parser.add_argument(
+        "--use_autoround_format",
+        action="store_true",
+        help="Use AutoRound format for saving the quantized model.",
+    )
+
+    parser.add_argument(
+        "--skip_attn",
+        action="store_true",
+        help="Skip quantize attention layers.",
+    )
+    parser.add_argument(
+        "--iters",
+        type=int,
+        default=0,
+        help="Number of iterations for quantization.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./",
+        help="Directory to save the quantized model.",
+    )
+
+    args = parser.parse_args()
+
+    quant_model(args)
@@ -0,0 +1,2 @@
+lm-eval==0.4.9.1
+loguru