intel · mengniwang95 · Dec 8, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/README.md
@@ -7,14 +7,14 @@ This example quantizes and validates the accuracy of Llama4.
 ## 1. Environment
 
 ```shell
-docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash
+docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.08-py3 /bin/bash
 docker exec -it llama4 bash
 git clone https://github.com/intel/neural-compressor.git
 cd neural-compressor/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4
-# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.6 release
-pip install neural-compressor-pt==3.6
-# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.8.0 release
-pip install auto-round==0.8.0
+# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master` for the latest updates before neural-compressor v3.7 release
+pip install neural-compressor-pt==3.7
+# Use `pip install git+https://github.com/intel/auto-round.git@main` for the latest updates before auto-round v0.9.3 release
+pip install auto-round==0.9.3
 bash setup.sh
 ```
 
@@ -36,5 +36,5 @@ CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=L
 ## 2. Benchmark
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4
+CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results --tasks=piqa --batch_size=1 --tp_size=4
 ```
diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/main.py
@@ -85,7 +85,7 @@ def tune(args):
         iters=args.iters,
         scheme=args.scheme,
         layer_config=layer_config,
-        export_format="llm_compressor",
+        export_format=args.export_format,
         output_dir=args.output_dir,
         processor=processor,
     )

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_benchmark.sh
@@ -40,6 +40,12 @@ function run_benchmark {
     batch_size=${batch_size:=1}
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
+        export VLLM_AR_MXFP4_MODULAR_MOE=1
+        export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1
+        export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0
+        export VLLM_ENABLE_STATIC_MOE=0
+        export VLLM_USE_DEEP_GEMM=0
+        export VLLM_ENABLE_AR_EXT=1
         extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7"
         extra_cmd="--gen_kwargs max_gen_toks=2048"
     fi

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/run_quant.sh
@@ -44,7 +44,7 @@ function run_tuning {
     iters=${iters:=0}
 
     if [ "${topology}" = "llama4_mxfp4" ]; then
-        extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4"
+        extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4 --export_format auto_round"
     fi
 
     python3 main.py \

diff --git a/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh b/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4/setup.sh
@@ -2,7 +2,6 @@ pip install -r requirements.txt
 pip install setuptools --upgrade
 pip install packaging --upgrade
 pip install -U "huggingface_hub[cli]"
-git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git
-cd vllm-fork
-VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation
-cd ..
+git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
+VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
+pip uninstall flash_attn -y