vllm-project
diff --git a/‎.github/workflows/test-check.yaml‎
Lines changed: 5 additions & 2 deletions b/‎.github/workflows/test-check.yaml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/awq/fp8_block_llama_example.py‎
Lines changed: 81 additions & 0 deletions b/‎examples/awq/fp8_block_llama_example.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎examples/awq/fp8_dynamic_llama_example.py‎
Lines changed: 81 additions & 0 deletions b/‎examples/awq/fp8_dynamic_llama_example.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎examples/quantization_w4a8/gpt_oss_20b_example.py‎
Lines changed: 79 additions & 0 deletions b/‎examples/quantization_w4a8/gpt_oss_20b_example.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎src/llmcompressor/core/lifecycle.py‎
Lines changed: 3 additions & 3 deletions b/‎src/llmcompressor/core/lifecycle.py‎
Lines changed: 3 additions & 3 deletions
@@ -13,15 +13,17 @@ on:
 
 env:
   CADENCE: "commit"
+  HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
   UV_SYSTEM_PYTHON: 1
   UV_TORCH_BACKEND: "auto"
 
 jobs:
 
   base-tests:
-    runs-on: ubuntu-22.04
+    runs-on: ibm-wdc-k8s-vllm-h100-solo
     env:
       COVERAGE_FILE: ".coverage.base"
+      HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
     strategy:
       matrix:
         python: ["3.10", "3.13"]
@@ -74,9 +76,10 @@ jobs:
           coverage report --data-file="$COVERAGE_FILE" --skip-empty --format="markdown" > "$GITHUB_STEP_SUMMARY"
 
   pytorch-tests:
-    runs-on: ubuntu-22.04
+    runs-on: ibm-wdc-k8s-vllm-h100-solo
     env:
       COVERAGE_FILE: ".coverage.pytorch"
+      HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
     strategy:
       matrix:
         python: ["3.10", "3.13"]
 
@@ -39,7 +39,7 @@ style:
 # run tests for the repo
 test:
 	@echo "Running python tests";
-	pytest tests $(PYTEST_ARGS)
+	pytest -ra tests $(PYTEST_ARGS) --ignore tests/lmeval
 
 # creates wheel file
 .PHONY: build
 
@@ -0,0 +1,81 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 256 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 512
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+# Configure the quantization algorithm to run.
+recipe = [
+    AWQModifier(
+        ignore=["lm_head"], scheme="FP8_BLOCK", targets=["Linear"], duo_scaling="both"
+    ),
+]
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
@@ -0,0 +1,81 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 256 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 512
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+# Configure the quantization algorithm to run.
+recipe = [
+    AWQModifier(
+        ignore=["lm_head"], scheme="FP8_DYNAMIC", targets=["Linear"], duo_scaling="both"
+    ),
+]
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
@@ -0,0 +1,79 @@
+import torch
+from compressed_tensors.quantization import QuantizationScheme
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modeling.gpt_oss import convert_model_for_quantization_gptoss
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+
+def main():
+    MODEL_ID = "openai/gpt-oss-20b"
+    BASE_NAME = MODEL_ID.rstrip("/").split("/")[-1]
+    OUTPUT_DIR = f"{BASE_NAME}-w4a8-channelwise"
+
+    print(f"[GPT-OSS] Loading model: {MODEL_ID}")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+    # ---- GPT-OSS MoE → linear experts conversion ----
+    print("[GPT-OSS] Converting fused MoE experts to LinearExperts for quantization...")
+    convert_model_for_quantization_gptoss(model)
+    print("[GPT-OSS] Conversion completed.")
+
+    # ---- Quantization config: W4A8 (int4 weights, int8 activations) ----
+
+    # Weights: 4-bit, channelwise, symmetric, static
+    weights_args = QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.CHANNEL,
+        symmetric=True,
+        dynamic=False,
+    )
+
+    # Activations: 8-bit, per-token, asymmetric, dynamic
+    activations_args = QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.TOKEN,
+        symmetric=False,
+        dynamic=True,
+        observer=None,
+    )
+
+    # Apply to all Linear layers, excluding lm_head
+    scheme = QuantizationScheme(
+        targets=["Linear"],
+        weights=weights_args,
+        input_activations=activations_args,
+    )
+
+    recipe = QuantizationModifier(
+        config_groups={"group_0": scheme},
+        ignore=["lm_head"],
+    )
+
+    print(f"[GPT-OSS] Starting oneshot quantization → {OUTPUT_DIR}")
+    oneshot(
+        model=model,
+        recipe=recipe,
+        tokenizer=tokenizer,
+        output_dir=OUTPUT_DIR,
+        trust_remote_code_model=True,
+    )
+    print(f"[GPT-OSS] Quantization finished. Quantized model written to: {OUTPUT_DIR}")
+
+
+if __name__ == "__main__":
+    main()
@@ -206,9 +206,9 @@ def event(
             if data is not None:
                 mod_data.append(data)
 
-        assert event is not None, (
-            f"Event lifecycle did not return an event for {event_type}"
-        )
+        assert (
+            event is not None
+        ), f"Event lifecycle did not return an event for {event_type}"
 
         return mod_data