Skip to content

Commit d91b3bd

Browse files
brian-dellabettajangel97
authored andcommitted
Merge branch 'main' into deprecation/torch-dtype-to-dtype
2 parents a957fe2 + 6bb7905 commit d91b3bd

File tree

27 files changed

+655
-198
lines changed

27 files changed

+655
-198
lines changed

.github/workflows/test-check.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,17 @@ on:
1313

1414
env:
1515
CADENCE: "commit"
16+
HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
1617
UV_SYSTEM_PYTHON: 1
1718
UV_TORCH_BACKEND: "auto"
1819

1920
jobs:
2021

2122
base-tests:
22-
runs-on: ubuntu-22.04
23+
runs-on: ibm-wdc-k8s-vllm-h100-solo
2324
env:
2425
COVERAGE_FILE: ".coverage.base"
26+
HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
2527
strategy:
2628
matrix:
2729
python: ["3.10", "3.13"]
@@ -74,9 +76,10 @@ jobs:
7476
coverage report --data-file="$COVERAGE_FILE" --skip-empty --format="markdown" > "$GITHUB_STEP_SUMMARY"
7577
7678
pytorch-tests:
77-
runs-on: ubuntu-22.04
79+
runs-on: ibm-wdc-k8s-vllm-h100-solo
7880
env:
7981
COVERAGE_FILE: ".coverage.pytorch"
82+
HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
8083
strategy:
8184
matrix:
8285
python: ["3.10", "3.13"]

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ style:
3939
# run tests for the repo
4040
test:
4141
@echo "Running python tests";
42-
pytest tests $(PYTEST_ARGS)
42+
pytest -ra tests $(PYTEST_ARGS) --ignore tests/lmeval
4343

4444
# creates wheel file
4545
.PHONY: build
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from datasets import load_dataset
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
4+
from llmcompressor import oneshot
5+
from llmcompressor.modifiers.awq import AWQModifier
6+
from llmcompressor.utils import dispatch_for_generation
7+
8+
# Select model and load it.
9+
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
10+
11+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
12+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
13+
14+
# Select calibration dataset.
15+
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
16+
DATASET_SPLIT = "train_sft"
17+
18+
# Select number of samples. 256 samples is a good place to start.
19+
# Increasing the number of samples can improve accuracy.
20+
NUM_CALIBRATION_SAMPLES = 256
21+
MAX_SEQUENCE_LENGTH = 512
22+
23+
# Load dataset and preprocess.
24+
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
25+
ds = ds.shuffle(seed=42)
26+
27+
28+
def preprocess(example):
29+
return {
30+
"text": tokenizer.apply_chat_template(
31+
example["messages"],
32+
tokenize=False,
33+
)
34+
}
35+
36+
37+
ds = ds.map(preprocess)
38+
39+
40+
# Tokenize inputs.
41+
def tokenize(sample):
42+
return tokenizer(
43+
sample["text"],
44+
padding=False,
45+
max_length=MAX_SEQUENCE_LENGTH,
46+
truncation=True,
47+
add_special_tokens=False,
48+
)
49+
50+
51+
# Configure the quantization algorithm to run.
52+
recipe = [
53+
AWQModifier(
54+
ignore=["lm_head"], scheme="FP8_BLOCK", targets=["Linear"], duo_scaling="both"
55+
),
56+
]
57+
58+
# Apply algorithms.
59+
oneshot(
60+
model=model,
61+
dataset=ds,
62+
recipe=recipe,
63+
max_seq_length=MAX_SEQUENCE_LENGTH,
64+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
65+
)
66+
67+
# Confirm generations of the quantized model look sane.
68+
print("\n\n")
69+
print("========== SAMPLE GENERATION ==============")
70+
dispatch_for_generation(model)
71+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
72+
model.device
73+
)
74+
output = model.generate(input_ids, max_new_tokens=100)
75+
print(tokenizer.decode(output[0]))
76+
print("==========================================\n\n")
77+
78+
# Save to disk compressed.
79+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
80+
model.save_pretrained(SAVE_DIR, save_compressed=True)
81+
tokenizer.save_pretrained(SAVE_DIR)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from datasets import load_dataset
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
4+
from llmcompressor import oneshot
5+
from llmcompressor.modifiers.awq import AWQModifier
6+
from llmcompressor.utils import dispatch_for_generation
7+
8+
# Select model and load it.
9+
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
10+
11+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
12+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
13+
14+
# Select calibration dataset.
15+
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
16+
DATASET_SPLIT = "train_sft"
17+
18+
# Select number of samples. 256 samples is a good place to start.
19+
# Increasing the number of samples can improve accuracy.
20+
NUM_CALIBRATION_SAMPLES = 256
21+
MAX_SEQUENCE_LENGTH = 512
22+
23+
# Load dataset and preprocess.
24+
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
25+
ds = ds.shuffle(seed=42)
26+
27+
28+
def preprocess(example):
29+
return {
30+
"text": tokenizer.apply_chat_template(
31+
example["messages"],
32+
tokenize=False,
33+
)
34+
}
35+
36+
37+
ds = ds.map(preprocess)
38+
39+
40+
# Tokenize inputs.
41+
def tokenize(sample):
42+
return tokenizer(
43+
sample["text"],
44+
padding=False,
45+
max_length=MAX_SEQUENCE_LENGTH,
46+
truncation=True,
47+
add_special_tokens=False,
48+
)
49+
50+
51+
# Configure the quantization algorithm to run.
52+
recipe = [
53+
AWQModifier(
54+
ignore=["lm_head"], scheme="FP8_DYNAMIC", targets=["Linear"], duo_scaling="both"
55+
),
56+
]
57+
58+
# Apply algorithms.
59+
oneshot(
60+
model=model,
61+
dataset=ds,
62+
recipe=recipe,
63+
max_seq_length=MAX_SEQUENCE_LENGTH,
64+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
65+
)
66+
67+
# Confirm generations of the quantized model look sane.
68+
print("\n\n")
69+
print("========== SAMPLE GENERATION ==============")
70+
dispatch_for_generation(model)
71+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
72+
model.device
73+
)
74+
output = model.generate(input_ids, max_new_tokens=100)
75+
print(tokenizer.decode(output[0]))
76+
print("==========================================\n\n")
77+
78+
# Save to disk compressed.
79+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
80+
model.save_pretrained(SAVE_DIR, save_compressed=True)
81+
tokenizer.save_pretrained(SAVE_DIR)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import torch
2+
from compressed_tensors.quantization import QuantizationScheme
3+
from compressed_tensors.quantization.quant_args import (
4+
QuantizationArgs,
5+
QuantizationStrategy,
6+
QuantizationType,
7+
)
8+
from transformers import AutoModelForCausalLM, AutoTokenizer
9+
10+
from llmcompressor import oneshot
11+
from llmcompressor.modeling.gpt_oss import convert_model_for_quantization_gptoss
12+
from llmcompressor.modifiers.quantization import QuantizationModifier
13+
14+
15+
def main():
16+
MODEL_ID = "openai/gpt-oss-20b"
17+
BASE_NAME = MODEL_ID.rstrip("/").split("/")[-1]
18+
OUTPUT_DIR = f"{BASE_NAME}-w4a8-channelwise"
19+
20+
print(f"[GPT-OSS] Loading model: {MODEL_ID}")
21+
model = AutoModelForCausalLM.from_pretrained(
22+
MODEL_ID,
23+
torch_dtype=torch.bfloat16,
24+
device_map="auto",
25+
trust_remote_code=True,
26+
)
27+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
28+
29+
# ---- GPT-OSS MoE → linear experts conversion ----
30+
print("[GPT-OSS] Converting fused MoE experts to LinearExperts for quantization...")
31+
convert_model_for_quantization_gptoss(model)
32+
print("[GPT-OSS] Conversion completed.")
33+
34+
# ---- Quantization config: W4A8 (int4 weights, int8 activations) ----
35+
36+
# Weights: 4-bit, channelwise, symmetric, static
37+
weights_args = QuantizationArgs(
38+
num_bits=4,
39+
type=QuantizationType.INT,
40+
strategy=QuantizationStrategy.CHANNEL,
41+
symmetric=True,
42+
dynamic=False,
43+
)
44+
45+
# Activations: 8-bit, per-token, asymmetric, dynamic
46+
activations_args = QuantizationArgs(
47+
num_bits=8,
48+
type=QuantizationType.INT,
49+
strategy=QuantizationStrategy.TOKEN,
50+
symmetric=False,
51+
dynamic=True,
52+
observer=None,
53+
)
54+
55+
# Apply to all Linear layers, excluding lm_head
56+
scheme = QuantizationScheme(
57+
targets=["Linear"],
58+
weights=weights_args,
59+
input_activations=activations_args,
60+
)
61+
62+
recipe = QuantizationModifier(
63+
config_groups={"group_0": scheme},
64+
ignore=["lm_head"],
65+
)
66+
67+
print(f"[GPT-OSS] Starting oneshot quantization → {OUTPUT_DIR}")
68+
oneshot(
69+
model=model,
70+
recipe=recipe,
71+
tokenizer=tokenizer,
72+
output_dir=OUTPUT_DIR,
73+
trust_remote_code_model=True,
74+
)
75+
print(f"[GPT-OSS] Quantization finished. Quantized model written to: {OUTPUT_DIR}")
76+
77+
78+
if __name__ == "__main__":
79+
main()

src/llmcompressor/core/lifecycle.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,9 +206,9 @@ def event(
206206
if data is not None:
207207
mod_data.append(data)
208208

209-
assert event is not None, (
210-
f"Event lifecycle did not return an event for {event_type}"
211-
)
209+
assert (
210+
event is not None
211+
), f"Event lifecycle did not return an event for {event_type}"
212212

213213
return mod_data
214214

0 commit comments

Comments
 (0)