Skip to content

Commit 804e346

Browse files
Update AMD test definitions (2025-12-08) (#30298)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
1 parent 83319b4 commit 804e346

File tree

1 file changed

+130
-58
lines changed

1 file changed

+130
-58
lines changed

.buildkite/test-amd.yaml

Lines changed: 130 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,8 @@ steps:
398398
timeout_in_minutes: 25
399399
gpu: h100
400400
source_file_dependencies:
401-
- vllm/
401+
- vllm/v1/attention
402+
- vllm/model_executor/layers
402403
- tests/v1/determinism/
403404
commands:
404405
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -440,23 +441,29 @@ steps:
440441
working_dir: "/vllm-workspace/examples"
441442
source_file_dependencies:
442443
- vllm/entrypoints
444+
- vllm/multimodal
443445
- examples/
444446
commands:
445447
- pip install tensorizer # for tensorizer test
448+
# for basic
449+
- python3 offline_inference/basic/chat.py
446450
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
447451
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
448-
- python3 offline_inference/basic/chat.py
449-
- python3 offline_inference/prefix_caching.py
450-
- python3 offline_inference/llm_engine_example.py
452+
- python3 offline_inference/basic/classify.py
453+
- python3 offline_inference/basic/embed.py
454+
- python3 offline_inference/basic/score.py
455+
# for multi-modal models
451456
- python3 offline_inference/audio_language.py --seed 0
452457
- python3 offline_inference/vision_language.py --seed 0
453458
- python3 offline_inference/vision_language_pooling.py --seed 0
454459
- python3 offline_inference/vision_language_multi_image.py --seed 0
455-
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
456460
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
457-
- python3 offline_inference/basic/classify.py
458-
- python3 offline_inference/basic/embed.py
459-
- python3 offline_inference/basic/score.py
461+
# for pooling models
462+
- python3 pooling/pooling/vision_language_pooling.py --seed 0
463+
# for features demo
464+
- python3 offline_inference/prefix_caching.py
465+
- python3 offline_inference/llm_engine_example.py
466+
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
460467
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
461468
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
462469
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -718,6 +725,18 @@ steps:
718725
- uv pip install --system conch-triton-kernels
719726
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
720727

728+
- label: LM Eval Small Models # 53min
729+
timeout_in_minutes: 75
730+
mirror_hardwares: [amdexperimental]
731+
agent_pool: mi325_1
732+
# grade: Blocking
733+
source_file_dependencies:
734+
- csrc/
735+
- vllm/model_executor/layers/quantization
736+
autorun_on_main: true
737+
commands:
738+
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
739+
721740
- label: OpenAI API correctness # 10min
722741
timeout_in_minutes: 15
723742
mirror_hardwares: [amdexperimental, amdproduction]
@@ -727,7 +746,7 @@ steps:
727746
- csrc/
728747
- vllm/entrypoints/openai/
729748
- vllm/model_executor/models/whisper.py
730-
commands: # LMEval
749+
commands: # LMEval+Transcription WER check
731750
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
732751
- pytest -s entrypoints/openai/correctness/
733752

@@ -963,6 +982,19 @@ steps:
963982
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
964983
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
965984

985+
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
986+
timeout_in_minutes: 180
987+
mirror_hardwares: [amdexperimental, amdproduction]
988+
agent_pool: mi325_1
989+
# grade: Blocking
990+
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
991+
source_file_dependencies:
992+
- vllm/multimodal/
993+
- vllm/inputs/
994+
- vllm/v1/core/
995+
commands:
996+
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
997+
966998
- label: Multi-Modal Models Test (Extended) 1 # 60min
967999
timeout_in_minutes: 120
9681000
mirror_hardwares: [amdexperimental]
@@ -1098,7 +1130,6 @@ steps:
10981130
- vllm/model_executor/layers/layernorm.py
10991131
- vllm/model_executor/layers/activation.py
11001132
- vllm/model_executor/layers/quantization/input_quant_fp8.py
1101-
- vllm/model_executor/layers/fused_moe/layer.py
11021133
- tests/compile/test_fusion_attn.py
11031134
- tests/compile/test_silu_mul_quant_fusion.py
11041135
- tests/compile/distributed/test_fusion_all_reduce.py
@@ -1132,12 +1163,25 @@ steps:
11321163
- vllm/model_executor/layers/activation.py
11331164
- vllm/model_executor/layers/quantization/input_quant_fp8.py
11341165
- tests/compile/distributed/test_fusions_e2e.py
1135-
- tests/compile/fullgraph/test_full_graph.py
11361166
commands:
11371167
- nvidia-smi
11381168
# Run all e2e fusion tests
11391169
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
11401170

1171+
- label: Blackwell GPT-OSS Eval
1172+
timeout_in_minutes: 60
1173+
working_dir: "/vllm-workspace/"
1174+
gpu: b200
1175+
optional: true # run on nightlies
1176+
source_file_dependencies:
1177+
- tests/evals/gpt_oss
1178+
- vllm/model_executor/models/gpt_oss.py
1179+
- vllm/model_executor/layers/quantization/mxfp4.py
1180+
- vllm/v1/attention/backends/flashinfer.py
1181+
commands:
1182+
- uv pip install --system 'gpt-oss[eval]==0.0.5'
1183+
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
1184+
11411185
- label: Blackwell Quantized MoE Test
11421186
timeout_in_minutes: 60
11431187
working_dir: "/vllm-workspace/"
@@ -1155,6 +1199,16 @@ steps:
11551199
commands:
11561200
- pytest -s -v tests/quantization/test_blackwell_moe.py
11571201

1202+
- label: Blackwell LM Eval Small Models
1203+
timeout_in_minutes: 120
1204+
gpu: b200
1205+
optional: true # run on nightlies
1206+
source_file_dependencies:
1207+
- csrc/
1208+
- vllm/model_executor/layers/quantization
1209+
commands:
1210+
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
1211+
11581212
##### 1 GPU test #####
11591213
##### multi gpus test #####
11601214

@@ -1397,6 +1451,39 @@ steps:
13971451
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
13981452
- pytest -v -s -x lora/test_mixtral.py
13991453

1454+
1455+
- label: LM Eval Large Models # optional
1456+
gpu: a100
1457+
optional: true
1458+
mirror_hardwares: [amdexperimental]
1459+
agent_pool: mi325_4
1460+
# grade: Blocking
1461+
num_gpus: 4
1462+
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
1463+
source_file_dependencies:
1464+
- csrc/
1465+
- vllm/model_executor/layers/quantization
1466+
commands:
1467+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
1468+
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
1469+
1470+
##### H100 test #####
1471+
- label: LM Eval Large Models (H100) # optional
1472+
gpu: h100
1473+
optional: true
1474+
mirror_hardwares: [amdexperimental]
1475+
agent_pool: mi325_4
1476+
# grade: Blocking
1477+
num_gpus: 4
1478+
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
1479+
source_file_dependencies:
1480+
- csrc/
1481+
- vllm/model_executor/layers/quantization
1482+
commands:
1483+
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
1484+
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
1485+
1486+
14001487
##### H200 test #####
14011488
- label: Distributed Tests (H200) # optional
14021489
mirror_hardwares: [amdexperimental]
@@ -1440,29 +1527,6 @@ steps:
14401527
commands:
14411528
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
14421529

1443-
- label: Blackwell LM Eval Small Models
1444-
timeout_in_minutes: 120
1445-
gpu: b200
1446-
optional: true # run on nightlies
1447-
source_file_dependencies:
1448-
- csrc/
1449-
- vllm/model_executor/layers/quantization
1450-
commands:
1451-
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
1452-
1453-
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
1454-
timeout_in_minutes: 70
1455-
mirror_hardwares: [amdexperimental, amdproduction]
1456-
agent_pool: mi325_1
1457-
# grade: Blocking
1458-
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
1459-
source_file_dependencies:
1460-
- vllm/multimodal/
1461-
- vllm/inputs/
1462-
- vllm/v1/core/
1463-
commands:
1464-
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
1465-
14661530
- label: LM Eval Large Models (4 Card)
14671531
mirror_hardwares: [amdexperimental, amdproduction]
14681532
agent_pool: mi325_4
@@ -1478,21 +1542,6 @@ steps:
14781542
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
14791543
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
14801544

1481-
- label: LM Eval Large Models (H100) # optional
1482-
mirror_hardwares: [amdexperimental, amdproduction]
1483-
agent_pool: mi325_4
1484-
# grade: Blocking
1485-
gpu: h100
1486-
optional: true
1487-
num_gpus: 4
1488-
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
1489-
source_file_dependencies:
1490-
- csrc/
1491-
- vllm/model_executor/layers/quantization
1492-
commands:
1493-
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
1494-
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
1495-
14961545
- label: ROCm LM Eval Large Models (8 Card)
14971546
mirror_hardwares: [amdproduction]
14981547
agent_pool: mi325_8
@@ -1517,6 +1566,20 @@ steps:
15171566
- uv pip install --system 'gpt-oss[eval]==0.0.5'
15181567
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
15191568

1569+
##### RL Integration Tests #####
1570+
- label: Prime-RL Integration Test # 15min
1571+
mirror_hardwares: [amdexperimental]
1572+
agent_pool: mi325_2
1573+
# grade: Blocking
1574+
timeout_in_minutes: 30
1575+
optional: true
1576+
num_gpus: 2
1577+
working_dir: "/vllm-workspace"
1578+
source_file_dependencies:
1579+
- vllm/
1580+
- .buildkite/scripts/run-prime-rl-test.sh
1581+
commands:
1582+
- bash .buildkite/scripts/run-prime-rl-test.sh
15201583
- label: DeepSeek V2-Lite Accuracy
15211584
mirror_hardwares: [amdexperimental, amdproduction]
15221585
agent_pool: mi325_4
@@ -1550,17 +1613,26 @@ steps:
15501613
commands:
15511614
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
15521615

1553-
##### RL Integration Tests #####
1554-
- label: Prime-RL Integration Test # 15min
1616+
- label: DeepSeek V2-Lite Async EPLB Accuracy
1617+
timeout_in_minutes: 60
15551618
mirror_hardwares: [amdexperimental]
1556-
agent_pool: mi325_2
1619+
agent_pool: mi325_4
15571620
# grade: Blocking
1558-
timeout_in_minutes: 30
1621+
gpu: h100
15591622
optional: true
1560-
num_gpus: 2
1623+
num_gpus: 4
15611624
working_dir: "/vllm-workspace"
1562-
source_file_dependencies:
1563-
- vllm/
1564-
- .buildkite/scripts/run-prime-rl-test.sh
15651625
commands:
1566-
- bash .buildkite/scripts/run-prime-rl-test.sh
1626+
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
1627+
1628+
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
1629+
timeout_in_minutes: 60
1630+
mirror_hardwares: [amdexperimental]
1631+
agent_pool: mi325_4
1632+
# grade: Blocking
1633+
gpu: h100
1634+
optional: true
1635+
num_gpus: 4
1636+
working_dir: "/vllm-workspace"
1637+
commands:
1638+
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

0 commit comments

Comments
 (0)