@@ -398,7 +398,8 @@ steps:
398398 timeout_in_minutes : 25
399399 gpu : h100
400400 source_file_dependencies :
401- - vllm/
401+ - vllm/v1/attention
402+ - vllm/model_executor/layers
402403 - tests/v1/determinism/
403404 commands :
404405 - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -440,23 +441,29 @@ steps:
440441 working_dir : " /vllm-workspace/examples"
441442 source_file_dependencies :
442443 - vllm/entrypoints
444+ - vllm/multimodal
443445 - examples/
444446 commands :
445447 - pip install tensorizer # for tensorizer test
448+ # for basic
449+ - python3 offline_inference/basic/chat.py
446450 - python3 offline_inference/basic/generate.py --model facebook/opt-125m
447451 - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
448- - python3 offline_inference/basic/chat.py
449- - python3 offline_inference/prefix_caching.py
450- - python3 offline_inference/llm_engine_example.py
452+ - python3 offline_inference/basic/classify.py
453+ - python3 offline_inference/basic/embed.py
454+ - python3 offline_inference/basic/score.py
455+ # for multi-modal models
451456 - python3 offline_inference/audio_language.py --seed 0
452457 - python3 offline_inference/vision_language.py --seed 0
453458 - python3 offline_inference/vision_language_pooling.py --seed 0
454459 - python3 offline_inference/vision_language_multi_image.py --seed 0
455- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
456460 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
457- - python3 offline_inference/basic/classify.py
458- - python3 offline_inference/basic/embed.py
459- - python3 offline_inference/basic/score.py
461+ # for pooling models
462+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
463+ # for features demo
464+ - python3 offline_inference/prefix_caching.py
465+ - python3 offline_inference/llm_engine_example.py
466+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
460467 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
461468 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
462469 - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -718,6 +725,18 @@ steps:
718725 - uv pip install --system conch-triton-kernels
719726 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
720727
728+ - label : LM Eval Small Models # 53min
729+ timeout_in_minutes : 75
730+ mirror_hardwares : [amdexperimental]
731+ agent_pool : mi325_1
732+ # grade: Blocking
733+ source_file_dependencies :
734+ - csrc/
735+ - vllm/model_executor/layers/quantization
736+ autorun_on_main : true
737+ commands :
738+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
739+
721740- label : OpenAI API correctness # 10min
722741 timeout_in_minutes : 15
723742 mirror_hardwares : [amdexperimental, amdproduction]
@@ -727,7 +746,7 @@ steps:
727746 - csrc/
728747 - vllm/entrypoints/openai/
729748 - vllm/model_executor/models/whisper.py
730- commands : # LMEval
749+ commands : # LMEval+Transcription WER check
731750 # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
732751 - pytest -s entrypoints/openai/correctness/
733752
@@ -963,6 +982,19 @@ steps:
963982 - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
964983 - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
965984
985+ - label : Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
986+ timeout_in_minutes : 180
987+ mirror_hardwares : [amdexperimental, amdproduction]
988+ agent_pool : mi325_1
989+ # grade: Blocking
990+ working_dir : " /vllm-workspace/.buildkite/lm-eval-harness"
991+ source_file_dependencies :
992+ - vllm/multimodal/
993+ - vllm/inputs/
994+ - vllm/v1/core/
995+ commands :
996+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
997+
966998- label : Multi-Modal Models Test (Extended) 1 # 60min
967999 timeout_in_minutes : 120
9681000 mirror_hardwares : [amdexperimental]
@@ -1098,7 +1130,6 @@ steps:
10981130 - vllm/model_executor/layers/layernorm.py
10991131 - vllm/model_executor/layers/activation.py
11001132 - vllm/model_executor/layers/quantization/input_quant_fp8.py
1101- - vllm/model_executor/layers/fused_moe/layer.py
11021133 - tests/compile/test_fusion_attn.py
11031134 - tests/compile/test_silu_mul_quant_fusion.py
11041135 - tests/compile/distributed/test_fusion_all_reduce.py
@@ -1132,12 +1163,25 @@ steps:
11321163 - vllm/model_executor/layers/activation.py
11331164 - vllm/model_executor/layers/quantization/input_quant_fp8.py
11341165 - tests/compile/distributed/test_fusions_e2e.py
1135- - tests/compile/fullgraph/test_full_graph.py
11361166 commands :
11371167 - nvidia-smi
11381168 # Run all e2e fusion tests
11391169 - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
11401170
1171+ - label : Blackwell GPT-OSS Eval
1172+ timeout_in_minutes : 60
1173+ working_dir : " /vllm-workspace/"
1174+ gpu : b200
1175+ optional : true # run on nightlies
1176+ source_file_dependencies :
1177+ - tests/evals/gpt_oss
1178+ - vllm/model_executor/models/gpt_oss.py
1179+ - vllm/model_executor/layers/quantization/mxfp4.py
1180+ - vllm/v1/attention/backends/flashinfer.py
1181+ commands :
1182+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
1183+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
1184+
11411185- label : Blackwell Quantized MoE Test
11421186 timeout_in_minutes : 60
11431187 working_dir : " /vllm-workspace/"
@@ -1155,6 +1199,16 @@ steps:
11551199 commands :
11561200 - pytest -s -v tests/quantization/test_blackwell_moe.py
11571201
1202+ - label : Blackwell LM Eval Small Models
1203+ timeout_in_minutes : 120
1204+ gpu : b200
1205+ optional : true # run on nightlies
1206+ source_file_dependencies :
1207+ - csrc/
1208+ - vllm/model_executor/layers/quantization
1209+ commands :
1210+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
1211+
11581212# #### 1 GPU test #####
11591213# #### multi gpus test #####
11601214
@@ -1397,6 +1451,39 @@ steps:
13971451 - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
13981452 - pytest -v -s -x lora/test_mixtral.py
13991453
1454+
1455+ - label : LM Eval Large Models # optional
1456+ gpu : a100
1457+ optional : true
1458+ mirror_hardwares : [amdexperimental]
1459+ agent_pool : mi325_4
1460+ # grade: Blocking
1461+ num_gpus : 4
1462+ working_dir : " /vllm-workspace/.buildkite/lm-eval-harness"
1463+ source_file_dependencies :
1464+ - csrc/
1465+ - vllm/model_executor/layers/quantization
1466+ commands :
1467+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
1468+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
1469+
1470+ # #### H100 test #####
1471+ - label : LM Eval Large Models (H100) # optional
1472+ gpu : h100
1473+ optional : true
1474+ mirror_hardwares : [amdexperimental]
1475+ agent_pool : mi325_4
1476+ # grade: Blocking
1477+ num_gpus : 4
1478+ working_dir : " /vllm-workspace/.buildkite/lm-eval-harness"
1479+ source_file_dependencies :
1480+ - csrc/
1481+ - vllm/model_executor/layers/quantization
1482+ commands :
1483+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
1484+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
1485+
1486+
14001487# #### H200 test #####
14011488- label : Distributed Tests (H200) # optional
14021489 mirror_hardwares : [amdexperimental]
@@ -1440,29 +1527,6 @@ steps:
14401527 commands :
14411528 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
14421529
1443- - label : Blackwell LM Eval Small Models
1444- timeout_in_minutes : 120
1445- gpu : b200
1446- optional : true # run on nightlies
1447- source_file_dependencies :
1448- - csrc/
1449- - vllm/model_executor/layers/quantization
1450- commands :
1451- - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
1452-
1453- - label : Multi-Modal Accuracy Eval (Small Models) # 10min
1454- timeout_in_minutes : 70
1455- mirror_hardwares : [amdexperimental, amdproduction]
1456- agent_pool : mi325_1
1457- # grade: Blocking
1458- working_dir : " /vllm-workspace/.buildkite/lm-eval-harness"
1459- source_file_dependencies :
1460- - vllm/multimodal/
1461- - vllm/inputs/
1462- - vllm/v1/core/
1463- commands :
1464- - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
1465-
14661530- label : LM Eval Large Models (4 Card)
14671531 mirror_hardwares : [amdexperimental, amdproduction]
14681532 agent_pool : mi325_4
@@ -1478,21 +1542,6 @@ steps:
14781542 - export VLLM_WORKER_MULTIPROC_METHOD=spawn
14791543 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
14801544
1481- - label : LM Eval Large Models (H100) # optional
1482- mirror_hardwares : [amdexperimental, amdproduction]
1483- agent_pool : mi325_4
1484- # grade: Blocking
1485- gpu : h100
1486- optional : true
1487- num_gpus : 4
1488- working_dir : " /vllm-workspace/.buildkite/lm-eval-harness"
1489- source_file_dependencies :
1490- - csrc/
1491- - vllm/model_executor/layers/quantization
1492- commands :
1493- - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
1494- - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
1495-
14961545- label : ROCm LM Eval Large Models (8 Card)
14971546 mirror_hardwares : [amdproduction]
14981547 agent_pool : mi325_8
@@ -1517,6 +1566,20 @@ steps:
15171566 - uv pip install --system 'gpt-oss[eval]==0.0.5'
15181567 - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
15191568
1569+ # #### RL Integration Tests #####
1570+ - label : Prime-RL Integration Test # 15min
1571+ mirror_hardwares : [amdexperimental]
1572+ agent_pool : mi325_2
1573+ # grade: Blocking
1574+ timeout_in_minutes : 30
1575+ optional : true
1576+ num_gpus : 2
1577+ working_dir : " /vllm-workspace"
1578+ source_file_dependencies :
1579+ - vllm/
1580+ - .buildkite/scripts/run-prime-rl-test.sh
1581+ commands :
1582+ - bash .buildkite/scripts/run-prime-rl-test.sh
15201583- label : DeepSeek V2-Lite Accuracy
15211584 mirror_hardwares : [amdexperimental, amdproduction]
15221585 agent_pool : mi325_4
@@ -1550,17 +1613,26 @@ steps:
15501613 commands :
15511614 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
15521615
1553- # #### RL Integration Tests #####
1554- - label : Prime-RL Integration Test # 15min
1616+ - label : DeepSeek V2-Lite Async EPLB Accuracy
1617+ timeout_in_minutes : 60
15551618 mirror_hardwares : [amdexperimental]
1556- agent_pool : mi325_2
1619+ agent_pool : mi325_4
15571620 # grade: Blocking
1558- timeout_in_minutes : 30
1621+ gpu : h100
15591622 optional : true
1560- num_gpus : 2
1623+ num_gpus : 4
15611624 working_dir : " /vllm-workspace"
1562- source_file_dependencies :
1563- - vllm/
1564- - .buildkite/scripts/run-prime-rl-test.sh
15651625 commands :
1566- - bash .buildkite/scripts/run-prime-rl-test.sh
1626+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
1627+
1628+ - label : Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
1629+ timeout_in_minutes : 60
1630+ mirror_hardwares : [amdexperimental]
1631+ agent_pool : mi325_4
1632+ # grade: Blocking
1633+ gpu : h100
1634+ optional : true
1635+ num_gpus : 4
1636+ working_dir : " /vllm-workspace"
1637+ commands :
1638+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
0 commit comments