|
10 | 10 | BACKENDS, |
11 | 11 | _extract_step_logprobs, |
12 | 12 | _random_prompt, |
| 13 | + is_device_capability_below_90, |
13 | 14 | resolve_model_name, |
14 | 15 | skip_unsupported, |
15 | 16 | ) |
16 | 17 |
|
17 | 18 | import vllm.model_executor.layers.batch_invariant as batch_invariant |
18 | 19 | from vllm import LLM, SamplingParams |
19 | 20 |
|
| 21 | +IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90() |
| 22 | + |
20 | 23 |
|
21 | 24 | @skip_unsupported |
22 | 25 | @pytest.mark.timeout(1000) |
@@ -190,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( |
190 | 193 | max_model_len=8192, |
191 | 194 | dtype="bfloat16", # not everything is supported |
192 | 195 | gpu_memory_utilization=0.9, |
| 196 | + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, |
193 | 197 | ) |
194 | 198 |
|
195 | 199 | # Use more realistic prompts for better token generation |
@@ -393,6 +397,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): |
393 | 397 | gpu_memory_utilization=0.9, |
394 | 398 | max_model_len=2048, |
395 | 399 | dtype="bfloat16", |
| 400 | + enable_prefix_caching=False, |
| 401 | + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, |
396 | 402 | ) |
397 | 403 |
|
398 | 404 | prompt = "the capital of france is" |
@@ -459,6 +465,7 @@ def test_logprobs_without_batch_invariance_should_fail( |
459 | 465 | max_num_seqs=32, |
460 | 466 | max_model_len=8192, |
461 | 467 | dtype="bfloat16", |
| 468 | + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, |
462 | 469 | ) |
463 | 470 |
|
464 | 471 | # build ragged prompts to change shapes significantly across BS=1 vs BS=N |
@@ -682,6 +689,7 @@ def test_decode_logprobs_match_prefill_logprobs( |
682 | 689 | max_num_seqs=32, |
683 | 690 | max_model_len=8192, |
684 | 691 | dtype="bfloat16", |
| 692 | + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, |
685 | 693 | ) |
686 | 694 |
|
687 | 695 | # Use a few test prompts |
@@ -925,6 +933,8 @@ def LLM_with_max_seqs( |
925 | 933 | max_model_len=max_model_len, |
926 | 934 | dtype="bfloat16", |
927 | 935 | tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), |
| 936 | + enable_prefix_caching=False, |
| 937 | + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, |
928 | 938 | # Enable for MOE models |
929 | 939 | # enable_expert_parallel=True, |
930 | 940 | ) |
0 commit comments