Skip to content

Commit 60d1725

Browse files
authored
[Disagg] Support large batch size in proxy server and update NixlConnector doc for DP (#28782)
Signed-off-by: Ming Yang <minos.future@gmail.com>
1 parent 1fb632f commit 60d1725

File tree

3 files changed

+42
-4
lines changed

3 files changed

+42
-4
lines changed

docs/features/nixl_connector_usage.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
146146
--decoder-ports 8000 8000
147147
```
148148

149+
For multi-host DP deployment, only need to provide the host/port of the head instances.
150+
149151
### KV Role Options
150152

151153
- **kv_producer**: For prefiller instances that generate KV caches

examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,21 @@ async def lifespan(app: FastAPI):
2626
)
2727

2828
app.state.prefill_client = httpx.AsyncClient(
29-
timeout=None, base_url=prefiller_base_url
29+
timeout=None,
30+
base_url=prefiller_base_url,
31+
limits=httpx.Limits(
32+
max_connections=None,
33+
max_keepalive_connections=None,
34+
),
35+
)
36+
app.state.decode_client = httpx.AsyncClient(
37+
timeout=None,
38+
base_url=decoder_base_url,
39+
limits=httpx.Limits(
40+
max_connections=None,
41+
max_keepalive_connections=None,
42+
),
3043
)
31-
app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)
3244

3345
yield
3446

@@ -105,6 +117,11 @@ async def send_request_to_service(
105117
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
106118
response = await client.post(endpoint, json=req_data, headers=headers)
107119
response.raise_for_status()
120+
121+
# read/consume the response body to release the connection
122+
# otherwise, it would http.ReadError
123+
await response.aread()
124+
108125
return response
109126

110127

tests/v1/kv_connector/nixl_integration/toy_proxy_server.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,14 @@ async def lifespan(app: FastAPI):
3030
prefiller_base_url = f"http://{host}:{port}/v1"
3131
app.state.prefill_clients.append(
3232
{
33-
"client": httpx.AsyncClient(timeout=None, base_url=prefiller_base_url),
33+
"client": httpx.AsyncClient(
34+
timeout=None,
35+
base_url=prefiller_base_url,
36+
limits=httpx.Limits(
37+
max_connections=None,
38+
max_keepalive_connections=None,
39+
),
40+
),
3441
"host": host,
3542
"port": port,
3643
"id": i,
@@ -42,7 +49,14 @@ async def lifespan(app: FastAPI):
4249
decoder_base_url = f"http://{host}:{port}/v1"
4350
app.state.decode_clients.append(
4451
{
45-
"client": httpx.AsyncClient(timeout=None, base_url=decoder_base_url),
52+
"client": httpx.AsyncClient(
53+
timeout=None,
54+
base_url=decoder_base_url,
55+
limits=httpx.Limits(
56+
max_connections=None,
57+
max_keepalive_connections=None,
58+
),
59+
),
4660
"host": host,
4761
"port": port,
4862
"id": i,
@@ -169,6 +183,10 @@ async def send_request_to_service(
169183
)
170184
response.raise_for_status()
171185

186+
# read/consume the response body to release the connection
187+
# otherwise, it would http.ReadError
188+
await response.aread()
189+
172190
return response
173191

174192

@@ -206,6 +224,7 @@ async def _handle_completions(api: str, request: Request):
206224

207225
# Extract the needed fields
208226
response_json = response.json()
227+
await response.aclose() # CRITICAL: Release connection back to pool
209228
kv_transfer_params = response_json.get("kv_transfer_params", {})
210229
if kv_transfer_params:
211230
req_data["kv_transfer_params"] = kv_transfer_params

0 commit comments

Comments
 (0)