From c2ff41392225b712c76b40a18a0375606e7fe4a4 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 19 Feb 2026 17:55:14 +0545 Subject: [PATCH 1/5] Add pd-disaggregation docs --- examples/inference/sglang/README.md | 80 +++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index 5b7dc640a..3d4595764 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -113,6 +113,86 @@ curl http://127.0.0.1:3000/proxy/services/main/deepseek-r1/v1/chat/completions \ > If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling or HTTPs, rate-limits, etc), the service endpoint will be available at `https://deepseek-r1./`. +## PD-Disaggregation + +To run PD-Disaggregated inference using SGLang Model Gateway. + +Create a SGLang-enabled gateway in the same network where prefill and decode workers will be deployed. Here we are using a Kubernetes cluster to ensure the gateway and workers share the same network. + +```yaml +type: gateway +name: gateway-name + +backend: kubernetes +region: any + +# This domain will be used to access the endpoint +domain: example.com +router: + type: sglang +``` + +After the gateway is ready, create a node group with at least two instances—one for the Prefill worker and one for the Decode worker—within the same Kubernetes cluster where the gateway is running. Then apply below service configuration to the GPU nodes. + +```yaml +type: service +name: prefill-decode +image: lmsysorg/sglang:latest + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1..4 + scaling: + metric: rps + target: 3 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 \ + --log-level debug \ + > worker-server.log 2>&1 + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 \ + --log-level debug \ + > worker-server.log 2>&1 + resources: + gpu: H200 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +# Custom probe is required for PD disaggregation +probes: + - type: http + url: /health_generate + interval: 15s + +router: + type: sglang + pd_disaggregation: true +``` + ## Source code The source-code of this example can be found in From f5a32c349c2c969b21cfd97713d048a1e61d7e21 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 19 Feb 2026 18:12:03 +0545 Subject: [PATCH 2/5] Add pd.dstack.yml file --- examples/inference/sglang/README.md | 12 ++---- examples/inference/sglang/pd.dstack.yml | 52 +++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 8 deletions(-) create mode 100644 examples/inference/sglang/pd.dstack.yml diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index 3d4595764..c4ef2687d 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -156,9 +156,7 @@ replicas: --disaggregation-transfer-backend mooncake \ --host 0.0.0.0 \ --port 8000 \ - --disaggregation-bootstrap-port 8998 \ - --log-level debug \ - > worker-server.log 2>&1 + --disaggregation-bootstrap-port 8998 resources: gpu: H200 @@ -173,9 +171,7 @@ replicas: --disaggregation-mode decode \ --disaggregation-transfer-backend mooncake \ --host 0.0.0.0 \ - --port 8000 \ - --log-level debug \ - > worker-server.log 2>&1 + --port 8000 resources: gpu: H200 @@ -195,8 +191,8 @@ router: ## Source code -The source-code of this example can be found in -[`examples/llms/deepseek/sglang`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/sglang). +The source-code of these examples can be found in +[`examples/llms/deepseek/sglang`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/sglang) and [`examples/inference/sglang`](https://github.com/dstackai/dstack/blob/master/examples/inference/sglang). ## What's next? diff --git a/examples/inference/sglang/pd.dstack.yml b/examples/inference/sglang/pd.dstack.yml new file mode 100644 index 000000000..714cb9cb1 --- /dev/null +++ b/examples/inference/sglang/pd.dstack.yml @@ -0,0 +1,52 @@ +type: service +name: prefill-decode-test +https: false +image: lmsysorg/sglang:latest + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1..2 + scaling: + metric: rps + target: 3 + commands: + - echo "Group Prefill" > /tmp/version.txt + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: 1 + + - count: 1 + commands: + - echo "Group Decode" > /tmp/version.txt + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 + resources: + gpu: 1 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +probes: + - type: http + url: /health_generate + interval: 15s + +router: + type: sglang + policy: round_robin + pd_disaggregation: true From 28e41a9c40851242d0f8676175569bd7109bd5cc Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 19 Feb 2026 18:16:54 +0545 Subject: [PATCH 3/5] Minor update --- examples/inference/sglang/pd.dstack.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/inference/sglang/pd.dstack.yml b/examples/inference/sglang/pd.dstack.yml index 714cb9cb1..614d4e72b 100644 --- a/examples/inference/sglang/pd.dstack.yml +++ b/examples/inference/sglang/pd.dstack.yml @@ -1,6 +1,5 @@ type: service -name: prefill-decode-test -https: false +name: prefill-decode image: lmsysorg/sglang:latest env: @@ -8,12 +7,11 @@ env: - MODEL_ID=zai-org/GLM-4.5-Air-FP8 replicas: - - count: 1..2 + - count: 1..4 scaling: metric: rps target: 3 commands: - - echo "Group Prefill" > /tmp/version.txt - | python -m sglang.launch_server \ --model-path $MODEL_ID \ @@ -25,9 +23,11 @@ replicas: resources: gpu: 1 - - count: 1 + - count: 1..8 + scaling: + metric: rps + target: 2 commands: - - echo "Group Decode" > /tmp/version.txt - | python -m sglang.launch_server \ --model-path $MODEL_ID \ @@ -48,5 +48,4 @@ probes: router: type: sglang - policy: round_robin pd_disaggregation: true From 651c8d0d0fa959d10a2ab0619fd09475f249c182 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 19 Feb 2026 20:12:23 +0545 Subject: [PATCH 4/5] Update gateway and services docs --- docs/docs/concepts/gateways.md | 4 ++-- docs/docs/concepts/services.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 55573bd74..0b01dc156 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -119,9 +119,9 @@ router: * `round_robin` — Cycles through workers in order. -> Currently, services using this type of gateway must run standard SGLang workers. See the [example](../../examples/inference/sglang/index.md). +> Services using this type of gateway can run PD-disaggregated inference. To run PD disaggregation inference, refer to the [SGLang PD-Disaggregation](../../examples/inference/sglang/index.md#pd-disaggregation) example. > -> Support for prefill/decode disaggregation and auto-scaling based on inter-token latency is coming soon. +> Support for auto-scaling based on TTFT and ITL is coming soon. ### Public IP diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index d40984866..8983f1b51 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -231,7 +231,7 @@ Setting the minimum number of replicas to `0` allows the service to scale down t > Properties such as `regions`, `port`, `image`, `env` and some other cannot be configured per replica group. This support is coming soon. ??? info "Disaggregated serving" - Native support for disaggregated prefill and decode, allowing both worker types to run within a single service, is coming soon. + Replica groups support disaggregated prefill and decode, allowing both worker types to run within a single service. To run PD disaggregated inference, refer to the [SGLang PD-Disaggregation](../../examples/inference/sglang/index.md#pd-disaggregation) example. ### Authorization From f4bc18564ab41c9d2ff0427952a4ccfd6e8c37e6 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 20 Feb 2026 14:51:56 +0100 Subject: [PATCH 5/5] [Docs] Minor changes related to PD disaggregation --- docs/docs/concepts/gateways.md | 9 +++-- docs/docs/concepts/services.md | 7 +++- examples/inference/sglang/README.md | 57 ++++++++++++++++++----------- 3 files changed, 46 insertions(+), 27 deletions(-) diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 0b01dc156..6ed19c2a0 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -110,7 +110,11 @@ router: -!!! info "Policy" +If you configure the `sglang` router, [services](../concepts/services.md) can run either [standard SGLang workers](../../examples/inference/sglang/index.md) or [Prefill-Decode workers](../../examples/inference/sglang/index.md#pd-disaggregation) (aka PD disaggregation). + +> Note, if you want to run services with PD disaggregation, the gateway must currently run in the same cluster as the service. + +??? info "Policy" The `policy` property allows you to configure the routing policy: * `cache_aware` — Default policy; combines cache locality with load balancing, falling back to shortest queue. @@ -119,9 +123,6 @@ router: * `round_robin` — Cycles through workers in order. -> Services using this type of gateway can run PD-disaggregated inference. To run PD disaggregation inference, refer to the [SGLang PD-Disaggregation](../../examples/inference/sglang/index.md#pd-disaggregation) example. -> -> Support for auto-scaling based on TTFT and ITL is coming soon. ### Public IP diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 8983f1b51..1eb63dd01 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -182,6 +182,8 @@ Setting the minimum number of replicas to `0` allows the service to scale down t > The `scaling` property requires creating a [gateway](gateways.md). + + ??? info "Replica groups" A service can include multiple replica groups. Each group can define its own `commands`, `resources` requirements, and `scaling` rules. @@ -230,8 +232,9 @@ Setting the minimum number of replicas to `0` allows the service to scale down t > Properties such as `regions`, `port`, `image`, `env` and some other cannot be configured per replica group. This support is coming soon. -??? info "Disaggregated serving" - Replica groups support disaggregated prefill and decode, allowing both worker types to run within a single service. To run PD disaggregated inference, refer to the [SGLang PD-Disaggregation](../../examples/inference/sglang/index.md#pd-disaggregation) example. +### PD disaggregation + +If you create a gateway with the [`sglang` router](gateways.md#sglang), you can run SGLang with [Prefill-Decode disaggregation](https://docs.sglang.io/advanced_features/pd_disaggregation.html). See the [corresponding example](../../examples/inference/sglang/index.md#pd-disaggregation). ### Authorization diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index c4ef2687d..6549afe5c 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -9,7 +9,7 @@ This example shows how to deploy DeepSeek-R1-Distill-Llama 8B and 70B using [SGL ## Apply a configuration -Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B using SgLang. +Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B using SGLang. === "NVIDIA" @@ -108,31 +108,18 @@ curl http://127.0.0.1:3000/proxy/services/main/deepseek-r1/v1/chat/completions \ ``` -!!! info "SGLang Model Gateway" - If you'd like to use a custom routing policy, e.g. by leveraging the [SGLang Model Gateway](https://docs.sglang.ai/advanced_features/router.html#), create a gateway with `router` set to `sglang`. Check out [gateways](https://dstack.ai/docs/concepts/gateways#router) for more details. +!!! info "Router policy" + If you'd like to use a custom routing policy, create a gateway with `router` set to `sglang`. Check out [gateways](https://dstack.ai/docs/concepts/gateways#router) for more details. -> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling or HTTPs, rate-limits, etc), the service endpoint will be available at `https://deepseek-r1./`. +> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://deepseek-r1./`. -## PD-Disaggregation +## Configuration options -To run PD-Disaggregated inference using SGLang Model Gateway. +### PD disaggregation -Create a SGLang-enabled gateway in the same network where prefill and decode workers will be deployed. Here we are using a Kubernetes cluster to ensure the gateway and workers share the same network. +If you create a gateway with the [`sglang` router](https://dstack.ai/docs/concepts/gateways/#sglang), you can run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/pd_disaggregation.html). -```yaml -type: gateway -name: gateway-name - -backend: kubernetes -region: any - -# This domain will be used to access the endpoint -domain: example.com -router: - type: sglang -``` - -After the gateway is ready, create a node group with at least two instances—one for the Prefill worker and one for the Decode worker—within the same Kubernetes cluster where the gateway is running. Then apply below service configuration to the GPU nodes. +
```yaml type: service @@ -189,6 +176,34 @@ router: pd_disaggregation: true ``` +
+ +Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon. + +#### Gateway + +Note, running services with PD disaggregation currently requires the gateway to run in the same cluster as the service. + +For example, if you run services on the `kubernetes` backend, make sure to also create the gateway in the same backend: + +
+ +```yaml +type: gateway +name: gateway-name + +backend: kubernetes +region: any + +domain: example.com +router: + type: sglang +``` + +
+ + + ## Source code The source-code of these examples can be found in