From 966d15aee14f6a12aa13132c4e2f9b1d32379e99 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Mon, 15 Jun 2026 20:16:32 +0545 Subject: [PATCH 1/4] [Docs]: PD Disaggregation gRPC workers --- mkdocs/docs/concepts/services.md | 50 ++++++++------- mkdocs/docs/examples/inference/sglang.md | 2 - mkdocs/docs/examples/inference/vllm.md | 78 ++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 24 deletions(-) diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index 757546483..1f6dc5c4a 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -357,7 +357,6 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: ```yaml type: service name: prefill-decode - image: lmsysorg/sglang:v0.5.10.post1 env: - HF_TOKEN @@ -365,62 +364,69 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: replicas: - count: 1 - # For now replica group with router must have count: 1 + python: "3.12" commands: - pip install smg - | smg launch \ + --enable-igw \ + --pd-disaggregation \ + --model-path $MODEL_ID \ --host 0.0.0.0 \ --port 8000 \ - --pd-disaggregation \ --prefill-policy cache_aware - resources: - cpu: 4 router: type: sglang + resources: + cpu: 4 - - count: 1..4 + - count: 1..2 scaling: metric: rps - target: 3 + target: 300 + image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10 commands: - | - python -m sglang.launch_server \ + python3 -m sglang.launch_server \ --model-path $MODEL_ID \ + --host 0.0.0.0 \ + --port 8000 \ + --grpc-mode \ --disaggregation-mode prefill \ --disaggregation-transfer-backend nixl \ - --port 8000 \ --disaggregation-bootstrap-port 8998 resources: gpu: H200 - - count: 1..8 + - count: 1..4 scaling: metric: rps - target: 2 + target: 300 + image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10 commands: - | - python -m sglang.launch_server \ + python3 -m sglang.launch_server \ --model-path $MODEL_ID \ + --host 0.0.0.0 \ + --port 8000 \ + --grpc-mode \ --disaggregation-mode decode \ - --disaggregation-transfer-backend nixl \ - --port 8000 + --disaggregation-transfer-backend nixl resources: gpu: H200 port: 8000 - model: zai-org/GLM-4.5-Air-FP8 - - # Custom probe is required for PD disaggregation. - probes: - - type: http - url: /health - interval: 15s ``` - > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon. + > With the `smg` router, workers communicate via gRPC as well as HTTP. + > + > On the router side, `--enable-igw` and `--model-path` are required for gRPC worker registration via HTTP endpoint. This is how `dstack` registers workers with SMG router. + > + > With SGLang gRPC workers, pass `--grpc-mode` to the worker launch command.To use [Mooncake Transfer](https://github.com/kvcache-ai/Mooncake), set `--disaggregation-transfer-backend mooncake`. For PD disaggregation with SGLang HTTP workers, see [SGLang PD Disaggregation](../examples/inference/sglang.md#pd-disaggregation). + > + > The SMG router supports only gRPC communication mode with vLLM workers. For PD disaggregation with vLLM, see [here](../examples/inference/vllm.md#pd-disaggregation). === "Dynamo" diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md index 1ea9e6e06..7c8004f9d 100644 --- a/mkdocs/docs/examples/inference/sglang.md +++ b/mkdocs/docs/examples/inference/sglang.md @@ -211,8 +211,6 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/ - > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon. - === "AMD" The example below deploys `Qwen/Qwen2.5-72B-Instruct` on a multi-node cluster with AMD MI300X GPUs: diff --git a/mkdocs/docs/examples/inference/vllm.md b/mkdocs/docs/examples/inference/vllm.md index dd6909ba6..4546ff4b3 100644 --- a/mkdocs/docs/examples/inference/vllm.md +++ b/mkdocs/docs/examples/inference/vllm.md @@ -124,6 +124,84 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ > If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. +## Configuration options + +### PD disaggregation + +To run vLLM with [PD disaggregation](https://docs.vllm.ai/en/latest/serving/disagg_prefill.html), use replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers (`kv_producer`), and one for decode workers (`kv_consumer`). + +
+ +```yaml +type: service +name: prefill-decode + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1 + python: "3.12" + commands: + - pip install smg + - | + smg launch \ + --pd-disaggregation \ + --model-path $MODEL_ID \ + --enable-igw \ + --host 0.0.0.0 \ + --port 8000 \ + --prefill-policy cache_aware + router: + type: sglang + resources: + cpu: 4 + + - count: 1..4 + scaling: + metric: rps + target: 3 + image: ghcr.io/lightseekorg/smg:1.4.1-vllm-v0.18.0 + commands: + - | + python3 -m vllm.entrypoints.grpc_server \ + --model "$MODEL_ID" \ + --host 0.0.0.0 \ + --port 8000 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + image: ghcr.io/lightseekorg/smg:1.4.1-vllm-v0.18.0 + commands: + - | + python3 -m vllm.entrypoints.grpc_server \ + --model "$MODEL_ID" \ + --host 0.0.0.0 \ + --port 8000 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' + resources: + gpu: H200 + +port: 8000 +``` + +
+ +> To use the [Mooncake Transfer](https://github.com/kvcache-ai/Mooncake) backend, set `"kv_connector": "MooncakeConnector"` in `--kv-transfer-config`. + +Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon. + +!!! info "Cluster" + PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances. + + While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster. + ## What's next? 1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) From 6b525767a8dbdfcc7f861a467e2de1fa59290c36 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Mon, 15 Jun 2026 20:24:27 +0545 Subject: [PATCH 2/4] Minor Update --- mkdocs/docs/concepts/services.md | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index 1f6dc5c4a..99bf961f9 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -364,6 +364,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: replicas: - count: 1 + # For now replica group with router must have count: 1 python: "3.12" commands: - pip install smg From 790f48351ab096cd8785858ff3e9cce8c056fdd6 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Mon, 29 Jun 2026 12:20:50 +0200 Subject: [PATCH 3/4] [Docs] Simplify PD disaggregation gRPC docs Keep services.md on the simple HTTP example, state the two worker communication modes (HTTP/gRPC) in one line, and move the gRPC config into a collapsed "gRPC mode" note on the SGLang and vLLM example pages. Co-Authored-By: Claude Opus 4.8 (1M context) --- mkdocs/docs/concepts/services.md | 52 ++++++++-------- mkdocs/docs/examples/inference/sglang.md | 75 ++++++++++++++++++++++++ mkdocs/docs/examples/inference/vllm.md | 2 +- 3 files changed, 100 insertions(+), 29 deletions(-) diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index 99bf961f9..7af900f41 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -357,6 +357,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: ```yaml type: service name: prefill-decode + image: lmsysorg/sglang:v0.5.10.post1 env: - HF_TOKEN @@ -365,69 +366,64 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: replicas: - count: 1 # For now replica group with router must have count: 1 - python: "3.12" commands: - pip install smg - | smg launch \ - --enable-igw \ - --pd-disaggregation \ - --model-path $MODEL_ID \ --host 0.0.0.0 \ --port 8000 \ + --pd-disaggregation \ --prefill-policy cache_aware - router: - type: sglang resources: cpu: 4 + router: + type: sglang - - count: 1..2 + - count: 1..4 scaling: metric: rps - target: 300 - image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10 + target: 3 commands: - | - python3 -m sglang.launch_server \ + python -m sglang.launch_server \ --model-path $MODEL_ID \ - --host 0.0.0.0 \ - --port 8000 \ - --grpc-mode \ --disaggregation-mode prefill \ --disaggregation-transfer-backend nixl \ + --port 8000 \ --disaggregation-bootstrap-port 8998 resources: gpu: H200 - - count: 1..4 + - count: 1..8 scaling: metric: rps - target: 300 - image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10 + target: 2 commands: - | - python3 -m sglang.launch_server \ + python -m sglang.launch_server \ --model-path $MODEL_ID \ - --host 0.0.0.0 \ - --port 8000 \ - --grpc-mode \ --disaggregation-mode decode \ - --disaggregation-transfer-backend nixl + --disaggregation-transfer-backend nixl \ + --port 8000 resources: gpu: H200 port: 8000 + model: zai-org/GLM-4.5-Air-FP8 + + # Custom probe is required for PD disaggregation. + probes: + - type: http + url: /health + interval: 15s ``` - > With the `smg` router, workers communicate via gRPC as well as HTTP. - > - > On the router side, `--enable-igw` and `--model-path` are required for gRPC worker registration via HTTP endpoint. This is how `dstack` registers workers with SMG router. - > - > With SGLang gRPC workers, pass `--grpc-mode` to the worker launch command.To use [Mooncake Transfer](https://github.com/kvcache-ai/Mooncake), set `--disaggregation-transfer-backend mooncake`. For PD disaggregation with SGLang HTTP workers, see [SGLang PD Disaggregation](../examples/inference/sglang.md#pd-disaggregation). - > - > The SMG router supports only gRPC communication mode with vLLM workers. For PD disaggregation with vLLM, see [here](../examples/inference/vllm.md#pd-disaggregation). + > SMG workers connect to the router over HTTP or gRPC. The example above uses HTTP. SGLang workers support both modes; vLLM workers support gRPC only. + + ??? info "gRPC mode" + Over gRPC, workers run from an SMG worker image, and `smg launch` needs `--enable-igw` and `--model-path` so the router can register the workers. See the full configurations in [SGLang PD disaggregation](../examples/inference/sglang.md#pd-disaggregation) and [vLLM PD disaggregation](../examples/inference/vllm.md#pd-disaggregation). === "Dynamo" diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md index 7c8004f9d..0d386aa42 100644 --- a/mkdocs/docs/examples/inference/sglang.md +++ b/mkdocs/docs/examples/inference/sglang.md @@ -211,6 +211,81 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/ + ??? info "gRPC mode" + + SGLang workers can also connect to the SMG router over gRPC. Use an SMG worker image, pass `--grpc-mode` to the worker, and add `--enable-igw` and `--model-path` to `smg launch` so the router can register the workers. + +
+ + ```yaml + type: service + name: prefill-decode + + env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + + replicas: + - count: 1 + # For now replica group with router must have count: 1 + python: "3.12" + commands: + - pip install smg + - | + smg launch \ + --enable-igw \ + --pd-disaggregation \ + --model-path $MODEL_ID \ + --host 0.0.0.0 \ + --port 8000 \ + --prefill-policy cache_aware + router: + type: sglang + resources: + cpu: 4 + + - count: 1..4 + scaling: + metric: rps + target: 3 + image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10 + commands: + - | + python3 -m sglang.launch_server \ + --model-path $MODEL_ID \ + --host 0.0.0.0 \ + --port 8000 \ + --grpc-mode \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10 + commands: + - | + python3 -m sglang.launch_server \ + --model-path $MODEL_ID \ + --host 0.0.0.0 \ + --port 8000 \ + --grpc-mode \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl + resources: + gpu: H200 + + port: 8000 + ``` + +
+ + To use the [Mooncake](https://github.com/kvcache-ai/Mooncake) transfer backend, set `--disaggregation-transfer-backend mooncake`. + === "AMD" The example below deploys `Qwen/Qwen2.5-72B-Instruct` on a multi-node cluster with AMD MI300X GPUs: diff --git a/mkdocs/docs/examples/inference/vllm.md b/mkdocs/docs/examples/inference/vllm.md index 4546ff4b3..fe1575ed8 100644 --- a/mkdocs/docs/examples/inference/vllm.md +++ b/mkdocs/docs/examples/inference/vllm.md @@ -128,7 +128,7 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ ### PD disaggregation -To run vLLM with [PD disaggregation](https://docs.vllm.ai/en/latest/serving/disagg_prefill.html), use replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers (`kv_producer`), and one for decode workers (`kv_consumer`). +To run vLLM with [PD disaggregation](https://docs.vllm.ai/en/latest/serving/disagg_prefill.html), use replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers, and one for decode workers.
From ab53b6b288ae42149acc79bdf5700dc5702cd2cf Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Mon, 29 Jun 2026 15:47:23 +0200 Subject: [PATCH 4/4] [Docs] Clarify SMG worker images in gRPC notes Address review feedback: "SMG worker image" was ambiguous. Note that gRPC workers run from SMG images bundling a specific backend version. Co-Authored-By: Claude Opus 4.8 (1M context) --- mkdocs/docs/concepts/services.md | 2 +- mkdocs/docs/examples/inference/sglang.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md index 7af900f41..000ad7de8 100644 --- a/mkdocs/docs/concepts/services.md +++ b/mkdocs/docs/concepts/services.md @@ -423,7 +423,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`: > SMG workers connect to the router over HTTP or gRPC. The example above uses HTTP. SGLang workers support both modes; vLLM workers support gRPC only. ??? info "gRPC mode" - Over gRPC, workers run from an SMG worker image, and `smg launch` needs `--enable-igw` and `--model-path` so the router can register the workers. See the full configurations in [SGLang PD disaggregation](../examples/inference/sglang.md#pd-disaggregation) and [vLLM PD disaggregation](../examples/inference/vllm.md#pd-disaggregation). + Over gRPC, workers run from SMG images that bundle a specific backend version (SGLang or vLLM), and `smg launch` needs `--enable-igw` and `--model-path` so the router can register the workers. See the full configurations in [SGLang PD disaggregation](../examples/inference/sglang.md#pd-disaggregation) and [vLLM PD disaggregation](../examples/inference/vllm.md#pd-disaggregation). === "Dynamo" diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md index 0d386aa42..5bf25ed5d 100644 --- a/mkdocs/docs/examples/inference/sglang.md +++ b/mkdocs/docs/examples/inference/sglang.md @@ -213,7 +213,7 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/ ??? info "gRPC mode" - SGLang workers can also connect to the SMG router over gRPC. Use an SMG worker image, pass `--grpc-mode` to the worker, and add `--enable-igw` and `--model-path` to `smg launch` so the router can register the workers. + SGLang workers can also connect to the SMG router over gRPC. Run the workers from an SMG image that bundles the SGLang version, pass `--grpc-mode`, and add `--enable-igw` and `--model-path` to `smg launch` so the router can register them.