From 966d15aee14f6a12aa13132c4e2f9b1d32379e99 Mon Sep 17 00:00:00 2001
From: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
Date: Mon, 15 Jun 2026 20:16:32 +0545
Subject: [PATCH 1/4] [Docs]: PD Disaggregation gRPC workers

---
 mkdocs/docs/concepts/services.md         | 50 ++++++++-------
 mkdocs/docs/examples/inference/sglang.md |  2 -
 mkdocs/docs/examples/inference/vllm.md   | 78 ++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 24 deletions(-)
diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md
index 757546483..1f6dc5c4a 100644
--- a/mkdocs/docs/concepts/services.md
+++ b/mkdocs/docs/concepts/services.md
@@ -357,7 +357,6 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`:
     ```yaml
     type: service
     name: prefill-decode
-    image: lmsysorg/sglang:v0.5.10.post1
 
     env:
       - HF_TOKEN
@@ -365,62 +364,69 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`:
 
     replicas:
       - count: 1
-        # For now replica group with router must have count: 1
+        python: "3.12"
         commands:
           - pip install smg
           - |
             smg launch \
+              --enable-igw \
+              --pd-disaggregation \
+              --model-path $MODEL_ID \
               --host 0.0.0.0 \
               --port 8000 \
-              --pd-disaggregation \
               --prefill-policy cache_aware
-        resources:
-          cpu: 4
         router:
           type: sglang
+        resources:
+          cpu: 4
 
-      - count: 1..4
+      - count: 1..2
         scaling:
           metric: rps
-          target: 3
+          target: 300
+        image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10
         commands:
           - |
-            python -m sglang.launch_server \
+            python3 -m sglang.launch_server \
               --model-path $MODEL_ID \
+              --host 0.0.0.0 \
+              --port 8000 \
+              --grpc-mode \
               --disaggregation-mode prefill \
               --disaggregation-transfer-backend nixl \
-              --port 8000 \
               --disaggregation-bootstrap-port 8998
         resources:
           gpu: H200
 
-      - count: 1..8
+      - count: 1..4
         scaling:
           metric: rps
-          target: 2
+          target: 300
+        image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10
         commands:
           - |
-            python -m sglang.launch_server \
+            python3 -m sglang.launch_server \
               --model-path $MODEL_ID \
+              --host 0.0.0.0 \
+              --port 8000 \
+              --grpc-mode \
               --disaggregation-mode decode \
-              --disaggregation-transfer-backend nixl \
-              --port 8000
+              --disaggregation-transfer-backend nixl
         resources:
           gpu: H200
 
     port: 8000
-    model: zai-org/GLM-4.5-Air-FP8
-
-    # Custom probe is required for PD disaggregation.
-    probes:
-      - type: http
-        url: /health
-        interval: 15s
     ```
 
     </div>
 
-    > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon.
+    > With the `smg` router, workers communicate via gRPC as well as HTTP.
+    >
+    > On the router side, `--enable-igw` and `--model-path` are required for gRPC worker registration via HTTP endpoint. This is how `dstack` registers workers with SMG router.
+    >
+    > With SGLang gRPC workers, pass `--grpc-mode` to the worker launch command.To use [Mooncake Transfer](https://github.com/kvcache-ai/Mooncake), set `--disaggregation-transfer-backend mooncake`. For PD disaggregation with SGLang HTTP workers, see [SGLang PD Disaggregation](../examples/inference/sglang.md#pd-disaggregation).
+    >
+    > The SMG router supports only gRPC communication mode with vLLM workers. For PD disaggregation with vLLM, see [here](../examples/inference/vllm.md#pd-disaggregation).
 
 === "Dynamo"
 
diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md
index 1ea9e6e06..7c8004f9d 100644
--- a/mkdocs/docs/examples/inference/sglang.md
+++ b/mkdocs/docs/examples/inference/sglang.md
@@ -211,8 +211,6 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/
 
     </div>
 
-    > With the `sglang` router, you can use SGLang prefill and decode workers. Support for vLLM and TensorRT-LLM workers is coming soon.
-
 === "AMD"
 
     The example below deploys `Qwen/Qwen2.5-72B-Instruct` on a multi-node cluster with AMD MI300X GPUs:
diff --git a/mkdocs/docs/examples/inference/vllm.md b/mkdocs/docs/examples/inference/vllm.md
index dd6909ba6..4546ff4b3 100644
--- a/mkdocs/docs/examples/inference/vllm.md
+++ b/mkdocs/docs/examples/inference/vllm.md
@@ -124,6 +124,84 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
 
 > If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36.<gateway domain>/`.
 
+## Configuration options
+
+### PD disaggregation
+
+To run vLLM with [PD disaggregation](https://docs.vllm.ai/en/latest/serving/disagg_prefill.html), use replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers (`kv_producer`), and one for decode workers (`kv_consumer`).
+
+<div editor-title="pd.dstack.yml">
+
+```yaml
+type: service
+name: prefill-decode
+
+env:
+  - HF_TOKEN
+  - MODEL_ID=zai-org/GLM-4.5-Air-FP8
+
+replicas:
+  - count: 1
+    python: "3.12"
+    commands:
+      - pip install smg
+      - |
+        smg launch \
+          --pd-disaggregation \
+          --model-path $MODEL_ID \
+          --enable-igw \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --prefill-policy cache_aware
+    router:
+      type: sglang
+    resources:
+      cpu: 4
+
+  - count: 1..4
+    scaling:
+      metric: rps
+      target: 3
+    image: ghcr.io/lightseekorg/smg:1.4.1-vllm-v0.18.0
+    commands:
+      - |
+        python3 -m vllm.entrypoints.grpc_server \
+          --model "$MODEL_ID" \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}'
+    resources:
+      gpu: H200
+
+  - count: 1..8
+    scaling:
+      metric: rps
+      target: 2
+    image: ghcr.io/lightseekorg/smg:1.4.1-vllm-v0.18.0
+    commands:
+      - |
+        python3 -m vllm.entrypoints.grpc_server \
+          --model "$MODEL_ID" \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}'
+    resources:
+      gpu: H200
+
+port: 8000
+```
+
+</div>
+
+> To use the [Mooncake Transfer](https://github.com/kvcache-ai/Mooncake) backend, set `"kv_connector": "MooncakeConnector"` in `--kv-transfer-config`.
+
+Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon.
+
+!!! info "Cluster"
+    PD disaggregation requires the service to run in a fleet with `placement` set to `cluster`, because the replicas require an interconnect between instances.
+
+    While the prefill and decode replicas run on GPUs, the router replica requires a CPU instance in the same cluster.
+
 ## What's next?
 
 1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md)

From 6b525767a8dbdfcc7f861a467e2de1fa59290c36 Mon Sep 17 00:00:00 2001
From: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
Date: Mon, 15 Jun 2026 20:24:27 +0545
Subject: [PATCH 2/4] Minor Update

---
 mkdocs/docs/concepts/services.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md
index 1f6dc5c4a..99bf961f9 100644
--- a/mkdocs/docs/concepts/services.md
+++ b/mkdocs/docs/concepts/services.md
@@ -364,6 +364,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`:
 
     replicas:
       - count: 1
+        # For now replica group with router must have count: 1
         python: "3.12"
         commands:
           - pip install smg

From 790f48351ab096cd8785858ff3e9cce8c056fdd6 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Mon, 29 Jun 2026 12:20:50 +0200
Subject: [PATCH 3/4] [Docs] Simplify PD disaggregation gRPC docs

Keep services.md on the simple HTTP example, state the two worker
communication modes (HTTP/gRPC) in one line, and move the gRPC config
into a collapsed "gRPC mode" note on the SGLang and vLLM example pages.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mkdocs/docs/concepts/services.md         | 52 ++++++++--------
 mkdocs/docs/examples/inference/sglang.md | 75 ++++++++++++++++++++++++
 mkdocs/docs/examples/inference/vllm.md   |  2 +-
 3 files changed, 100 insertions(+), 29 deletions(-)

diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md
index 99bf961f9..7af900f41 100644
--- a/mkdocs/docs/concepts/services.md
+++ b/mkdocs/docs/concepts/services.md
@@ -357,6 +357,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`:
     ```yaml
     type: service
     name: prefill-decode
+    image: lmsysorg/sglang:v0.5.10.post1
 
     env:
       - HF_TOKEN
@@ -365,69 +366,64 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`:
     replicas:
       - count: 1
         # For now replica group with router must have count: 1
-        python: "3.12"
         commands:
           - pip install smg
           - |
             smg launch \
-              --enable-igw \
-              --pd-disaggregation \
-              --model-path $MODEL_ID \
               --host 0.0.0.0 \
               --port 8000 \
+              --pd-disaggregation \
               --prefill-policy cache_aware
-        router:
-          type: sglang
         resources:
           cpu: 4
+        router:
+          type: sglang
 
-      - count: 1..2
+      - count: 1..4
         scaling:
           metric: rps
-          target: 300
-        image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10
+          target: 3
         commands:
           - |
-            python3 -m sglang.launch_server \
+            python -m sglang.launch_server \
               --model-path $MODEL_ID \
-              --host 0.0.0.0 \
-              --port 8000 \
-              --grpc-mode \
               --disaggregation-mode prefill \
               --disaggregation-transfer-backend nixl \
+              --port 8000 \
               --disaggregation-bootstrap-port 8998
         resources:
           gpu: H200
 
-      - count: 1..4
+      - count: 1..8
         scaling:
           metric: rps
-          target: 300
-        image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10
+          target: 2
         commands:
           - |
-            python3 -m sglang.launch_server \
+            python -m sglang.launch_server \
               --model-path $MODEL_ID \
-              --host 0.0.0.0 \
-              --port 8000 \
-              --grpc-mode \
               --disaggregation-mode decode \
-              --disaggregation-transfer-backend nixl
+              --disaggregation-transfer-backend nixl \
+              --port 8000
         resources:
           gpu: H200
 
     port: 8000
+    model: zai-org/GLM-4.5-Air-FP8
+
+    # Custom probe is required for PD disaggregation.
+    probes:
+      - type: http
+        url: /health
+        interval: 15s
     ```
 
     </div>
 
-    > With the `smg` router, workers communicate via gRPC as well as HTTP.
-    >
-    > On the router side, `--enable-igw` and `--model-path` are required for gRPC worker registration via HTTP endpoint. This is how `dstack` registers workers with SMG router.
-    >
-    > With SGLang gRPC workers, pass `--grpc-mode` to the worker launch command.To use [Mooncake Transfer](https://github.com/kvcache-ai/Mooncake), set `--disaggregation-transfer-backend mooncake`. For PD disaggregation with SGLang HTTP workers, see [SGLang PD Disaggregation](../examples/inference/sglang.md#pd-disaggregation).
-    >
-    > The SMG router supports only gRPC communication mode with vLLM workers. For PD disaggregation with vLLM, see [here](../examples/inference/vllm.md#pd-disaggregation).
+    > SMG workers connect to the router over HTTP or gRPC. The example above uses HTTP. SGLang workers support both modes; vLLM workers support gRPC only.
+
+    ??? info "gRPC mode"
+        Over gRPC, workers run from an SMG worker image, and `smg launch` needs `--enable-igw` and `--model-path` so the router can register the workers. See the full configurations in [SGLang PD disaggregation](../examples/inference/sglang.md#pd-disaggregation) and [vLLM PD disaggregation](../examples/inference/vllm.md#pd-disaggregation).
 
 === "Dynamo"
 
diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md
index 7c8004f9d..0d386aa42 100644
--- a/mkdocs/docs/examples/inference/sglang.md
+++ b/mkdocs/docs/examples/inference/sglang.md
@@ -211,6 +211,81 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/
 
     </div>
 
+    ??? info "gRPC mode"
+
+        SGLang workers can also connect to the SMG router over gRPC. Use an SMG worker image, pass `--grpc-mode` to the worker, and add `--enable-igw` and `--model-path` to `smg launch` so the router can register the workers.
+
+        <div editor-title="pd-grpc.dstack.yml">
+
+        ```yaml
+        type: service
+        name: prefill-decode
+
+        env:
+          - HF_TOKEN
+          - MODEL_ID=zai-org/GLM-4.5-Air-FP8
+
+        replicas:
+          - count: 1
+            # For now replica group with router must have count: 1
+            python: "3.12"
+            commands:
+              - pip install smg
+              - |
+                smg launch \
+                  --enable-igw \
+                  --pd-disaggregation \
+                  --model-path $MODEL_ID \
+                  --host 0.0.0.0 \
+                  --port 8000 \
+                  --prefill-policy cache_aware
+            router:
+              type: sglang
+            resources:
+              cpu: 4
+
+          - count: 1..4
+            scaling:
+              metric: rps
+              target: 3
+            image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10
+            commands:
+              - |
+                python3 -m sglang.launch_server \
+                  --model-path $MODEL_ID \
+                  --host 0.0.0.0 \
+                  --port 8000 \
+                  --grpc-mode \
+                  --disaggregation-mode prefill \
+                  --disaggregation-transfer-backend nixl \
+                  --disaggregation-bootstrap-port 8998
+            resources:
+              gpu: H200
+
+          - count: 1..8
+            scaling:
+              metric: rps
+              target: 2
+            image: ghcr.io/lightseekorg/smg:1.4.1-sglang-v0.5.10
+            commands:
+              - |
+                python3 -m sglang.launch_server \
+                  --model-path $MODEL_ID \
+                  --host 0.0.0.0 \
+                  --port 8000 \
+                  --grpc-mode \
+                  --disaggregation-mode decode \
+                  --disaggregation-transfer-backend nixl
+            resources:
+              gpu: H200
+
+        port: 8000
+        ```
+
+        </div>
+
+        To use the [Mooncake](https://github.com/kvcache-ai/Mooncake) transfer backend, set `--disaggregation-transfer-backend mooncake`.
+
 === "AMD"
 
     The example below deploys `Qwen/Qwen2.5-72B-Instruct` on a multi-node cluster with AMD MI300X GPUs:
diff --git a/mkdocs/docs/examples/inference/vllm.md b/mkdocs/docs/examples/inference/vllm.md
index 4546ff4b3..fe1575ed8 100644
--- a/mkdocs/docs/examples/inference/vllm.md
+++ b/mkdocs/docs/examples/inference/vllm.md
@@ -128,7 +128,7 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
 
 ### PD disaggregation
 
-To run vLLM with [PD disaggregation](https://docs.vllm.ai/en/latest/serving/disagg_prefill.html), use replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers (`kv_producer`), and one for decode workers (`kv_consumer`).
+To run vLLM with [PD disaggregation](https://docs.vllm.ai/en/latest/serving/disagg_prefill.html), use replica groups: one for [Shepherd Model Gateway (SMG)](https://docs.sglang.io/advanced_features/sgl_model_gateway.html), one for prefill workers, and one for decode workers.
 
 <div editor-title="pd.dstack.yml">
 

From ab53b6b288ae42149acc79bdf5700dc5702cd2cf Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Mon, 29 Jun 2026 15:47:23 +0200
Subject: [PATCH 4/4] [Docs] Clarify SMG worker images in gRPC notes

Address review feedback: "SMG worker image" was ambiguous. Note that
gRPC workers run from SMG images bundling a specific backend version.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mkdocs/docs/concepts/services.md         | 2 +-
 mkdocs/docs/examples/inference/sglang.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mkdocs/docs/concepts/services.md b/mkdocs/docs/concepts/services.md
index 7af900f41..000ad7de8 100644
--- a/mkdocs/docs/concepts/services.md
+++ b/mkdocs/docs/concepts/services.md
@@ -423,7 +423,7 @@ Below is an example for running `zai-org/GLM-4.5-Air-FP8` on `H200`:
     > SMG workers connect to the router over HTTP or gRPC. The example above uses HTTP. SGLang workers support both modes; vLLM workers support gRPC only.
 
     ??? info "gRPC mode"
-        Over gRPC, workers run from an SMG worker image, and `smg launch` needs `--enable-igw` and `--model-path` so the router can register the workers. See the full configurations in [SGLang PD disaggregation](../examples/inference/sglang.md#pd-disaggregation) and [vLLM PD disaggregation](../examples/inference/vllm.md#pd-disaggregation).
+        Over gRPC, workers run from SMG images that bundle a specific backend version (SGLang or vLLM), and `smg launch` needs `--enable-igw` and `--model-path` so the router can register the workers. See the full configurations in [SGLang PD disaggregation](../examples/inference/sglang.md#pd-disaggregation) and [vLLM PD disaggregation](../examples/inference/vllm.md#pd-disaggregation).
 
 === "Dynamo"
 
diff --git a/mkdocs/docs/examples/inference/sglang.md b/mkdocs/docs/examples/inference/sglang.md
index 0d386aa42..5bf25ed5d 100644
--- a/mkdocs/docs/examples/inference/sglang.md
+++ b/mkdocs/docs/examples/inference/sglang.md
@@ -213,7 +213,7 @@ To run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/
 
     ??? info "gRPC mode"
 
-        SGLang workers can also connect to the SMG router over gRPC. Use an SMG worker image, pass `--grpc-mode` to the worker, and add `--enable-igw` and `--model-path` to `smg launch` so the router can register the workers.
+        SGLang workers can also connect to the SMG router over gRPC. Run the workers from an SMG image that bundles the SGLang version, pass `--grpc-mode`, and add `--enable-igw` and `--model-path` to `smg launch` so the router can register them.
 
         <div editor-title="pd-grpc.dstack.yml">