diff --git a/embodichain/gen_sim/prompt2scene/.gitignore b/embodichain/gen_sim/prompt2scene/.gitignore
new file mode 100644
index 00000000..75f4908e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/.gitignore
@@ -0,0 +1,7 @@
+cli/preview*
+cli/export*
+agent_tools/servers/geometry_generation_server/*
+
+# Python cache
+__pycache__/
+*.py[cod]
diff --git a/embodichain/gen_sim/prompt2scene/__init__.py b/embodichain/gen_sim/prompt2scene/__init__.py
new file mode 100644
index 00000000..01ece10d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/__init__.py
@@ -0,0 +1,15 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/__init__.py
new file mode 100644
index 00000000..a4b11ff0
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/__init__.py
@@ -0,0 +1 @@
+"""Internal client + External server for agent tool calling."""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/__init__.py
new file mode 100644
index 00000000..3afc32bd
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/__init__.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.base import BaseHttpClient
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import ClientError
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+    DEFAULT_CLIENT_CONFIG_PATH,
+    load_client_config,
+)
+
+__all__ = [
+    "BaseHttpClient",
+    "ClientError",
+    "DEFAULT_CLIENT_CONFIG_PATH",
+    "load_client_config",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/base.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/base.py
new file mode 100644
index 00000000..8981602f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/base.py
@@ -0,0 +1,131 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import time
+from pathlib import Path
+from typing import Callable
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+    ClientError,
+    build_client_error,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+    load_client_config,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import (
+    log_api_request_start,
+    log_info,
+    log_warning,
+)
+
+__all__ = ["BaseHttpClient"]
+
+
+class BaseHttpClient:
+    """Shared HTTP client behavior for agent-tool service clients."""
+
+    def __init__(
+        self,
+        *,
+        config_key: str,
+        server_name: str,
+        base_url: str | None = None,
+        timeout_s: int | None = None,
+        config_path: Path | None = None,
+        session: requests.Session | None = None,
+        trust_env: bool = True,
+    ) -> None:
+        """Initialize common service client fields from config."""
+        self.config = load_client_config(config_key, config_path)
+        self.server_name = server_name
+        self.base_url = (base_url or str(self.config["base_url"])).rstrip("/")
+        self.timeout_s = int(timeout_s or self.config.get("timeout_s", 120))
+        self.health_path = str(self.config.get("health_path", "/health"))
+        self.session = session or requests.Session()
+        self.session.trust_env = trust_env
+        log_info(f"{self.server_name} client initialized for {self.base_url}")
+
+    def health_check(self) -> bool:
+        """Check whether the configured service is healthy."""
+        try:
+            response = self.session.get(
+                f"{self.base_url}{self.health_path}",
+                timeout=5,
+            )
+            response.raise_for_status()
+            return True
+        except Exception as exc:
+            log_warning(f"{self.server_name} health check failed: {exc}")
+            return False
+
+    def post_with_retries(
+        self,
+        request_fn: Callable[[], requests.Response],
+        *,
+        max_retries: int,
+        error_cls: type[ClientError] = ClientError,
+        request_label: str | None = None,
+    ) -> requests.Response | ClientError:
+        """Run a POST request function with retry and HTTP error handling."""
+        for attempt in range(max_retries):
+            try:
+                if request_label is not None:
+                    log_api_request_start(
+                        step=self.server_name,
+                        request=request_label,
+                        attempt=attempt + 1,
+                    )
+                response = request_fn()
+                response.raise_for_status()
+                return response
+
+            except requests.exceptions.ConnectionError as exc:
+                if attempt < max_retries - 1:
+                    log_warning(
+                        f"{self.server_name} connection failed; retrying "
+                        f"({attempt + 1}/{max_retries})."
+                    )
+                    time.sleep(min(2**attempt, 60))
+                    continue
+                raise ConnectionError(
+                    f"Failed to connect to {self.server_name} at {self.base_url}"
+                ) from exc
+
+            except requests.exceptions.HTTPError as exc:
+                response = exc.response
+                if response is None:
+                    raise RuntimeError(f"{self.server_name} HTTP request failed.") from exc
+                if response.status_code >= 500 and attempt < max_retries - 1:
+                    log_warning(
+                        f"{self.server_name} server error; retrying "
+                        f"({attempt + 1}/{max_retries})."
+                    )
+                    time.sleep(min(2**attempt, 60))
+                    continue
+                return build_client_error(
+                    response,
+                    server_name=self.server_name,
+                    error_cls=error_cls,
+                )
+
+            except requests.exceptions.Timeout as exc:
+                raise TimeoutError(f"{self.server_name} request timed out.") from exc
+
+        raise RuntimeError(f"{self.server_name} request failed unexpectedly.")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/common.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/common.py
new file mode 100644
index 00000000..f1c7dc69
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/common.py
@@ -0,0 +1,139 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import requests
+
+__all__ = [
+    "ClientError",
+    "build_client_error",
+    "first_string",
+    "format_http_error",
+    "parse_error_response",
+    "parse_json_object_response",
+    "validate_required_strings",
+    "validate_png_response",
+]
+
+
+@dataclass(frozen=True)
+class ClientError:
+    """Common HTTP client error response."""
+
+    error_message: str
+    status_code: int | None = None
+    content_type: str | None = None
+    headers: dict[str, str] = field(default_factory=dict)
+    raw_response: dict[str, Any] | None = None
+
+
+def validate_png_response(
+    response: requests.Response,
+    png_bytes: bytes,
+) -> None:
+    content_type = response.headers.get("Content-Type", "")
+    if "image/png" not in content_type.lower():
+        raise RuntimeError(
+            "Image generation server returned non-PNG content: "
+            f"{content_type or 'unknown'}"
+        )
+    if not png_bytes.startswith(b"\x89PNG\r\n\x1a\n"):
+        raise RuntimeError("Image generation server returned invalid PNG bytes.")
+
+
+def validate_required_strings(fields: dict[str, object]) -> None:
+    """Validate required client request string fields."""
+    for field_name, value in fields.items():
+        if not str(value).strip():
+            raise ValueError(f"{field_name} must be non-empty.")
+
+
+def format_http_error(response: requests.Response, *, server_name: str) -> str:
+    """Format an HTTP error response from an agent-tool server."""
+    try:
+        response_data = response.json()
+    except ValueError:
+        return f"{server_name} HTTP error: {response.status_code}"
+
+    error_message = first_string(
+        response_data,
+        "error",
+        "error_message",
+        "message",
+        "detail",
+    )
+    if error_message:
+        return f"{server_name} error: {error_message}"
+    return f"{server_name} HTTP error: {response.status_code}"
+
+
+def parse_error_response(response: requests.Response) -> dict[str, Any] | None:
+    """Parse an error response body as a JSON object if possible."""
+    try:
+        response_data = response.json()
+    except ValueError:
+        return None
+    return response_data if isinstance(response_data, dict) else None
+
+
+def build_client_error(
+    response: requests.Response,
+    *,
+    server_name: str,
+    error_cls: type[ClientError] = ClientError,
+) -> ClientError:
+    """Build a common client error dataclass from an HTTP response."""
+    return error_cls(
+        error_message=format_http_error(
+            response,
+            server_name=server_name,
+        ),
+        status_code=response.status_code,
+        content_type=response.headers.get("Content-Type"),
+        headers=dict(response.headers),
+        raw_response=parse_error_response(response),
+    )
+
+
+def parse_json_object_response(
+    response: requests.Response,
+    *,
+    server_name: str,
+) -> dict[str, Any]:
+    """Parse an HTTP response body as a JSON object."""
+    try:
+        response_data = response.json()
+    except ValueError as exc:
+        raise RuntimeError(
+            f"{server_name} returned invalid JSON content: "
+            f"{response.headers.get('Content-Type') or 'unknown'}"
+        ) from exc
+    if not isinstance(response_data, dict):
+        raise RuntimeError(f"{server_name} response must be a JSON object.")
+    return response_data
+
+
+def first_string(data: dict[str, Any], *keys: str) -> str | None:
+    """Return the first string value for the given keys."""
+    for key in keys:
+        value = data.get(key)
+        if isinstance(value, str):
+            return value
+    return None
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/config.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/config.py
new file mode 100644
index 00000000..5592806a
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/config.py
@@ -0,0 +1,50 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+__all__ = ["DEFAULT_CLIENT_CONFIG_PATH", "load_client_config"]
+
+DEFAULT_CLIENT_CONFIG_PATH = (
+    Path(__file__).resolve().parents[2] / "configs" / "client_config.json"
+)
+
+
+def load_client_config(
+    config_key: str,
+    config_path: Path | None = None,
+) -> dict[str, Any]:
+    """Load one agent-tool client config section."""
+    resolved_config_path = (config_path or DEFAULT_CLIENT_CONFIG_PATH).resolve()
+    if not resolved_config_path.is_file():
+        raise FileNotFoundError(f"Client config not found: {resolved_config_path}")
+
+    with resolved_config_path.open("r", encoding="utf-8") as f:
+        raw_config = json.load(f)
+
+    config = raw_config.get(config_key)
+    if not isinstance(config, dict):
+        raise ValueError(
+            f"Client config section {config_key!r} not found in "
+            f"{resolved_config_path}"
+        )
+    if not config.get("base_url"):
+        raise ValueError(f"Client config section {config_key!r} requires base_url.")
+    return config
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/__init__.py
new file mode 100644
index 00000000..3fa63f3b
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/__init__.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.client import (
+    GeometryGenerationClient,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.schemas import (
+    GeometryGenerationError,
+    GeometryGenerationResult,
+    GeometryGenerationServerRequest,
+    GeometryGenerationServerResponse,
+    MultiObjectGenerationError,
+    MultiObjectGenerationObject,
+    MultiObjectGenerationResult,
+    MultiObjectGenerationServerRequest,
+    MultiObjectGenerationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+    DEFAULT_CLIENT_CONFIG_PATH,
+)
+
+__all__ = [
+    "DEFAULT_CLIENT_CONFIG_PATH",
+    "GeometryGenerationClient",
+    "GeometryGenerationError",
+    "GeometryGenerationResult",
+    "GeometryGenerationServerRequest",
+    "GeometryGenerationServerResponse",
+    "MultiObjectGenerationError",
+    "MultiObjectGenerationObject",
+    "MultiObjectGenerationResult",
+    "MultiObjectGenerationServerRequest",
+    "MultiObjectGenerationServerResponse",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/client.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/client.py
new file mode 100644
index 00000000..0615c6d2
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/client.py
@@ -0,0 +1,213 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Client for the SAM3D geometry generation server."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.base import BaseHttpClient
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+    validate_required_strings,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+    DEFAULT_CLIENT_CONFIG_PATH,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.parser import (
+    parse_geometry_generation_response,
+    parse_multi_object_generation_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.schemas import (
+    GeometryGenerationError,
+    GeometryGenerationResult,
+    GeometryGenerationServerRequest,
+    GeometryGenerationServerResponse,
+    MultiObjectGenerationError,
+    MultiObjectGenerationObject,
+    MultiObjectGenerationServerRequest,
+    MultiObjectGenerationServerResponse,
+)
+
+__all__ = [
+    "DEFAULT_CLIENT_CONFIG_PATH",
+    "GeometryGenerationClient",
+    "GeometryGenerationError",
+    "GeometryGenerationResult",
+    "GeometryGenerationServerRequest",
+    "GeometryGenerationServerResponse",
+    "MultiObjectGenerationError",
+    "MultiObjectGenerationObject",
+    "MultiObjectGenerationServerRequest",
+    "MultiObjectGenerationServerResponse",
+]
+
+
+class GeometryGenerationClient(BaseHttpClient):
+    """Client for making single-object SAM3D geometry generation requests."""
+
+    def __init__(
+        self,
+        *,
+        base_url: str | None = None,
+        timeout_s: int | None = None,
+        config_path: Path | None = None,
+        config_key: str = "sam3d_generation",
+        session: requests.Session | None = None,
+    ) -> None:
+        """Initialize the geometry generation client."""
+        super().__init__(
+            config_key=config_key,
+            server_name="Geometry generation server",
+            base_url=base_url,
+            timeout_s=timeout_s,
+            config_path=config_path,
+            session=session,
+            trust_env=False,
+        )
+        self.generate_single_object_path = str(
+            self.config.get("generate_single_object_path", "/generate_single_object")
+        )
+        self.generate_multiple_objects_path = str(
+            self.config.get(
+                "generate_multiple_objects_path", "/generate_multiple_objects"
+            )
+        )
+
+    def generate(
+        self,
+        request: GeometryGenerationServerRequest,
+        *,
+        max_retries: int = 3,
+    ) -> GeometryGenerationServerResponse | GeometryGenerationError:
+        """Generate one GLB mesh from an object image and save it locally."""
+        _validate_request(request)
+        url = f"{self.base_url}{self.generate_single_object_path}"
+        response = self.post_with_retries(
+            lambda: _post_geometry_generation_request(self, url, request),
+            max_retries=max_retries,
+            error_cls=GeometryGenerationError,
+            request_label="geometry_generation",
+        )
+        if isinstance(response, GeometryGenerationError):
+            return response
+        return parse_geometry_generation_response(response, request)
+
+    def generate_multiple_objects(
+        self,
+        request: MultiObjectGenerationServerRequest,
+        *,
+        output_dir: Path | None = None,
+        max_retries: int = 3,
+    ) -> MultiObjectGenerationServerResponse | MultiObjectGenerationError:
+        """Generate multiple GLB meshes from one image and multiple masks."""
+        _validate_multi_object_request(request)
+        url = f"{self.base_url}{self.generate_multiple_objects_path}"
+        response = self.post_with_retries(
+            lambda: _post_multi_object_generation_request(self, url, request),
+            max_retries=max_retries,
+            error_cls=MultiObjectGenerationError,
+            request_label="multi_object_geometry_generation",
+        )
+        if isinstance(response, MultiObjectGenerationError):
+            return response
+        return parse_multi_object_generation_response(
+            response,
+            self.base_url,
+            output_dir=output_dir,
+            session=self.session,
+        )
+
+
+def _validate_request(request: GeometryGenerationServerRequest) -> None:
+    validate_required_strings(
+        {
+            "Geometry generation image_path": request.image_path,
+            "Geometry generation output_path": request.output_path,
+        }
+    )
+    image_path = Path(request.image_path).expanduser()
+    if not image_path.is_file():
+        raise FileNotFoundError(f"Geometry generation input not found: {image_path}")
+    if not str(request.output_path).lower().endswith(".glb"):
+        raise ValueError("Geometry generation output_path must be a GLB file path.")
+
+
+def _post_geometry_generation_request(
+    client: GeometryGenerationClient,
+    url: str,
+    request: GeometryGenerationServerRequest,
+) -> requests.Response:
+    with _open_image_file(request.image_path) as image_file:
+        return client.session.post(
+            url,
+            data=request.to_form_data(),
+            files={
+                "image": (
+                    Path(request.image_path).name,
+                    image_file,
+                )
+            },
+            timeout=(10, client.timeout_s),
+        )
+
+
+def _open_image_file(image_path: str | Path) -> Any:
+    return Path(image_path).expanduser().resolve().open("rb")
+
+
+def _validate_multi_object_request(
+    request: MultiObjectGenerationServerRequest,
+) -> None:
+    validate_required_strings(
+        {"Multi-object geometry generation image_path": request.image_path}
+    )
+    image_path = Path(request.image_path).expanduser()
+    if not image_path.is_file():
+        raise FileNotFoundError(
+            f"Multi-object geometry generation input not found: {image_path}"
+        )
+    if not request.mask_paths:
+        raise ValueError("mask_paths must be non-empty.")
+    for mask_path in request.mask_paths:
+        if not Path(mask_path).expanduser().is_file():
+            raise FileNotFoundError(
+                f"Multi-object geometry mask not found: {mask_path}"
+            )
+
+
+def _post_multi_object_generation_request(
+    client: GeometryGenerationClient,
+    url: str,
+    request: MultiObjectGenerationServerRequest,
+) -> requests.Response:
+    mask_files = [
+        ("masks", (Path(p).name, Path(p).expanduser().resolve().open("rb")))
+        for p in request.mask_paths
+    ]
+    try:
+        return client.session.post(
+            url,
+            data=request.to_form_data(),
+            files=[("image", (Path(request.image_path).name, _open_image_file(request.image_path)))] + mask_files,
+            timeout=(10, client.timeout_s),
+        )
+    finally:
+        for _, (_, f) in mask_files:
+            f.close()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/parser.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/parser.py
new file mode 100644
index 00000000..4d3c0967
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/parser.py
@@ -0,0 +1,255 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.schemas import (
+    GeometryGenerationResult,
+    GeometryGenerationServerRequest,
+    GeometryGenerationServerResponse,
+    MultiObjectGenerationObject,
+    MultiObjectGenerationResult,
+    MultiObjectGenerationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = ["parse_geometry_generation_response", "parse_multi_object_generation_response"]
+
+
+def parse_geometry_generation_response(
+    response: requests.Response,
+    request: GeometryGenerationServerRequest,
+) -> GeometryGenerationServerResponse:
+    """Parse a geometry GLB response and save it to the request output path."""
+    glb_bytes = response.content
+    _validate_glb_response(response, glb_bytes)
+    output_path = _write_glb_output(request, glb_bytes)
+    result = GeometryGenerationResult(geometry_path=str(output_path))
+    return GeometryGenerationServerResponse(
+        ok=True,
+        status="ok",
+        result=result,
+        status_code=response.status_code,
+        content_type=response.headers.get("Content-Type"),
+        headers=dict(response.headers),
+    )
+
+
+def _validate_glb_response(
+    response: requests.Response,
+    glb_bytes: bytes,
+) -> None:
+    if not glb_bytes.startswith(b"glTF"):
+        content_type = response.headers.get("Content-Type", "")
+        raise RuntimeError(
+            "Geometry generation server returned invalid GLB content: "
+            f"{content_type or 'unknown'}"
+        )
+
+
+def _write_glb_output(
+    request: GeometryGenerationServerRequest,
+    glb_bytes: bytes,
+) -> Path:
+    output_path = Path(request.output_path).expanduser().resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_bytes(glb_bytes)
+    if not output_path.is_file():
+        raise FileNotFoundError(f"Generated geometry was not written: {output_path}")
+    log_info(f"Generated geometry written: {output_path}")
+    return output_path
+
+
+def parse_multi_object_generation_response(
+    response: requests.Response,
+    base_url: str,
+    *,
+    output_dir: Path | None = None,
+    session: requests.Session | None = None,
+) -> MultiObjectGenerationServerResponse:
+    """Parse a multi-object geometry response, download GLBs if output_dir given."""
+    body = _parse_json_body(response)
+    ok = body.get("ok", False)
+    if not isinstance(ok, bool) or not ok:
+        error_msg = body.get("error", "ok is not true")
+        raise RuntimeError(
+            f"Multi-object geometry generation failed: {error_msg}"
+        )
+
+    result_data = body.get("result")
+    if not isinstance(result_data, dict):
+        raise RuntimeError(
+            "Multi-object geometry generation response missing 'result' object"
+        )
+    base = base_url.rstrip("/")
+    objects = _parse_multi_object_items(
+        result_data,
+        base,
+        output_dir=output_dir,
+        session=session,
+    )
+
+    return MultiObjectGenerationServerResponse(
+        ok=True,
+        status=str(body.get("status") or "ok"),
+        result=MultiObjectGenerationResult(objects=objects),
+        status_code=response.status_code,
+        content_type=response.headers.get("Content-Type"),
+        headers=dict(response.headers),
+    )
+
+
+def _parse_multi_object_items(
+    body: dict[str, object],
+    base_url: str,
+    *,
+    output_dir: Path | None,
+    session: requests.Session | None,
+) -> list[MultiObjectGenerationObject]:
+    response_objects = body.get("objects")
+    if not isinstance(response_objects, list) or not response_objects:
+        raise RuntimeError(
+            "Multi-object geometry generation response missing 'result.objects' list"
+        )
+    return [
+        _parse_multi_object_item(
+            item,
+            index=i,
+            base_url=base_url,
+            output_dir=output_dir,
+            session=session,
+        )
+        for i, item in enumerate(response_objects)
+    ]
+
+
+def _parse_multi_object_item(
+    item: object,
+    *,
+    index: int,
+    base_url: str,
+    output_dir: Path | None,
+    session: requests.Session | None,
+) -> MultiObjectGenerationObject:
+    if not isinstance(item, dict):
+        raise RuntimeError(f"Multi-object item {index} must be a JSON object")
+
+    mesh_rel_path = item.get("mesh")
+    if not isinstance(mesh_rel_path, str) or not mesh_rel_path:
+        raise RuntimeError(f"Multi-object item {index} missing 'mesh'")
+
+    name = str(item.get("name") or Path(mesh_rel_path).stem or index)
+    geometry_path = _resolve_or_download_glb(
+        base_url,
+        mesh_rel_path,
+        name=name,
+        index=index,
+        output_dir=output_dir,
+        session=session,
+    )
+
+    return MultiObjectGenerationObject(
+        name=name,
+        geometry_path=geometry_path,
+        rotation_quaternion_wxyz=_float_list(
+            item.get("rotation_quaternion_wxyz"),
+            expected_len=4,
+            field_name=f"objects[{index}].rotation_quaternion_wxyz",
+        ),
+        translation=_float_list(
+            item.get("translation"),
+            expected_len=3,
+            field_name=f"objects[{index}].translation",
+        ),
+        scale=_float_list(
+            item.get("scale"),
+            expected_len=3,
+            field_name=f"objects[{index}].scale",
+        ),
+    )
+
+
+def _resolve_or_download_glb(
+    base_url: str,
+    mesh_rel_path: str,
+    *,
+    name: str,
+    index: int,
+    output_dir: Path | None,
+    session: requests.Session | None,
+) -> str:
+    url = _join_url(base_url, mesh_rel_path)
+    if output_dir is None:
+        return url
+
+    output_dir = output_dir.expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    filename = f"{name}.glb" if name else f"{index}.glb"
+    dest = output_dir / filename
+    _download_glb(url, dest, session=session)
+    return str(dest)
+
+
+def _join_url(base_url: str, path_or_url: str) -> str:
+    if path_or_url.startswith(("http://", "https://")):
+        return path_or_url
+    if path_or_url.startswith("/"):
+        return f"{base_url}{path_or_url}"
+    return f"{base_url}/{path_or_url}"
+
+
+def _float_list(value: object, *, expected_len: int, field_name: str) -> list[float]:
+    if not isinstance(value, list) or len(value) != expected_len:
+        raise RuntimeError(f"Multi-object geometry response missing '{field_name}'")
+    try:
+        return [float(v) for v in value]
+    except (TypeError, ValueError) as exc:
+        raise RuntimeError(
+            f"Multi-object geometry response field '{field_name}' must be numeric"
+        ) from exc
+
+
+def _parse_json_body(response: requests.Response) -> dict[str, object]:
+    try:
+        body = response.json()
+    except ValueError as exc:
+        raise RuntimeError(
+            "Multi-object geometry generation server returned invalid JSON"
+        ) from exc
+    if not isinstance(body, dict):
+        raise RuntimeError(
+            "Multi-object geometry generation response must be a JSON object"
+        )
+    return body
+
+
+def _download_glb(
+    url: str,
+    dest: Path,
+    *,
+    session: requests.Session | None,
+) -> None:
+    """Download a GLB from the geometry server."""
+    http = session or requests.Session()
+    r = http.get(url, timeout=30)
+    r.raise_for_status()
+    _validate_glb_response(r, r.content)
+    dest.write_bytes(r.content)
+    log_info(f"Generated geometry written: {dest}")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/schemas.py
new file mode 100644
index 00000000..d8ede9ee
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/schemas.py
@@ -0,0 +1,134 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import ClientError
+
+__all__ = [
+    "GeometryGenerationError",
+    "GeometryGenerationResult",
+    "GeometryGenerationServerRequest",
+    "GeometryGenerationServerResponse",
+    "MultiObjectGenerationError",
+    "MultiObjectGenerationObject",
+    "MultiObjectGenerationResult",
+    "MultiObjectGenerationServerRequest",
+    "MultiObjectGenerationServerResponse",
+]
+
+
+@dataclass(frozen=True)
+class GeometryGenerationServerRequest:
+    """Request sent to the Geometry Generation server.
+
+    Args:
+        image_path: Local object image path.
+        output_path: Local output GLB path where the client saves the generated geometry.
+    """
+
+    image_path: str | Path
+    output_path: str | Path
+
+    def to_form_data(self) -> dict[str, str]:
+        """Convert the request to the geometry server multipart form fields."""
+        return {}
+
+
+@dataclass(frozen=True)
+class GeometryGenerationResult:
+    """Successful Geometry Generation result."""
+
+    geometry_path: str
+
+
+@dataclass(frozen=True)
+class GeometryGenerationServerResponse:
+    """Parsed successful response from the Geometry Generation server."""
+
+    ok: bool
+    result: GeometryGenerationResult
+    status: str | None = None
+    error: str | None = None
+    status_code: int | None = None
+    content_type: str | None = None
+    headers: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class GeometryGenerationError(ClientError):
+    """Geometry generation failure returned by the server."""
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationServerRequest:
+    """Request sent to the Geometry Generation server (multi-object).
+
+    Args:
+        image_path: Local scene RGB image path.
+        mask_paths: Local mask PNG file paths (one per object).
+    """
+
+    image_path: str | Path
+    mask_paths: list[Path]
+
+    def to_form_data(self) -> dict[str, str]:
+        """Convert the request to the geometry server multipart form fields."""
+        return {"json": "1"}
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationObject:
+    """Successful Multi-Object Geometry Generation result."""
+
+    name: str
+    geometry_path: str
+    rotation_quaternion_wxyz: list[float]
+    translation: list[float]
+    scale: list[float]
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationResult:
+    """Successful Multi-Object Geometry Generation result."""
+
+    objects: list[MultiObjectGenerationObject]
+
+    @property
+    def geometry_paths(self) -> list[str]:
+        """Paths to the generated GLB files."""
+        return [item.geometry_path for item in self.objects]
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationServerResponse:
+    """Parsed successful response from the Geometry Generation server."""
+
+    ok: bool
+    result: MultiObjectGenerationResult
+    status: str | None = None
+    error: str | None = None
+    status_code: int | None = None
+    content_type: str | None = None
+    headers: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationError(ClientError):
+    """Multi-object geometry generation failure returned by the server."""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/__init__.py
new file mode 100644
index 00000000..c112bd3d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/__init__.py
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.client import (
+    ImageGenerationClient,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.schemas import (
+    ImageGenerationError,
+    ImageGenerationResult,
+    ImageGenerationServerRequest,
+    ImageGenerationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+    DEFAULT_CLIENT_CONFIG_PATH,
+)
+
+__all__ = [
+    "DEFAULT_CLIENT_CONFIG_PATH",
+    "ImageGenerationClient",
+    "ImageGenerationError",
+    "ImageGenerationResult",
+    "ImageGenerationServerRequest",
+    "ImageGenerationServerResponse",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/client.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/client.py
new file mode 100644
index 00000000..6f23d47b
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/client.py
@@ -0,0 +1,117 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Client for the Z-Image image generation server."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.base import BaseHttpClient
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+    validate_required_strings,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+    DEFAULT_CLIENT_CONFIG_PATH,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.parser import (
+    parse_generation_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.schemas import (
+    ImageGenerationError,
+    ImageGenerationResult,
+    ImageGenerationServerRequest,
+    ImageGenerationServerResponse,
+)
+
+__all__ = [
+    "DEFAULT_CLIENT_CONFIG_PATH",
+    "ImageGenerationClient",
+    "ImageGenerationError",
+    "ImageGenerationResult",
+    "ImageGenerationServerRequest",
+    "ImageGenerationServerResponse",
+]
+
+
+class ImageGenerationClient(BaseHttpClient):
+    """Client for making single-image Z-Image generation requests."""
+
+    def __init__(
+        self,
+        *,
+        base_url: str | None = None,
+        timeout_s: int | None = None,
+        config_path: Path | None = None,
+        config_key: str = "zimage",
+        session: requests.Session | None = None,
+    ) -> None:
+        """Initialize the image generation client."""
+        super().__init__(
+            config_key=config_key,
+            server_name="Image generation server",
+            base_url=base_url,
+            timeout_s=timeout_s,
+            config_path=config_path,
+            session=session,
+        )
+        self.generate_single_object_path = str(
+            self.config.get("generate_single_object_path", "/generate.png")
+        )
+
+    def generate(
+        self,
+        request: ImageGenerationServerRequest,
+        *,
+        max_retries: int = 3,
+    ) -> ImageGenerationServerResponse | ImageGenerationError:
+        """Generate one image and save the returned PNG locally."""
+        _validate_request(request)
+        url = f"{self.base_url}{self.generate_single_object_path}"
+        response = self.post_with_retries(
+            lambda: _post_generation_request(self, url, request),
+            max_retries=max_retries,
+            error_cls=ImageGenerationError,
+            request_label="image_generation",
+        )
+        if isinstance(response, ImageGenerationError):
+            return response
+        return parse_generation_response(response, request)
+
+
+def _validate_request(request: ImageGenerationServerRequest) -> None:
+    validate_required_strings(
+        {
+            "Image generation prompt": request.prompt,
+            "Image generation output_path": request.output_path,
+        }
+    )
+    if not str(request.output_path).lower().endswith(".png"):
+        raise ValueError("Image generation output_path must be a PNG file path.")
+
+
+def _post_generation_request(
+    client: ImageGenerationClient,
+    url: str,
+    request: ImageGenerationServerRequest,
+) -> requests.Response:
+    return client.session.post(
+        url,
+        json=request.to_dict(),
+        timeout=(10, client.timeout_s),
+    )
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/parser.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/parser.py
new file mode 100644
index 00000000..a43ee030
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/parser.py
@@ -0,0 +1,65 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+    validate_png_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.schemas import (
+    ImageGenerationResult,
+    ImageGenerationServerRequest,
+    ImageGenerationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = ["parse_generation_response"]
+
+
+def parse_generation_response(
+    response: requests.Response,
+    request: ImageGenerationServerRequest,
+) -> ImageGenerationServerResponse:
+    """Parse a Z-Image PNG response and save it to the request output path."""
+    png_bytes = response.content
+    validate_png_response(response, png_bytes)
+    output_path = _write_png_output(request, png_bytes)
+    result = ImageGenerationResult(image_path=str(output_path))
+    return ImageGenerationServerResponse(
+        ok=True,
+        status="ok",
+        result=result,
+        status_code=response.status_code,
+        content_type=response.headers.get("Content-Type"),
+        headers=dict(response.headers),
+    )
+
+
+def _write_png_output(
+    request: ImageGenerationServerRequest,
+    png_bytes: bytes,
+) -> Path:
+    output_path = Path(request.output_path).expanduser().resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_bytes(png_bytes)
+    if not output_path.is_file():
+        raise FileNotFoundError(f"Generated image was not written: {output_path}")
+    log_info(f"Generated image written: {output_path}")
+    return output_path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/schemas.py
new file mode 100644
index 00000000..09c845ba
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/schemas.py
@@ -0,0 +1,72 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import ClientError
+
+__all__ = [
+    "ImageGenerationError",
+    "ImageGenerationResult",
+    "ImageGenerationServerRequest",
+    "ImageGenerationServerResponse",
+]
+
+
+@dataclass(frozen=True)
+class ImageGenerationServerRequest:
+    """Request sent to the Z-Image server.
+
+    Args:
+        prompt: Text prompt used to generate the image.
+        output_path: Local output PNG path where the client saves the response.
+    """
+
+    prompt: str
+    output_path: str | Path
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the request to the Z-Image server JSON payload."""
+        return {"prompt": self.prompt}
+
+
+@dataclass(frozen=True)
+class ImageGenerationResult:
+    """Successful Z-Image generation result."""
+
+    image_path: str
+
+
+@dataclass(frozen=True)
+class ImageGenerationServerResponse:
+    """Parsed successful response from the Z-Image server."""
+
+    ok: bool
+    result: ImageGenerationResult
+    status: str | None = None
+    error: str | None = None
+    status_code: int | None = None
+    content_type: str | None = None
+    headers: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ImageGenerationError(ClientError):
+    """Image generation failure returned by the server."""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/__init__.py
new file mode 100644
index 00000000..a503f287
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/__init__.py
@@ -0,0 +1,61 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+    DEFAULT_CLIENT_CONFIG_PATH,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.client import (
+    ImageSegmentationClient,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.schemas import (
+    ImageSegmentationCandidate,
+    ImageSegmentationError,
+    ImageSegmentationResult,
+    ImageSegmentationServerRequest,
+    ImageSegmentationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.utils import (
+    apply_mask_to_alpha,
+    bbox_iou,
+    decode_rle_mask,
+    draw_labeled_bboxes,
+    draw_numbered_bboxes,
+    draw_numbered_masks,
+    is_usable_segmentation_candidate,
+    save_candidate_rgba_and_mask,
+    sort_segments_by_bbox,
+)
+
+__all__ = [
+    "DEFAULT_CLIENT_CONFIG_PATH",
+    "ImageSegmentationCandidate",
+    "ImageSegmentationClient",
+    "ImageSegmentationError",
+    "ImageSegmentationResult",
+    "ImageSegmentationServerRequest",
+    "ImageSegmentationServerResponse",
+    "apply_mask_to_alpha",
+    "bbox_iou",
+    "decode_rle_mask",
+    "draw_labeled_bboxes",
+    "draw_numbered_bboxes",
+    "draw_numbered_masks",
+    "is_usable_segmentation_candidate",
+    "save_candidate_rgba_and_mask",
+    "sort_segments_by_bbox",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/client.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/client.py
new file mode 100644
index 00000000..1a880bb6
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/client.py
@@ -0,0 +1,132 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Client for the SAM3 image segmentation server."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.base import BaseHttpClient
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+    validate_required_strings,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+    DEFAULT_CLIENT_CONFIG_PATH,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.parser import (
+    parse_segmentation_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.schemas import (
+    ImageSegmentationCandidate,
+    ImageSegmentationError,
+    ImageSegmentationResult,
+    ImageSegmentationServerRequest,
+    ImageSegmentationServerResponse,
+)
+
+__all__ = [
+    "DEFAULT_CLIENT_CONFIG_PATH",
+    "ImageSegmentationCandidate",
+    "ImageSegmentationClient",
+    "ImageSegmentationError",
+    "ImageSegmentationResult",
+    "ImageSegmentationServerRequest",
+    "ImageSegmentationServerResponse",
+]
+
+
+class ImageSegmentationClient(BaseHttpClient):
+    """Client for making single-image SAM3 segmentation requests."""
+
+    def __init__(
+        self,
+        *,
+        base_url: str | None = None,
+        timeout_s: int | None = None,
+        config_path: Path | None = None,
+        config_key: str = "sam3_segmentation",
+        session: requests.Session | None = None,
+    ) -> None:
+        """Initialize the image segmentation client."""
+        super().__init__(
+            config_key=config_key,
+            server_name="Image segmentation server",
+            base_url=base_url,
+            timeout_s=timeout_s,
+            config_path=config_path,
+            session=session,
+            trust_env=False,
+        )
+        self.segmentation_path = str(
+            self.config.get("segment_single_object_path", "/segment_single_object")
+        )
+
+    def segment(
+        self,
+        request: ImageSegmentationServerRequest,
+        *,
+        max_retries: int = 3,
+    ) -> ImageSegmentationServerResponse | ImageSegmentationError:
+        """Segment one image with a text prompt."""
+        _validate_request(request)
+        url = f"{self.base_url}{self.segmentation_path}"
+        response = self.post_with_retries(
+            lambda: _post_segmentation_request(self, url, request),
+            max_retries=max_retries,
+            error_cls=ImageSegmentationError,
+            request_label="image_segmentation",
+        )
+        if isinstance(response, ImageSegmentationError):
+            return response
+        return parse_segmentation_response(response, request)
+
+
+def _validate_request(request: ImageSegmentationServerRequest) -> None:
+    validate_required_strings(
+        {
+            "Image segmentation image_path": request.image_path,
+        }
+    )
+    image_path = Path(request.image_path).expanduser()
+    if not image_path.is_file():
+        raise FileNotFoundError(f"Image segmentation input not found: {image_path}")
+
+
+def _post_segmentation_request(
+    client: ImageSegmentationClient,
+    url: str,
+    request: ImageSegmentationServerRequest,
+) -> requests.Response:
+    with _open_image_file(request.image_path) as image_file:
+        return client.session.post(
+            url,
+            data=request.to_form_data(),
+            files={
+                "image": (
+                    Path(request.image_path).name,
+                    image_file,
+                )
+            },
+            timeout=(10, client.timeout_s),
+        )
+
+
+def _open_image_file(image_path: str | Path) -> Any:
+    return Path(image_path).expanduser().resolve().open("rb")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/parser.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/parser.py
new file mode 100644
index 00000000..762a1b43
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/parser.py
@@ -0,0 +1,218 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+    parse_json_object_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.schemas import (
+    ImageSegmentationCandidate,
+    ImageSegmentationResult,
+    ImageSegmentationServerRequest,
+    ImageSegmentationServerResponse,
+)
+
+__all__ = ["parse_segmentation_response"]
+
+SERVER_NAME = "Image segmentation server"
+
+
+def parse_segmentation_response(
+    response: requests.Response,
+    request: ImageSegmentationServerRequest,
+) -> ImageSegmentationServerResponse:
+    """Parse a SAM3 server JSON response into typed segmentation records."""
+    response_data = parse_json_object_response(
+        response,
+        server_name=SERVER_NAME,
+    )
+    result = _parse_segmentation_result(response_data, request)
+    return ImageSegmentationServerResponse(
+        ok=bool(response_data.get("ok", True)),
+        status=_string_or_none(response_data.get("status")) or "ok",
+        result=result,
+        status_code=response.status_code,
+        content_type=response.headers.get("Content-Type"),
+        headers=dict(response.headers),
+    )
+
+
+def _parse_segmentation_result(
+    response_data: dict[str, Any],
+    request: ImageSegmentationServerRequest,
+) -> ImageSegmentationResult:
+    result_data = response_data.get("result")
+    if not isinstance(result_data, dict):
+        result_data = response_data.get("data")
+    if not isinstance(result_data, dict):
+        result_data = response_data
+
+    return ImageSegmentationResult(
+        image_path=_string_or_none(result_data.get("image_path"))
+        or str(request.image_path),
+        prompt=_string_or_none(result_data.get("prompt")) or request.prompt,
+        candidates=_parse_candidates(result_data),
+        request_id=_string_or_none(result_data.get("request_id")),
+        elapsed_sec=_float_or_none(result_data.get("elapsed_sec")),
+        count=_int_or_none(result_data.get("count")),
+        image_width=_parse_image_width(result_data),
+        image_height=_parse_image_height(result_data),
+        box_format=_string_or_none(result_data.get("box_format")) or "xyxy",
+        mask_format=_string_or_none(result_data.get("mask_format")) or "rle",
+    )
+
+
+def _parse_candidates(result_data: dict[str, Any]) -> list[ImageSegmentationCandidate]:
+    for key in ("instances", "candidates", "segmentations", "detections"):
+        items = result_data.get(key)
+        if isinstance(items, list):
+            return [
+                _parse_candidate_item(item, index)
+                for index, item in enumerate(items)
+                if isinstance(item, dict)
+            ]
+
+    boxes = result_data.get("boxes", [])
+    scores = result_data.get("scores", [])
+    masks = result_data.get("masks", [])
+    if not isinstance(boxes, list):
+        return []
+
+    candidates: list[ImageSegmentationCandidate] = []
+    for index, box in enumerate(boxes):
+        candidates.append(
+            ImageSegmentationCandidate(
+                candidate_id=f"candidate_{index}",
+                bbox_xyxy=_float_list(box),
+                score=_float_or_zero(_list_get(scores, index)),
+                mask_rle=_mask_or_none(_list_get(masks, index)),
+            )
+        )
+    return candidates
+
+
+def _parse_candidate_item(
+    item: dict[str, Any],
+    index: int,
+) -> ImageSegmentationCandidate:
+    known_keys = {
+        "candidate_id",
+        "id",
+        "index",
+        "bbox_xyxy",
+        "box_xyxy",
+        "box",
+        "bbox",
+        "score",
+        "mask_rle",
+        "mask",
+        "segmentation",
+        "mask_path",
+        "label",
+    }
+    mask_value = item.get("mask_rle") or item.get("mask") or item.get("segmentation")
+    return ImageSegmentationCandidate(
+        candidate_id=_string_or_none(item.get("candidate_id"))
+        or _string_or_none(item.get("id"))
+        or _index_id_or_none(item.get("index"))
+        or f"candidate_{index}",
+        bbox_xyxy=_float_list(
+            item.get("bbox_xyxy")
+            or item.get("box_xyxy")
+            or item.get("box")
+            or item.get("bbox")
+        ),
+        score=_float_or_zero(item.get("score")),
+        mask_rle=_mask_or_none(mask_value),
+        mask_path=_string_or_none(item.get("mask_path")),
+        label=_string_or_none(item.get("label")),
+        metadata={k: v for k, v in item.items() if k not in known_keys},
+    )
+
+
+def _list_get(values: Any, index: int) -> Any:
+    if not isinstance(values, list) or index >= len(values):
+        return None
+    return values[index]
+
+
+def _float_list(value: Any) -> list[float]:
+    if not isinstance(value, list):
+        return []
+    parsed: list[float] = []
+    for item in value:
+        try:
+            parsed.append(float(item))
+        except (TypeError, ValueError):
+            continue
+    return parsed
+
+
+def _float_or_zero(value: Any) -> float:
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+
+
+def _float_or_none(value: Any) -> float | None:
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _int_or_none(value: Any) -> int | None:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _string_or_none(value: Any) -> str | None:
+    return value if isinstance(value, str) else None
+
+
+def _mask_or_none(value: Any) -> dict[str, Any] | None:
+    return value if isinstance(value, dict) else None
+
+
+def _index_id_or_none(value: Any) -> str | None:
+    index = _int_or_none(value)
+    return f"candidate_{index}" if index is not None else None
+
+
+def _parse_image_width(result_data: dict[str, Any]) -> int | None:
+    image_size = result_data.get("image_size")
+    if isinstance(image_size, dict):
+        width = _int_or_none(image_size.get("width"))
+        if width is not None:
+            return width
+    return _int_or_none(result_data.get("image_width"))
+
+
+def _parse_image_height(result_data: dict[str, Any]) -> int | None:
+    image_size = result_data.get("image_size")
+    if isinstance(image_size, dict):
+        height = _int_or_none(image_size.get("height"))
+        if height is not None:
+            return height
+    return _int_or_none(result_data.get("image_height"))
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/schemas.py
new file mode 100644
index 00000000..3945bf4b
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/schemas.py
@@ -0,0 +1,103 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import ClientError
+
+__all__ = [
+    "ImageSegmentationCandidate",
+    "ImageSegmentationError",
+    "ImageSegmentationResult",
+    "ImageSegmentationServerRequest",
+    "ImageSegmentationServerResponse",
+]
+
+
+@dataclass(frozen=True)
+class ImageSegmentationServerRequest:
+    """Request sent to the SAM3 server.
+
+    Args:
+        prompt: Short text concept prompt.
+        image_path: Local input image path.
+    """
+
+    prompt: str
+    image_path: str | Path
+
+    def to_form_data(self) -> dict[str, str]:
+        """Convert the request to the SAM3 server multipart form fields."""
+        return {
+            "prompt": self.prompt,
+            "score_threshold": "0.0",
+            "max_instances": "5",
+        }
+
+
+@dataclass(frozen=True)
+class ImageSegmentationCandidate:
+    """One SAM3 segmentation candidate for a prompted concept.
+
+    SAM3 image inference returns parallel masks, boxes, and scores. The client
+    normalizes one aligned mask/box/score item into this candidate record.
+    """
+
+    candidate_id: str
+    bbox_xyxy: list[float]
+    score: float
+    mask_rle: dict[str, Any] | None = None
+    mask_path: str | None = None
+    label: str | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ImageSegmentationResult:
+    """Successful SAM3 segmentation result."""
+
+    image_path: str
+    prompt: str
+    candidates: list[ImageSegmentationCandidate]
+    request_id: str | None = None
+    elapsed_sec: float | None = None
+    count: int | None = None
+    image_width: int | None = None
+    image_height: int | None = None
+    box_format: str = "xyxy"
+    mask_format: str | None = None
+
+
+@dataclass(frozen=True)
+class ImageSegmentationServerResponse:
+    """Parsed successful response from the SAM3 server."""
+
+    ok: bool
+    result: ImageSegmentationResult
+    status: str | None = None
+    error: str | None = None
+    status_code: int | None = None
+    content_type: str | None = None
+    headers: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ImageSegmentationError(ClientError):
+    """Image segmentation failure returned by the server."""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/utils.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/utils.py
new file mode 100644
index 00000000..83457358
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/utils.py
@@ -0,0 +1,322 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from PIL import Image, ImageDraw, ImageFont
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.schemas import (
+    ImageSegmentationCandidate,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = [
+    "apply_mask_to_alpha",
+    "bbox_iou",
+    "decode_rle_mask",
+    "draw_labeled_bboxes",
+    "draw_numbered_bboxes",
+    "draw_numbered_masks",
+    "is_usable_segmentation_candidate",
+    "save_candidate_rgba_and_mask",
+    "sort_segments_by_bbox",
+]
+
+
+def decode_rle_mask(mask_rle: dict[str, Any]) -> Image.Image:
+    """Decode an uncompressed SAM3 RLE mask into a grayscale PIL image."""
+    size = mask_rle.get("size")
+    counts = mask_rle.get("counts")
+    if not _is_size_pair(size):
+        raise ValueError("SAM3 mask_rle requires size=[height, width].")
+    if not isinstance(counts, list):
+        raise ValueError("SAM3 mask_rle counts must be an uncompressed list.")
+
+    height = int(size[0])
+    width = int(size[1])
+    expected_pixels = height * width
+    starts_with = int(mask_rle.get("starts_with", 0))
+    value = 255 if starts_with else 0
+    pixels = bytearray(expected_pixels)
+    offset = 0
+
+    for count_value in counts:
+        count = int(count_value)
+        if count < 0:
+            raise ValueError("SAM3 mask_rle counts must be non-negative.")
+        next_offset = offset + count
+        if next_offset > expected_pixels:
+            raise ValueError("SAM3 mask_rle counts exceed the expected image size.")
+        if value:
+            pixels[offset:next_offset] = b"\xff" * count
+        offset = next_offset
+        value = 0 if value else 255
+
+    if offset != expected_pixels:
+        raise ValueError(
+            "SAM3 mask_rle counts do not cover the expected image size: "
+            f"{offset} != {expected_pixels}."
+        )
+    return Image.frombytes("L", (width, height), bytes(pixels))
+
+
+def apply_mask_to_alpha(
+    image_path: str | Path,
+    mask: Image.Image,
+) -> Image.Image:
+    """Return an RGBA image whose alpha channel is the provided mask."""
+    image = Image.open(image_path).convert("RGBA")
+    alpha = mask.convert("L")
+    if alpha.size != image.size:
+        alpha = alpha.resize(image.size, Image.Resampling.NEAREST)
+    image.putalpha(alpha)
+    return image
+
+
+def save_candidate_rgba_and_mask(
+    *,
+    image_path: str | Path,
+    candidate: ImageSegmentationCandidate,
+    output_dir: str | Path,
+    prefix: str | None = None,
+) -> dict[str, str]:
+    """Save one candidate's mask image and RGBA image for SAM3D input."""
+    if candidate.mask_rle is None:
+        raise ValueError(f"Candidate {candidate.candidate_id} has no mask_rle.")
+
+    output_dir = Path(output_dir).expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    filename_prefix = prefix or candidate.candidate_id
+    mask_path = output_dir / f"{filename_prefix}_mask.png"
+    rgba_path = output_dir / f"{filename_prefix}_rgba.png"
+
+    mask = decode_rle_mask(candidate.mask_rle)
+    mask.save(mask_path)
+    rgba = apply_mask_to_alpha(image_path, mask)
+    rgba.save(rgba_path)
+    log_info(f"SAM3 mask written: {mask_path}")
+    log_info(f"SAM3 RGBA image written: {rgba_path}")
+    return {
+        "mask_path": str(mask_path),
+        "rgba_path": str(rgba_path),
+    }
+
+
+def draw_numbered_bboxes(
+    *,
+    image_path: str | Path,
+    segments: list[dict[str, Any]],
+    output_path: str | Path,
+) -> Path:
+    """Draw numbered bounding boxes for visual segmentation verification."""
+    image = Image.open(image_path).convert("RGB")
+    draw = ImageDraw.Draw(image)
+    font = _load_label_font(image.width)
+    for index, segment in enumerate(segments, start=1):
+        _draw_bbox_label(
+            draw=draw,
+            bbox_xyxy=segment["bbox_xyxy"],
+            label=str(index),
+            font=font,
+        )
+
+    output_path = Path(output_path).expanduser().resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    image.save(output_path)
+    return output_path
+
+
+def draw_numbered_masks(
+    *,
+    image_path: str | Path,
+    segments: list[dict[str, Any]],
+    output_path: str | Path,
+) -> Path:
+    """Draw numbered segmentation masks for visual segmentation verification."""
+    image = Image.open(image_path).convert("RGBA")
+    overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
+    draw_overlay = ImageDraw.Draw(overlay)
+    font = _load_label_font(image.width)
+    colors = [
+        (255, 64, 64, 110),
+        (64, 160, 255, 110),
+        (64, 220, 120, 110),
+        (255, 190, 64, 110),
+        (190, 96, 255, 110),
+        (255, 96, 190, 110),
+    ]
+
+    for index, segment in enumerate(segments, start=1):
+        mask_rle = segment.get("mask_rle")
+        if mask_rle is None:
+            continue
+        mask = decode_rle_mask(mask_rle)
+        if mask.size != image.size:
+            mask = mask.resize(image.size, Image.Resampling.NEAREST)
+        color = colors[(index - 1) % len(colors)]
+        color_layer = Image.new("RGBA", image.size, color)
+        transparent = Image.new("RGBA", image.size)
+        overlay.alpha_composite(Image.composite(color_layer, transparent, mask))
+        _draw_mask_label(
+            draw=draw_overlay,
+            segment=segment,
+            mask=mask,
+            label=str(index),
+            font=font,
+        )
+
+    result = Image.alpha_composite(image, overlay).convert("RGB")
+    output_path = Path(output_path).expanduser().resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    result.save(output_path)
+    return output_path
+
+
+def draw_labeled_bboxes(
+    *,
+    image_path: str | Path,
+    boxes: list[dict[str, Any]],
+    output_path: str | Path,
+) -> Path:
+    """Draw labeled bounding boxes for final segmentation visualization."""
+    image = Image.open(image_path).convert("RGB")
+    draw = ImageDraw.Draw(image)
+    font = _load_label_font(image.width)
+    for box in boxes:
+        x1, y1, x2, y2 = box["bbox_xyxy"]
+        label = str(box["label"])
+        _draw_bbox_label(
+            draw=draw,
+            bbox_xyxy=[x1, y1, x2, y2],
+            label=label,
+            font=font,
+        )
+
+    output_path = Path(output_path).expanduser().resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    image.save(output_path)
+    return output_path
+
+
+def sort_segments_by_bbox(segments: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Sort segments by top-left image position, then by descending score."""
+    return sorted(
+        segments,
+        key=lambda segment: (
+            float(segment["bbox_xyxy"][1]),
+            float(segment["bbox_xyxy"][0]),
+            -float(segment["score"]),
+        ),
+    )
+
+
+def bbox_iou(bbox_a: list[float], bbox_b: list[float]) -> float:
+    """Compute IoU for two xyxy bounding boxes."""
+    ax1, ay1, ax2, ay2 = bbox_a
+    bx1, by1, bx2, by2 = bbox_b
+    ix1 = max(ax1, bx1)
+    iy1 = max(ay1, by1)
+    ix2 = min(ax2, bx2)
+    iy2 = min(ay2, by2)
+    iw = max(0.0, ix2 - ix1)
+    ih = max(0.0, iy2 - iy1)
+    intersection = iw * ih
+    area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
+    area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
+    union = area_a + area_b - intersection
+    return intersection / union if union > 0 else 0.0
+
+
+def is_usable_segmentation_candidate(
+    candidate: ImageSegmentationCandidate,
+) -> bool:
+    """Return whether a candidate has the fields needed by downstream stages."""
+    return candidate.mask_rle is not None and len(candidate.bbox_xyxy) == 4
+
+
+def _is_size_pair(value: Any) -> bool:
+    return (
+        isinstance(value, list)
+        and len(value) == 2
+        and isinstance(value[0], int)
+        and isinstance(value[1], int)
+    )
+
+
+def _load_label_font(image_width: int) -> ImageFont.ImageFont:
+    font_size = max(24, image_width // 80)
+    try:
+        return ImageFont.truetype("DejaVuSans-Bold.ttf", font_size)
+    except OSError:
+        return ImageFont.load_default()
+
+
+def _draw_bbox_label(
+    *,
+    draw: ImageDraw.ImageDraw,
+    bbox_xyxy: list[float],
+    label: str,
+    font: ImageFont.ImageFont,
+) -> None:
+    x1, y1, x2, y2 = bbox_xyxy
+    draw.rectangle((x1, y1, x2, y2), outline="red", width=6)
+    label_box = draw.textbbox((x1, y1), label, font=font)
+    padding = 8
+    draw.rectangle(
+        (
+            label_box[0] - padding,
+            label_box[1] - padding,
+            label_box[2] + padding,
+            label_box[3] + padding,
+        ),
+        fill="red",
+    )
+    draw.text((x1, y1), label, fill="white", font=font)
+
+
+def _draw_mask_label(
+    *,
+    draw: ImageDraw.ImageDraw,
+    segment: dict[str, Any],
+    mask: Image.Image,
+    label: str,
+    font: ImageFont.ImageFont,
+) -> None:
+    bbox = mask.getbbox()
+    if bbox is None:
+        x1, y1, x2, y2 = segment["bbox_xyxy"]
+        x = float(x1 + x2) * 0.5
+        y = float(y1 + y2) * 0.5
+    else:
+        x1, y1, x2, y2 = bbox
+        x = float(x1 + x2) * 0.5
+        y = float(y1 + y2) * 0.5
+
+    label_box = draw.textbbox((x, y), label, font=font)
+    padding = 8
+    draw.rectangle(
+        (
+            label_box[0] - padding,
+            label_box[1] - padding,
+            label_box[2] + padding,
+            label_box[3] + padding,
+        ),
+        fill="red",
+    )
+    draw.text((x, y), label, fill="white", font=font)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/__init__.py
new file mode 100644
index 00000000..32f8ef6c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/__init__.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.blender_rendering_manager.manager import (
+    BlenderRenderingManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.blender_rendering_manager.schemas import (
+    RenderObjectScenesRequest,
+    RenderObjectScenesResult,
+)
+
+__all__ = [
+    "BlenderRenderingManager",
+    "RenderObjectScenesRequest",
+    "RenderObjectScenesResult",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/manager.py
new file mode 100644
index 00000000..8617f297
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/manager.py
@@ -0,0 +1,175 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.blender_rendering_manager.schemas import (
+    RenderObjectScenesRequest,
+    RenderObjectScenesResult,
+)
+
+__all__ = ["BlenderRenderingManager"]
+
+
+class BlenderRenderingManager:
+    """Render simulation scenes through Blender's background CLI."""
+
+    def render_object_scenes(
+        self,
+        request: RenderObjectScenesRequest,
+    ) -> RenderObjectScenesResult:
+        """Render a front-oblique view of a collection of Z-up scenes."""
+        output_path = request.output_path.expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with tempfile.TemporaryDirectory(prefix="p2s_blender_render_") as tmp_dir:
+            glb_paths = self._export_y_up_scenes(
+                request.object_scenes,
+                Path(tmp_dir),
+            )
+            self._render_glbs(
+                glb_paths,
+                output_path,
+                timeout_seconds=request.timeout_seconds,
+            )
+        return RenderObjectScenesResult(output_path=output_path)
+
+    @staticmethod
+    def _export_y_up_scenes(
+        object_scenes: list[tuple[str, object]],
+        output_dir: Path,
+    ) -> list[Path]:
+        z_up_to_y_up = np.array(
+            [
+                [1.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0],
+                [0.0, -1.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 1.0],
+            ],
+            dtype=np.float64,
+        )
+        paths: list[Path] = []
+        for object_id, scene in object_scenes:
+            path = output_dir / f"{object_id}_render.glb"
+            copied = scene.copy()
+            copied.apply_transform(z_up_to_y_up)
+            copied.export(path)
+            paths.append(path)
+        return paths
+
+    @classmethod
+    def _render_glbs(
+        cls,
+        glb_paths: list[Path],
+        output_path: Path,
+        *,
+        timeout_seconds: int,
+    ) -> None:
+        script = cls._front_oblique_script(glb_paths, output_path)
+        with tempfile.NamedTemporaryFile(
+            mode="w",
+            suffix=".py",
+            encoding="utf-8",
+            delete=False,
+        ) as file:
+            script_path = Path(file.name)
+            file.write(script)
+        try:
+            subprocess.run(
+                ["blender", "--background", "--python", str(script_path)],
+                check=True,
+                timeout=timeout_seconds,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+        except subprocess.CalledProcessError as exc:
+            stderr_tail = (exc.stderr or "").strip()[-4000:]
+            raise RuntimeError(
+                f"Blender front-oblique render failed:\n{stderr_tail}"
+            ) from exc
+        finally:
+            script_path.unlink(missing_ok=True)
+        if not output_path.is_file():
+            raise FileNotFoundError(f"Blender render was not written: {output_path}")
+
+    @staticmethod
+    def _front_oblique_script(glb_paths: list[Path], output_path: Path) -> str:
+        object_paths_json = json.dumps([str(path.resolve()) for path in glb_paths])
+        output_path_json = json.dumps(str(output_path.resolve()))
+        return f"""\
+import bpy
+import json
+import mathutils
+
+object_paths = json.loads({object_paths_json!r})
+output_path = json.loads({output_path_json!r})
+bpy.ops.object.select_all(action="SELECT")
+bpy.ops.object.delete()
+for path in object_paths:
+    bpy.ops.import_scene.gltf(filepath=path)
+mesh_objects = [obj for obj in bpy.context.scene.objects if obj.type == "MESH"]
+if not mesh_objects:
+    raise RuntimeError("No mesh objects were imported.")
+min_corner = mathutils.Vector((float("inf"), float("inf"), float("inf")))
+max_corner = mathutils.Vector((float("-inf"), float("-inf"), float("-inf")))
+for obj in mesh_objects:
+    for corner in obj.bound_box:
+        world = obj.matrix_world @ mathutils.Vector(corner)
+        min_corner.x = min(min_corner.x, world.x)
+        min_corner.y = min(min_corner.y, world.y)
+        min_corner.z = min(min_corner.z, world.z)
+        max_corner.x = max(max_corner.x, world.x)
+        max_corner.y = max(max_corner.y, world.y)
+        max_corner.z = max(max_corner.z, world.z)
+center = (min_corner + max_corner) * 0.5
+span_x = max(max_corner.x - min_corner.x, 1.0e-4)
+span_y = max(max_corner.y - min_corner.y, 1.0e-4)
+span_z = max(max_corner.z - min_corner.z, 1.0e-4)
+camera_data = bpy.data.cameras.new("front_oblique_camera")
+camera = bpy.data.objects.new("front_oblique_camera", camera_data)
+bpy.context.collection.objects.link(camera)
+view_distance = max(span_x, span_y, span_z) * 2.4
+camera.location = (center.x, center.y - view_distance, center.z + view_distance * 0.75)
+camera.rotation_euler = (center - camera.location).to_track_quat("-Z", "Y").to_euler()
+camera_data.type = "ORTHO"
+camera_data.ortho_scale = max(span_x, span_y, span_z * 1.8) * 1.35
+bpy.context.scene.camera = camera
+light_data = bpy.data.lights.new("front_oblique_area_light", "AREA")
+light = bpy.data.objects.new("front_oblique_area_light", light_data)
+bpy.context.collection.objects.link(light)
+light.location = camera.location
+light_data.energy = 600.0
+light_data.size = max(span_x, span_y) * 2.0
+bpy.context.scene.world.color = (1.0, 1.0, 1.0)
+try:
+    bpy.context.scene.render.engine = "BLENDER_EEVEE_NEXT"
+except Exception:
+    bpy.context.scene.render.engine = "BLENDER_EEVEE"
+bpy.context.scene.render.resolution_x = 768
+bpy.context.scene.render.resolution_y = 768
+bpy.context.scene.render.film_transparent = False
+bpy.context.scene.view_settings.view_transform = "Standard"
+bpy.context.scene.view_settings.look = "Medium High Contrast"
+bpy.context.scene.render.filepath = output_path
+bpy.ops.render.render(write_still=True)
+"""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/schemas.py
new file mode 100644
index 00000000..e3f986c7
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/schemas.py
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = ["RenderObjectScenesRequest", "RenderObjectScenesResult"]
+
+
+@dataclass(frozen=True)
+class RenderObjectScenesRequest:
+    """Request to render internal Z-up object scenes with Blender."""
+
+    object_scenes: list[tuple[str, Any]]
+    output_path: Path
+    timeout_seconds: int = 180
+
+
+@dataclass(frozen=True)
+class RenderObjectScenesResult:
+    """Result of rendering object scenes."""
+
+    output_path: Path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/__init__.py
new file mode 100644
index 00000000..ef8b9315
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/__init__.py
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager.manager import (
+    GeometryGenerationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager.schemas import (
+    GeometryGenerationRequest,
+    GeometryGenerationResult,
+    MultiObjectGenerationObject,
+    MultiObjectGenerationRequest,
+    MultiObjectGenerationResult,
+    RgbaImageToGeometryRequest,
+    RgbaImagesToGeometriesObject,
+    RgbaImagesToGeometriesRequest,
+    RgbaImagesToGeometriesResult,
+)
+
+__all__ = [
+    "GeometryGenerationManager",
+    "GeometryGenerationRequest",
+    "GeometryGenerationResult",
+    "MultiObjectGenerationObject",
+    "MultiObjectGenerationRequest",
+    "MultiObjectGenerationResult",
+    "RgbaImageToGeometryRequest",
+    "RgbaImagesToGeometriesObject",
+    "RgbaImagesToGeometriesRequest",
+    "RgbaImagesToGeometriesResult",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/manager.py
new file mode 100644
index 00000000..d30ea09a
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/manager.py
@@ -0,0 +1,209 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from PIL import Image
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client import (
+    GeometryGenerationClient,
+    GeometryGenerationError,
+    GeometryGenerationServerRequest,
+    MultiObjectGenerationError,
+    MultiObjectGenerationServerRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager.schemas import (
+    GeometryGenerationRequest,
+    GeometryGenerationResult,
+    MultiObjectGenerationObject,
+    MultiObjectGenerationRequest,
+    MultiObjectGenerationResult,
+    RgbaImageToGeometryRequest,
+    RgbaImagesToGeometriesObject,
+    RgbaImagesToGeometriesRequest,
+    RgbaImagesToGeometriesResult,
+)
+
+
+class GeometryGenerationManager:
+    """Geometry generation domain operations."""
+
+    def __init__(self, *, client: GeometryGenerationClient | None = None) -> None:
+        self.client = client or GeometryGenerationClient()
+
+    def generate_single_object_mesh(
+        self,
+        request: GeometryGenerationRequest,
+    ) -> GeometryGenerationResult:
+        image_path = request.image_path.expanduser().resolve()
+        output_path = request.output_path.expanduser().resolve()
+        _validate_single_object_request(image_path=image_path, output_path=output_path)
+
+        response = self.client.generate(
+            GeometryGenerationServerRequest(
+                image_path=image_path,
+                output_path=output_path,
+            ),
+        )
+        if isinstance(response, GeometryGenerationError):
+            raise RuntimeError(response.error_message)
+
+        return GeometryGenerationResult(
+            output_path=Path(response.result.geometry_path).expanduser().resolve(),
+        )
+
+    def generate_multi_object_meshes(
+        self,
+        request: MultiObjectGenerationRequest,
+    ) -> MultiObjectGenerationResult:
+        image_path = request.image_path.expanduser().resolve()
+        output_dir = request.output_dir.expanduser().resolve()
+        _validate_multi_object_request(
+            image_path=image_path,
+            mask_paths=request.mask_paths,
+            output_dir=output_dir,
+        )
+
+        response = self.client.generate_multiple_objects(
+            MultiObjectGenerationServerRequest(
+                image_path=image_path,
+                mask_paths=[p.expanduser().resolve() for p in request.mask_paths],
+            ),
+            output_dir=output_dir,
+        )
+        if isinstance(response, MultiObjectGenerationError):
+            raise RuntimeError(response.error_message)
+
+        objects = [
+            MultiObjectGenerationObject(
+                name=item.name,
+                geometry_path=Path(item.geometry_path).expanduser().resolve(),
+                rotation_quaternion_wxyz=item.rotation_quaternion_wxyz,
+                translation=item.translation,
+                scale=item.scale,
+            )
+            for item in response.result.objects
+        ]
+        return MultiObjectGenerationResult(objects=objects)
+
+    def convert_rgba_image_to_geometry(
+        self,
+        request: RgbaImageToGeometryRequest,
+    ) -> Path:
+        image_path = request.image_path.expanduser().resolve()
+        output_path = request.output_path.expanduser().resolve()
+        _validate_rgba_image(image_path)
+
+        result = self.generate_single_object_mesh(
+            GeometryGenerationRequest(image_path=image_path, output_path=output_path)
+        )
+        return _postprocess_mesh(result.output_path)
+
+    def convert_rgba_images_to_geometries(
+        self,
+        request: RgbaImagesToGeometriesRequest,
+    ) -> RgbaImagesToGeometriesResult:
+        image_path = request.image_path.expanduser().resolve()
+        output_dir = request.output_dir.expanduser().resolve()
+        _validate_rgba_images_request(image_path, request.mask_paths)
+
+        result = self.generate_multi_object_meshes(
+            MultiObjectGenerationRequest(
+                image_path=image_path,
+                mask_paths=request.mask_paths,
+                output_dir=output_dir,
+            )
+        )
+        objects = [
+            RgbaImagesToGeometriesObject(
+                name=item.name,
+                geometry_path=_postprocess_mesh(item.geometry_path),
+                rotation_quaternion_wxyz=item.rotation_quaternion_wxyz,
+                translation=item.translation,
+                scale=item.scale,
+            )
+            for item in result.objects
+        ]
+        return RgbaImagesToGeometriesResult(objects=objects)
+
+
+def _validate_single_object_request(*, image_path: Path, output_path: Path) -> None:
+    if not image_path.is_file():
+        raise FileNotFoundError(f"Geometry generation input not found: {image_path}")
+    if output_path.suffix.lower() != ".glb":
+        raise ValueError("Geometry generation output_path must be a GLB file path.")
+    if output_path.exists() and output_path.is_dir():
+        raise ValueError(f"Geometry generation output_path is a directory: {output_path}")
+
+
+def _validate_multi_object_request(
+    *,
+    image_path: Path,
+    mask_paths: list[Path],
+    output_dir: Path,
+) -> None:
+    if not image_path.is_file():
+        raise FileNotFoundError(
+            f"Multi-object geometry generation input not found: {image_path}"
+        )
+    if not mask_paths:
+        raise ValueError("mask_paths must be non-empty.")
+    for mask_path in mask_paths:
+        mask_path_resolved = mask_path.expanduser().resolve()
+        if not mask_path_resolved.is_file():
+            raise FileNotFoundError(
+                f"Multi-object geometry mask not found: {mask_path_resolved}"
+            )
+    if output_dir.exists() and not output_dir.is_dir():
+        raise ValueError(
+            f"Multi-object geometry output_dir is not a directory: {output_dir}"
+        )
+
+
+def _validate_rgba_image(image_path: Path) -> None:
+    if not image_path.is_file():
+        raise FileNotFoundError(f"RGBA image not found: {image_path}")
+
+    with Image.open(image_path) as image:
+        if image.mode in {"RGBA", "LA"}:
+            return
+        if image.mode == "P" and "transparency" in image.info:
+            return
+        raise ValueError(
+            "Geometry tool requires an image with an alpha channel, "
+            f"got mode={image.mode!r}: {image_path}"
+        )
+
+
+def _validate_rgba_images_request(
+    image_path: Path,
+    mask_paths: list[Path],
+) -> None:
+    if not image_path.is_file():
+        raise FileNotFoundError(f"Scene image not found: {image_path}")
+    with Image.open(image_path):
+        pass
+    if not mask_paths:
+        raise ValueError("mask_paths must be non-empty.")
+    for mask_path in mask_paths:
+        if not mask_path.expanduser().resolve().is_file():
+            raise FileNotFoundError(f"Mask not found: {mask_path}")
+
+
+def _postprocess_mesh(mesh_path: Path) -> Path:
+    return mesh_path.expanduser().resolve()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/schemas.py
new file mode 100644
index 00000000..81f6816a
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/schemas.py
@@ -0,0 +1,105 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class RgbaImageToGeometryRequest:
+    """Request for converting one RGBA asset image to one mesh."""
+
+    image_path: Path
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class RgbaImagesToGeometriesRequest:
+    """Request for converting a scene image with object masks to meshes."""
+
+    image_path: Path
+    mask_paths: list[Path]
+    output_dir: Path
+
+
+@dataclass(frozen=True)
+class RgbaImagesToGeometriesObject:
+    """One generated object mesh and its scene placement."""
+
+    name: str
+    geometry_path: Path
+    rotation_quaternion_wxyz: list[float]
+    translation: list[float]
+    scale: list[float]
+
+
+@dataclass(frozen=True)
+class RgbaImagesToGeometriesResult:
+    """Result of multi-object geometry generation."""
+
+    objects: list[RgbaImagesToGeometriesObject]
+
+    @property
+    def geometry_paths(self) -> list[Path]:
+        return [item.geometry_path for item in self.objects]
+
+
+@dataclass(frozen=True)
+class GeometryGenerationRequest:
+    """Request for generating one object mesh from one image."""
+
+    image_path: Path
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class GeometryGenerationResult:
+    """Generated mesh path."""
+
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationRequest:
+    """Request to generate multiple object meshes from one image and masks."""
+
+    image_path: Path
+    mask_paths: list[Path]
+    output_dir: Path
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationObject:
+    """One generated object mesh and its scene placement."""
+
+    name: str
+    geometry_path: Path
+    rotation_quaternion_wxyz: list[float]
+    translation: list[float]
+    scale: list[float]
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationResult:
+    """Result of multi-object geometry generation."""
+
+    objects: list[MultiObjectGenerationObject]
+
+    @property
+    def geometry_paths(self) -> list[Path]:
+        return [item.geometry_path for item in self.objects]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/__init__.py
new file mode 100644
index 00000000..7d70c81c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/__init__.py
@@ -0,0 +1,69 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.manager import (
+    DEFAULT_INPUT_UP_AXIS,
+    DEFAULT_UP_AXIS,
+    GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.schemas import (
+    AlignToAxisRequest,
+    AlignToAxisResult,
+    AlignXYLongAxisRequest,
+    AlignXYLongAxisResult,
+    CenterMeshRequest,
+    CenterMeshResult,
+    ConvertUpAxisRequest,
+    ConvertUpAxisResult,
+    DetectTabletopRequest,
+    DetectTabletopResult,
+    ExportMeshRequest,
+    ExportMeshResult,
+    LoadMeshRequest,
+    LoadMeshResult,
+    NormalizeRequest,
+    NormalizeResult,
+    PlaceAbovePlaneRequest,
+    PlaceAbovePlaneResult,
+    SupportPlaneCandidate,
+)
+
+__all__ = [
+    "AlignToAxisRequest",
+    "AlignToAxisResult",
+    "AlignXYLongAxisRequest",
+    "AlignXYLongAxisResult",
+    "CenterMeshRequest",
+    "CenterMeshResult",
+    "ConvertUpAxisRequest",
+    "ConvertUpAxisResult",
+    "DEFAULT_INPUT_UP_AXIS",
+    "DEFAULT_UP_AXIS",
+    "DetectTabletopRequest",
+    "DetectTabletopResult",
+    "ExportMeshRequest",
+    "ExportMeshResult",
+    "GeometryManager",
+    "LoadMeshRequest",
+    "LoadMeshResult",
+    "NormalizeRequest",
+    "NormalizeResult",
+    "PlaceAbovePlaneRequest",
+    "PlaceAbovePlaneResult",
+    "SupportPlaneCandidate",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/manager.py
new file mode 100644
index 00000000..2e5c88ab
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/manager.py
@@ -0,0 +1,584 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Geometry manager for mesh I/O, transforms, and tabletop detection."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import trimesh
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.schemas import (
+    AlignToAxisRequest,
+    AlignToAxisResult,
+    AlignXYLongAxisRequest,
+    AlignXYLongAxisResult,
+    CenterMeshRequest,
+    CenterMeshResult,
+    ConvertUpAxisRequest,
+    ConvertUpAxisResult,
+    DetectTabletopRequest,
+    DetectTabletopResult,
+    ExportMeshRequest,
+    ExportMeshResult,
+    LoadMeshRequest,
+    LoadMeshResult,
+    NormalizeRequest,
+    NormalizeResult,
+    PlaceAbovePlaneRequest,
+    PlaceAbovePlaneResult,
+    SupportPlaneCandidate,
+)
+
+__all__ = ["GeometryManager"]
+
+DEFAULT_INPUT_UP_AXIS = [0.0, 1.0, 0.0]
+DEFAULT_UP_AXIS = [0.0, 0.0, 1.0]
+
+
+class GeometryManager:
+    """Manager for mesh geometry operations.
+
+    Provides typed methods for mesh I/O, axis conversion, bounding-box
+    transforms, tabletop plane detection, and PCA alignment, following
+    the same pattern as service clients.
+    """
+
+
+    @staticmethod
+    def load_mesh(request: LoadMeshRequest) -> LoadMeshResult:
+        """Load a GLB/mesh file as one Trimesh object."""
+        mesh_path = request.mesh_path.expanduser().resolve()
+        if not mesh_path.is_file():
+            raise FileNotFoundError(f"Mesh file not found: {mesh_path}")
+
+        loaded = trimesh.load(mesh_path, force=None)
+        if isinstance(loaded, trimesh.Scene):
+            geometries = [
+                g
+                for g in loaded.dump(concatenate=False)
+                if hasattr(g, "vertices") and hasattr(g, "faces")
+            ]
+            if not geometries:
+                raise ValueError(f"Scene contains no mesh geometry: {mesh_path}")
+            return LoadMeshResult(mesh=trimesh.util.concatenate(geometries))
+        return LoadMeshResult(mesh=loaded)
+
+    @staticmethod
+    def export_mesh(request: ExportMeshRequest) -> ExportMeshResult:
+        """Export a mesh and return the resolved output path."""
+        output_path = request.output_path.expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        request.mesh.export(output_path)
+        if not output_path.is_file():
+            raise FileNotFoundError(f"Mesh was not written: {output_path}")
+        return ExportMeshResult(output_path=output_path)
+
+
+    @staticmethod
+    def convert_up_axis(request: ConvertUpAxisRequest) -> ConvertUpAxisResult:
+        """Convert a mesh from one up-axis convention to another."""
+        mesh = GeometryManager._align_vector_to_axis(
+            request.mesh,
+            source_axis=request.input_up_axis or DEFAULT_INPUT_UP_AXIS,
+            target_axis=request.output_up_axis or DEFAULT_UP_AXIS,
+        )
+        return ConvertUpAxisResult(mesh=mesh)
+
+    @staticmethod
+    def center_by_bbox(request: CenterMeshRequest) -> CenterMeshResult:
+        """Center a mesh by its bounding box."""
+        GeometryManager._validate_mesh(request.mesh)
+
+        bounds = np.asarray(request.mesh.bounds, dtype=float)
+        if bounds.shape != (2, 3):
+            raise ValueError("Mesh bounds must have shape (2, 3).")
+
+        bbox_center = (bounds[0] + bounds[1]) * 0.5
+        centered = request.mesh.copy()
+        centered.apply_translation(-bbox_center)
+        return CenterMeshResult(
+            mesh=centered,
+            bbox_center=[float(v) for v in bbox_center],
+        )
+
+    @staticmethod
+    def align_to_axis(request: AlignToAxisRequest) -> AlignToAxisResult:
+        """Rotate a mesh so a source vector aligns to a target axis."""
+        mesh = GeometryManager._align_vector_to_axis(
+            request.mesh,
+            source_axis=request.source_axis,
+            target_axis=request.target_axis,
+        )
+        return AlignToAxisResult(mesh=mesh)
+
+    @staticmethod
+    def place_above_plane(
+        request: PlaceAbovePlaneRequest,
+    ) -> PlaceAbovePlaneResult:
+        """Translate a mesh so its AABB bottom is above the XY plane."""
+        if request.clearance < 0.0:
+            raise ValueError("clearance must be non-negative.")
+
+        bounds = np.asarray(request.mesh.bounds, dtype=float)
+        if bounds.shape != (2, 3):
+            raise ValueError("Mesh bounds must have shape (2, 3).")
+
+        min_z = float(bounds[0][2])
+        placed = request.mesh.copy()
+        placed.apply_translation([0.0, 0.0, request.clearance - min_z])
+        return PlaceAbovePlaneResult(mesh=placed)
+
+    @staticmethod
+    def normalize(request: NormalizeRequest) -> NormalizeResult:
+        """Scale a mesh so its longest bounding-box axis equals target_size."""
+        if request.target_size <= 0.0:
+            raise ValueError("target_size must be positive.")
+
+        extents = np.asarray(
+            request.mesh.bounding_box_oriented.primitive.extents, dtype=float
+        )
+        scale_factor = request.target_size / float(np.max(extents))
+        normalized = request.mesh.copy()
+        normalized.apply_scale(scale_factor)
+        return NormalizeResult(mesh=normalized, scale_factor=scale_factor)
+
+    @staticmethod
+    def mesh_aabb_size(mesh: Any) -> Any:
+        """Return a mesh AABB size vector."""
+        bounds = np.asarray(mesh.bounds, dtype=np.float64)
+        if bounds.shape != (2, 3):
+            raise ValueError("Mesh bounds must have shape (2, 3).")
+        size = bounds[1] - bounds[0]
+        if np.any(size <= 0.0):
+            raise ValueError(f"Mesh AABB size must be positive, got {size.tolist()}.")
+        return size
+
+    @staticmethod
+    def bbox_ratio(size: Any) -> Any:
+        """Return bbox dimensions normalized by the largest axis."""
+        size = np.asarray(size, dtype=np.float64)
+        max_size = float(np.max(size))
+        if max_size <= 0.0:
+            raise ValueError("bbox size max must be positive.")
+        return size / max_size
+
+    @staticmethod
+    def best_axis_bbox_scale_match(
+        *,
+        source_size_cm: Any,
+        target_size_cm: Any,
+    ) -> dict[str, Any]:
+        """Match target bbox axes to source axes and return a scale candidate."""
+        source = np.asarray(source_size_cm, dtype=np.float64)
+        target = np.asarray(target_size_cm, dtype=np.float64)
+        if source.shape != (3,) or target.shape != (3,):
+            raise ValueError("source_size_cm and target_size_cm must have shape (3,).")
+        if np.any(source <= 0.0) or np.any(target <= 0.0):
+            raise ValueError("source_size_cm and target_size_cm must be positive.")
+
+        source_ratio = GeometryManager.bbox_ratio(source)
+        best: dict[str, Any] | None = None
+        for permutation in [
+            (0, 1, 2),
+            (0, 2, 1),
+            (1, 0, 2),
+            (1, 2, 0),
+            (2, 0, 1),
+            (2, 1, 0),
+        ]:
+            target_perm = target[list(permutation)]
+            target_ratio = GeometryManager.bbox_ratio(target_perm)
+            ratio_error = GeometryManager._mean_abs_log_ratio_error(
+                source_ratio,
+                target_ratio,
+            )
+            per_axis_scale = target_perm / source
+            candidate = {
+                "target_permutation": list(permutation),
+                "source_size_cm": source.tolist(),
+                "target_size_cm_original_order": target.tolist(),
+                "target_size_cm_matched_to_source_axes": target_perm.tolist(),
+                "source_ratio": source_ratio.tolist(),
+                "target_ratio_matched": target_ratio.tolist(),
+                "per_axis_scale": per_axis_scale.tolist(),
+                "scale_factor": float(np.median(per_axis_scale)),
+                "shape_ratio_error": float(ratio_error),
+            }
+            if best is None or ratio_error < float(best["shape_ratio_error"]):
+                best = candidate
+        if best is None:
+            raise ValueError("Failed to match bbox axes.")
+        return best
+
+    @staticmethod
+    def scene_to_mesh(scene: Any) -> Any:
+        """Convert a trimesh Scene or mesh-like object to one mesh."""
+        if isinstance(scene, trimesh.Trimesh):
+            return scene
+        dumped = scene.dump(concatenate=True)
+        if isinstance(dumped, trimesh.Trimesh):
+            return dumped
+        meshes = [item for item in dumped if isinstance(item, trimesh.Trimesh)]
+        if not meshes:
+            raise ValueError("Scene contains no mesh geometry.")
+        return trimesh.util.concatenate(meshes)
+
+    @staticmethod
+    def detect_tabletop(
+        request: DetectTabletopRequest,
+    ) -> DetectTabletopResult:
+        """Detect the most likely tabletop plane in a mesh."""
+        candidates = GeometryManager._find_support_plane_candidates(
+            request.mesh,
+            normal_angle_tol_deg=request.normal_angle_tol_deg,
+            plane_distance_tol=request.plane_distance_tol,
+            min_area_ratio=request.min_area_ratio,
+            max_candidates=request.max_candidates,
+        )
+        selected = GeometryManager._select_tabletop_plane(candidates)
+        oriented_normal = GeometryManager._orient_plane_normal(
+            request.mesh,
+            plane_normal=selected.normal,
+            plane_center=selected.center,
+        )
+        return DetectTabletopResult(
+            selected=selected,
+            oriented_normal=oriented_normal,
+            candidates=candidates,
+        )
+
+
+    @staticmethod
+    def align_xy_long_axis(
+        request: AlignXYLongAxisRequest,
+    ) -> AlignXYLongAxisResult:
+        """Rotate a table so its XY-projected long axis aligns with the Y axis."""
+        vertices = np.asarray(request.mesh.vertices, dtype=float)
+        xy_vertices = GeometryManager._select_xy_vertices(
+            request.mesh, vertices, request.face_indices
+        )
+        if xy_vertices.shape[0] < 2:
+            raise ValueError(
+                "Mesh must contain at least two vertices for PCA alignment."
+            )
+
+        centered_xy = xy_vertices - np.mean(xy_vertices, axis=0)
+        covariance = centered_xy.T @ centered_xy / max(centered_xy.shape[0] - 1, 1)
+        eigenvalues, eigenvectors = np.linalg.eigh(covariance)
+        long_axis = eigenvectors[:, int(np.argmax(eigenvalues))]
+        if float(np.linalg.norm(long_axis)) == 0.0:
+            raise ValueError("PCA long axis is degenerate.")
+
+        axis_angle = float(np.arctan2(long_axis[1], long_axis[0]))
+        rotation_angle = GeometryManager._minimal_angle_to_align_axis(
+            axis_angle, np.pi / 2.0
+        )
+        rotation = GeometryManager._z_axis_rotation_transform(rotation_angle)
+        aligned = request.mesh.copy()
+        aligned.apply_transform(rotation)
+        return AlignXYLongAxisResult(
+            mesh=aligned,
+            yaw_angle_degrees=float(np.rad2deg(rotation_angle)),
+        )
+
+
+    @staticmethod
+    def _align_vector_to_axis(
+        mesh: Any,
+        *,
+        source_axis: list[float],
+        target_axis: list[float],
+    ) -> Any:
+        source = GeometryManager._normalize(
+            np.asarray(source_axis, dtype=float)
+        )
+        target = GeometryManager._normalize(
+            np.asarray(target_axis, dtype=float)
+        )
+        if np.linalg.norm(source) == 0:
+            raise ValueError("source_axis must be non-zero.")
+        if np.linalg.norm(target) == 0:
+            raise ValueError("target_axis must be non-zero.")
+
+        transform = GeometryManager._rotation_transform_between_vectors(
+            source, target
+        )
+        aligned = mesh.copy()
+        aligned.apply_transform(transform)
+        return aligned
+
+
+    @staticmethod
+    def _find_support_plane_candidates(
+        mesh: Any,
+        *,
+        normal_angle_tol_deg: float = 8.0,
+        plane_distance_tol: float | None = None,
+        min_area_ratio: float = 0.02,
+        max_candidates: int = 24,
+    ) -> list[SupportPlaneCandidate]:
+        GeometryManager._validate_mesh(mesh)
+
+        normals = np.asarray(mesh.face_normals, dtype=float)
+        centers = np.asarray(mesh.triangles_center, dtype=float)
+        areas = np.asarray(mesh.area_faces, dtype=float)
+        vertices = np.asarray(mesh.vertices, dtype=float)
+        total_area = float(np.sum(areas))
+        if total_area <= 0:
+            raise ValueError("Mesh has no positive face area.")
+
+        if plane_distance_tol is None:
+            extent = float(
+                np.linalg.norm(np.asarray(mesh.extents, dtype=float))
+            )
+            plane_distance_tol = max(extent * 0.01, 1e-4)
+
+        cos_tol = float(np.cos(np.deg2rad(normal_angle_tol_deg)))
+        min_area = total_area * min_area_ratio
+        order = np.argsort(-areas)
+        used = np.zeros(len(areas), dtype=bool)
+        candidates: list[SupportPlaneCandidate] = []
+
+        for seed_index in order:
+            if used[seed_index]:
+                continue
+            seed_normal = GeometryManager._normalize(normals[seed_index])
+            if np.linalg.norm(seed_normal) == 0:
+                used[seed_index] = True
+                continue
+
+            seed_center = centers[seed_index]
+            seed_offset = float(np.dot(seed_normal, seed_center))
+            normal_match = normals @ seed_normal >= cos_tol
+            offsets = centers @ seed_normal
+            plane_match = np.abs(offsets - seed_offset) <= plane_distance_tol
+            face_mask = normal_match & plane_match & ~used
+            face_indices = np.flatnonzero(face_mask)
+            if len(face_indices) == 0:
+                used[seed_index] = True
+                continue
+
+            used[face_indices] = True
+            area = float(np.sum(areas[face_indices]))
+            if area < min_area:
+                continue
+
+            weighted_normal = GeometryManager._normalize(
+                np.sum(
+                    normals[face_indices] * areas[face_indices, None], axis=0
+                ),
+            )
+            center = (
+                np.sum(
+                    centers[face_indices] * areas[face_indices, None], axis=0
+                )
+                / area
+            )
+            candidate = GeometryManager._build_candidate(
+                normal=weighted_normal,
+                center=center,
+                area=area,
+                face_indices=face_indices,
+                vertices=vertices,
+            )
+            candidates.append(candidate)
+
+        candidates.sort(key=lambda c: c.score, reverse=True)
+        return candidates[:max_candidates]
+
+    @staticmethod
+    def _select_tabletop_plane(
+        candidates: list[SupportPlaneCandidate],
+    ) -> SupportPlaneCandidate:
+        if not candidates:
+            raise ValueError("No support-plane candidates were found.")
+        return max(candidates, key=lambda c: c.score)
+
+    @staticmethod
+    def _orient_plane_normal(
+        mesh: Any,
+        *,
+        plane_normal: list[float],
+        plane_center: list[float],
+    ) -> list[float]:
+        GeometryManager._validate_mesh(mesh)
+
+        normal = GeometryManager._normalize(
+            np.asarray(plane_normal, dtype=float)
+        )
+        center = np.asarray(plane_center, dtype=float)
+        if np.linalg.norm(normal) == 0:
+            raise ValueError("plane_normal must be non-zero.")
+
+        vertices = np.asarray(mesh.vertices, dtype=float)
+        signed_distances = (vertices - center) @ normal
+        positive_mask = signed_distances > 1e-6
+        negative_mask = signed_distances < -1e-6
+        positive_score = float(np.sum(np.abs(signed_distances[positive_mask])))
+        negative_score = float(np.sum(np.abs(signed_distances[negative_mask])))
+
+        if positive_score > negative_score:
+            normal = -normal
+        return [float(v) for v in normal]
+
+    @staticmethod
+    def _build_candidate(
+        *,
+        normal: Any,
+        center: Any,
+        area: float,
+        face_indices: Any,
+        vertices: Any,
+    ) -> SupportPlaneCandidate:
+        signed_distances = (vertices - center) @ normal
+        below_mask = signed_distances < -1e-6
+        above_mask = signed_distances > 1e-6
+        below_count = int(np.count_nonzero(below_mask))
+        above_count = int(np.count_nonzero(above_mask))
+        below_score = float(np.sum(np.abs(signed_distances[below_mask])))
+        above_score = float(np.sum(np.abs(signed_distances[above_mask])))
+
+        smaller_score = min(below_score, above_score)
+        larger_score = max(below_score, above_score)
+        asymmetry_score = min(
+            (larger_score + 1e-9) / (smaller_score + 1e-9), 10.0
+        )
+        score = float(area * asymmetry_score)
+        return SupportPlaneCandidate(
+            normal=[float(v) for v in normal],
+            center=[float(v) for v in center],
+            area=area,
+            face_indices=[int(i) for i in face_indices],
+            below_vertex_count=below_count,
+            above_vertex_count=above_count,
+            below_area_score=below_score,
+            above_area_score=above_score,
+            score=score,
+        )
+
+
+    @staticmethod
+    def _select_xy_vertices(
+        mesh: Any,
+        vertices: Any,
+        face_indices: list[int] | None,
+    ) -> Any:
+        if face_indices is None:
+            return vertices[:, :2]
+
+        faces = np.asarray(mesh.faces, dtype=int)
+        selected_faces = faces[np.asarray(face_indices, dtype=int)]
+        selected_vertex_indices = np.unique(selected_faces.reshape(-1))
+        return vertices[selected_vertex_indices, :2]
+
+    @staticmethod
+    def _minimal_angle_to_align_axis(
+        source_angle: float, target_angle: float
+    ) -> float:
+        candidates = [
+            GeometryManager._wrap_to_pi(target_angle - source_angle),
+            GeometryManager._wrap_to_pi(
+                target_angle + 3.141592653589793 - source_angle
+            ),
+        ]
+        return min(candidates, key=abs)
+
+    @staticmethod
+    def _wrap_to_pi(angle: float) -> float:
+        two_pi = 2.0 * 3.141592653589793
+        return (angle + 3.141592653589793) % two_pi - 3.141592653589793
+
+    @staticmethod
+    def _z_axis_rotation_transform(angle: float) -> Any:
+        c = float(np.cos(angle))
+        s = float(np.sin(angle))
+        transform = np.eye(4)
+        transform[:3, :3] = np.array(
+            [
+                [c, -s, 0.0],
+                [s, c, 0.0],
+                [0.0, 0.0, 1.0],
+            ],
+            dtype=float,
+        )
+        return transform
+
+
+    @staticmethod
+    def _rotation_transform_between_vectors(
+        source: Any, target: Any
+    ) -> Any:
+        dot = float(np.clip(np.dot(source, target), -1.0, 1.0))
+        transform = np.eye(4)
+        if dot > 1.0 - 1e-8:
+            return transform
+        if dot < -1.0 + 1e-8:
+            axis = GeometryManager._orthogonal_axis(source)
+            rotation = GeometryManager._axis_angle_rotation(axis, np.pi)
+        else:
+            axis = GeometryManager._normalize(np.cross(source, target))
+            angle = float(np.arccos(dot))
+            rotation = GeometryManager._axis_angle_rotation(axis, angle)
+        transform[:3, :3] = rotation
+        return transform
+
+    @staticmethod
+    def _axis_angle_rotation(axis: Any, angle: float) -> Any:
+        axis = GeometryManager._normalize(axis)
+        x, y, z = axis
+        c = float(np.cos(angle))
+        s = float(np.sin(angle))
+        one_c = 1.0 - c
+        return np.array(
+            [
+                [c + x * x * one_c, x * y * one_c - z * s, x * z * one_c + y * s],
+                [y * x * one_c + z * s, c + y * y * one_c, y * z * one_c - x * s],
+                [z * x * one_c - y * s, z * y * one_c + x * s, c + z * z * one_c],
+            ],
+            dtype=float,
+        )
+
+    @staticmethod
+    def _orthogonal_axis(vector: Any) -> Any:
+        axis = np.array([1.0, 0.0, 0.0])
+        if abs(float(np.dot(vector, axis))) > 0.9:
+            axis = np.array([0.0, 1.0, 0.0])
+        return GeometryManager._normalize(np.cross(vector, axis))
+
+    @staticmethod
+    def _normalize(vector: Any) -> Any:
+        norm = float(np.linalg.norm(vector))
+        if norm == 0.0:
+            return vector
+        return vector / norm
+
+    @staticmethod
+    def _mean_abs_log_ratio_error(lhs: Any, rhs: Any) -> float:
+        eps = 1.0e-6
+        lhs = np.maximum(np.asarray(lhs, dtype=np.float64), eps)
+        rhs = np.maximum(np.asarray(rhs, dtype=np.float64), eps)
+        return float(np.mean(np.abs(np.log(lhs / rhs))))
+
+    @staticmethod
+    def _validate_mesh(mesh: Any) -> None:
+        if not hasattr(mesh, "vertices") or not hasattr(mesh, "faces"):
+            raise ValueError("Loaded geometry is not a mesh.")
+        if len(mesh.vertices) == 0 or len(mesh.faces) == 0:
+            raise ValueError("Mesh must contain vertices and faces.")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/scene_geometry.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/scene_geometry.py
new file mode 100644
index 00000000..be502fbb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/scene_geometry.py
@@ -0,0 +1,567 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+    DetectTabletopRequest,
+    GeometryManager,
+)
+
+__all__ = [
+    "_compose_json_matrices",
+    "_compose_simready_to_aligned_matrix",
+    "_decompose_transform_matrix",
+    "_aabb_bottom_to_xy_plane_transform",
+    "_aabb_center",
+    "_compose_sam3d_multi_object_transform",
+    "_copy_scene_with_transform",
+    "_estimate_support_normal",
+    "_glb_to_sam3d_local_matrix",
+    "_load_scene_with_transform",
+    "_matrix_from_json",
+    "_quaternion_wxyz_to_matrix",
+    "_rotation_between_vectors",
+    "_row_linear_to_trimesh_matrix",
+    "_scale_transform",
+    "_scene_to_mesh",
+    "_support_normal_flip_transform",
+    "_transform_point",
+    "_validate_vector",
+    "_xy_aabb_center",
+    "_xy_aabb_size",
+    "_z_up_to_glb_y_up_transform",
+    "_z_yaw_transform",
+]
+
+
+def _compose_json_matrices(*values: Any) -> list[list[float]]:
+    matrices = [np.asarray(value, dtype=np.float64) for value in values]
+    if any(matrix.shape != (4, 4) for matrix in matrices):
+        return []
+    result = np.eye(4, dtype=np.float64)
+    for matrix in matrices:
+        result = result @ matrix
+    return result.tolist()
+
+
+def _compose_simready_to_aligned_matrix(
+    *, raw_to_aligned_matrix: Any, raw_to_simready_matrix: Any
+) -> list[list[float]]:
+    raw_to_aligned = np.asarray(raw_to_aligned_matrix, dtype=np.float64)
+    raw_to_simready = np.asarray(raw_to_simready_matrix, dtype=np.float64)
+    if raw_to_aligned.shape != (4, 4) or raw_to_simready.shape != (4, 4):
+        return []
+    try:
+        return (raw_to_aligned @ np.linalg.inv(raw_to_simready)).tolist()
+    except np.linalg.LinAlgError:
+        return []
+
+
+def _decompose_transform_matrix(matrix_value: Any) -> dict[str, Any]:
+    matrix = np.asarray(matrix_value, dtype=np.float64)
+    if matrix.shape != (4, 4):
+        return {"translation": [], "rotation_matrix": [], "scale": []}
+    linear = matrix[:3, :3]
+    scale = np.linalg.norm(linear, axis=0)
+    rotation = np.eye(3, dtype=np.float64)
+    for index in range(3):
+        if scale[index] > 1.0e-12:
+            rotation[:, index] = linear[:, index] / scale[index]
+    return {
+        "translation": matrix[:3, 3].tolist(),
+        "rotation_matrix": rotation.tolist(),
+        "scale": scale.tolist(),
+    }
+
+
+def _support_normal_flip_transform(
+    *,
+    support_normal: np.ndarray,
+    normal_alignment: np.ndarray,
+) -> np.ndarray:
+    flipped_normal_alignment = _rotation_between_vectors(
+        -support_normal,
+        np.array([0.0, 0.0, 1.0], dtype=np.float64),
+    )
+    return flipped_normal_alignment @ np.linalg.inv(normal_alignment)
+
+
+def _z_yaw_transform(yaw_degrees: float) -> np.ndarray:
+    angle = np.deg2rad(yaw_degrees)
+    c = float(np.cos(angle))
+    s = float(np.sin(angle))
+    transform = np.eye(4, dtype=np.float64)
+    transform[:3, :3] = np.array(
+        [
+            [c, -s, 0.0],
+            [s, c, 0.0],
+            [0.0, 0.0, 1.0],
+        ],
+        dtype=np.float64,
+    )
+    return transform
+
+
+def _z_up_to_glb_y_up_transform() -> np.ndarray:
+    return _rotation_between_vectors(
+        np.array([0.0, 0.0, 1.0], dtype=np.float64),
+        np.array([0.0, 1.0, 0.0], dtype=np.float64),
+    )
+
+
+def _copy_scene_with_transform(scene: Any, transform: np.ndarray) -> Any:
+    copied = scene.copy()
+    copied.apply_transform(transform)
+    return copied
+
+
+def _matrix_from_json(value: Any, *, name: str) -> np.ndarray:
+    matrix = np.asarray(value, dtype=np.float64)
+    if matrix.shape != (4, 4):
+        raise ValueError(f"{name} must be a 4x4 matrix.")
+    return matrix
+
+
+def _load_scene_with_transform(
+    *,
+    path: Path,
+    transform: np.ndarray,
+    trimesh: Any,
+) -> Any:
+    scene = trimesh.load(path, force="scene")
+    scene.apply_transform(transform)
+    return scene
+
+
+def _scene_to_mesh(scene: Any, *, trimesh: Any) -> Any:
+    if isinstance(scene, trimesh.Trimesh):
+        return scene
+    dumped = scene.dump(concatenate=True)
+    if isinstance(dumped, trimesh.Trimesh):
+        return dumped
+    meshes = [item for item in dumped if isinstance(item, trimesh.Trimesh)]
+    if not meshes:
+        raise ValueError("Scene contains no mesh geometry.")
+    return trimesh.util.concatenate(meshes)
+
+
+def _estimate_support_normal(mesh: Any) -> np.ndarray:
+    geom = GeometryManager()
+    try:
+        detect_result = geom.detect_tabletop(DetectTabletopRequest(mesh=mesh))
+        normal = np.asarray(detect_result.oriented_normal, dtype=np.float64)
+        norm = np.linalg.norm(normal)
+        if norm > 0.0:
+            return normal / norm
+    except Exception:
+        pass
+
+    normals = np.asarray(mesh.face_normals, dtype=np.float64)
+    areas = np.asarray(mesh.area_faces, dtype=np.float64)
+    if normals.size == 0 or areas.size == 0:
+        return np.array([0.0, 0.0, 1.0], dtype=np.float64)
+    normal = normals[int(np.argmax(areas))]
+    norm = np.linalg.norm(normal)
+    if norm == 0.0:
+        return np.array([0.0, 0.0, 1.0], dtype=np.float64)
+    return normal / norm
+
+
+def _rotation_between_vectors(source: np.ndarray, target: np.ndarray) -> np.ndarray:
+    source = source / np.linalg.norm(source)
+    target = target / np.linalg.norm(target)
+    cross = np.cross(source, target)
+    dot = float(np.clip(np.dot(source, target), -1.0, 1.0))
+    if np.linalg.norm(cross) < 1e-8:
+        if dot > 0.0:
+            return np.eye(4, dtype=np.float64)
+        axis = np.array([1.0, 0.0, 0.0], dtype=np.float64)
+        if abs(float(np.dot(source, axis))) > 0.9:
+            axis = np.array([0.0, 1.0, 0.0], dtype=np.float64)
+        cross = np.cross(source, axis)
+    axis = cross / np.linalg.norm(cross)
+    angle = float(np.arccos(dot))
+    skew = np.array(
+        [
+            [0.0, -axis[2], axis[1]],
+            [axis[2], 0.0, -axis[0]],
+            [-axis[1], axis[0], 0.0],
+        ],
+        dtype=np.float64,
+    )
+    rotation = (
+        np.eye(3, dtype=np.float64)
+        + np.sin(angle) * skew
+        + (1.0 - np.cos(angle)) * (skew @ skew)
+    )
+    transform = np.eye(4, dtype=np.float64)
+    transform[:3, :3] = rotation
+    return transform
+
+
+def _transform_point(transform: np.ndarray, point: np.ndarray) -> np.ndarray:
+    homogeneous = np.ones(4, dtype=np.float64)
+    homogeneous[:3] = point
+    return (transform @ homogeneous)[:3]
+
+
+def _aabb_center(bounds: np.ndarray) -> np.ndarray:
+    return 0.5 * (
+        np.asarray(bounds[0], dtype=np.float64)
+        + np.asarray(bounds[1], dtype=np.float64)
+    )
+
+
+def _xy_aabb_center(bounds: np.ndarray) -> np.ndarray:
+    bounds = np.asarray(bounds, dtype=np.float64)
+    return 0.5 * (bounds[0, :2] + bounds[1, :2])
+
+
+def _xy_aabb_size(bounds: np.ndarray) -> np.ndarray:
+    bounds = np.asarray(bounds, dtype=np.float64)
+    return np.maximum(bounds[1, :2] - bounds[0, :2], 1e-6)
+
+
+def _aabb_bottom_to_xy_plane_transform(bounds: np.ndarray) -> np.ndarray:
+    bounds = np.asarray(bounds, dtype=np.float64)
+    min_z = float(bounds[0][2])
+    transform = np.eye(4, dtype=np.float64)
+    transform[:3, 3] = [0.0, 0.0, -min_z]
+    return transform
+
+
+def _scale_transform(scale: float) -> np.ndarray:
+    transform = np.eye(4, dtype=np.float64)
+    transform[:3, :3] *= float(scale)
+    return transform
+
+
+def _compose_sam3d_multi_object_transform(
+    *,
+    rotation_quaternion_wxyz: list[float],
+    translation: list[float],
+    scale: list[float],
+) -> np.ndarray:
+    """Compose the transform equivalent to the old baked multi-object export."""
+    rotation = _quaternion_wxyz_to_matrix(rotation_quaternion_wxyz)
+    scale_matrix = np.diag(_validate_vector(scale, expected_len=3, name="scale"))
+    linear_row = _glb_to_sam3d_local_matrix() @ scale_matrix @ rotation
+    return _row_linear_to_trimesh_matrix(
+        linear_row=linear_row,
+        translation=translation,
+    )
+
+
+def _row_linear_to_trimesh_matrix(
+    *,
+    linear_row: np.ndarray,
+    translation: list[float],
+) -> np.ndarray:
+    """Convert a row-vector linear transform to trimesh's 4x4 matrix format."""
+    translation_vector = _validate_vector(
+        translation,
+        expected_len=3,
+        name="translation",
+    )
+    transform = np.eye(4, dtype=np.float64)
+    transform[:3, :3] = linear_row.T
+    transform[:3, 3] = translation_vector
+    return transform
+
+
+def _validate_vector(
+    values: list[float],
+    *,
+    expected_len: int,
+    name: str,
+) -> np.ndarray:
+    """Validate and convert a numeric vector."""
+    if len(values) != expected_len:
+        raise ValueError(f"{name} must have {expected_len} values")
+    return np.asarray(values, dtype=np.float64)
+
+
+def _glb_to_sam3d_local_matrix() -> np.ndarray:
+    """Return the basis conversion used by the old baked multi-object exporter."""
+    return np.array(
+        [
+            [1.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0],
+            [0.0, -1.0, 0.0],
+        ],
+        dtype=np.float64,
+    )
+
+
+def _quaternion_wxyz_to_matrix(quaternion: list[float]) -> np.ndarray:
+    """Convert a wxyz quaternion to a 3x3 rotation matrix."""
+    if len(quaternion) != 4:
+        raise ValueError("rotation_quaternion_wxyz must have 4 values")
+    w, x, y, z = [float(v) for v in quaternion]
+    norm = np.sqrt(w * w + x * x + y * y + z * z)
+    if norm == 0.0:
+        raise ValueError("rotation quaternion must be non-zero")
+    w, x, y, z = w / norm, x / norm, y / norm, z / norm
+    return np.array(
+        [
+            [
+                1.0 - 2.0 * (y * y + z * z),
+                2.0 * (x * y - z * w),
+                2.0 * (x * z + y * w),
+            ],
+            [
+                2.0 * (x * y + z * w),
+                1.0 - 2.0 * (x * x + z * z),
+                2.0 * (y * z - x * w),
+            ],
+            [
+                2.0 * (x * z - y * w),
+                2.0 * (y * z + x * w),
+                1.0 - 2.0 * (x * x + y * y),
+            ],
+        ],
+        dtype=np.float64,
+    )
+
+
+def _detect_table_fit_support_quad(
+    mesh: Any,
+    *,
+    target_aspect: float,
+) -> dict[str, Any]:
+    geom = GeometryManager()
+    detect = geom.detect_tabletop(DetectTabletopRequest(mesh=mesh))
+    faces = np.asarray(mesh.faces, dtype=np.int64)
+    vertices = np.asarray(mesh.vertices, dtype=np.float64)
+    support_vertices = vertices[
+        np.unique(faces[np.asarray(detect.selected.face_indices, dtype=np.int64)])
+    ]
+    hull_xy = _table_fit_convex_hull_2d(support_vertices[:, :2])
+    quad = _largest_centered_table_fit_inscribed_rect(
+        hull_xy,
+        target_aspect=max(float(target_aspect), 1.0e-6),
+    )
+    center_z = float(np.mean(support_vertices[:, 2]))
+    return {
+        "method": "sampled_centered_inscribed_rectangle_on_support_convex_hull",
+        "normal": detect.oriented_normal,
+        "area": float(detect.selected.area),
+        "center": [quad["center_xy"][0], quad["center_xy"][1], center_z],
+        "center_xy": quad["center_xy"],
+        "size_xy": quad["size_xy"],
+        "yaw_radians": quad["yaw_radians"],
+        "yaw_degrees": float(np.rad2deg(quad["yaw_radians"])),
+        "corners_xy": quad["corners_xy"],
+        "support_hull_xy": hull_xy.tolist(),
+    }
+
+
+def _largest_centered_table_fit_inscribed_rect(
+    hull_xy: np.ndarray,
+    *,
+    target_aspect: float,
+    yaw_samples: int = 180,
+) -> dict[str, Any]:
+    if hull_xy.shape[0] < 3:
+        raise ValueError("Support hull must contain at least 3 points.")
+    best: dict[str, Any] | None = None
+    centers = [
+        np.mean(hull_xy, axis=0),
+        0.5 * (np.min(hull_xy, axis=0) + np.max(hull_xy, axis=0)),
+    ]
+    for yaw in np.linspace(0.0, np.pi, yaw_samples, endpoint=False):
+        rot = _table_fit_rot2(-yaw)
+        inv_rot = _table_fit_rot2(yaw)
+        rotated_hull = hull_xy @ rot.T
+        for center_world in centers:
+            center = center_world @ rot.T
+            lo = 0.0
+            bbox_size = np.max(rotated_hull, axis=0) - np.min(rotated_hull, axis=0)
+            hi = float(max(bbox_size[0] / target_aspect, bbox_size[1], 1.0e-6))
+            for _ in range(40):
+                mid = 0.5 * (lo + hi)
+                width = target_aspect * mid
+                depth = mid
+                corners = _table_fit_rect_corners(
+                    center=center,
+                    width=width,
+                    depth=depth,
+                )
+                corners_world = corners @ inv_rot.T
+                if all(
+                    _table_fit_point_in_convex_polygon(point, hull_xy)
+                    for point in corners_world
+                ):
+                    lo = mid
+                else:
+                    hi = mid
+            width = target_aspect * lo
+            depth = lo
+            area = width * depth
+            corners_world = (
+                _table_fit_rect_corners(center=center, width=width, depth=depth)
+                @ inv_rot.T
+            )
+            candidate = {
+                "center_xy": center_world.tolist(),
+                "size_xy": [float(width), float(depth)],
+                "yaw_radians": float(yaw),
+                "corners_xy": corners_world.tolist(),
+                "area": float(area),
+            }
+            if best is None or area > float(best["area"]):
+                best = candidate
+    if best is None:
+        raise ValueError("Failed to estimate an inscribed support rectangle.")
+    return best
+
+
+def _load_table_fit_scene_internal_z(
+    path: Path,
+    *,
+    trimesh: Any,
+    y_to_z: np.ndarray,
+) -> Any:
+    if not path.is_file():
+        raise FileNotFoundError(f"GLB not found: {path}")
+    scene = trimesh.load(path, force="scene")
+    scene.apply_transform(y_to_z)
+    return scene
+
+
+def _table_fit_scene_union_bounds(scenes: list[Any], *, trimesh: Any) -> np.ndarray:
+    bounds = [
+        np.asarray(_scene_to_mesh(scene, trimesh=trimesh).bounds, dtype=np.float64)
+        for scene in scenes
+    ]
+    return np.vstack(
+        [
+            np.vstack([item[0] for item in bounds]).min(axis=0),
+            np.vstack([item[1] for item in bounds]).max(axis=0),
+        ]
+    )
+
+
+def _table_fit_bounds_xy_manifest(
+    bounds: np.ndarray,
+    *,
+    unit_scale: float,
+) -> dict[str, Any]:
+    min_xy = bounds[0, :2] * unit_scale
+    max_xy = bounds[1, :2] * unit_scale
+    size_xy = max_xy - min_xy
+    center_xy = 0.5 * (min_xy + max_xy)
+    return {
+        "unit": "cm",
+        "min_xy": min_xy.tolist(),
+        "max_xy": max_xy.tolist(),
+        "center_xy": center_xy.tolist(),
+        "size_xy": size_xy.tolist(),
+        "area": float(size_xy[0] * size_xy[1]),
+    }
+
+
+def _table_fit_uniform_xy_scale_transform(
+    *,
+    center_xy: np.ndarray,
+    scale: float,
+) -> np.ndarray:
+    center = np.eye(4, dtype=np.float64)
+    center[:3, 3] = [float(center_xy[0]), float(center_xy[1]), 0.0]
+    uncenter = np.eye(4, dtype=np.float64)
+    uncenter[:3, 3] = [-float(center_xy[0]), -float(center_xy[1]), 0.0]
+    scale_mat = np.eye(4, dtype=np.float64)
+    scale_mat[0, 0] = float(scale)
+    scale_mat[1, 1] = float(scale)
+    return center @ scale_mat @ uncenter
+
+
+def _table_fit_safe_positive_ratio(numerator: float, denominator: float) -> float:
+    return max(float(numerator) / max(float(denominator), 1.0e-6), 1.0e-6)
+
+
+def _table_fit_convex_hull_2d(points: np.ndarray) -> np.ndarray:
+    unique = sorted({(float(x), float(y)) for x, y in np.asarray(points)[:, :2]})
+    if len(unique) <= 1:
+        return np.asarray(unique, dtype=np.float64)
+
+    def cross(
+        o: tuple[float, float],
+        a: tuple[float, float],
+        b: tuple[float, float],
+    ) -> float:
+        return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
+
+    lower: list[tuple[float, float]] = []
+    for point in unique:
+        while len(lower) >= 2 and cross(lower[-2], lower[-1], point) <= 0.0:
+            lower.pop()
+        lower.append(point)
+    upper: list[tuple[float, float]] = []
+    for point in reversed(unique):
+        while len(upper) >= 2 and cross(upper[-2], upper[-1], point) <= 0.0:
+            upper.pop()
+        upper.append(point)
+    return np.asarray(lower[:-1] + upper[:-1], dtype=np.float64)
+
+
+def _table_fit_point_in_convex_polygon(
+    point: np.ndarray,
+    polygon: np.ndarray,
+) -> bool:
+    previous = 0.0
+    for index in range(len(polygon)):
+        a = polygon[index]
+        b = polygon[(index + 1) % len(polygon)]
+        cross = float(np.cross(b - a, point - a))
+        if abs(cross) < 1.0e-9:
+            continue
+        if previous == 0.0:
+            previous = cross
+        elif cross * previous < -1.0e-9:
+            return False
+    return True
+
+
+def _table_fit_rect_corners(
+    *,
+    center: np.ndarray,
+    width: float,
+    depth: float,
+) -> np.ndarray:
+    half_w = 0.5 * float(width)
+    half_d = 0.5 * float(depth)
+    return np.asarray(
+        [
+            [center[0] - half_w, center[1] - half_d],
+            [center[0] + half_w, center[1] - half_d],
+            [center[0] + half_w, center[1] + half_d],
+            [center[0] - half_w, center[1] + half_d],
+        ],
+        dtype=np.float64,
+    )
+
+
+def _table_fit_rot2(angle: float) -> np.ndarray:
+    c = float(np.cos(angle))
+    s = float(np.sin(angle))
+    return np.asarray([[c, -s], [s, c]], dtype=np.float64)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/schemas.py
new file mode 100644
index 00000000..f001720f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/schemas.py
@@ -0,0 +1,201 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+    "AlignToAxisRequest",
+    "AlignToAxisResult",
+    "AlignXYLongAxisRequest",
+    "AlignXYLongAxisResult",
+    "CenterMeshRequest",
+    "NormalizeRequest",
+    "NormalizeResult",
+    "CenterMeshResult",
+    "ConvertUpAxisRequest",
+    "ConvertUpAxisResult",
+    "DetectTabletopRequest",
+    "DetectTabletopResult",
+    "ExportMeshRequest",
+    "ExportMeshResult",
+    "LoadMeshRequest",
+    "LoadMeshResult",
+    "PlaceAbovePlaneRequest",
+    "PlaceAbovePlaneResult",
+    "SupportPlaneCandidate",
+]
+
+
+@dataclass(frozen=True)
+class SupportPlaneCandidate:
+    """Candidate planar tabletop support surface."""
+
+    normal: list[float]
+    center: list[float]
+    area: float
+    face_indices: list[int]
+    below_vertex_count: int
+    above_vertex_count: int
+    below_area_score: float
+    above_area_score: float
+    score: float
+
+
+@dataclass(frozen=True)
+class LoadMeshRequest:
+    """Request to load a GLB/mesh file."""
+
+    mesh_path: Path
+
+
+@dataclass(frozen=True)
+class LoadMeshResult:
+    """Result of loading a mesh file."""
+
+    mesh: Any
+
+
+@dataclass(frozen=True)
+class ExportMeshRequest:
+    """Request to export a mesh to a file."""
+
+    mesh: Any
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class ExportMeshResult:
+    """Result of exporting a mesh."""
+
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class ConvertUpAxisRequest:
+    """Request to convert a mesh from one up-axis convention to another."""
+
+    mesh: Any
+    input_up_axis: list[float] | None = None
+    output_up_axis: list[float] | None = None
+
+
+@dataclass(frozen=True)
+class ConvertUpAxisResult:
+    """Result of converting a mesh up-axis."""
+
+    mesh: Any
+
+
+@dataclass(frozen=True)
+class CenterMeshRequest:
+    """Request to center a mesh by its bounding-box center."""
+
+    mesh: Any
+
+
+@dataclass(frozen=True)
+class CenterMeshResult:
+    """Result of centering a mesh."""
+
+    mesh: Any
+    bbox_center: list[float]
+
+
+@dataclass(frozen=True)
+class AlignToAxisRequest:
+    """Request to rotate a mesh so a source axis aligns to a target axis."""
+
+    mesh: Any
+    source_axis: list[float]
+    target_axis: list[float]
+
+
+@dataclass(frozen=True)
+class AlignToAxisResult:
+    """Result of aligning a mesh vector to an axis."""
+
+    mesh: Any
+
+
+@dataclass(frozen=True)
+class PlaceAbovePlaneRequest:
+    """Request to translate a mesh so its AABB bottom sits above the XY plane."""
+
+    mesh: Any
+    clearance: float = 0.01
+
+
+@dataclass(frozen=True)
+class PlaceAbovePlaneResult:
+    """Result of placing a mesh above the XY plane."""
+
+    mesh: Any
+
+
+@dataclass(frozen=True)
+class DetectTabletopRequest:
+    """Request to detect the most likely tabletop plane in a mesh."""
+
+    mesh: Any
+    normal_angle_tol_deg: float = 8.0
+    plane_distance_tol: float | None = None
+    min_area_ratio: float = 0.02
+    max_candidates: int = 24
+
+
+@dataclass(frozen=True)
+class DetectTabletopResult:
+    """Result of detecting the tabletop plane with oriented normal."""
+
+    selected: SupportPlaneCandidate
+    oriented_normal: list[float]
+    candidates: list[SupportPlaneCandidate]
+
+
+@dataclass(frozen=True)
+class AlignXYLongAxisRequest:
+    """Request to align a mesh XY long axis to the Y axis via PCA."""
+
+    mesh: Any
+    face_indices: list[int] | None = None
+
+
+@dataclass(frozen=True)
+class AlignXYLongAxisResult:
+    """Result of PCA yaw alignment."""
+
+    mesh: Any
+    yaw_angle_degrees: float
+
+
+@dataclass(frozen=True)
+class NormalizeRequest:
+    """Request to normalize a mesh to a target size."""
+
+    mesh: Any
+    target_size: float = 1.0
+
+
+@dataclass(frozen=True)
+class NormalizeResult:
+    """Result of normalizing a mesh."""
+
+    mesh: Any
+    scale_factor: float
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/__init__.py
new file mode 100644
index 00000000..c7a200a5
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/__init__.py
@@ -0,0 +1,35 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager.manager import (
+    ASSET_IMAGE_PROMPT_SUFFIX,
+    ImageGenerationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager.schemas import (
+    ImageGenerationRequest,
+    ImageGenerationResult,
+    TextToAssetImageRequest,
+)
+
+__all__ = [
+    "ASSET_IMAGE_PROMPT_SUFFIX",
+    "ImageGenerationManager",
+    "ImageGenerationRequest",
+    "ImageGenerationResult",
+    "TextToAssetImageRequest",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/manager.py
new file mode 100644
index 00000000..6406f74d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/manager.py
@@ -0,0 +1,76 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client import (
+    ImageGenerationClient,
+    ImageGenerationError,
+    ImageGenerationServerRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager.schemas import (
+    ImageGenerationRequest,
+    ImageGenerationResult,
+    TextToAssetImageRequest,
+)
+
+ASSET_IMAGE_PROMPT_SUFFIX = (
+    "single isolated object, centered, fully visible, "
+    "on a high contrast colored background. "
+)
+
+
+class ImageGenerationManager:
+    """Image generation domain operations."""
+
+    def __init__(self, *, client: ImageGenerationClient | None = None) -> None:
+        self.client = client or ImageGenerationClient()
+
+    def generate_image(self, request: ImageGenerationRequest) -> ImageGenerationResult:
+        output_path = request.output_path.expanduser().resolve()
+        response = self.client.generate(
+            ImageGenerationServerRequest(
+                prompt=request.prompt,
+                output_path=output_path,
+            ),
+        )
+        if isinstance(response, ImageGenerationError):
+            raise RuntimeError(response.error_message)
+
+        return ImageGenerationResult(
+            image_path=Path(response.result.image_path).expanduser().resolve(),
+        )
+
+    def generate_asset_image_from_text(
+        self,
+        request: TextToAssetImageRequest,
+    ) -> Path:
+        prompt = _build_asset_image_prompt(request.prompt)
+        result = self.generate_image(
+            ImageGenerationRequest(prompt=prompt, output_path=request.output_path)
+        )
+        return result.image_path
+
+
+def _build_asset_image_prompt(prompt: str) -> str:
+    prompt = prompt.strip()
+    if not prompt:
+        raise ValueError("Text-to-asset image prompt must be non-empty.")
+    if ASSET_IMAGE_PROMPT_SUFFIX in prompt:
+        return prompt
+    return f"{prompt}, {ASSET_IMAGE_PROMPT_SUFFIX}"
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/schemas.py
new file mode 100644
index 00000000..ac4a9cd7
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/schemas.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class TextToAssetImageRequest:
+    """Request for generating an asset image from a text prompt."""
+
+    prompt: str
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class ImageGenerationRequest:
+    """Request for generating one image from text."""
+
+    prompt: str
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class ImageGenerationResult:
+    """Generated image path."""
+
+    image_path: Path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/__init__.py
new file mode 100644
index 00000000..2ad8f11a
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/__init__.py
@@ -0,0 +1,29 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_scene_manager.alignment import (
+    _export_support_aligned_layout_glbs,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_scene_manager.manifests import (
+    _write_multi_object_layout_manifests,
+)
+
+__all__ = [
+    "_export_support_aligned_layout_glbs",
+    "_write_multi_object_layout_manifests",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/alignment.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/alignment.py
new file mode 100644
index 00000000..6d7084f4
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/alignment.py
@@ -0,0 +1,537 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+import traceback
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.workflows.llm_output import (
+    call_structured_json_model_step,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_scene_manager.prompts import (
+    build_up_down_flip_check_messages,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.metric_scale_manager import (
+    GlobalMetricScaleRequest,
+    MetricScaleManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_scene_manager.schemas import (
+    UP_DOWN_FLIP_CHECK_JSON_SCHEMA,
+)
+
+UP_DOWN_FLIP_CHECK_CONFIDENCE_THRESHOLD = 0.6
+UNIFIED_SCENE_STEP = "unified_scene"
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.blender_rendering_manager import (
+    BlenderRenderingManager,
+    RenderObjectScenesRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager import (
+    MatplotlibManager,
+    RenderImageComparisonRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.scene_geometry import (
+    _aabb_center,
+    _copy_scene_with_transform,
+    _estimate_support_normal,
+    _load_scene_with_transform,
+    _matrix_from_json,
+    _rotation_between_vectors,
+    _scale_transform,
+    _scene_to_mesh,
+    _support_normal_flip_transform,
+    _xy_aabb_center,
+    _z_up_to_glb_y_up_transform,
+    _z_yaw_transform,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import (
+    relative_path,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.optimization_manager import (
+    _object_scenes_xy_aabb_manifest,
+    _settle_and_pack_object_footprints,
+)
+
+__all__ = ["_export_support_aligned_layout_glbs"]
+
+
+def _export_support_aligned_layout_glbs(
+    *,
+    table: dict[str, Any],
+    objects: list[dict[str, Any]],
+    spatial_relations: list[dict[str, Any]],
+    original_image_path: Path | None,
+    llm: Any | None,
+    output_dir: Path,
+    output_root: Path,
+) -> dict[str, Any]:
+    """Export layout-baked GLBs aligned by support normal and left-right order."""
+    try:
+        import trimesh
+    except ImportError as exc:
+        raise RuntimeError("Support-aligned GLB export requires trimesh.") from exc
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    support_reference_path = _resolve_generated_path(
+        table.get("support_reference_geometry_path") or table.get("raw_geometry_path"),
+        output_root,
+    )
+    object_paths = [
+        (
+            str(item["id"]),
+            _resolve_generated_path(item.get("raw_geometry_path"), output_root),
+            item.get("transform_matrix"),
+        )
+        for item in objects
+        if item.get("raw_geometry_path") and item.get("transform_matrix")
+    ]
+    if not support_reference_path.is_file():
+        raise FileNotFoundError(
+            f"Support reference table GLB not found: {support_reference_path}"
+        )
+    support_reference_transform = _matrix_from_json(
+        table.get("support_reference_transform_matrix")
+        or table.get("transform_matrix"),
+        name="table.support_reference_transform_matrix",
+    )
+    if not object_paths:
+        raise ValueError("No raw object GLBs with transform matrices available.")
+
+    support_reference_scene = trimesh.load(support_reference_path, force="scene")
+    support_reference_scene.apply_transform(support_reference_transform)
+    object_scenes = [
+        (
+            object_id,
+            _load_scene_with_transform(
+                path=path,
+                transform=_matrix_from_json(
+                    transform,
+                    name=f"{object_id}.transform_matrix",
+                ),
+                trimesh=trimesh,
+            ),
+        )
+        for object_id, path, transform in object_paths
+    ]
+    table_mesh = _scene_to_mesh(support_reference_scene, trimesh=trimesh)
+    support_normal = _estimate_support_normal(table_mesh)
+    normal_alignment = _rotation_between_vectors(
+        support_normal,
+        np.array([0.0, 0.0, 1.0]),
+    )
+
+    for _, scene in object_scenes:
+        scene.apply_transform(normal_alignment)
+
+    object_bounds = [
+        _scene_to_mesh(scene, trimesh=trimesh).bounds for _, scene in object_scenes
+    ]
+    clutter_bounds = np.vstack(
+        [
+            np.vstack([bounds[0] for bounds in object_bounds]).min(axis=0),
+            np.vstack([bounds[1] for bounds in object_bounds]).max(axis=0),
+        ]
+    )
+    clutter_center = 0.5 * (clutter_bounds[0] + clutter_bounds[1])
+    center_transform = np.eye(4, dtype=np.float64)
+    center_transform[:3, 3] = [
+        -float(clutter_center[0]),
+        -float(clutter_center[1]),
+        -float(clutter_center[2]),
+    ]
+
+    for _, scene in object_scenes:
+        scene.apply_transform(center_transform)
+
+    alignment_candidates = _build_up_down_alignment_candidates(
+        object_scenes=object_scenes,
+        support_normal=support_normal,
+        normal_alignment=normal_alignment,
+        spatial_relations=spatial_relations,
+        trimesh=trimesh,
+    )
+    vlm_check_dir = output_dir / "vlm_up_down_flip_check"
+    up_down_flip_check_result = _run_aligned_up_down_flip_vlm_check(
+        llm=llm,
+        original_image_path=original_image_path,
+        normal_object_scenes=alignment_candidates["normal"]["object_scenes"],
+        flipped_object_scenes=alignment_candidates["flipped"]["object_scenes"],
+        output_dir=vlm_check_dir,
+    )
+    selected_variant = str(
+        up_down_flip_check_result.get("selected_variant") or "normal"
+    )
+    if selected_variant not in alignment_candidates:
+        selected_variant = "normal"
+    selected_candidate = alignment_candidates[selected_variant]
+    object_scenes = selected_candidate["object_scenes"]
+    selected_extra_transform = selected_candidate["extra_transform"]
+    apply_up_down_flip = selected_variant == "flipped"
+
+    global_metric_scale = MetricScaleManager.compute_global_from_object_scenes(
+        GlobalMetricScaleRequest(
+            objects=objects,
+            object_scenes=object_scenes,
+        )
+    )
+    metric_scale_transform = _scale_transform(global_metric_scale["scale_factor"])
+    if float(global_metric_scale["scale_factor"]) != 1.0:
+        for _, scene in object_scenes:
+            scene.apply_transform(metric_scale_transform)
+
+    footprint_result = _settle_and_pack_object_footprints(
+        object_scenes=object_scenes,
+        output_dir=output_dir / "footprint_layout",
+        output_root=output_root,
+        trimesh=trimesh,
+    )
+    object_scenes = footprint_result["object_scenes"]
+
+    output_axis_transform = _z_up_to_glb_y_up_transform()
+    object_outputs = []
+    for object_id, scene in object_scenes:
+        object_output = output_dir / f"{object_id}_aligned.glb"
+        _copy_scene_with_transform(scene, output_axis_transform).export(object_output)
+        object_outputs.append(
+            {
+                "id": object_id,
+                "aligned_geometry_path": relative_path(str(object_output), output_root),
+            }
+        )
+
+    alignment_matrix = selected_extra_transform @ center_transform @ normal_alignment
+    scaled_alignment_matrix = metric_scale_transform @ alignment_matrix
+    final_clutter_aabb_2d_cm = _object_scenes_xy_aabb_manifest(
+        object_scenes=object_scenes,
+        trimesh=trimesh,
+        unit_scale=100.0,
+        unit="cm",
+    )
+    return {
+        "status": "ok",
+        "output_dir": relative_path(str(output_dir), output_root),
+        "support_normal": support_normal.tolist(),
+        "clutter_aabb_center_before_centering": clutter_center.tolist(),
+        "alignment_matrix": scaled_alignment_matrix.tolist(),
+        "pre_metric_scale_alignment_matrix": alignment_matrix.tolist(),
+        "global_metric_scale": global_metric_scale,
+        "final_clutter_2d_aabb_cm": final_clutter_aabb_2d_cm,
+        "internal_up_axis": [0.0, 0.0, 1.0],
+        "glb_output_up_axis": [0.0, 1.0, 0.0],
+        "glb_output_axis_transform": output_axis_transform.tolist(),
+        "selected_up_down_variant": selected_variant,
+        "applied_up_down_flip": apply_up_down_flip,
+        "selected_extra_transform": selected_extra_transform.tolist(),
+        "object_alignment_matrices": {
+            object_id: (object_transform @ scaled_alignment_matrix).tolist()
+            for object_id, object_transform in footprint_result[
+                "object_layout_transforms"
+            ].items()
+        },
+        "footprint_layout": footprint_result["manifest"],
+        "yaw_sampling": {
+            "sample_count_per_variant": 360,
+            "score_type": "center_left_of_hard_count",
+            "top_view_plane": "XY",
+            "yaw_axis": "Z",
+            "left_right_axis": "X",
+            "front_back_axis": "Y",
+            "front_direction": "+Y",
+            "normal": alignment_candidates["normal"]["yaw_metadata"],
+            "flipped": alignment_candidates["flipped"]["yaw_metadata"],
+        },
+        "up_down_flip_check": up_down_flip_check_result,
+        "objects": object_outputs,
+    }
+
+
+def _build_up_down_alignment_candidates(
+    *,
+    object_scenes: list[tuple[str, Any]],
+    support_normal: np.ndarray,
+    normal_alignment: np.ndarray,
+    spatial_relations: list[dict[str, Any]],
+    trimesh: Any,
+) -> dict[str, dict[str, Any]]:
+    flip_transform = _support_normal_flip_transform(
+        support_normal=support_normal,
+        normal_alignment=normal_alignment,
+    )
+    directional_relations = _spatial_directional_relations(spatial_relations)
+    candidates: dict[str, dict[str, Any]] = {}
+    for variant, pre_yaw_transform in [
+        ("normal", np.eye(4, dtype=np.float64)),
+        ("flipped", flip_transform),
+    ]:
+        candidate_object_scenes = [
+            (object_id, _copy_scene_with_transform(scene, pre_yaw_transform))
+            for object_id, scene in object_scenes
+        ]
+        object_bounds = {
+            object_id: np.asarray(
+                _scene_to_mesh(scene, trimesh=trimesh).bounds,
+                dtype=np.float64,
+            )
+            for object_id, scene in candidate_object_scenes
+        }
+        yaw_metadata = _best_spatial_yaw(
+            object_bounds=object_bounds,
+            relations=directional_relations,
+        )
+        yaw_transform = _z_yaw_transform(
+            float(yaw_metadata["yaw_degrees"]),
+        )
+        for _, scene in candidate_object_scenes:
+            scene.apply_transform(yaw_transform)
+        candidates[variant] = {
+            "object_scenes": candidate_object_scenes,
+            "pre_yaw_transform": pre_yaw_transform,
+            "yaw_transform": yaw_transform,
+            "extra_transform": yaw_transform @ pre_yaw_transform,
+            "yaw_metadata": yaw_metadata,
+        }
+    return candidates
+
+
+def _best_spatial_yaw(
+    *,
+    object_bounds: dict[str, np.ndarray],
+    relations: list[dict[str, str]],
+) -> dict[str, Any]:
+    if not relations:
+        return {
+            "yaw_degrees": 0,
+            "score": 0,
+            "raw_gap_sum": 0.0,
+            "relation_count": 0,
+            "score_type": "center_left_of_hard_count",
+        }
+
+    object_centers = {
+        object_id: _aabb_center(bounds) for object_id, bounds in object_bounds.items()
+    }
+    best_yaw = 0
+    best_score = -1
+    best_raw_gap_sum = float("-inf")
+    best_relation_scores: list[dict[str, Any]] = []
+    for yaw_degrees in range(360):
+        rotation = _z_yaw_transform(float(yaw_degrees))
+        rotated_centers = {
+            object_id: _transform_point(rotation, center)
+            for object_id, center in object_centers.items()
+        }
+        score, raw_gap_sum, relation_scores = _center_left_of_score(
+            centers=rotated_centers,
+            relations=relations,
+        )
+        if score > best_score or (
+            score == best_score and raw_gap_sum > best_raw_gap_sum
+        ):
+            best_yaw = yaw_degrees
+            best_score = score
+            best_raw_gap_sum = raw_gap_sum
+            best_relation_scores = relation_scores
+    return {
+        "yaw_degrees": best_yaw,
+        "score": best_score,
+        "raw_gap_sum": best_raw_gap_sum,
+        "relation_count": len(relations),
+        "score_type": "center_left_of_hard_count",
+        "relation_scores": best_relation_scores,
+    }
+
+
+def _spatial_directional_relations(
+    spatial_relations: list[dict[str, Any]],
+) -> list[dict[str, str]]:
+    relations: list[dict[str, str]] = []
+    seen: set[tuple[str, str, str]] = set()
+    for relation in spatial_relations:
+        subject = str(relation.get("subject") or "")
+        object_id = str(relation.get("object") or "")
+        relation_name = str(relation.get("relation") or "")
+        if (
+            not subject
+            or not object_id
+            or subject == object_id
+            or relation_name != "left_of"
+        ):
+            continue
+        key = (subject, relation_name, object_id)
+        if key in seen:
+            continue
+        seen.add(key)
+        relations.append(
+            {
+                "subject": subject,
+                "relation": relation_name,
+                "object": object_id,
+            }
+        )
+    return relations
+
+
+def _center_left_of_score(
+    centers: dict[str, np.ndarray],
+    relations: list[dict[str, str]],
+) -> tuple[int, float, list[dict[str, Any]]]:
+    score = 0
+    raw_gap_sum = 0.0
+    relation_scores: list[dict[str, Any]] = []
+    for relation in relations:
+        subject = relation["subject"]
+        object_id = relation["object"]
+        if subject not in centers or object_id not in centers:
+            continue
+        subject_center = centers[subject]
+        object_center = centers[object_id]
+        gap = float(object_center[0] - subject_center[0])
+        relation_score = 1 if gap > 0.0 else 0
+        score += relation_score
+        raw_gap_sum += gap
+        relation_scores.append(
+            {
+                "subject": subject,
+                "relation": "left_of",
+                "object": object_id,
+                "gap": gap,
+                "score": relation_score,
+            }
+        )
+    return score, raw_gap_sum, relation_scores
+
+
+def _transform_point(transform: np.ndarray, point: np.ndarray) -> np.ndarray:
+    homogeneous = np.ones(4, dtype=np.float64)
+    homogeneous[:3] = point
+    return (transform @ homogeneous)[:3]
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+    path = Path(str(value or "")).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (output_root / path).resolve()
+
+
+def _run_aligned_up_down_flip_vlm_check(
+    *,
+    llm: Any | None,
+    original_image_path: Path | None,
+    normal_object_scenes: list[tuple[str, Any]],
+    flipped_object_scenes: list[tuple[str, Any]],
+    output_dir: Path,
+) -> dict[str, Any]:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    result: dict[str, Any] = {
+        "status": "skipped",
+        "applied_up_down_flip": False,
+        "confidence_threshold": UP_DOWN_FLIP_CHECK_CONFIDENCE_THRESHOLD,
+        "reason": "",
+    }
+    if not normal_object_scenes or not flipped_object_scenes:
+        result["reason"] = "missing_object_scenes"
+        return result
+
+    try:
+        normal_render_path = output_dir / "normal_object_only_front_oblique_view.png"
+        flipped_render_path = output_dir / "flipped_object_only_front_oblique_view.png"
+        comparison_image_path = output_dir / "numbered_up_down_candidates.png"
+        BlenderRenderingManager().render_object_scenes(
+            RenderObjectScenesRequest(
+                object_scenes=normal_object_scenes,
+                output_path=normal_render_path,
+            )
+        )
+        BlenderRenderingManager().render_object_scenes(
+            RenderObjectScenesRequest(
+                object_scenes=flipped_object_scenes,
+                output_path=flipped_render_path,
+            )
+        )
+        MatplotlibManager(figsize=(12, 6), dpi=180).render_image_comparison(
+            RenderImageComparisonRequest(
+                first_image_path=normal_render_path,
+                second_image_path=flipped_render_path,
+                output_path=comparison_image_path,
+            )
+        )
+        if llm is None:
+            result["reason"] = "missing_llm"
+            return result
+        if original_image_path is None or not original_image_path.is_file():
+            result["reason"] = "missing_original_image"
+            return result
+
+        raw_model_output = call_structured_json_model_step(
+            llm=llm,
+            schema=UP_DOWN_FLIP_CHECK_JSON_SCHEMA,
+            messages=build_up_down_flip_check_messages(
+                original_image_path=original_image_path,
+                comparison_image_path=comparison_image_path,
+            ),
+            context="Unified scene aligned up-down flip check",
+            step_name=UNIFIED_SCENE_STEP,
+            output_root=None,
+            attempt_count=0,
+        )
+        # Persist VLM raw output alongside the comparison renders
+        try:
+            import json as _json
+
+            vlm_result_path = output_dir / "vlm_flip_check_result.json"
+            vlm_result_path.write_text(
+                _json.dumps(raw_model_output, ensure_ascii=False, indent=2),
+                encoding="utf-8",
+            )
+        except Exception:
+            pass
+        confidence = float(raw_model_output.get("confidence", 0.0))
+        selected_number = int(raw_model_output.get("selected_number", 1))
+        if selected_number not in {1, 2}:
+            selected_number = 1
+        model_selected_variant = "flipped" if selected_number == 2 else "normal"
+        should_apply = (
+            model_selected_variant == "flipped"
+            and confidence >= UP_DOWN_FLIP_CHECK_CONFIDENCE_THRESHOLD
+        )
+        selected_variant = "flipped" if should_apply else "normal"
+        selected_number = 2 if selected_variant == "flipped" else 1
+        result.update(
+            {
+                "status": "ok",
+                "selected_number": selected_number,
+                "selected_variant": selected_variant,
+                "applied_up_down_flip": should_apply,
+                "model_selected_number": raw_model_output.get("selected_number"),
+                "model_selected_variant": model_selected_variant,
+                "confidence": confidence,
+                "reason": str(raw_model_output.get("reason", "")),
+            }
+        )
+        return result
+    except Exception:
+        result.update(
+            {
+                "status": "failed",
+                "reason": traceback.format_exc(),
+            }
+        )
+        return result
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/manifests.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/manifests.py
new file mode 100644
index 00000000..6ae379c3
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/manifests.py
@@ -0,0 +1,212 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.io import (
+    relative_path,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.scene_geometry import (
+    _compose_json_matrices,
+    _compose_simready_to_aligned_matrix,
+    _decompose_transform_matrix,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+
+__all__ = ["_write_multi_object_layout_manifests"]
+
+
+def _write_multi_object_layout_manifests(
+    *,
+    glb_gen_dir: Path,
+    output_root: Path,
+    table: dict[str, Any] | None,
+    objects: list[dict[str, Any]],
+    alignment: dict[str, Any] | None,
+) -> dict[str, str]:
+    simready_to_aligned_path = glb_gen_dir / "simready_to_aligned_manifest.json"
+
+    write_json(
+        simready_to_aligned_path,
+        _simready_to_aligned_manifest(
+            table=table,
+            items=objects,
+            alignment=alignment,
+            output_root=output_root,
+        ),
+    )
+    return {
+        "simready_to_aligned_manifest_path": relative_path(
+            str(simready_to_aligned_path),
+            output_root,
+        ),
+    }
+
+
+def _simready_to_aligned_manifest(
+    *,
+    table: dict[str, Any] | None,
+    items: list[dict[str, Any]],
+    alignment: dict[str, Any] | None,
+    output_root: Path,
+) -> dict[str, Any]:
+    alignment = alignment or {}
+    alignment_matrix = alignment.get("alignment_matrix", [])
+    glb_output_axis_transform = alignment.get("glb_output_axis_transform", [])
+    object_alignment_matrices = alignment.get("object_alignment_matrices", {})
+    aligned_by_id = _aligned_outputs_by_id(alignment)
+    return {
+        "note": (
+            "Aligned GLBs are generated from raw_downloads plus SAM3D layout "
+            "matrices in memory; simready paths are recorded here as the "
+            "simulation-ready counterpart for each raw GLB."
+        ),
+        "alignment_status": alignment.get("status", ""),
+        "alignment_reason": alignment.get("reason", ""),
+        "selected_up_down_variant": alignment.get("selected_up_down_variant", ""),
+        "applied_up_down_flip": alignment.get("applied_up_down_flip", False),
+        "alignment_matrix": alignment_matrix,
+        "global_metric_scale": alignment.get("global_metric_scale"),
+        "final_clutter_2d_aabb_cm": alignment.get("final_clutter_2d_aabb_cm"),
+        "glb_output_axis_transform": glb_output_axis_transform,
+        "table": (
+            _simready_manifest_table_item(table, output_root=output_root)
+            if table is not None
+            else None
+        ),
+        "items": [
+            _simready_to_aligned_manifest_item(
+                item,
+                aligned_by_id=aligned_by_id,
+                alignment_matrix=alignment_matrix,
+                object_alignment_matrices=object_alignment_matrices,
+                glb_output_axis_transform=glb_output_axis_transform,
+                output_root=output_root,
+            )
+            for item in items
+        ],
+    }
+
+
+def _aligned_outputs_by_id(alignment: dict[str, Any]) -> dict[str, str]:
+    outputs: dict[str, str] = {}
+    for item in alignment.get("objects", []) or []:
+        if isinstance(item, dict) and item.get("id"):
+            outputs[str(item["id"])] = str(item.get("aligned_geometry_path", ""))
+    return outputs
+
+
+def _simready_manifest_table_item(
+    item: dict[str, Any],
+    *,
+    output_root: Path,
+) -> dict[str, Any]:
+    return {
+        "id": item.get("id", ""),
+        "name": item.get("name", ""),
+        "kind": item.get("kind", "table"),
+        "status": item.get("status", ""),
+        "simready_geometry_path": (
+            relative_path(
+                str(
+                    _resolve_generated_path(
+                        item.get("simready_geometry_path"), output_root
+                    )
+                ),
+                output_root,
+            )
+            if item.get("simready_geometry_path")
+            else ""
+        ),
+        "support_reference_geometry_path": (
+            relative_path(
+                str(
+                    _resolve_generated_path(
+                        item.get("support_reference_geometry_path"),
+                        output_root,
+                    )
+                ),
+                output_root,
+            )
+            if item.get("support_reference_geometry_path")
+            else ""
+        ),
+        "table_asset_source": item.get("table_asset_source", ""),
+        "support_normal_source": item.get("support_normal_source", ""),
+        "is_complete_visible_table": item.get("is_complete_visible_table", False),
+        "complete_table_description": item.get("complete_table_description", ""),
+    }
+
+
+def _simready_to_aligned_manifest_item(
+    item: dict[str, Any],
+    *,
+    aligned_by_id: dict[str, str],
+    alignment_matrix: Any,
+    object_alignment_matrices: Any,
+    glb_output_axis_transform: Any,
+    output_root: Path,
+) -> dict[str, Any]:
+    item_id = str(item.get("id", ""))
+    sam3d_transform = item.get("transform_matrix", [])
+    item_alignment_matrix = alignment_matrix
+    if isinstance(object_alignment_matrices, dict):
+        item_alignment_matrix = object_alignment_matrices.get(
+            item_id,
+            alignment_matrix,
+        )
+    raw_to_aligned_matrix = _compose_json_matrices(
+        glb_output_axis_transform,
+        item_alignment_matrix,
+        sam3d_transform,
+    )
+    simready_to_aligned_matrix = _compose_simready_to_aligned_matrix(
+        raw_to_aligned_matrix=raw_to_aligned_matrix,
+        raw_to_simready_matrix=item.get("raw_to_simready_glb_matrix", []),
+    )
+    decomposed = _decompose_transform_matrix(simready_to_aligned_matrix)
+    return {
+        "id": item_id,
+        "name": item.get("name", ""),
+        "kind": item.get("kind", ""),
+        "simready_geometry_path": item.get("simready_geometry_path", ""),
+        "aligned_geometry_path": aligned_by_id.get(item_id, ""),
+        "metric_scale": _trim_metric_scale(item.get("metric_scale")),
+        "simready_to_aligned_matrix": simready_to_aligned_matrix,
+        "translation": decomposed["translation"],
+        "rotation_matrix": decomposed["rotation_matrix"],
+        "scale": decomposed["scale"],
+    }
+
+
+def _trim_metric_scale(value: Any) -> dict[str, Any] | None:
+    if not isinstance(value, dict):
+        return None
+    metric_scale = dict(value)
+    for key in ["result_path", "raw_model_output_path"]:
+        metric_scale.pop(key, None)
+    return metric_scale
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+    path = Path(str(value or "")).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (output_root / path).resolve()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/prompts.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/prompts.py
new file mode 100644
index 00000000..85b41388
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/prompts.py
@@ -0,0 +1,106 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.prompts import render_prompt
+from embodichain.gen_sim.prompt2scene.utils.io import image_to_data_url
+
+__all__ = [
+    "build_image_metric_scale_messages",
+    "build_up_down_flip_check_messages",
+]
+
+UNIFIED_SCENE_GEN_PROMPT_NAME = "unified_scene_gen.yaml"
+
+
+def build_image_metric_scale_messages(
+    *,
+    bbox_name_image_path: Path,
+    objects_json: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                UNIFIED_SCENE_GEN_PROMPT_NAME,
+                prompt_key="image_metric_scale_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": render_prompt(
+                        UNIFIED_SCENE_GEN_PROMPT_NAME,
+                        {
+                            "objects_json": json.dumps(
+                                objects_json,
+                                ensure_ascii=False,
+                                indent=2,
+                            ),
+                        },
+                        prompt_key="image_metric_scale_user",
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(bbox_name_image_path)},
+                },
+            ],
+        },
+    ]
+
+
+def build_up_down_flip_check_messages(
+    *,
+    original_image_path: Path,
+    comparison_image_path: Path,
+) -> list[dict[str, Any]]:
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                UNIFIED_SCENE_GEN_PROMPT_NAME,
+                prompt_key="up_down_flip_check_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": render_prompt(
+                        UNIFIED_SCENE_GEN_PROMPT_NAME,
+                        prompt_key="up_down_flip_check_user",
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(original_image_path)},
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(comparison_image_path)},
+                },
+            ],
+        },
+    ]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/schemas.py
new file mode 100644
index 00000000..b22fcebb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_scene_manager/schemas.py
@@ -0,0 +1,71 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+__all__ = [
+    "IMAGE_METRIC_SCALE_JSON_SCHEMA",
+    "UP_DOWN_FLIP_CHECK_JSON_SCHEMA",
+]
+
+UP_DOWN_FLIP_CHECK_JSON_SCHEMA: dict[str, Any] = {
+    "title": "AlignedUpDownFlipCheckOutput",
+    "type": "object",
+    "additionalProperties": False,
+    "properties": {
+        "selected_number": {"type": "integer", "enum": [1, 2]},
+        "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
+        "reason": {"type": "string"},
+    },
+    "required": ["selected_number", "confidence", "reason"],
+}
+
+IMAGE_METRIC_SCALE_JSON_SCHEMA: dict[str, Any] = {
+    "title": "ImageMetricScaleEstimate",
+    "type": "object",
+    "additionalProperties": False,
+    "properties": {
+        "object_scales": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "additionalProperties": False,
+                "properties": {
+                    "object_id": {"type": "string"},
+                    "bbox_dims_cm": {
+                        "type": "array",
+                        "minItems": 3,
+                        "maxItems": 3,
+                        "items": {
+                            "type": "number",
+                            "minimum": 1.0e-6,
+                        },
+                    },
+                    "confidence": {
+                        "type": "number",
+                        "minimum": 0.0,
+                        "maximum": 1.0,
+                    },
+                    "reason": {"type": "string"},
+                },
+                "required": ["object_id", "bbox_dims_cm", "confidence", "reason"],
+            },
+        },
+    },
+    "required": ["object_scales"],
+}
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/__init__.py
new file mode 100644
index 00000000..fbbf3148
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/__init__.py
@@ -0,0 +1,33 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager.manager import (
+    ImageSegmentationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager.schemas import (
+    AssetImageToRgbaRequest,
+    ImageSegmentationRequest,
+    ImageSegmentationResult,
+)
+
+__all__ = [
+    "AssetImageToRgbaRequest",
+    "ImageSegmentationManager",
+    "ImageSegmentationRequest",
+    "ImageSegmentationResult",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/manager.py
new file mode 100644
index 00000000..052b8d7d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/manager.py
@@ -0,0 +1,90 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+    ImageSegmentationClient,
+    ImageSegmentationError,
+    ImageSegmentationServerRequest,
+    apply_mask_to_alpha,
+    decode_rle_mask,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager.schemas import (
+    AssetImageToRgbaRequest,
+    ImageSegmentationRequest,
+    ImageSegmentationResult,
+)
+
+
+class ImageSegmentationManager:
+    """Image segmentation domain operations."""
+
+    def __init__(self, *, client: ImageSegmentationClient | None = None) -> None:
+        self.client = client or ImageSegmentationClient()
+
+    def segment_image(
+        self,
+        request: ImageSegmentationRequest,
+    ) -> ImageSegmentationResult:
+        image_path = request.image_path.expanduser().resolve()
+        _validate_segment_request(image_path=image_path, prompt=request.prompt)
+
+        response = self.client.segment(
+            ImageSegmentationServerRequest(
+                prompt=request.prompt.strip(),
+                image_path=image_path,
+            ),
+        )
+        if isinstance(response, ImageSegmentationError):
+            raise RuntimeError(response.error_message)
+
+        return ImageSegmentationResult(candidates=list(response.result.candidates))
+
+    def convert_asset_image_to_rgba(
+        self,
+        request: AssetImageToRgbaRequest,
+    ) -> Path:
+        segmentation_result = self.segment_image(
+            ImageSegmentationRequest(
+                image_path=request.image_path,
+                prompt=request.prompt,
+            )
+        )
+        if not segmentation_result.candidates:
+            raise ValueError("Image segmentation returned no candidates.")
+
+        candidate = segmentation_result.candidates[0]
+        if candidate.mask_rle is None:
+            raise ValueError(f"Candidate {candidate.candidate_id} has no mask_rle.")
+
+        output_path = request.output_path.expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        mask = decode_rle_mask(candidate.mask_rle)
+        rgba = apply_mask_to_alpha(request.image_path, mask)
+        rgba.save(output_path)
+        if not output_path.is_file():
+            raise FileNotFoundError(f"RGBA image was not written: {output_path}")
+        return output_path
+
+
+def _validate_segment_request(*, image_path: Path, prompt: str) -> None:
+    if not image_path.is_file():
+        raise FileNotFoundError(f"Image segmentation input not found: {image_path}")
+    if not prompt.strip():
+        raise ValueError("Image segmentation prompt must be non-empty.")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/schemas.py
new file mode 100644
index 00000000..d59b7e7a
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/schemas.py
@@ -0,0 +1,48 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+    ImageSegmentationCandidate,
+)
+
+
+@dataclass(frozen=True)
+class AssetImageToRgbaRequest:
+    """Request for converting an asset image to an RGBA cutout."""
+
+    image_path: Path
+    prompt: str
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class ImageSegmentationRequest:
+    """Request for segmenting one image with one text prompt."""
+
+    image_path: Path
+    prompt: str
+
+
+@dataclass(frozen=True)
+class ImageSegmentationResult:
+    """Segmentation candidates."""
+
+    candidates: list[ImageSegmentationCandidate]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/__init__.py
new file mode 100644
index 00000000..21cf6c25
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/__init__.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.manager import (
+    MatplotlibManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.schemas import (
+    RenderFootprintLayoutRequest,
+    RenderFootprintLayoutResult,
+    RenderImageComparisonRequest,
+    RenderImageComparisonResult,
+    RenderSupportRegionRequest,
+    RenderSupportRegionResult,
+    RenderXYComparisonRequest,
+    RenderXYComparisonResult,
+)
+
+__all__ = [
+    "MatplotlibManager",
+    "RenderFootprintLayoutRequest",
+    "RenderFootprintLayoutResult",
+    "RenderImageComparisonRequest",
+    "RenderImageComparisonResult",
+    "RenderSupportRegionRequest",
+    "RenderSupportRegionResult",
+    "RenderXYComparisonRequest",
+    "RenderXYComparisonResult",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/manager.py
new file mode 100644
index 00000000..1feb13c3
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/manager.py
@@ -0,0 +1,401 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Matplotlib manager for mesh visualization."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.collections import PolyCollection
+from matplotlib.patches import Rectangle
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.schemas import (
+    RenderFootprintLayoutRequest,
+    RenderFootprintLayoutResult,
+    RenderImageComparisonRequest,
+    RenderImageComparisonResult,
+    RenderSupportRegionRequest,
+    RenderSupportRegionResult,
+    RenderXYComparisonRequest,
+    RenderXYComparisonResult,
+)
+
+__all__ = ["MatplotlibManager"]
+
+
+class MatplotlibManager:
+    """Manager for mesh visualization via matplotlib.
+
+    Wraps matplotlib rendering with typed request/response methods,
+    following the same pattern as service clients.
+    """
+
+    def __init__(
+        self,
+        *,
+        figsize: tuple[float, float] = (8, 8),
+        dpi: int = 180,
+    ) -> None:
+        """Initialize the matplotlib manager.
+
+        Args:
+            figsize: Default figure size for rendered images.
+            dpi: Output image resolution.
+        """
+        self._figsize = figsize
+        self._dpi = dpi
+
+    def render_footprint_layout(
+        self,
+        request: RenderFootprintLayoutRequest,
+    ) -> RenderFootprintLayoutResult:
+        """Render labeled XY footprints with full-length coordinate axes."""
+        output_path = request.output_path.expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        if not request.object_ids:
+            return RenderFootprintLayoutResult(output_path=output_path)
+
+        centers = {
+            object_id: np.asarray(request.centers[object_id], dtype=float)
+            for object_id in request.object_ids
+        }
+        sizes = {
+            object_id: np.asarray(request.xy_sizes[object_id], dtype=float)
+            for object_id in request.object_ids
+        }
+        footprint_mins = np.vstack(
+            [
+                centers[object_id] - 0.5 * sizes[object_id]
+                for object_id in request.object_ids
+            ]
+        )
+        footprint_maxs = np.vstack(
+            [
+                centers[object_id] + 0.5 * sizes[object_id]
+                for object_id in request.object_ids
+            ]
+        )
+        data_min = footprint_mins.min(axis=0)
+        data_max = footprint_maxs.max(axis=0)
+        span = np.maximum(data_max - data_min, 1.0e-6)
+        padding = max(float(span.max()) * 0.12, 1.0e-3)
+        x_limits = (float(data_min[0] - padding), float(data_max[0] + padding))
+        y_limits = (float(data_min[1] - padding), float(data_max[1] + padding))
+
+        fig, ax = plt.subplots(figsize=self._figsize)
+        for object_id in request.object_ids:
+            center = centers[object_id]
+            size = sizes[object_id]
+            ax.add_patch(
+                Rectangle(
+                    (center[0] - 0.5 * size[0], center[1] - 0.5 * size[1]),
+                    size[0],
+                    size[1],
+                    facecolor=(0.35, 0.60, 0.95, 0.30),
+                    edgecolor=(0.08, 0.22, 0.60, 1.0),
+                    linewidth=1.5,
+                )
+            )
+            label = object_id.replace("interact_", "").removesuffix("_0")
+            ax.text(
+                center[0],
+                center[1],
+                label,
+                ha="center",
+                va="center",
+                fontsize=9,
+                color="black",
+            )
+
+        self._draw_full_xy_axes(ax, x_limits=x_limits, y_limits=y_limits)
+        ax.set_xlim(*x_limits)
+        ax.set_ylim(*y_limits)
+        ax.set_aspect("equal", adjustable="box")
+        ax.set_title(request.title)
+        ax.grid(True, linestyle=":", linewidth=0.6, alpha=0.30)
+        fig.tight_layout()
+        fig.savefig(output_path, dpi=self._dpi)
+        plt.close(fig)
+        return RenderFootprintLayoutResult(output_path=output_path)
+
+    def render_image_comparison(
+        self,
+        request: RenderImageComparisonRequest,
+    ) -> RenderImageComparisonResult:
+        """Render two images side by side with numbered labels."""
+        output_path = request.output_path.expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        first_image = plt.imread(request.first_image_path.expanduser().resolve())
+        second_image = plt.imread(request.second_image_path.expanduser().resolve())
+
+        fig, axes = plt.subplots(1, 2, figsize=(12, 6))
+        for ax, image, label in (
+            (axes[0], first_image, request.first_label),
+            (axes[1], second_image, request.second_label),
+        ):
+            ax.imshow(image)
+            ax.set_title(label, fontsize=16, loc="left")
+            ax.axis("off")
+        fig.tight_layout()
+        fig.savefig(output_path, dpi=self._dpi, facecolor="white")
+        plt.close(fig)
+        return RenderImageComparisonResult(output_path=output_path)
+
+    @staticmethod
+    def _draw_full_xy_axes(
+        ax: Any,
+        *,
+        x_limits: tuple[float, float],
+        y_limits: tuple[float, float],
+    ) -> None:
+        """Draw axes across the full viewport, centered on the data bounds."""
+        axis_color = "#303030"
+        x_center = 0.5 * (x_limits[0] + x_limits[1])
+        y_center = 0.5 * (y_limits[0] + y_limits[1])
+        # Horizontal axis (X) — spans full width, positioned at vertical centre.
+        ax.annotate(
+            "",
+            xy=(x_limits[1], y_center),
+            xytext=(x_limits[0], y_center),
+            arrowprops={"arrowstyle": "->", "color": axis_color, "lw": 1.8},
+            zorder=8,
+        )
+        # Vertical axis (Y) — spans full height, positioned at horizontal centre.
+        ax.annotate(
+            "",
+            xy=(x_center, y_limits[1]),
+            xytext=(x_center, y_limits[0]),
+            arrowprops={"arrowstyle": "->", "color": axis_color, "lw": 1.8},
+            zorder=8,
+        )
+        x_span = x_limits[1] - x_limits[0]
+        y_span = y_limits[1] - y_limits[0]
+        ax.text(
+            x_limits[1] - 0.03 * x_span,
+            y_center + 0.02 * y_span,
+            "+X",
+            ha="right",
+            va="bottom",
+            color=axis_color,
+            fontsize=11,
+        )
+        ax.text(
+            x_center + 0.02 * x_span,
+            y_limits[1] - 0.03 * y_span,
+            "+Y",
+            ha="left",
+            va="top",
+            color=axis_color,
+            fontsize=11,
+        )
+        # Mark the origin at the centre.
+        ax.plot(x_center, y_center, "o", color=axis_color, markersize=6, zorder=9)
+        ax.text(
+            x_center + 0.015 * x_span,
+            y_center + 0.015 * y_span,
+            "Origin",
+            fontsize=8,
+            color=axis_color,
+            ha="left",
+            va="bottom",
+            zorder=9,
+        )
+
+    def render_selected_support_region(
+        self, request: RenderSupportRegionRequest
+    ) -> RenderSupportRegionResult:
+        """Render a mesh with the selected support region highlighted."""
+        output_path = request.output_path.expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        vertices = np.asarray(request.mesh.vertices, dtype=float)
+        faces = np.asarray(request.mesh.faces, dtype=int)
+        selected_faces = faces[np.asarray(request.face_indices, dtype=int)]
+
+        fig = plt.figure(figsize=self._figsize)
+        ax = fig.add_subplot(111, projection="3d")
+        ax.add_collection3d(
+            Poly3DCollection(
+                vertices[faces],
+                facecolors=(0.65, 0.68, 0.72, 0.16),
+                edgecolors=(0.35, 0.37, 0.40, 0.08),
+                linewidths=0.15,
+            )
+        )
+        ax.add_collection3d(
+            Poly3DCollection(
+                vertices[selected_faces],
+                facecolors=(1.0, 0.18, 0.05, 0.88),
+                edgecolors=(0.55, 0.02, 0.0, 1.0),
+                linewidths=0.8,
+            )
+        )
+        self._set_equal_axes(ax, vertices)
+        ax.view_init(elev=25.0, azim=-45.0)
+        ax.set_xlabel("X")
+        ax.set_ylabel("Y")
+        ax.set_zlabel("Z")
+        ax.set_title("Selected Support Region")
+        fig.tight_layout()
+        fig.savefig(output_path, dpi=self._dpi)
+        plt.close(fig)
+        return RenderSupportRegionResult(output_path=output_path)
+
+    def render_xy_alignment_comparison(
+        self, request: RenderXYComparisonRequest
+    ) -> RenderXYComparisonResult:
+        """Render before/after XY projections for PCA yaw alignment."""
+        output_path = request.output_path.expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        before_polygons, before_xy = self._xy_polygons_and_vertices(request.before_mesh)
+        after_polygons, after_xy = self._xy_polygons_and_vertices(request.after_mesh)
+        center, view_half = self._xy_view_bounds(before_xy, after_xy)
+
+        fig, axes = plt.subplots(1, 2, figsize=self._figsize)
+        self._draw_xy_projection(
+            axes[0],
+            before_polygons,
+            before_xy,
+            "Before PCA yaw",
+            center,
+            view_half,
+        )
+        self._draw_xy_projection(
+            axes[1],
+            after_polygons,
+            after_xy,
+            f"After PCA yaw ({request.angle_degrees:.2f} deg)",
+            center,
+            view_half,
+        )
+        fig.tight_layout()
+        fig.savefig(output_path, dpi=self._dpi)
+        plt.close(fig)
+        return RenderXYComparisonResult(output_path=output_path)
+
+    @staticmethod
+    def _xy_polygons_and_vertices(mesh: Any) -> tuple[Any, Any]:
+        vertices = np.asarray(mesh.vertices, dtype=float)
+        faces = np.asarray(mesh.faces, dtype=int)
+        return vertices[faces][:, :, :2], vertices[:, :2]
+
+    @staticmethod
+    def _xy_view_bounds(before_xy: Any, after_xy: Any) -> tuple[Any, float]:
+        values = np.concatenate([before_xy, after_xy], axis=0)
+        bounds_min = values.min(axis=0)
+        bounds_max = values.max(axis=0)
+        center = 0.5 * (bounds_min + bounds_max)
+        span = np.maximum(bounds_max - bounds_min, 1e-3)
+        view_half = max(float(span.max()) * 0.65, 0.5)
+        return center, view_half
+
+    def _draw_xy_projection(
+        self,
+        ax: Any,
+        polygons_xy: Any,
+        vertices_xy: Any,
+        title: str,
+        center: Any,
+        view_half: float,
+    ) -> None:
+        ax.add_collection(
+            PolyCollection(
+                polygons_xy,
+                facecolors=(0.24, 0.50, 0.90, 0.28),
+                edgecolors=(0.05, 0.16, 0.35, 0.20),
+                linewidths=0.20,
+            )
+        )
+        self._draw_xy_aabb(ax, vertices_xy)
+        self._add_xy_axes(ax, view_half)
+        ax.set_xlim(center[0] - view_half, center[0] + view_half)
+        ax.set_ylim(center[1] - view_half, center[1] + view_half)
+        ax.set_aspect("equal", adjustable="box")
+        ax.set_xlabel("X")
+        ax.set_ylabel("Y")
+        ax.set_title(title)
+        ax.grid(True, which="major", linestyle="-", linewidth=0.7, alpha=0.35)
+        ax.minorticks_on()
+        ax.grid(True, which="minor", linestyle=":", linewidth=0.45, alpha=0.25)
+
+    @staticmethod
+    def _draw_xy_aabb(ax: Any, vertices_xy: Any) -> None:
+        bounds_min = vertices_xy.min(axis=0)
+        bounds_max = vertices_xy.max(axis=0)
+        width, height = bounds_max - bounds_min
+        ax.add_patch(
+            Rectangle(
+                (bounds_min[0], bounds_min[1]),
+                width,
+                height,
+                fill=False,
+                edgecolor="#d62828",
+                linewidth=1.6,
+                linestyle="-",
+                alpha=0.95,
+            )
+        )
+
+    @staticmethod
+    def _add_xy_axes(ax: Any, view_half: float) -> None:
+        arrow_len = max(view_half * 0.35, 0.2)
+        ax.scatter([0.0], [0.0], color="black", s=22, zorder=8)
+        ax.text(0.0, 0.0, " Origin", fontsize=9, ha="left", va="bottom")
+        ax.arrow(
+            0.0,
+            0.0,
+            arrow_len,
+            0.0,
+            width=arrow_len * 0.015,
+            head_width=arrow_len * 0.06,
+            head_length=arrow_len * 0.08,
+            color="#d62828",
+            length_includes_head=True,
+            zorder=9,
+        )
+        ax.text(arrow_len * 1.08, 0.0, "+X", color="#d62828", fontsize=11)
+        ax.arrow(
+            0.0,
+            0.0,
+            0.0,
+            arrow_len,
+            width=arrow_len * 0.015,
+            head_width=arrow_len * 0.06,
+            head_length=arrow_len * 0.08,
+            color="#2a9d8f",
+            length_includes_head=True,
+            zorder=9,
+        )
+        ax.text(0.0, arrow_len * 1.08, "+Y", color="#2a9d8f", fontsize=11)
+
+    @staticmethod
+    def _set_equal_axes(ax: Any, vertices: Any) -> None:
+        mins = np.min(vertices, axis=0)
+        maxs = np.max(vertices, axis=0)
+        center = (mins + maxs) * 0.5
+        radius = max(float(np.max(maxs - mins)) * 0.5, 1e-6)
+        ax.set_xlim(center[0] - radius, center[0] + radius)
+        ax.set_ylim(center[1] - radius, center[1] + radius)
+        ax.set_zlim(center[2] - radius, center[2] + radius)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/schemas.py
new file mode 100644
index 00000000..764383f3
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/schemas.py
@@ -0,0 +1,101 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+    "RenderFootprintLayoutRequest",
+    "RenderFootprintLayoutResult",
+    "RenderImageComparisonRequest",
+    "RenderImageComparisonResult",
+    "RenderSupportRegionRequest",
+    "RenderSupportRegionResult",
+    "RenderXYComparisonRequest",
+    "RenderXYComparisonResult",
+]
+
+
+@dataclass(frozen=True)
+class RenderFootprintLayoutRequest:
+    """Request to render labeled top-down object footprints."""
+
+    object_ids: list[str]
+    centers: dict[str, Any]
+    xy_sizes: dict[str, Any]
+    output_path: Path
+    title: str = ""
+
+
+@dataclass(frozen=True)
+class RenderFootprintLayoutResult:
+    """Result of rendering a footprint layout."""
+
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderImageComparisonRequest:
+    """Request to render two labeled images side by side."""
+
+    first_image_path: Path
+    second_image_path: Path
+    output_path: Path
+    first_label: str = "1: normal"
+    second_label: str = "2: flipped"
+
+
+@dataclass(frozen=True)
+class RenderImageComparisonResult:
+    """Result of rendering an image comparison."""
+
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderSupportRegionRequest:
+    """Request to render a mesh with the selected support region highlighted."""
+
+    mesh: Any
+    face_indices: list[int]
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderSupportRegionResult:
+    """Result of rendering the support region."""
+
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderXYComparisonRequest:
+    """Request to render before/after XY projections for PCA yaw alignment."""
+
+    before_mesh: Any
+    after_mesh: Any
+    angle_degrees: float
+    output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderXYComparisonResult:
+    """Result of rendering the XY alignment comparison."""
+
+    output_path: Path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/__init__.py
new file mode 100644
index 00000000..8eca3510
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/__init__.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.metric_scale_manager.manager import (
+    METRIC_SCALE_ENABLED,
+    MetricScaleManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.metric_scale_manager.schemas import (
+    EstimateMetricScalesRequest,
+    EstimateMetricScalesResult,
+    GlobalMetricScaleRequest,
+    MetricScaleObjectInput,
+)
+
+__all__ = [
+    "METRIC_SCALE_ENABLED",
+    "EstimateMetricScalesRequest",
+    "EstimateMetricScalesResult",
+    "GlobalMetricScaleRequest",
+    "MetricScaleManager",
+    "MetricScaleObjectInput",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/manager.py
new file mode 100644
index 00000000..ce1d47e9
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/manager.py
@@ -0,0 +1,431 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+    GeometryManager,
+    LoadMeshRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.metric_scale_manager.schemas import (
+    EstimateMetricScalesRequest,
+    EstimateMetricScalesResult,
+    GlobalMetricScaleRequest,
+    MetricScaleObjectInput,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.workflows.llm_output import (
+    call_structured_json_model_step,
+)
+
+__all__ = ["METRIC_SCALE_ENABLED", "MetricScaleManager"]
+
+METRIC_SCALE_ENABLED = True
+
+
+class MetricScaleManager:
+    """Manager for metric scale estimation and scale aggregation."""
+
+    @staticmethod
+    def estimate_metric_scales(
+        request: EstimateMetricScalesRequest,
+    ) -> EstimateMetricScalesResult:
+        """Call an LLM and convert bbox-size predictions into scale factors."""
+        object_payload = MetricScaleManager.build_object_payload(request.objects)
+        raw_model_output_path = (
+            request.raw_output_path.expanduser().resolve()
+            if request.raw_output_path is not None
+            else None
+        )
+        raw_model_output = call_structured_json_model_step(
+            llm=request.llm,
+            schema=request.schema,
+            messages=request.messages,
+            context=request.context,
+            step_name=request.step_name,
+            output_root=None,
+            attempt_count=0,
+            raw_output_writer=(
+                (lambda payload: write_json(raw_model_output_path, payload))
+                if raw_model_output_path is not None
+                else None
+            ),
+        )
+        object_scales = MetricScaleManager.apply_model_output(
+            object_payload=object_payload,
+            raw_model_output=raw_model_output,
+            method=request.method,
+        )
+        return EstimateMetricScalesResult(
+            status="ok",
+            object_scales=object_scales,
+            object_payload=object_payload,
+            raw_model_output=raw_model_output,
+        )
+
+    @staticmethod
+    def build_object_payload(
+        objects: list[MetricScaleObjectInput],
+    ) -> list[dict[str, Any]]:
+        """Build object payload with normalized mesh bbox measurements."""
+        geom = GeometryManager()
+        payload: list[dict[str, Any]] = []
+        for obj in objects:
+            mesh = geom.load_mesh(LoadMeshRequest(mesh_path=obj.mesh_path)).mesh
+            normalized_bbox_size_m = GeometryManager.mesh_aabb_size(mesh)
+            payload.append(
+                {
+                    "object_id": obj.object_id,
+                    "object_name": obj.object_name,
+                    "object_description": obj.object_description,
+                    "normalized_bbox_size_m": normalized_bbox_size_m.tolist(),
+                    "normalized_bbox_ratio": GeometryManager.bbox_ratio(
+                        normalized_bbox_size_m
+                    ).tolist(),
+                }
+            )
+        return payload
+
+    @staticmethod
+    def object_prompt_payload(
+        objects: list[MetricScaleObjectInput],
+    ) -> list[dict[str, str]]:
+        """Return the lightweight object payload intended for LLM prompts."""
+        return [
+            {
+                "object_id": obj.object_id,
+                "object_name": obj.object_name,
+                "object_description": obj.object_description,
+            }
+            for obj in objects
+        ]
+
+    @staticmethod
+    def apply_model_output(
+        *,
+        object_payload: list[dict[str, Any]],
+        raw_model_output: dict[str, Any],
+        method: str,
+    ) -> list[dict[str, Any]]:
+        """Convert model bbox predictions into per-object metric-scale records."""
+        model_by_id = {
+            str(item.get("object_id", "")): item
+            for item in raw_model_output.get("object_scales", [])
+            if isinstance(item, dict)
+        }
+        estimates: list[dict[str, Any]] = []
+        for payload in object_payload:
+            object_id = str(payload.get("object_id", ""))
+            model_item = model_by_id.get(object_id)
+            if model_item is None:
+                estimates.append(
+                    MetricScaleManager.failure(
+                        object_id=object_id,
+                        reason="missing_object_scale_from_model",
+                        method=method,
+                    )
+                )
+                continue
+            estimates.append(
+                MetricScaleManager.select_candidate(
+                    object_id=object_id,
+                    object_name=str(payload.get("object_name", "")),
+                    object_description=str(payload.get("object_description", "")),
+                    bbox_dims_cm=model_item.get("bbox_dims_cm", []),
+                    confidence=float(model_item.get("confidence", 0.0)),
+                    reason=str(model_item.get("reason", "")),
+                    normalized_bbox_size_m=np.asarray(
+                        payload["normalized_bbox_size_m"],
+                        dtype=np.float64,
+                    ),
+                    method=method,
+                )
+            )
+        return estimates
+
+    @staticmethod
+    def apply_to_objects(
+        *,
+        objects: list[dict[str, Any]],
+        object_scales: list[dict[str, Any]],
+    ) -> None:
+        """Attach metric-scale records to object dictionaries by object id."""
+        scale_by_id = {str(item.get("object_id", "")): item for item in object_scales}
+        for obj in objects:
+            object_id = str(obj.get("id", ""))
+            if object_id in scale_by_id:
+                obj["metric_scale"] = scale_by_id[object_id]
+
+    @staticmethod
+    def select_candidate(
+        *,
+        object_id: str,
+        object_name: str,
+        object_description: str,
+        bbox_dims_cm: Any,
+        confidence: float,
+        reason: str,
+        normalized_bbox_size_m: np.ndarray,
+        method: str,
+    ) -> dict[str, Any]:
+        """Select a scale factor from predicted real-world bbox dimensions."""
+        try:
+            selected = MetricScaleManager.compute_from_bbox_dims(
+                bbox_dims_cm=bbox_dims_cm,
+                confidence=confidence,
+                reason=reason,
+                normalized_bbox_size_m=normalized_bbox_size_m,
+            )
+        except (TypeError, ValueError):
+            return MetricScaleManager.failure(
+                object_id=object_id,
+                reason="invalid_bbox_dims_cm",
+                method=method,
+            )
+        normalized_bbox_size_cm = (
+            np.asarray(normalized_bbox_size_m, dtype=np.float64) * 100.0
+        )
+        return {
+            "status": "ok",
+            "method": method,
+            "object_id": object_id,
+            "object_name": object_name,
+            "object_description": object_description,
+            "normalized_bbox_size_m": normalized_bbox_size_m.tolist(),
+            "normalized_bbox_size_cm": normalized_bbox_size_cm.tolist(),
+            "normalized_bbox_ratio": GeometryManager.bbox_ratio(
+                normalized_bbox_size_m
+            ).tolist(),
+            "bbox_dims_cm": selected["bbox_dims_cm"],
+            "axis_match": selected["axis_match"],
+            "scale_factor": selected["scale_factor"],
+            "confidence": selected["confidence"],
+            "reason": selected["reason"],
+            "unit_note": "scale_factor is not baked into this GLB.",
+        }
+
+    @staticmethod
+    def compute_from_bbox_dims(
+        *,
+        bbox_dims_cm: Any,
+        confidence: float,
+        reason: str,
+        normalized_bbox_size_m: np.ndarray,
+    ) -> dict[str, Any]:
+        """Compute one scale candidate from model-predicted bbox dimensions."""
+        dims_cm = np.asarray(
+            [float(value) for value in bbox_dims_cm],
+            dtype=np.float64,
+        )
+        if dims_cm.shape != (3,) or np.any(dims_cm <= 0.0):
+            raise ValueError("bbox_dims_cm must contain three positive values.")
+        normalized_bbox_size_cm = (
+            np.asarray(normalized_bbox_size_m, dtype=np.float64) * 100.0
+        )
+        axis_match = GeometryManager.best_axis_bbox_scale_match(
+            source_size_cm=normalized_bbox_size_cm,
+            target_size_cm=dims_cm,
+        )
+        return {
+            "bbox_dims_cm": dims_cm.tolist(),
+            "axis_match": axis_match,
+            "scale_factor": float(axis_match["scale_factor"]),
+            "confidence": confidence,
+            "reason": reason,
+        }
+
+    @staticmethod
+    def failure(
+        *,
+        object_id: str,
+        reason: str,
+        method: str,
+    ) -> dict[str, Any]:
+        """Build a failed per-object metric-scale record."""
+        return {
+            "status": "failed",
+            "method": method,
+            "object_id": object_id,
+            "scale_factor": 1.0,
+            "reason": reason,
+        }
+
+    @staticmethod
+    def set_for_all_objects(
+        *,
+        objects: list[dict[str, Any]],
+        status: str,
+        reason: str,
+        method: str,
+    ) -> None:
+        """Attach the same fallback metric-scale status to all objects."""
+        for obj in objects:
+            obj["metric_scale"] = {
+                "status": status,
+                "method": method,
+                "object_id": str(obj.get("id", "")),
+                "scale_factor": 1.0,
+                "reason": reason,
+            }
+
+    @staticmethod
+    def compute_global_from_object_scenes(
+        request: GlobalMetricScaleRequest,
+    ) -> dict[str, Any]:
+        """Aggregate object metric scales into one global scale for a scene layout."""
+        if not METRIC_SCALE_ENABLED:
+            return {
+                "status": "disabled",
+                "method": "metric_scale_disabled",
+                "scale_factor": 1.0,
+                "object_count": len(request.objects),
+                "used_count": 0,
+                "skipped_count": len(request.objects),
+                "used": [],
+                "skipped": [
+                    {"id": str(item.get("id", "")), "reason": "metric_scale_disabled"}
+                    for item in request.objects
+                ],
+                "unit_note": (
+                    "Metric scale is disabled; aligned GLBs keep simready "
+                    "normalized size."
+                ),
+            }
+
+        used: list[dict[str, Any]] = []
+        skipped: list[dict[str, Any]] = []
+        object_by_id = {str(item.get("id", "")): item for item in request.objects}
+        for object_id, scene in request.object_scenes:
+            item = object_by_id.get(object_id)
+            if item is None:
+                skipped.append({"id": object_id, "reason": "missing_object_record"})
+                continue
+            metric_scale = item.get("metric_scale")
+            if not isinstance(metric_scale, dict):
+                skipped.append({"id": object_id, "reason": "missing_metric_scale"})
+                continue
+            if metric_scale.get("status") != "ok":
+                skipped.append(
+                    {
+                        "id": object_id,
+                        "reason": str(metric_scale.get("status") or "not_ok"),
+                    }
+                )
+                continue
+
+            scale_factor_simready = float(metric_scale.get("scale_factor", 1.0))
+            if not np.isfinite(scale_factor_simready) or scale_factor_simready <= 0.0:
+                skipped.append(
+                    {"id": object_id, "reason": "invalid_simready_scale_factor"}
+                )
+                continue
+            try:
+                simready_size_m = np.asarray(
+                    [float(v) for v in metric_scale.get("normalized_bbox_size_m", [])],
+                    dtype=np.float64,
+                )
+            except (TypeError, ValueError):
+                skipped.append(
+                    {"id": object_id, "reason": "invalid_normalized_bbox_size_m"}
+                )
+                continue
+            if simready_size_m.shape != (3,) or np.any(simready_size_m <= 0.0):
+                skipped.append(
+                    {"id": object_id, "reason": "invalid_normalized_bbox_size_m"}
+                )
+                continue
+
+            current_bounds = np.asarray(GeometryManager.scene_to_mesh(scene).bounds)
+            current_size_m = current_bounds[1] - current_bounds[0]
+            if current_size_m.shape != (3,) or np.any(current_size_m <= 0.0):
+                skipped.append({"id": object_id, "reason": "invalid_current_scene_aabb"})
+                continue
+
+            geo_ratio = np.sort(current_size_m) / np.sort(simready_size_m)
+            geo_scale = float(np.median(geo_ratio))
+            if not np.isfinite(geo_scale) or geo_scale <= 0.0:
+                skipped.append({"id": object_id, "reason": "non_positive_geo_scale"})
+                continue
+
+            effective_scale = scale_factor_simready / geo_scale
+            if not np.isfinite(effective_scale) or effective_scale <= 0.0:
+                skipped.append(
+                    {"id": object_id, "reason": "non_positive_effective_scale"}
+                )
+                continue
+
+            used.append(
+                {
+                    "id": object_id,
+                    "effective_scale": effective_scale,
+                    "scale_factor_simready": scale_factor_simready,
+                    "geo_scale": geo_scale,
+                    "simready_bbox_size_m": simready_size_m.tolist(),
+                    "simready_bbox_size_cm": (simready_size_m * 100.0).tolist(),
+                    "current_scene_bbox_size_m": current_size_m.tolist(),
+                    "current_scene_bbox_size_cm": (current_size_m * 100.0).tolist(),
+                    "target_bbox_dims_cm": metric_scale.get("bbox_dims_cm"),
+                    "confidence": metric_scale.get("confidence"),
+                }
+            )
+
+        if not used:
+            return {
+                "status": "fallback",
+                "method": "simready_reference_geo_ratio_mean_with_clamp",
+                "scale_factor": 1.0,
+                "raw_scale_factor": 1.0,
+                "was_clamped": False,
+                "clamp": {"min": request.min_scale, "max": request.max_scale},
+                "object_count": len(request.objects),
+                "used_count": 0,
+                "skipped_count": len(skipped),
+                "used": [],
+                "skipped": skipped,
+                "unit_note": (
+                    "No valid metric scale was available; image clutter keeps the "
+                    "SAM3D layout scale without an additional metric scale."
+                ),
+            }
+
+        raw_scale_factor = float(np.mean([item["effective_scale"] for item in used]))
+        scale_factor = float(
+            np.clip(raw_scale_factor, request.min_scale, request.max_scale)
+        )
+        return {
+            "status": "ok",
+            "method": "simready_reference_geo_ratio_mean_with_clamp",
+            "scale_factor": scale_factor,
+            "raw_scale_factor": raw_scale_factor,
+            "was_clamped": bool(scale_factor != raw_scale_factor),
+            "clamp": {"min": request.min_scale, "max": request.max_scale},
+            "object_count": len(request.objects),
+            "used_count": len(used),
+            "skipped_count": len(skipped),
+            "used": used,
+            "skipped": skipped,
+            "unit_note": (
+                "Global scale derived from scene-level VLM per-object scale_factor "
+                "divided by the geometric scale ratio between simready normalized "
+                "bbox and current aligned scene bbox (sorted, permutation-invariant). "
+                f"Aggregated via mean across objects, clamped to "
+                f"[{request.min_scale:.2f}, {request.max_scale:.2f}]."
+            ),
+        }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/schemas.py
new file mode 100644
index 00000000..dd2de343
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/metric_scale_manager/schemas.py
@@ -0,0 +1,73 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+    "EstimateMetricScalesRequest",
+    "EstimateMetricScalesResult",
+    "GlobalMetricScaleRequest",
+    "MetricScaleObjectInput",
+]
+
+
+@dataclass(frozen=True)
+class MetricScaleObjectInput:
+    """Object input for metric-scale estimation."""
+
+    object_id: str
+    object_name: str
+    object_description: str
+    mesh_path: Path
+
+
+@dataclass(frozen=True)
+class EstimateMetricScalesRequest:
+    """Request to estimate metric scale for a set of normalized objects."""
+
+    objects: list[MetricScaleObjectInput]
+    messages: list[dict[str, Any]]
+    schema: dict[str, Any]
+    llm: Any
+    context: str
+    method: str
+    step_name: str = "metric_scale"
+    raw_output_path: Path | None = None
+
+
+@dataclass(frozen=True)
+class EstimateMetricScalesResult:
+    """Result of estimating metric scale for normalized objects."""
+
+    status: str
+    object_scales: list[dict[str, Any]]
+    object_payload: list[dict[str, Any]]
+    raw_model_output: dict[str, Any] | None = None
+    reason: str = ""
+
+
+@dataclass(frozen=True)
+class GlobalMetricScaleRequest:
+    """Request to aggregate per-object metric scales into one scene scale."""
+
+    objects: list[dict[str, Any]]
+    object_scenes: list[tuple[str, Any]]
+    min_scale: float = 0.10
+    max_scale: float = 10.00
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/optimization_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/optimization_manager/__init__.py
new file mode 100644
index 00000000..b61756bf
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/optimization_manager/__init__.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.optimization_manager.manager import (
+    _center_xy_aabb_layout,
+    _footprint_layout_diagnostics,
+    _object_scenes_xy_aabb_manifest,
+    _settle_and_pack_object_footprints,
+    _xy_aabb_overlap,
+    _xy_union_area,
+    _xy_union_bounds,
+)
+
+__all__ = [
+    "_center_xy_aabb_layout",
+    "_footprint_layout_diagnostics",
+    "_object_scenes_xy_aabb_manifest",
+    "_settle_and_pack_object_footprints",
+    "_xy_aabb_overlap",
+    "_xy_union_area",
+    "_xy_union_bounds",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/optimization_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/optimization_manager/manager.py
new file mode 100644
index 00000000..d7ed1348
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/optimization_manager/manager.py
@@ -0,0 +1,633 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+import tempfile
+import traceback
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager import (
+    SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+    GravityDropRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.scene_geometry import (
+    _aabb_bottom_to_xy_plane_transform,
+    _copy_scene_with_transform,
+    _matrix_from_json,
+    _scene_to_mesh,
+    _xy_aabb_center,
+    _xy_aabb_size,
+    _z_up_to_glb_y_up_transform,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import (
+    relative_path,
+)
+
+__all__ = [
+    "_center_xy_aabb_layout",
+    "_object_scenes_xy_aabb_manifest",
+    "_settle_and_pack_object_footprints",
+    "_xy_aabb_overlap",
+    "_xy_union_area",
+    "_xy_union_bounds",
+]
+
+def _object_scenes_xy_aabb_manifest(
+    *,
+    object_scenes: list[tuple[str, Any]],
+    trimesh: Any,
+    unit_scale: float,
+    unit: str,
+) -> dict[str, Any]:
+    if not object_scenes:
+        return {
+            "status": "empty",
+            "unit": unit,
+            "object_count": 0,
+        }
+    bounds = [
+        np.asarray(_scene_to_mesh(scene, trimesh=trimesh).bounds, dtype=np.float64)
+        for _, scene in object_scenes
+    ]
+    union_bounds = np.vstack(
+        [
+            np.vstack([item[0] for item in bounds]).min(axis=0),
+            np.vstack([item[1] for item in bounds]).max(axis=0),
+        ]
+    )
+    min_xy = union_bounds[0, :2] * unit_scale
+    max_xy = union_bounds[1, :2] * unit_scale
+    size_xy = max_xy - min_xy
+    center_xy = 0.5 * (min_xy + max_xy)
+    return {
+        "status": "ok",
+        "unit": unit,
+        "object_count": len(object_scenes),
+        "min_xy": min_xy.tolist(),
+        "max_xy": max_xy.tolist(),
+        "center_xy": center_xy.tolist(),
+        "size_xy": size_xy.tolist(),
+        "area": float(size_xy[0] * size_xy[1]),
+    }
+
+
+
+def _settle_and_pack_object_footprints(
+    *,
+    object_scenes: list[tuple[str, Any]],
+    output_dir: Path,
+    output_root: Path,
+    trimesh: Any,
+) -> dict[str, Any]:
+    sim = SimulationManager(headless=True, sim_device="cpu")
+    footprint_items: list[dict[str, Any]] = []
+    settled_entries: list[dict[str, Any]] = []
+    output_axis_transform = _z_up_to_glb_y_up_transform()
+    output_to_internal_transform = np.linalg.inv(output_axis_transform)
+
+    with tempfile.TemporaryDirectory(prefix="p2s_footprint_drop_") as tmp_dir:
+        tmp_path = Path(tmp_dir)
+        for object_id, scene in object_scenes:
+            mesh = _scene_to_mesh(scene, trimesh=trimesh)
+            mesh_bounds = np.asarray(mesh.bounds, dtype=np.float64)
+            mesh_z_height = max(float(mesh_bounds[1][2] - mesh_bounds[0][2]), 0.0)
+            bottom_to_xy_plane_transform = _aabb_bottom_to_xy_plane_transform(
+                mesh_bounds
+            )
+            normalized_scene = _copy_scene_with_transform(
+                scene,
+                bottom_to_xy_plane_transform,
+            )
+            normalized_output_scene = _copy_scene_with_transform(
+                normalized_scene,
+                output_axis_transform,
+            )
+            pre_gravity_path = tmp_path / f"{object_id}_pre_gravity.glb"
+            normalized_output_scene.export(pre_gravity_path)
+            gravity_initial_height = mesh_z_height * 0.1
+
+            gravity_status = "ok"
+            gravity_transform = np.eye(4, dtype=np.float64)
+            gravity_reason = ""
+            try:
+                gravity_result = sim.run_gravity_simulation(
+                    GravityDropRequest(
+                        glb_path=pre_gravity_path,
+                        max_convex_hull_num=32,
+                        initial_height=gravity_initial_height,
+                    )
+                )
+                gravity_transform = _matrix_from_json(
+                    gravity_result.final_pose,
+                    name=f"{object_id}.gravity_final_pose",
+                )
+            except Exception:
+                gravity_status = "failed"
+                gravity_reason = traceback.format_exc()
+
+            settled_origin_scene = _copy_scene_with_transform(
+                normalized_scene,
+                gravity_transform,
+            )
+            settled_mesh = _scene_to_mesh(settled_origin_scene, trimesh=trimesh)
+            settled_bounds = np.asarray(settled_mesh.bounds, dtype=np.float64)
+            settled_xy_center = _xy_aabb_center(settled_bounds)
+            settled_xy_size = _xy_aabb_size(settled_bounds)
+            settled_entries.append(
+                {
+                    "id": object_id,
+                    "scene": scene,
+                    "bottom_to_xy_plane_transform": bottom_to_xy_plane_transform,
+                    "mesh_z_height": mesh_z_height,
+                    "gravity_initial_height": gravity_initial_height,
+                    "gravity_transform": gravity_transform,
+                    "settled_bounds": settled_bounds,
+                    "settled_xy_center": settled_xy_center,
+                    "settled_xy_size": settled_xy_size,
+                    "gravity_status": gravity_status,
+                    "gravity_reason": gravity_reason,
+                }
+            )
+
+    layout_result = _optimize_xy_aabb_footprint_layout(
+        object_ids=[str(entry["id"]) for entry in settled_entries],
+        xy_sizes={
+            str(entry["id"]): np.asarray(entry["settled_xy_size"], dtype=np.float64)
+            for entry in settled_entries
+        },
+        current_centers={
+            str(entry["id"]): _xy_aabb_center(
+                _scene_to_mesh(entry["scene"], trimesh=trimesh).bounds
+            )
+            for entry in settled_entries
+        },
+    )
+    target_centers = layout_result["centers"]
+
+    packed_object_scenes: list[tuple[str, Any]] = []
+    object_layout_transforms: dict[str, np.ndarray] = {}
+    for entry in settled_entries:
+        object_id = str(entry["id"])
+        settled_bounds = np.asarray(entry["settled_bounds"], dtype=np.float64)
+        target_xy = target_centers[object_id]
+        placement_transform = np.eye(4, dtype=np.float64)
+        placement_transform[:3, 3] = [
+            float(target_xy[0] - entry["settled_xy_center"][0]),
+            float(target_xy[1] - entry["settled_xy_center"][1]),
+            -float(settled_bounds[0][2]),
+        ]
+        object_transform = (
+            placement_transform
+            @ entry["gravity_transform"]
+            @ entry["bottom_to_xy_plane_transform"]
+        )
+        packed_scene = _copy_scene_with_transform(entry["scene"], object_transform)
+        packed_object_scenes.append((object_id, packed_scene))
+        object_layout_transforms[object_id] = object_transform
+
+        packed_bounds = np.asarray(
+            _scene_to_mesh(packed_scene, trimesh=trimesh).bounds,
+            dtype=np.float64,
+        )
+        footprint_items.append(
+            {
+                "id": object_id,
+                "gravity_status": entry["gravity_status"],
+                "gravity_reason": entry["gravity_reason"],
+                "bottom_to_xy_plane_transform": entry[
+                    "bottom_to_xy_plane_transform"
+                ].tolist(),
+                "mesh_z_height": entry["mesh_z_height"],
+                "gravity_initial_height": entry["gravity_initial_height"],
+                "gravity_transform": entry["gravity_transform"].tolist(),
+                "placement_transform": placement_transform.tolist(),
+                "object_layout_transform": object_transform.tolist(),
+                "settled_xy_size": entry["settled_xy_size"].tolist(),
+                "target_xy_center": target_xy.tolist(),
+                "packed_bounds": packed_bounds.tolist(),
+            }
+        )
+
+    manifest = {
+        "status": "ok",
+        "method": "per_object_gravity_then_geometry_knn_2d_aabb_relaxation",
+        "output_dir": relative_path(str(output_dir), output_root),
+        "internal_up_axis": [0.0, 0.0, 1.0],
+        "gravity_glb_up_axis": [0.0, 1.0, 0.0],
+        "internal_to_gravity_glb_transform": output_axis_transform.tolist(),
+        "gravity_glb_to_internal_transform": output_to_internal_transform.tolist(),
+        "layout_optimization": layout_result["metadata"],
+        "items": footprint_items,
+    }
+    return {
+        "object_scenes": packed_object_scenes,
+        "object_layout_transforms": object_layout_transforms,
+        "manifest": manifest,
+    }
+
+
+
+def _optimize_xy_aabb_footprint_layout(
+    *,
+    object_ids: list[str],
+    xy_sizes: dict[str, np.ndarray],
+    current_centers: dict[str, np.ndarray],
+    padding_ratio: float = 0.08,
+) -> dict[str, Any]:
+    if not object_ids:
+        return {
+            "centers": {},
+            "metadata": {
+                "method": "geometry_knn_2d_aabb_relaxation",
+                "iterations": 0,
+                "confidence_score": 1.0,
+            },
+        }
+
+    max_extent = max(
+        float(max(xy_sizes[object_id][0], xy_sizes[object_id][1]))
+        for object_id in object_ids
+    )
+    padding = max(max_extent * padding_ratio, 1e-3)
+    max_iterations = 300
+    overlap_strength = 1.0
+    neighbor_strength = 0.04
+    compactness_strength = 0.01
+    target_expansion_ratio = 1.2
+    knn_k = min(3, max(len(object_ids) - 1, 0))
+    centers = {
+        object_id: np.asarray(
+            current_centers.get(object_id, np.zeros(2, dtype=np.float64)),
+            dtype=np.float64,
+        ).copy()
+        for object_id in object_ids
+    }
+    centers = _center_xy_aabb_layout(
+        centers=centers,
+        xy_sizes=xy_sizes,
+    )
+    initial_centers = {
+        object_id: center.copy()
+        for object_id, center in centers.items()
+    }
+    initial_union_bounds = _xy_union_bounds(
+        centers=initial_centers,
+        xy_sizes=xy_sizes,
+    )
+    neighbor_edges = _knn_neighbor_edges(
+        centers=initial_centers,
+        k=knn_k,
+    )
+
+    iterations = 0
+    for iteration in range(max_iterations):
+        iterations = iteration + 1
+        max_delta = 0.0
+
+        for i, object_id in enumerate(object_ids):
+            for other_id in object_ids[i + 1 :]:
+                overlap = _xy_aabb_overlap(
+                    center_a=centers[object_id],
+                    size_a=xy_sizes[object_id],
+                    center_b=centers[other_id],
+                    size_b=xy_sizes[other_id],
+                    padding=padding,
+                )
+                if overlap is None:
+                    continue
+                overlap_x, overlap_y = overlap
+                if overlap_x <= overlap_y:
+                    axis = 0
+                    sign = (
+                        -1.0
+                        if centers[object_id][0] <= centers[other_id][0]
+                        else 1.0
+                    )
+                    amount = overlap_x
+                else:
+                    axis = 1
+                    sign = (
+                        -1.0
+                        if centers[object_id][1] <= centers[other_id][1]
+                        else 1.0
+                    )
+                    amount = overlap_y
+                shift = 0.5 * (amount + 1e-6) * overlap_strength
+                centers[object_id][axis] += sign * shift
+                centers[other_id][axis] -= sign * shift
+                max_delta = max(max_delta, shift)
+
+        for edge in neighbor_edges:
+            object_id = edge["object"]
+            neighbor_id = edge["neighbor"]
+            initial_delta = np.asarray(edge["initial_delta"], dtype=np.float64)
+            error = (centers[object_id] - centers[neighbor_id]) - initial_delta
+            correction = 0.5 * neighbor_strength * error
+            centers[object_id] -= correction
+            centers[neighbor_id] += correction
+            max_delta = max(max_delta, float(np.linalg.norm(correction)))
+
+        max_delta = max(
+            max_delta,
+            _apply_compactness_pull(
+                centers=centers,
+                xy_sizes=xy_sizes,
+                initial_union_bounds=initial_union_bounds,
+                target_expansion_ratio=target_expansion_ratio,
+                strength=compactness_strength,
+            ),
+        )
+
+        centers = _center_xy_aabb_layout(
+            centers=centers,
+            xy_sizes=xy_sizes,
+        )
+        if iteration >= 20 and max_delta < 1e-5:
+            break
+
+    diagnostics = _footprint_layout_diagnostics(
+        object_ids=object_ids,
+        centers=centers,
+        initial_centers=initial_centers,
+        xy_sizes=xy_sizes,
+        padding=padding,
+        initial_union_bounds=initial_union_bounds,
+    )
+    metadata = {
+        "method": "geometry_knn_2d_aabb_relaxation",
+        "relation_usage": "disabled",
+        "iterations": iterations,
+        "padding": padding,
+        "padding_ratio": padding_ratio,
+        "max_iterations": max_iterations,
+        "overlap_strength": overlap_strength,
+        "neighbor_strength": neighbor_strength,
+        "compactness_strength": compactness_strength,
+        "target_expansion_ratio": target_expansion_ratio,
+        "knn_k": knn_k,
+        "neighbor_edges": neighbor_edges,
+        "final_centers": {
+            object_id: centers[object_id].tolist()
+            for object_id in object_ids
+        },
+        **diagnostics,
+    }
+    return {"centers": centers, "metadata": metadata}
+
+
+
+def _knn_neighbor_edges(
+    *,
+    centers: dict[str, np.ndarray],
+    k: int,
+) -> list[dict[str, Any]]:
+    if k <= 0 or len(centers) < 2:
+        return []
+    object_ids = sorted(centers)
+    edges: list[dict[str, Any]] = []
+    seen: set[tuple[str, str]] = set()
+    for object_id in object_ids:
+        distances = []
+        for other_id in object_ids:
+            if other_id == object_id:
+                continue
+            distance = float(np.linalg.norm(centers[object_id] - centers[other_id]))
+            distances.append((distance, other_id))
+        for _, neighbor_id in sorted(distances)[:k]:
+            edge_key = tuple(sorted((object_id, neighbor_id)))
+            if edge_key in seen:
+                continue
+            seen.add(edge_key)
+            edges.append(
+                {
+                    "object": object_id,
+                    "neighbor": neighbor_id,
+                    "initial_delta": (
+                        centers[object_id] - centers[neighbor_id]
+                    ).tolist(),
+                }
+            )
+    return edges
+
+
+
+def _apply_compactness_pull(
+    *,
+    centers: dict[str, np.ndarray],
+    xy_sizes: dict[str, np.ndarray],
+    initial_union_bounds: np.ndarray,
+    target_expansion_ratio: float,
+    strength: float,
+) -> float:
+    current_bounds = _xy_union_bounds(centers=centers, xy_sizes=xy_sizes)
+    expansion_ratio = _xy_union_area(current_bounds) / max(
+        _xy_union_area(initial_union_bounds),
+        1.0e-12,
+    )
+    if expansion_ratio <= target_expansion_ratio:
+        return 0.0
+    excess = min(expansion_ratio / target_expansion_ratio - 1.0, 1.0)
+    union_center = 0.5 * (current_bounds[0] + current_bounds[1])
+    factor = strength * excess
+    max_delta = 0.0
+    for object_id, center in centers.items():
+        delta = factor * (union_center - center)
+        centers[object_id] = center + delta
+        max_delta = max(max_delta, float(np.linalg.norm(delta)))
+    return max_delta
+
+
+
+def _footprint_layout_diagnostics(
+    *,
+    object_ids: list[str],
+    centers: dict[str, np.ndarray],
+    initial_centers: dict[str, np.ndarray],
+    xy_sizes: dict[str, np.ndarray],
+    padding: float,
+    initial_union_bounds: np.ndarray,
+) -> dict[str, Any]:
+    remaining_overlaps = _remaining_xy_overlaps(
+        object_ids=object_ids,
+        centers=centers,
+        xy_sizes=xy_sizes,
+        padding=padding,
+    )
+    displacements = [
+        float(np.linalg.norm(centers[object_id] - initial_centers[object_id]))
+        for object_id in object_ids
+    ]
+    current_union_bounds = _xy_union_bounds(centers=centers, xy_sizes=xy_sizes)
+    expansion_ratio = _xy_union_area(current_union_bounds) / max(
+        _xy_union_area(initial_union_bounds),
+        1.0e-12,
+    )
+    average_displacement = float(np.mean(displacements)) if displacements else 0.0
+    max_displacement = float(np.max(displacements)) if displacements else 0.0
+    confidence_score = _footprint_confidence_score(
+        remaining_overlap_count=len(remaining_overlaps),
+        average_displacement=average_displacement,
+        max_extent=max(
+            float(max(xy_sizes[object_id][0], xy_sizes[object_id][1]))
+            for object_id in object_ids
+        )
+        if object_ids
+        else 1.0,
+        expansion_ratio=expansion_ratio,
+    )
+    return {
+        "remaining_overlaps": remaining_overlaps,
+        "average_displacement": average_displacement,
+        "max_displacement": max_displacement,
+        "union_aabb_expansion_ratio": expansion_ratio,
+        "confidence_score": confidence_score,
+    }
+
+
+
+def _remaining_xy_overlaps(
+    *,
+    object_ids: list[str],
+    centers: dict[str, np.ndarray],
+    xy_sizes: dict[str, np.ndarray],
+    padding: float,
+) -> list[dict[str, Any]]:
+    overlaps: list[dict[str, Any]] = []
+    for index, object_id in enumerate(object_ids):
+        for other_id in object_ids[index + 1 :]:
+            overlap = _xy_aabb_overlap(
+                center_a=centers[object_id],
+                size_a=xy_sizes[object_id],
+                center_b=centers[other_id],
+                size_b=xy_sizes[other_id],
+                padding=padding,
+            )
+            if overlap is None:
+                continue
+            overlaps.append(
+                {
+                    "object": object_id,
+                    "other": other_id,
+                    "overlap_x": overlap[0],
+                    "overlap_y": overlap[1],
+                }
+            )
+    return overlaps
+
+
+
+def _footprint_confidence_score(
+    *,
+    remaining_overlap_count: int,
+    average_displacement: float,
+    max_extent: float,
+    expansion_ratio: float,
+) -> float:
+    displacement_scale = max(max_extent, 1.0e-6)
+    overlap_penalty = min(0.35 * remaining_overlap_count, 0.7)
+    displacement_penalty = min(0.1 * average_displacement / displacement_scale, 0.2)
+    expansion_penalty = min(max(expansion_ratio - 1.2, 0.0) * 0.25, 0.2)
+    return float(
+        np.clip(
+            1.0
+            - overlap_penalty
+            - displacement_penalty
+            - expansion_penalty,
+            0.0,
+            1.0,
+        )
+    )
+
+
+
+def _center_xy_aabb_layout(
+    *,
+    centers: dict[str, np.ndarray],
+    xy_sizes: dict[str, np.ndarray],
+) -> dict[str, np.ndarray]:
+    if not centers:
+        return centers
+    bounds_min = []
+    bounds_max = []
+    for object_id, center in centers.items():
+        half_size = 0.5 * np.asarray(xy_sizes[object_id], dtype=np.float64)
+        bounds_min.append(center - half_size)
+        bounds_max.append(center + half_size)
+    clutter_center = 0.5 * (
+        np.vstack(bounds_min).min(axis=0)
+        + np.vstack(bounds_max).max(axis=0)
+    )
+    return {
+        object_id: np.asarray(center, dtype=np.float64) - clutter_center
+        for object_id, center in centers.items()
+    }
+
+
+
+def _xy_union_bounds(
+    *,
+    centers: dict[str, np.ndarray],
+    xy_sizes: dict[str, np.ndarray],
+) -> np.ndarray:
+    if not centers:
+        return np.zeros((2, 2), dtype=np.float64)
+    bounds_min = []
+    bounds_max = []
+    for object_id, center in centers.items():
+        half_size = 0.5 * np.asarray(xy_sizes[object_id], dtype=np.float64)
+        bounds_min.append(np.asarray(center, dtype=np.float64) - half_size)
+        bounds_max.append(np.asarray(center, dtype=np.float64) + half_size)
+    return np.vstack(
+        [
+            np.vstack(bounds_min).min(axis=0),
+            np.vstack(bounds_max).max(axis=0),
+        ]
+    )
+
+
+
+def _xy_union_area(bounds: np.ndarray) -> float:
+    bounds = np.asarray(bounds, dtype=np.float64)
+    size = np.maximum(bounds[1] - bounds[0], 1.0e-9)
+    return float(size[0] * size[1])
+
+
+
+def _xy_aabb_overlap(
+    *,
+    center_a: np.ndarray,
+    size_a: np.ndarray,
+    center_b: np.ndarray,
+    size_b: np.ndarray,
+    padding: float,
+) -> tuple[float, float] | None:
+    half_a = 0.5 * np.asarray(size_a, dtype=np.float64)
+    half_b = 0.5 * np.asarray(size_b, dtype=np.float64)
+    delta = np.abs(
+        np.asarray(center_b, dtype=np.float64)
+        - np.asarray(center_a, dtype=np.float64)
+    )
+    overlap = half_a + half_b + padding - delta
+    if float(overlap[0]) <= 0.0 or float(overlap[1]) <= 0.0:
+        return None
+    return float(overlap[0]), float(overlap[1])
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/__init__.py
new file mode 100644
index 00000000..12ebfd69
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/__init__.py
@@ -0,0 +1,35 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.manager import (
+    SimreadyManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.schemas import (
+    MakeAssetSimreadyRequest,
+    MakeAssetSimreadyResult,
+    MakeTableSimreadyRequest,
+    MakeTableSimreadyResult,
+)
+
+__all__ = [
+    "MakeAssetSimreadyRequest",
+    "MakeAssetSimreadyResult",
+    "MakeTableSimreadyRequest",
+    "MakeTableSimreadyResult",
+    "SimreadyManager",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/manager.py
new file mode 100644
index 00000000..6f92e1f8
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/manager.py
@@ -0,0 +1,396 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.manager import (
+    DEFAULT_INPUT_UP_AXIS,
+    DEFAULT_UP_AXIS,
+    GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.schemas import (
+    AlignToAxisRequest,
+    CenterMeshRequest,
+    ConvertUpAxisRequest,
+    DetectTabletopRequest,
+    ExportMeshRequest,
+    LoadMeshRequest,
+    NormalizeRequest,
+    PlaceAbovePlaneRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.manager import (
+    MatplotlibManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.schemas import (
+    RenderSupportRegionRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.schemas import (
+    MakeAssetSimreadyRequest,
+    MakeAssetSimreadyResult,
+    MakeTableSimreadyRequest,
+    MakeTableSimreadyResult,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.manager import (
+    SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+    GravityDropRequest,
+)
+
+
+class SimreadyManager:
+    """Prepare generated GLB assets for simulation placement."""
+
+    def __init__(
+        self,
+        *,
+        geometry_manager: GeometryManager | None = None,
+        simulation_manager: SimulationManager | None = None,
+        matplotlib_manager: MatplotlibManager | None = None,
+    ) -> None:
+        self.geometry_manager = geometry_manager or GeometryManager()
+        self.simulation_manager = simulation_manager or SimulationManager()
+        self.matplotlib_manager = matplotlib_manager or MatplotlibManager()
+
+    def make_asset_simready(
+        self,
+        request: MakeAssetSimreadyRequest,
+    ) -> MakeAssetSimreadyResult:
+        input_path = request.input_path.expanduser().resolve()
+        output_path = request.output_path.expanduser().resolve()
+        if output_path.suffix.lower() != ".glb":
+            raise ValueError("Sim-ready asset output_path must be a .glb file.")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        input_up_axis = _request_axis(request.input_up_axis, DEFAULT_INPUT_UP_AXIS)
+        raw_to_simready = np.eye(4, dtype=np.float64)
+        geom = self.geometry_manager
+        sim = self.simulation_manager
+
+        mesh = geom.load_mesh(LoadMeshRequest(mesh_path=input_path)).mesh
+
+        transform = _axis_conversion_transform(input_up_axis, DEFAULT_UP_AXIS)
+        raw_to_simready = transform @ raw_to_simready
+        mesh = geom.convert_up_axis(
+            ConvertUpAxisRequest(
+                mesh=mesh,
+                input_up_axis=input_up_axis,
+                output_up_axis=DEFAULT_UP_AXIS,
+            )
+        ).mesh
+
+        center_result = geom.center_by_bbox(CenterMeshRequest(mesh=mesh))
+        mesh = center_result.mesh
+        transform = _translation_transform(-np.asarray(center_result.bbox_center))
+        raw_to_simready = transform @ raw_to_simready
+
+        transform = _place_above_plane_transform(mesh, request.ground_clearance)
+        raw_to_simready = transform @ raw_to_simready
+        mesh = geom.place_above_plane(
+            PlaceAbovePlaneRequest(mesh=mesh, clearance=request.ground_clearance)
+        ).mesh
+
+        pre_gravity_mesh = geom.convert_up_axis(
+            ConvertUpAxisRequest(
+                mesh=mesh,
+                input_up_axis=DEFAULT_UP_AXIS,
+                output_up_axis=DEFAULT_INPUT_UP_AXIS,
+            )
+        ).mesh
+        pre_gravity_path = output_path.with_name(f".{output_path.stem}_pre_gravity.glb")
+        geom.export_mesh(
+            ExportMeshRequest(mesh=pre_gravity_mesh, output_path=pre_gravity_path)
+        )
+        try:
+            gravity_result = sim.run_gravity_simulation(
+                GravityDropRequest(glb_path=pre_gravity_path, max_convex_hull_num=32)
+            )
+
+            gravity_transform = _as_transform(gravity_result.final_pose)
+            settled_mesh = mesh.copy()
+            settled_mesh.apply_transform(gravity_transform)
+            raw_to_simready = gravity_transform @ raw_to_simready
+            transform = _center_aabb_bottom_xy_at_origin_transform(settled_mesh)
+            settled_mesh.apply_transform(transform)
+            raw_to_simready = transform @ raw_to_simready
+
+            transform = _center_aabb_bottom_xy_at_origin_transform(settled_mesh)
+            raw_to_simready = transform @ raw_to_simready
+            final_mesh = _center_aabb_bottom_xy_at_origin(settled_mesh)
+
+            normalize_result = geom.normalize(NormalizeRequest(mesh=final_mesh))
+            final_mesh = normalize_result.mesh
+            transform = _scale_transform(normalize_result.scale_factor)
+            raw_to_simready = transform @ raw_to_simready
+
+            transform = _place_above_plane_transform(final_mesh, request.ground_clearance)
+            raw_to_simready = transform @ raw_to_simready
+            final_mesh = geom.place_above_plane(
+                PlaceAbovePlaneRequest(
+                    mesh=final_mesh,
+                    clearance=request.ground_clearance,
+                )
+            ).mesh
+
+            transform = _axis_conversion_transform(DEFAULT_UP_AXIS, DEFAULT_INPUT_UP_AXIS)
+            raw_to_simready = transform @ raw_to_simready
+            final_mesh = geom.convert_up_axis(
+                ConvertUpAxisRequest(
+                    mesh=final_mesh,
+                    input_up_axis=DEFAULT_UP_AXIS,
+                    output_up_axis=DEFAULT_INPUT_UP_AXIS,
+                )
+            ).mesh
+
+            geom.export_mesh(ExportMeshRequest(mesh=final_mesh, output_path=output_path))
+        finally:
+            pre_gravity_path.unlink(missing_ok=True)
+
+        return MakeAssetSimreadyResult(
+            output_path=output_path,
+            transform_matrix=raw_to_simready.tolist(),
+        )
+
+    def make_table_simready(
+        self,
+        request: MakeTableSimreadyRequest,
+    ) -> MakeTableSimreadyResult:
+        input_path = request.input_path.expanduser().resolve()
+        output_path = request.output_path.expanduser().resolve()
+        if output_path.suffix.lower() != ".glb":
+            raise ValueError("Sim-ready table output_path must be a .glb file.")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        input_up_axis = _request_axis(request.input_up_axis, DEFAULT_INPUT_UP_AXIS)
+        up_axis = _request_axis(request.up_axis, DEFAULT_UP_AXIS)
+        raw_to_simready = np.eye(4, dtype=np.float64)
+        geom = self.geometry_manager
+        sim = self.simulation_manager
+        mpl = self.matplotlib_manager
+
+        mesh = geom.load_mesh(LoadMeshRequest(mesh_path=input_path)).mesh
+
+        transform = _axis_conversion_transform(input_up_axis, DEFAULT_UP_AXIS)
+        raw_to_simready = transform @ raw_to_simready
+        mesh = geom.convert_up_axis(
+            ConvertUpAxisRequest(
+                mesh=mesh,
+                input_up_axis=input_up_axis,
+                output_up_axis=DEFAULT_UP_AXIS,
+            )
+        ).mesh
+
+        center_result = geom.center_by_bbox(CenterMeshRequest(mesh=mesh))
+        mesh = center_result.mesh
+        transform = _translation_transform(-np.asarray(center_result.bbox_center))
+        raw_to_simready = transform @ raw_to_simready
+
+        detect_result = geom.detect_tabletop(DetectTabletopRequest(mesh=mesh))
+
+        transform = _axis_conversion_transform(detect_result.oriented_normal, up_axis)
+        raw_to_simready = transform @ raw_to_simready
+        mesh = geom.align_to_axis(
+            AlignToAxisRequest(
+                mesh=mesh,
+                source_axis=detect_result.oriented_normal,
+                target_axis=up_axis,
+            )
+        ).mesh
+
+        transform = _place_above_plane_transform(mesh, request.ground_clearance)
+        raw_to_simready = transform @ raw_to_simready
+        mesh = geom.place_above_plane(
+            PlaceAbovePlaneRequest(mesh=mesh, clearance=request.ground_clearance)
+        ).mesh
+
+        pre_gravity_mesh = geom.convert_up_axis(
+            ConvertUpAxisRequest(
+                mesh=mesh,
+                input_up_axis=DEFAULT_UP_AXIS,
+                output_up_axis=DEFAULT_INPUT_UP_AXIS,
+            )
+        ).mesh
+        pre_gravity_path = output_path.with_name(f".{output_path.stem}_pre_gravity.glb")
+        geom.export_mesh(
+            ExportMeshRequest(mesh=pre_gravity_mesh, output_path=pre_gravity_path)
+        )
+        try:
+            gravity_result = sim.run_gravity_simulation(
+                GravityDropRequest(glb_path=pre_gravity_path, max_convex_hull_num=16)
+            )
+
+            gravity_transform = _as_transform(gravity_result.final_pose)
+            settled_mesh = mesh.copy()
+            settled_mesh.apply_transform(gravity_transform)
+            raw_to_simready = gravity_transform @ raw_to_simready
+            transform = _center_aabb_bottom_xy_at_origin_transform(settled_mesh)
+            settled_mesh.apply_transform(transform)
+            raw_to_simready = transform @ raw_to_simready
+
+            settled_detect = geom.detect_tabletop(
+                DetectTabletopRequest(mesh=settled_mesh)
+            )
+
+            mpl.render_selected_support_region(
+                RenderSupportRegionRequest(
+                    mesh=settled_mesh,
+                    face_indices=settled_detect.selected.face_indices,
+                    output_path=output_path.with_name(
+                        f"{output_path.stem}_support_region.png"
+                    ),
+                )
+            )
+
+            transform = _center_aabb_bottom_xy_at_origin_transform(settled_mesh)
+            raw_to_simready = transform @ raw_to_simready
+            final_mesh = _center_aabb_bottom_xy_at_origin(settled_mesh)
+
+            normalize_result = geom.normalize(NormalizeRequest(mesh=final_mesh))
+            final_mesh = normalize_result.mesh
+            transform = _scale_transform(normalize_result.scale_factor)
+            raw_to_simready = transform @ raw_to_simready
+
+            transform = _place_above_plane_transform(final_mesh, request.ground_clearance)
+            raw_to_simready = transform @ raw_to_simready
+            final_mesh = geom.place_above_plane(
+                PlaceAbovePlaneRequest(
+                    mesh=final_mesh,
+                    clearance=request.ground_clearance,
+                )
+            ).mesh
+
+            transform = _axis_conversion_transform(DEFAULT_UP_AXIS, DEFAULT_INPUT_UP_AXIS)
+            raw_to_simready = transform @ raw_to_simready
+            final_mesh = geom.convert_up_axis(
+                ConvertUpAxisRequest(
+                    mesh=final_mesh,
+                    input_up_axis=DEFAULT_UP_AXIS,
+                    output_up_axis=DEFAULT_INPUT_UP_AXIS,
+                )
+            ).mesh
+
+            geom.export_mesh(ExportMeshRequest(mesh=final_mesh, output_path=output_path))
+        finally:
+            pre_gravity_path.unlink(missing_ok=True)
+
+        return MakeTableSimreadyResult(
+            output_path=output_path,
+            transform_matrix=raw_to_simready.tolist(),
+        )
+
+
+def _request_axis(value: list[float] | None, default: tuple[float, float, float]) -> list[float]:
+    if value is not None:
+        return list(value)
+    return list(default)
+
+
+def _center_aabb_bottom_xy_at_origin(mesh: Any) -> Any:
+    bounds = mesh.bounds
+    bottom_center_x = (float(bounds[0][0]) + float(bounds[1][0])) * 0.5
+    bottom_center_y = (float(bounds[0][1]) + float(bounds[1][1])) * 0.5
+    centered = mesh.copy()
+    centered.apply_translation([-bottom_center_x, -bottom_center_y, 0.0])
+    return centered
+
+
+def _axis_conversion_transform(source_axis: list[float], target_axis: list[float]) -> np.ndarray:
+    source = _normalize(np.asarray(source_axis, dtype=np.float64))
+    target = _normalize(np.asarray(target_axis, dtype=np.float64))
+    return _rotation_between_vectors(source, target)
+
+
+def _place_above_plane_transform(mesh: Any, clearance: float) -> np.ndarray:
+    min_z = float(mesh.bounds[0][2])
+    return _translation_transform(np.array([0.0, 0.0, clearance - min_z]))
+
+
+def _center_aabb_bottom_xy_at_origin_transform(mesh: Any) -> np.ndarray:
+    bounds = mesh.bounds
+    bottom_center_x = (float(bounds[0][0]) + float(bounds[1][0])) * 0.5
+    bottom_center_y = (float(bounds[0][1]) + float(bounds[1][1])) * 0.5
+    return _translation_transform(np.array([-bottom_center_x, -bottom_center_y, 0.0]))
+
+
+def _translation_transform(translation: np.ndarray) -> np.ndarray:
+    transform = np.eye(4, dtype=np.float64)
+    transform[:3, 3] = translation
+    return transform
+
+
+def _scale_transform(scale: float) -> np.ndarray:
+    transform = np.eye(4, dtype=np.float64)
+    transform[:3, :3] *= float(scale)
+    return transform
+
+
+def _as_transform(value: Any) -> np.ndarray:
+    transform = np.asarray(value, dtype=np.float64)
+    if transform.shape != (4, 4):
+        raise ValueError("Expected a 4x4 transform matrix.")
+    return transform
+
+
+def _rotation_between_vectors(source: np.ndarray, target: np.ndarray) -> np.ndarray:
+    source = _normalize(source)
+    target = _normalize(target)
+    dot = float(np.clip(np.dot(source, target), -1.0, 1.0))
+    transform = np.eye(4, dtype=np.float64)
+    if dot > 1.0 - 1e-8:
+        return transform
+    if dot < -1.0 + 1e-8:
+        axis = _orthogonal_axis(source)
+        rotation = _axis_angle_rotation(axis, np.pi)
+    else:
+        axis = _normalize(np.cross(source, target))
+        angle = float(np.arccos(dot))
+        rotation = _axis_angle_rotation(axis, angle)
+    transform[:3, :3] = rotation
+    return transform
+
+
+def _axis_angle_rotation(axis: np.ndarray, angle: float) -> np.ndarray:
+    axis = _normalize(axis)
+    x, y, z = axis
+    c = float(np.cos(angle))
+    s = float(np.sin(angle))
+    one_c = 1.0 - c
+    return np.array(
+        [
+            [c + x * x * one_c, x * y * one_c - z * s, x * z * one_c + y * s],
+            [y * x * one_c + z * s, c + y * y * one_c, y * z * one_c - x * s],
+            [z * x * one_c - y * s, z * y * one_c + x * s, c + z * z * one_c],
+        ],
+        dtype=np.float64,
+    )
+
+
+def _orthogonal_axis(vector: np.ndarray) -> np.ndarray:
+    axis = np.array([1.0, 0.0, 0.0], dtype=np.float64)
+    if abs(float(np.dot(vector, axis))) > 0.9:
+        axis = np.array([0.0, 1.0, 0.0], dtype=np.float64)
+    return _normalize(np.cross(vector, axis))
+
+
+def _normalize(vector: np.ndarray) -> np.ndarray:
+    norm = float(np.linalg.norm(vector))
+    if norm == 0.0:
+        return vector
+    return vector / norm
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/schemas.py
new file mode 100644
index 00000000..86ae22b0
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/schemas.py
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class MakeAssetSimreadyRequest:
+    """Request to prepare a general asset GLB for simulation placement."""
+
+    input_path: Path
+    output_path: Path
+    input_up_axis: list[float] | None = None
+    up_axis: list[float] | None = None
+    ground_clearance: float = 0.01
+
+
+@dataclass(frozen=True)
+class MakeAssetSimreadyResult:
+    """Result of making an asset simulation-ready."""
+
+    output_path: Path
+    transform_matrix: list[list[float]]
+
+
+@dataclass(frozen=True)
+class MakeTableSimreadyRequest:
+    """Request to prepare a generated table GLB for simulation placement."""
+
+    input_path: Path
+    output_path: Path
+    input_up_axis: list[float] | None = None
+    up_axis: list[float] | None = None
+    ground_clearance: float = 0.01
+
+
+@dataclass(frozen=True)
+class MakeTableSimreadyResult:
+    """Result of making a table simulation-ready."""
+
+    output_path: Path
+    transform_matrix: list[list[float]]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/__init__.py
new file mode 100644
index 00000000..9441c6b8
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/__init__.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.manager import (
+    SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+    GravityDropRequest,
+    GravityDropResult,
+)
+
+__all__ = [
+    "GravityDropRequest",
+    "GravityDropResult",
+    "SimulationManager",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/manager.py
new file mode 100644
index 00000000..4a072110
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/manager.py
@@ -0,0 +1,124 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Simulation manager for gravity-based asset placement."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import torch
+import trimesh
+
+from embodichain.lab.sim.cfg import RigidObjectCfg
+from embodichain.lab.sim.shapes import MeshCfg
+from embodichain.lab.sim.sim_manager import (
+    SimulationManager as _EmbodiSimManager,
+    SimulationManagerCfg,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+    GravityDropRequest,
+    GravityDropResult,
+)
+
+__all__ = ["SimulationManager"]
+
+
+class SimulationManager:
+    """Manager for gravity-based asset placement.
+
+    Wraps an EmbodiChain simulation instance with typed request/response
+    methods, following the same pattern as service clients.
+    """
+
+    def __init__(
+        self,
+        *,
+        headless: bool = True,
+        physics_dt: float = 0.01,
+        sim_device: str = "cpu",
+    ) -> None:
+        """Initialize the simulation manager.
+
+        Args:
+            headless: Whether to run without a GUI.
+            physics_dt: Physics timestep in seconds.
+            sim_device: Device to run the simulation on.
+        """
+        self._headless = headless
+        self._physics_dt = physics_dt
+        self._sim_device = sim_device
+
+    def run_gravity_simulation(
+        self, request: GravityDropRequest
+    ) -> GravityDropResult:
+        """Drop one GLB under gravity and return its final pose."""
+        glb_path = request.glb_path.expanduser().resolve()
+        if not glb_path.is_file():
+            raise FileNotFoundError(f"GLB file not found: {glb_path}")
+
+        initial_height = (
+            float(request.initial_height)
+            if request.initial_height is not None
+            else self._compute_adaptive_drop_height(glb_path)
+        )
+        sim = _EmbodiSimManager(
+            SimulationManagerCfg(
+                headless=self._headless,
+                physics_dt=self._physics_dt,
+                sim_device=self._sim_device,
+            )
+        )
+        obj = sim.add_rigid_object(
+            RigidObjectCfg(
+                uid="dropped_asset",
+                shape=MeshCfg(fpath=str(glb_path)),
+                init_pos=(0.0, 0.0, initial_height),
+                init_rot=(0.0, 0.0, 0.0),
+                body_type="dynamic",
+                max_convex_hull_num=request.max_convex_hull_num,
+            )
+        )
+        sim.update(step=300)
+
+        final_pose = obj.get_local_pose(to_matrix=True)[0].detach().cpu()
+        sim._deferred_destroy()
+        return GravityDropResult(
+            final_pose=np.asarray(final_pose.numpy(), dtype=float),
+        )
+
+    def _compute_adaptive_drop_height(
+        self,
+        glb_path: Path,
+        *,
+        min_clearance: float = 0.2,
+        height_scale: float = 1.25,
+    ) -> float:
+        """Compute an initial drop height from a GLB bounding box."""
+        if min_clearance < 0.0:
+            raise ValueError("min_clearance must be non-negative.")
+        if height_scale <= 0.0:
+            raise ValueError("height_scale must be positive.")
+
+        glb_path = glb_path.expanduser().resolve()
+        loaded = trimesh.load(glb_path, force=None)
+        if isinstance(loaded, trimesh.Scene):
+            bounds = loaded.bounds
+        else:
+            bounds = loaded.bounds
+        height = float(bounds[1][2] - bounds[0][2])
+        return max(height * height_scale, height + min_clearance)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/schemas.py
new file mode 100644
index 00000000..c9df4a52
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/schemas.py
@@ -0,0 +1,42 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+    "GravityDropRequest",
+    "GravityDropResult",
+]
+
+
+@dataclass(frozen=True)
+class GravityDropRequest:
+    """Request to drop a GLB asset under gravity simulation."""
+
+    glb_path: Path
+    max_convex_hull_num: int = 32
+    initial_height: float | None = None
+
+
+@dataclass(frozen=True)
+class GravityDropResult:
+    """Result of dropping a GLB asset under gravity."""
+
+    final_pose: Any
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/table_clutter_fit_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/table_clutter_fit_manager/__init__.py
new file mode 100644
index 00000000..0819a0d3
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/table_clutter_fit_manager/__init__.py
@@ -0,0 +1,23 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.table_clutter_fit_manager.manager import (
+    fit_table_to_clutter,
+)
+
+__all__ = ["fit_table_to_clutter"]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/table_clutter_fit_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/table_clutter_fit_manager/manager.py
new file mode 100644
index 00000000..eeb79a18
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/table_clutter_fit_manager/manager.py
@@ -0,0 +1,327 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+import json
+import tempfile
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.utils.io import relative_path
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.scene_geometry import (
+    _copy_scene_with_transform,
+    _scene_to_mesh,
+    _z_up_to_glb_y_up_transform,
+    _detect_table_fit_support_quad,
+    _load_table_fit_scene_internal_z,
+    _table_fit_bounds_xy_manifest,
+    _table_fit_safe_positive_ratio,
+    _table_fit_scene_union_bounds,
+    _table_fit_uniform_xy_scale_transform,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager import (
+    SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+    GravityDropRequest,
+)
+
+__all__ = ["fit_table_to_clutter"]
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+    if not value:
+        return Path()
+    path = Path(str(value)).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (output_root.expanduser().resolve() / path).resolve()
+
+
+def _gravity_settle_table_fit_internal_z_scene(
+    scene: Any,
+    *,
+    z_to_y: np.ndarray,
+    sim_device: str,
+) -> Any:
+    sim = SimulationManager(headless=True, sim_device=sim_device)
+    with tempfile.TemporaryDirectory(prefix="p2s_table_fit_gravity_") as tmp:
+        tmp_path = Path(tmp)
+        pre_gravity = tmp_path / "table_pre_gravity.glb"
+        _copy_scene_with_transform(scene, z_to_y).export(pre_gravity)
+        result = sim.run_gravity_simulation(
+            GravityDropRequest(
+                glb_path=pre_gravity,
+                max_convex_hull_num=16,
+                initial_height=0.05,
+            )
+        )
+    settled = scene.copy()
+    settled.apply_transform(np.asarray(result.final_pose, dtype=np.float64))
+    return settled
+
+
+def _write_table_fit_json(path: Path, data: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(data, ensure_ascii=False, indent=2) + "\n",
+        encoding="utf-8",
+    )
+
+
+def fit_table_to_clutter(
+    *,
+    table_result: dict[str, Any],
+    clutter_result: dict[str, Any],
+    output_root: Path,
+    output_dir: Path,
+    margin_cm: float = 10.0,
+    support_occupancy_ratio: float = 0.80,
+    object_coverage_percent: int | None = None,
+    gravity_settle_table: bool = True,
+    sim_device: str = "cpu",
+) -> dict[str, Any]:
+    """Fit a table mesh to an already laid-out clutter result.
+
+    Args:
+        object_coverage_percent: If set (1-100), overrides
+            ``support_occupancy_ratio`` by converting the percentage to a ratio
+            (e.g. 30 → 0.30). The required table size is computed as
+            clutter_size / ratio. When None, the default
+            ``support_occupancy_ratio`` is used.
+    """
+    try:
+        import trimesh
+    except ImportError as exc:
+        raise RuntimeError("Table fitting requires trimesh.") from exc
+
+    output_root = output_root.expanduser().resolve()
+    output_dir = output_dir.expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Resolve the table geometry.
+    table_simready_path = _resolve_generated_path(
+        table_result.get("simready_geometry_path") or table_result.get("mesh_path"),
+        output_root,
+    )
+    if not table_simready_path.is_file():
+        raise FileNotFoundError(f"Table simready GLB not found: {table_simready_path}")
+
+    # Resolve the clutter object geometries.
+    settled_objects = [
+        item
+        for item in clutter_result.get("objects", [])
+        if isinstance(item, dict) and item.get("status") == "ok"
+    ]
+    if not settled_objects:
+        raise ValueError("No successfully settled objects for table fitting.")
+
+    object_glb_paths: list[tuple[str, Path]] = []
+    for item in settled_objects:
+        glb_path = _resolve_generated_path(
+            item.get("laid_out_glb_path") or item.get("settled_glb_path"),
+            output_root,
+        )
+        if glb_path.is_file():
+            object_glb_paths.append((str(item["id"]), glb_path))
+
+    if not object_glb_paths:
+        raise ValueError("No valid settled object GLBs for table fitting.")
+
+    z_to_y = _z_up_to_glb_y_up_transform()
+    y_to_z = np.linalg.inv(z_to_y)
+
+    # Load the table and detect its support surface.
+    table_scene = _load_table_fit_scene_internal_z(
+        table_simready_path,
+        trimesh=trimesh,
+        y_to_z=y_to_z,
+    )
+    table_mesh = _scene_to_mesh(table_scene, trimesh=trimesh)
+    clutter_aabb = clutter_result.get("clutter_2d_aabb_cm") or {}
+    clutter_size = clutter_aabb.get("size_xy", [1.0, 1.0])
+    target_aspect = float(clutter_size[0]) / max(float(clutter_size[1]), 1.0e-6)
+    initial_support = _detect_table_fit_support_quad(
+        table_mesh,
+        target_aspect=target_aspect,
+    )
+
+    # Load the clutter scenes.
+    clutter_scenes = [
+        (oid, _load_table_fit_scene_internal_z(path, trimesh=trimesh, y_to_z=y_to_z))
+        for oid, path in object_glb_paths
+    ]
+    clutter_bounds = _table_fit_scene_union_bounds(
+        [scene for _, scene in clutter_scenes],
+        trimesh=trimesh,
+    )
+
+    # Compute the required table size and uniform scale.
+    clutter_size_cm = (clutter_bounds[1, :2] - clutter_bounds[0, :2]) * 100.0
+    if object_coverage_percent is not None:
+        support_occupancy_ratio = float(
+            np.clip(object_coverage_percent / 100.0, 0.1, 1.0)
+        )
+    occupancy = float(np.clip(support_occupancy_ratio, 0.1, 1.0))
+    required_size_cm = clutter_size_cm / occupancy + 2.0 * float(margin_cm)
+    support_size_cm = np.asarray(initial_support["size_xy"], dtype=np.float64) * 100.0
+    scale_x = _table_fit_safe_positive_ratio(required_size_cm[0], support_size_cm[0])
+    scale_y = _table_fit_safe_positive_ratio(required_size_cm[1], support_size_cm[1])
+    uniform_scale = max(scale_x, scale_y)
+    table_scale_transform = _table_fit_uniform_xy_scale_transform(
+        center_xy=np.asarray(initial_support["center_xy"], dtype=np.float64),
+        scale=uniform_scale,
+    )
+    table_scene.apply_transform(table_scale_transform)
+
+    # Settle the table under gravity.
+    if gravity_settle_table:
+        table_scene = _gravity_settle_table_fit_internal_z_scene(
+            table_scene,
+            z_to_y=z_to_y,
+            sim_device=sim_device,
+        )
+
+    # Reposition the table at the origin.
+    final_table_mesh = _scene_to_mesh(table_scene, trimesh=trimesh)
+    final_support = _detect_table_fit_support_quad(
+        final_table_mesh,
+        target_aspect=float(required_size_cm[0] / max(required_size_cm[1], 1.0e-6)),
+    )
+    support_center = np.asarray(final_support["center"], dtype=np.float64)
+    table_bounds = np.asarray(final_table_mesh.bounds, dtype=np.float64)
+    table_bottom_z = float(table_bounds[0, 2])
+
+    table_shift = np.eye(4, dtype=np.float64)
+    table_shift[:3, 3] = [-support_center[0], -support_center[1], -table_bottom_z]
+    table_scene.apply_transform(table_shift)
+    support_z_after = float((support_center + table_shift[:3, 3])[2])
+
+    # Measure the table surface height.
+    # Use the highest point of the table mesh (after scaling + gravity + shift)
+    # rather than the support-plane mean Z, so that thin objects sit above the
+    # actual geometry even when the tabletop has slight unevenness.
+    _table_mesh_after_shift = _scene_to_mesh(table_scene, trimesh=trimesh)
+    _table_max_z = float(
+        np.asarray(_table_mesh_after_shift.bounds, dtype=np.float64)[1, 2]
+    )
+    _surface_z_margin = 0.01  # 1 cm above the highest table point
+
+    # Place the objects on the table.
+    placed_objects: list[dict[str, Any]] = []
+    shifted_clutter: list[tuple[str, Any]] = []
+    clutter_after = _table_fit_scene_union_bounds(
+        [scene for _, scene in clutter_scenes],
+        trimesh=trimesh,
+    )
+    clutter_center_xy = 0.5 * (clutter_after[0, :2] + clutter_after[1, :2])
+    for oid, scene in clutter_scenes:
+        obj_mesh = _scene_to_mesh(scene, trimesh=trimesh)
+        obj_bounds = np.asarray(obj_mesh.bounds, dtype=np.float64)
+        obj_bottom_z = float(obj_bounds[0, 2])
+        obj_shift = np.eye(4, dtype=np.float64)
+        obj_shift[:3, 3] = [
+            -float(clutter_center_xy[0]),
+            -float(clutter_center_xy[1]),
+            _table_max_z - obj_bottom_z + _surface_z_margin,
+        ]
+        scene.apply_transform(obj_shift)
+        shifted_clutter.append((oid, scene))
+
+    # Export the fitted table and placed objects.
+    final_table_path = output_dir / "table_fit_to_clutter.glb"
+    _copy_scene_with_transform(table_scene, z_to_y).export(final_table_path)
+
+    for oid, scene in shifted_clutter:
+        object_path = output_dir / f"{oid}_on_table.glb"
+        _copy_scene_with_transform(scene, z_to_y).export(object_path)
+        # Compute world-space AABB bottom-centre (sim Z-up coords) before
+        # the scene is converted to GLB Y-up for export.  This is the
+        # reference position that gym_export uses to derive ``init_pos``.
+        _placed_mesh = _scene_to_mesh(scene, trimesh=trimesh)
+        _placed_b = np.asarray(_placed_mesh.bounds, dtype=np.float64)
+        world_aabb_bottom_center = [
+            float(0.5 * (_placed_b[0, 0] + _placed_b[1, 0])),
+            float(0.5 * (_placed_b[0, 1] + _placed_b[1, 1])),
+            float(_placed_b[0, 2]),
+        ]
+        placed_objects.append(
+            {
+                "id": oid,
+                "path": str(object_path),
+                "world_aabb_bottom_center": world_aabb_bottom_center,
+            }
+        )
+
+    # Write the fit manifest.
+    final_clutter_bounds = _table_fit_scene_union_bounds(
+        [scene for _, scene in shifted_clutter],
+        trimesh=trimesh,
+    )
+    final_clutter_aabb_cm = _table_fit_bounds_xy_manifest(
+        final_clutter_bounds,
+        unit_scale=100.0,
+    )
+    final_support_centered = {
+        **final_support,
+        "center": (support_center + table_shift[:3, 3]).tolist(),
+        "center_xy": (
+            np.asarray(final_support["center_xy"], dtype=np.float64)
+            - support_center[:2]
+        ).tolist(),
+        "corners_xy": (
+            np.asarray(final_support["corners_xy"], dtype=np.float64)
+            - support_center[:2]
+        ).tolist(),
+    }
+    manifest = {
+        "status": "ok",
+        "output_dir": str(output_dir),
+        "table_simready_path": str(table_simready_path),
+        "table_output_path": str(final_table_path),
+        "objects": placed_objects,
+        "margin_cm": margin_cm,
+        "support_occupancy_ratio": occupancy,
+        "gravity_settle_table": gravity_settle_table,
+        "table_bottom_z_after_shift": 0.0,
+        "support_z_after_shift": support_z_after,
+        "initial_support_quad": initial_support,
+        "final_support_quad_centered": final_support_centered,
+        "clutter_2d_aabb_cm": final_clutter_aabb_cm,
+        "required_support_size_cm": required_size_cm.tolist(),
+        "table_xy_scale": {
+            "uniform_scale": uniform_scale,
+            "scale_x_raw": scale_x,
+            "scale_y_raw": scale_y,
+            "support_size_before_scale_cm": support_size_cm.tolist(),
+        },
+        "fit_check": {
+            "fits_width": float(final_clutter_aabb_cm["size_xy"][0])
+            <= float(np.asarray(final_support_centered["size_xy"])[0] * 100.0),
+            "fits_depth": float(final_clutter_aabb_cm["size_xy"][1])
+            <= float(np.asarray(final_support_centered["size_xy"])[1] * 100.0),
+        },
+    }
+    manifest_path = output_dir / "table_fit_to_clutter_manifest.json"
+    _write_table_fit_json(manifest_path, manifest)
+    return {
+        "status": "ok",
+        "manifest_path": relative_path(manifest_path, output_root),
+    }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/__init__.py
new file mode 100644
index 00000000..ce221532
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/__init__.py
@@ -0,0 +1,33 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.text_layout_manager.layout import (
+    _layout_text_objects_grid,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.text_layout_manager.optimization import (
+    _optimize_text_layout_slp,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.text_layout_manager.settle import (
+    settle_text_objects_to_ground,
+)
+
+__all__ = [
+    "_layout_text_objects_grid",
+    "_optimize_text_layout_slp",
+    "settle_text_objects_to_ground",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/layout.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/layout.py
new file mode 100644
index 00000000..7b94a852
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/layout.py
@@ -0,0 +1,383 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.optimization_manager import (
+    _center_xy_aabb_layout,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.text_layout_manager.optimization import (
+    _optimize_text_layout_slp,
+)
+__all__ = [
+    "_layout_text_objects_grid",
+]
+
+def _transitive_closure(
+    nodes: list[str],
+    edges: list[tuple[str, str]],
+) -> list[tuple[str, str]]:
+    """Floyd–Warshall transitive closure over a small set of nodes."""
+    if not nodes or not edges:
+        return list(edges)
+    idx = {n: i for i, n in enumerate(nodes)}
+    n = len(nodes)
+    adj = [[False] * n for _ in range(n)]
+    for src, dst in edges:
+        if src in idx and dst in idx:
+            adj[idx[src]][idx[dst]] = True
+    for k in range(n):
+        for i in range(n):
+            if adj[i][k]:
+                row_k = adj[k]
+                row_i = adj[i]
+                for j in range(n):
+                    if row_k[j]:
+                        row_i[j] = True
+    closed: list[tuple[str, str]] = []
+    for i in range(n):
+        for j in range(n):
+            if adj[i][j]:
+                closed.append((nodes[i], nodes[j]))
+    return closed
+
+
+
+def _longest_path_ranks(
+    nodes: list[str],
+    edges: list[tuple[str, str]],
+) -> dict[str, int]:
+    """Assign integer ranks satisfying ``(A,B)`` → rank[A] < rank[B].
+
+    Uses topological sort + longest-path DP.  Returns a rank dict for every
+    node in *nodes* (default 0 for isolated nodes).
+    """
+    ranks: dict[str, int] = {n: 0 for n in nodes}
+    if not edges:
+        return ranks
+    # Build adjacency and in-degree
+    adj: dict[str, list[str]] = {n: [] for n in nodes}
+    in_deg: dict[str, int] = {n: 0 for n in nodes}
+    present = set(nodes)
+    for src, dst in edges:
+        if src not in present or dst not in present:
+            continue
+        adj[src].append(dst)
+        in_deg[dst] += 1
+    # Kahn topological sort
+    queue = [n for n in nodes if in_deg[n] == 0]
+    order: list[str] = []
+    while queue:
+        u = queue.pop(0)
+        order.append(u)
+        for v in adj[u]:
+            in_deg[v] -= 1
+            if in_deg[v] == 0:
+                queue.append(v)
+    # Longest path
+    for u in order:
+        for v in adj[u]:
+            if ranks[v] < ranks[u] + 1:
+                ranks[v] = ranks[u] + 1
+    # Remaining nodes (cycles / isolated) keep rank 0
+    return ranks
+
+
+
+def _layout_text_objects_grid(
+    *,
+    object_ids: list[str],
+    xy_sizes: dict[str, np.ndarray],
+    spatial_relations: list[dict[str, Any]],
+    table_constraints: list[dict[str, Any]] | None = None,
+    grid_spacing: float = 0.02,
+    padding_ratio: float = 0.08,
+) -> dict[str, Any]:
+    """Lay out text-scene objects — transitive closure + longest-path ranks.
+
+    1. Transitive closure of left_of / front_of.
+    2. Pick centre: explicit 9‑grid ʻcenterʼ, else highest-degree node.
+    3. Longest-path rank assignment (left_of→X, front_of→Y).
+    4. Shift 9‑grid anchors to their grid positions.
+    5. Free objects auto‑wrap below.
+    6. Convert ranks→XY using per‑column/row max sizes + gaps.
+    7. SA point optimisation + mesh AABB collision cleanup.
+    """
+    if not object_ids:
+        return {
+            "centers": {},
+            "initial_centers": {},
+            "metadata": {
+                "method": "transitive_closure_longest_path_with_9grid",
+                "iterations": 0,
+            },
+        }
+
+    # Parse spatial relations.
+    left_of_edges: list[tuple[str, str]] = []
+    front_of_edges: list[tuple[str, str]] = []
+    seen: set[tuple[str, str, str]] = set()
+    for rel in spatial_relations:
+        subject = str(rel.get("subject") or "")
+        obj = str(rel.get("object") or "")
+        relation = str(rel.get("relation") or "")
+        if not subject or not obj or subject == obj:
+            continue
+        key = (subject, relation, obj)
+        if key in seen:
+            continue
+        seen.add(key)
+        if relation == "left_of":
+            left_of_edges.append((subject, obj))
+        elif relation == "front_of":
+            front_of_edges.append((subject, obj))
+
+    # Compute transitive closures.
+    left_of_closed = _transitive_closure(object_ids, left_of_edges)
+    front_of_closed = _transitive_closure(object_ids, front_of_edges)
+
+    # Parse nine-grid constraints.
+    # −Y = front, so front row = 0, back row = 2
+    _GRID_TO_RC: dict[str, tuple[int, int]] = {
+        "left_front": (0, 0), "center_front": (1, 0), "right_front": (2, 0),
+        "left_center": (0, 1), "center": (1, 1), "right_center": (2, 1),
+        "left_back": (0, 2), "center_back": (1, 2), "right_back": (2, 2),
+        "front": (1, 0), "back": (1, 2),
+        "left": (0, 1), "right": (2, 1),
+    }
+    grid_targets: dict[str, tuple[int, int]] = {}
+    for tc in (table_constraints or []):
+        asset = str(tc.get("asset") or "")
+        grid_name = str(tc.get("grid") or "").strip()
+        if asset in object_ids and grid_name in _GRID_TO_RC:
+            grid_targets[asset] = _GRID_TO_RC[grid_name]
+
+    # Select a center object when none is explicit.
+    auto_center_oid: str | None = None
+    has_explicit_center = any(
+        tc.get("grid") == "center" for tc in (table_constraints or [])
+    )
+    if not has_explicit_center:
+        # Degree = appearances in left_of + front_of (subject or object)
+        degree: dict[str, int] = {oid: 0 for oid in object_ids}
+        for src, dst in left_of_closed + front_of_closed:
+            if src in degree:
+                degree[src] += 1
+            if dst in degree:
+                degree[dst] += 1
+        max_deg = max(degree.values()) if degree else 0
+        if max_deg > 0:
+            candidates = [oid for oid, d in degree.items() if d == max_deg]
+            # Tie-breaker: largest AABB area
+            centre_oid = max(
+                candidates,
+                key=lambda oid: float(xy_sizes[oid][0]) * float(xy_sizes[oid][1]),
+            )
+            grid_targets[centre_oid] = (1, 1)  # 9‑grid centre
+            auto_center_oid = centre_oid
+
+    # Derive ranks from the transitive closures.
+    x_rank = _longest_path_ranks(object_ids, left_of_closed)
+    # −Y = front:  A front_of B  →  A.y < B.y  →  row[A] < row[B].
+    # _longest_path_ranks gives rank[src] < rank[dst]; edges are
+    # already (A,B) for "A front_of B", so NO reversal needed.
+    y_rank = _longest_path_ranks(object_ids, front_of_closed)
+
+    # Apply nine-grid shifts.
+    # Pin 9‑grid objects to their target ranks; shift all connected
+    # objects (both upstream and downstream) to preserve topology.
+    if grid_targets:
+        # Build undirected connected-components via relation edges
+        all_edges = left_of_closed + front_of_closed
+        neighbours: dict[str, set[str]] = {oid: set() for oid in object_ids}
+        for src, dst in all_edges:
+            if src in neighbours and dst in neighbours:
+                neighbours[src].add(dst)
+                neighbours[dst].add(src)
+        for oid in grid_targets:
+            neighbours.setdefault(oid, set())
+
+        # For each 9‑grid object, BFS the component and shift uniformly
+        shifted: set[str] = set()
+        for oid, (target_col, target_row) in grid_targets.items():
+            if oid in shifted:
+                continue
+            dx = target_col - x_rank.get(oid, 0)
+            dy = target_row - y_rank.get(oid, 0)
+
+            # BFS to collect the full connected component
+            component: set[str] = {oid}
+            queue = [oid]
+            while queue:
+                u = queue.pop(0)
+                for v in neighbours.get(u, set()):
+                    if v not in component:
+                        component.add(v)
+                        queue.append(v)
+
+            for oid2 in component:
+                if oid2 not in grid_targets:  # only shift non‑anchored objects
+                    x_rank[oid2] = x_rank.get(oid2, 0) + dx
+                    y_rank[oid2] = y_rank.get(oid2, 0) + dy
+            shifted.update(component)
+
+    # Propagate row and column alignment.
+    # left_of A B  →  same row  (y_rank[A] = y_rank[B])
+    # front_of A B →  same col  (x_rank[A] = x_rank[B])
+    # Priority (higher wins): 9‑grid > higher degree > larger area.
+    _prio = {
+        oid: (
+            oid in grid_targets,
+            sum(1 for e in left_of_closed + front_of_closed if oid in e),
+            float(xy_sizes[oid][0]) * float(xy_sizes[oid][1]),
+        )
+        for oid in object_ids
+    }
+    for src, dst in left_of_closed:
+        if _prio[src] >= _prio[dst]:
+            y_rank[dst] = y_rank.get(src, 0)
+        else:
+            y_rank[src] = y_rank.get(dst, 0)
+    for src, dst in front_of_closed:
+        if _prio[src] >= _prio[dst]:
+            x_rank[dst] = x_rank.get(src, 0)
+        else:
+            x_rank[src] = x_rank.get(dst, 0)
+
+    # Normalise to >= 0
+    min_x = min(x_rank.values()) if x_rank else 0
+    min_y = min(y_rank.values()) if y_rank else 0
+    for oid in object_ids:
+        x_rank[oid] = x_rank.get(oid, 0) - min_x
+        y_rank[oid] = y_rank.get(oid, 0) - min_y
+
+    # Resolve cell collisions: spread objects sharing the same (col, row)
+    cell_occupants: dict[tuple[int, int], list[str]] = {}
+    for oid in object_ids:
+        cell = (x_rank[oid], y_rank[oid])
+        cell_occupants.setdefault(cell, []).append(oid)
+    for (col, row), occupants in cell_occupants.items():
+        if len(occupants) > 1:
+            for offset, oid in enumerate(occupants[1:], start=1):
+                x_rank[oid] = col + offset
+
+    # Place unconstrained objects in wrapped rows.
+    constrained = set()
+    for src, dst in left_of_closed + front_of_closed:
+        constrained.update([src, dst])
+    constrained.update(grid_targets)
+    free_objects = [oid for oid in object_ids if oid not in constrained]
+
+    if free_objects:
+        free_row = max(y_rank.values()) + 1 if y_rank else 0
+        # Max row width ≈ existing union width × 1.5 (at least 3 cols)
+        col_keys = list(x_rank.values())
+        existing_cols = max(col_keys) - min(col_keys) + 1 if col_keys else 1
+        max_cols_per_row = max(existing_cols, 3)
+        free_sorted = sorted(
+            free_objects,
+            key=lambda oid: float(xy_sizes[oid][0]),
+            reverse=True,
+        )
+        col = 0
+        row_offset = 0
+        for oid in free_sorted:
+            x_rank[oid] = col
+            y_rank[oid] = free_row + row_offset
+            col += 1
+            if col >= max_cols_per_row:
+                col = 0
+                row_offset += 1
+
+    # Convert ranks to XY positions.
+    col_widths: dict[int, float] = {}
+    row_heights: dict[int, float] = {}
+    for oid in object_ids:
+        c = x_rank[oid]
+        r = y_rank[oid]
+        col_widths[c] = max(col_widths.get(c, 0.0), float(xy_sizes[oid][0]))
+        row_heights[r] = max(row_heights.get(r, 0.0), float(xy_sizes[oid][1]))
+
+    x_cumsum: dict[int, float] = {}
+    cumulative = 0.0
+    for c in sorted(col_widths):
+        x_cumsum[c] = cumulative
+        cumulative += col_widths[c] + grid_spacing
+
+    y_cumsum: dict[int, float] = {}
+    cumulative = 0.0
+    for r in sorted(row_heights):
+        y_cumsum[r] = cumulative
+        cumulative += row_heights[r] + grid_spacing
+
+    centers: dict[str, np.ndarray] = {}
+    for oid in object_ids:
+        c = x_rank[oid]
+        r = y_rank[oid]
+        cx = x_cumsum[c] + 0.5 * float(xy_sizes[oid][0])
+        cy = y_cumsum[r] + 0.5 * float(xy_sizes[oid][1])
+        centers[oid] = np.array([cx, cy], dtype=np.float64)
+
+    centers = _center_xy_aabb_layout(centers=centers, xy_sizes=xy_sizes)
+
+    initial_centers = {oid: c.copy() for oid, c in centers.items()}
+
+    # Snap initial grid positions as 9‑grid spring targets
+    grid_spring_targets: dict[str, np.ndarray] = {
+        oid: initial_centers[oid].copy()
+        for oid in grid_targets
+        if oid in initial_centers
+    }
+
+    # Optimize positions and remove mesh AABB collisions.
+    optimized = _optimize_text_layout_slp(
+        object_ids=object_ids,
+        xy_sizes=xy_sizes,
+        initial_centers=initial_centers,
+        left_of_edges=left_of_closed,
+        front_of_edges=front_of_closed,
+        grid_spring_targets=grid_spring_targets,
+        padding_ratio=padding_ratio,
+    )
+    centers = optimized["centers"]
+    optimization_metadata = optimized["metadata"]
+
+    # Collect layout metadata.
+    metadata = {
+        "method": "transitive_closure_longest_path_with_9grid_and_sa",
+        "grid_spacing": grid_spacing,
+        "auto_center_oid": auto_center_oid,
+        "has_explicit_center": has_explicit_center,
+        "table_constraint_count": len(grid_targets),
+        "left_of_count": len(left_of_edges),
+        "left_of_closed_count": len(left_of_closed),
+        "front_of_count": len(front_of_edges),
+        "front_of_closed_count": len(front_of_closed),
+        "free_object_count": len(free_objects),
+        "x_ranks": {oid: x_rank.get(oid, 0) for oid in object_ids},
+        "y_ranks": {oid: y_rank.get(oid, 0) for oid in object_ids},
+        "optimization": optimization_metadata,
+    }
+    return {
+        "centers": centers,
+        "initial_centers": initial_centers,
+        "metadata": metadata,
+    }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/optimization.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/optimization.py
new file mode 100644
index 00000000..b8915fc4
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/optimization.py
@@ -0,0 +1,404 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+from scipy.optimize import minimize
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.optimization_manager import (
+    _center_xy_aabb_layout,
+    _footprint_layout_diagnostics,
+    _xy_aabb_overlap,
+    _xy_union_bounds,
+)
+
+__all__ = ["_optimize_text_layout_slp"]
+
+# SLSQP solve options — matching the original example_optimization SA pipeline.
+_SLSQP_OPTIONS: dict[str, Any] = {"maxiter": 500, "ftol": 1e-6, "disp": False}
+
+# Objective weights (relations are hard constraints, not in the objective).
+_WEIGHTS: dict[str, float] = {
+    "seed": 1.0,
+    "overlap": 200.0,
+    "grid": 100.0,
+}
+
+
+def _optimize_text_layout_slp(
+    *,
+    object_ids: list[str],
+    xy_sizes: dict[str, np.ndarray],
+    initial_centers: dict[str, np.ndarray],
+    left_of_edges: list[tuple[str, str]],
+    front_of_edges: list[tuple[str, str]],
+    grid_spring_targets: dict[str, np.ndarray],
+    padding_ratio: float,
+) -> dict[str, Any]:
+    """Optimize 2D centres with scipy SLSQP, then remove mesh AABB overlap.
+
+    Mirroring the original example_optimization/SA pipeline:
+    - left_of / front_of → linear inequality constraints
+    - bounding box → variable bounds (2× initial union)
+    - seed / overlap / grid → soft penalties in the objective
+    - post‑solve collision cleanup on actual footprint AABBs
+    """
+    if not object_ids:
+        return {
+            "centers": {},
+            "metadata": {
+                "method": "text_slsqp_then_mesh_aabb_collision_removal",
+                "slsqp_iterations": 0,
+                "collision_iterations": 0,
+            },
+        }
+
+    max_extent = max(
+        float(max(xy_sizes[oid][0], xy_sizes[oid][1])) for oid in object_ids
+    )
+    padding = max(max_extent * padding_ratio, 1e-3)
+
+    initial_centers = {
+        oid: np.asarray(initial_centers[oid], dtype=np.float64).copy()
+        for oid in object_ids
+    }
+    initial_union_bounds = _xy_union_bounds(
+        centers=initial_centers,
+        xy_sizes=xy_sizes,
+    )
+
+    index_by_id = {oid: i for i, oid in enumerate(object_ids)}
+    x0 = _pack_centers(object_ids, initial_centers)
+
+    # Build linear inequality constraints for left_of and front_of.
+    constraints: list[dict[str, Any]] = []
+    _build_relation_constraints(
+        constraints=constraints,
+        object_ids=object_ids,
+        index_by_id=index_by_id,
+        xy_sizes=xy_sizes,
+        left_of_edges=left_of_edges,
+        front_of_edges=front_of_edges,
+        padding=padding,
+    )
+
+    # Bound variables to twice the initial union size.
+    init_size = initial_union_bounds[1] - initial_union_bounds[0]
+    margin = init_size * 0.5  # 50 % each side → 2× total
+    bounds = []
+    for oid in object_ids:
+        bounds.append(
+            (
+                float(initial_union_bounds[0, 0] - margin[0]),
+                float(initial_union_bounds[1, 0] + margin[0]),
+            )
+        )  # x
+        bounds.append(
+            (
+                float(initial_union_bounds[0, 1] - margin[1]),
+                float(initial_union_bounds[1, 1] + margin[1]),
+            )
+        )  # y
+
+    # Define the optimization objective.
+    def _objective(xvec: np.ndarray) -> float:
+        centers = _unpack_centers(object_ids, xvec)
+        loss = 0.0
+
+        # seed: stay close to initial positions
+        for oid in object_ids:
+            delta = centers[oid] - initial_centers[oid]
+            loss += _WEIGHTS["seed"] * float(np.dot(delta, delta))
+
+        # overlap: AABB overlap area penalty
+        for i, oid in enumerate(object_ids):
+            for other_id in object_ids[i + 1 :]:
+                ov = _xy_aabb_overlap(
+                    center_a=centers[oid],
+                    size_a=xy_sizes[oid],
+                    center_b=centers[other_id],
+                    size_b=xy_sizes[other_id],
+                    padding=padding,
+                )
+                if ov is not None:
+                    loss += _WEIGHTS["overlap"] * float(ov[0] * ov[1])
+
+        # grid: spring toward 9‑grid targets
+        for oid, target in grid_spring_targets.items():
+            if oid not in centers:
+                continue
+            delta = centers[oid] - target
+            loss += _WEIGHTS["grid"] * float(np.dot(delta, delta))
+
+        return float(loss)
+
+    # Solve the constrained optimization problem.
+    slsqp_result: dict[str, Any] = {"success": False, "nit": 0, "message": ""}
+    try:
+        result = minimize(
+            _objective,
+            x0,
+            method="SLSQP",
+            bounds=bounds,
+            constraints=constraints,
+            options=_SLSQP_OPTIONS,
+        )
+        slsqp_result = {
+            "success": bool(result.success),
+            "nit": int(getattr(result, "nit", 0)),
+            "message": str(result.message),
+            "fun": float(result.fun) if result.fun is not None else None,
+        }
+        if result.success:
+            x_opt = result.x
+        else:
+            # SLSQP failed — fall back to seed positions
+            x_opt = x0.copy()
+    except Exception:
+        x_opt = x0.copy()
+        slsqp_result["message"] = "SLSQP raised an exception; using seed positions."
+
+    centers = _unpack_centers(object_ids, x_opt)
+    centers = _center_xy_aabb_layout(centers=centers, xy_sizes=xy_sizes)
+
+    # Remove residual collisions.
+    centers, collision_metadata = _remove_mesh_aabb_collisions(
+        object_ids=object_ids,
+        xy_sizes=xy_sizes,
+        centers=centers,
+        initial_centers=initial_centers,
+        left_of_edges=left_of_edges,
+        front_of_edges=front_of_edges,
+        padding=padding,
+    )
+    centers = _center_xy_aabb_layout(centers=centers, xy_sizes=xy_sizes)
+
+    # Collect optimization metadata.
+    diagnostics = _footprint_layout_diagnostics(
+        object_ids=object_ids,
+        centers=centers,
+        initial_centers=initial_centers,
+        xy_sizes=xy_sizes,
+        padding=padding,
+        initial_union_bounds=initial_union_bounds,
+    )
+    metadata: dict[str, Any] = {
+        "method": "text_slsqp_then_mesh_aabb_collision_removal",
+        "relation_usage": "left_of_front_of_hard_constraints",
+        "padding": float(padding),
+        "padding_ratio": float(padding_ratio),
+        "weights": dict(_WEIGHTS),
+        "slsqp": slsqp_result,
+        "bounds_expansion": 2.0,
+        "initial_union_size": init_size.tolist(),
+        **collision_metadata,
+        "final_centers": {
+            oid: centers[oid].tolist() for oid in object_ids
+        },
+        **diagnostics,
+    }
+    return {"centers": centers, "metadata": metadata}
+
+
+# Build relation constraints.
+
+
+def _build_relation_constraints(
+    *,
+    constraints: list[dict[str, Any]],
+    object_ids: list[str],
+    index_by_id: dict[str, int],
+    xy_sizes: dict[str, np.ndarray],
+    left_of_edges: list[tuple[str, str]],
+    front_of_edges: list[tuple[str, str]],
+    padding: float,
+) -> None:
+    """Append SLSQP inequality constraints for left_of / front_of edges."""
+
+    for subject, obj in left_of_edges:
+        if subject not in index_by_id or obj not in index_by_id:
+            continue
+        i_a = index_by_id[subject]
+        i_b = index_by_id[obj]
+        # A.x + gap ≤ B.x  →  B.x - A.x - gap ≥ 0
+        gap = (
+            0.5 * float(xy_sizes[subject][0])
+            + 0.5 * float(xy_sizes[obj][0])
+            + padding
+        )
+        constraints.append(
+            {
+                "type": "ineq",
+                "fun": lambda x, ia=i_a, ib=i_b, g=gap: float(
+                    x[2 * ib] - x[2 * ia] - g
+                ),
+            }
+        )
+
+    for subject, obj in front_of_edges:
+        if subject not in index_by_id or obj not in index_by_id:
+            continue
+        i_a = index_by_id[subject]
+        i_b = index_by_id[obj]
+        # A.y + gap ≤ B.y  →  B.y - A.y - gap ≥ 0
+        gap = (
+            0.5 * float(xy_sizes[subject][1])
+            + 0.5 * float(xy_sizes[obj][1])
+            + padding
+        )
+        constraints.append(
+            {
+                "type": "ineq",
+                "fun": lambda x, ia=i_a, ib=i_b, g=gap: float(
+                    x[2 * ib + 1] - x[2 * ia + 1] - g
+                ),
+            }
+        )
+
+
+# Remove AABB collisions.
+
+
+def _remove_mesh_aabb_collisions(
+    *,
+    object_ids: list[str],
+    xy_sizes: dict[str, np.ndarray],
+    centers: dict[str, np.ndarray],
+    initial_centers: dict[str, np.ndarray],
+    left_of_edges: list[tuple[str, str]],
+    front_of_edges: list[tuple[str, str]],
+    padding: float,
+) -> tuple[dict[str, np.ndarray], dict[str, Any]]:
+    relation_pairs = set(left_of_edges + front_of_edges)
+    relation_pairs.update((b, a) for a, b in left_of_edges + front_of_edges)
+    current = {
+        oid: np.asarray(center, dtype=np.float64).copy()
+        for oid, center in centers.items()
+    }
+    max_rounds = 80
+    total_pushes = 0
+    last_overlap_count = 0
+
+    for iteration in range(max_rounds):
+        overlaps = _mesh_aabb_collision_pairs(
+            object_ids=object_ids,
+            xy_sizes=xy_sizes,
+            centers=current,
+            padding=padding,
+        )
+        last_overlap_count = len(overlaps)
+        if not overlaps:
+            return current, {
+                "collision_iterations": iteration,
+                "collision_pushes": total_pushes,
+                "collision_remaining": 0,
+                "collision_removal": "iterative_mesh_aabb_push",
+            }
+        for item in overlaps:
+            object_a = item["object"]
+            object_b = item["other"]
+            axis = int(item["axis"])
+            sign = -1.0 if current[object_a][axis] <= current[object_b][axis] else 1.0
+            amount = 0.5 * (float(item["overlap"]) + 1.0e-6)
+            if (object_a, object_b) in relation_pairs:
+                current[object_a][axis] += sign * amount
+                current[object_b][axis] -= sign * amount
+            else:
+                drift_a = np.linalg.norm(
+                    current[object_a] - initial_centers[object_a]
+                )
+                drift_b = np.linalg.norm(
+                    current[object_b] - initial_centers[object_b]
+                )
+                if drift_a <= drift_b:
+                    current[object_a][axis] += sign * amount * 1.25
+                    current[object_b][axis] -= sign * amount * 0.75
+                else:
+                    current[object_a][axis] += sign * amount * 0.75
+                    current[object_b][axis] -= sign * amount * 1.25
+            total_pushes += 1
+        current = _center_xy_aabb_layout(centers=current, xy_sizes=xy_sizes)
+
+    return current, {
+        "collision_iterations": max_rounds,
+        "collision_pushes": total_pushes,
+        "collision_remaining": last_overlap_count,
+        "collision_removal": "iterative_mesh_aabb_push",
+    }
+
+
+def _mesh_aabb_collision_pairs(
+    *,
+    object_ids: list[str],
+    xy_sizes: dict[str, np.ndarray],
+    centers: dict[str, np.ndarray],
+    padding: float,
+) -> list[dict[str, Any]]:
+    pairs: list[dict[str, Any]] = []
+    for i, oid in enumerate(object_ids):
+        for other_id in object_ids[i + 1 :]:
+            ov = _xy_aabb_overlap(
+                center_a=centers[oid],
+                size_a=xy_sizes[oid],
+                center_b=centers[other_id],
+                size_b=xy_sizes[other_id],
+                padding=padding,
+            )
+            if ov is None:
+                continue
+            axis = 0 if ov[0] <= ov[1] else 1
+            pairs.append(
+                {
+                    "object": oid,
+                    "other": other_id,
+                    "axis": axis,
+                    "overlap": float(ov[axis]),
+                    "overlap_x": float(ov[0]),
+                    "overlap_y": float(ov[1]),
+                }
+            )
+    pairs.sort(key=lambda item: item["overlap"], reverse=True)
+    return pairs
+
+
+# Pack and unpack center coordinates.
+
+
+def _pack_centers(
+    object_ids: list[str],
+    centers: dict[str, np.ndarray],
+) -> np.ndarray:
+    values: list[float] = []
+    for oid in object_ids:
+        c = np.asarray(centers[oid], dtype=np.float64)
+        values.extend([float(c[0]), float(c[1])])
+    return np.asarray(values, dtype=np.float64)
+
+
+def _unpack_centers(
+    object_ids: list[str],
+    xvec: np.ndarray,
+) -> dict[str, np.ndarray]:
+    return {
+        oid: np.asarray(
+            [xvec[2 * i], xvec[2 * i + 1]],
+            dtype=np.float64,
+        )
+        for i, oid in enumerate(object_ids)
+    }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/settle.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/settle.py
new file mode 100644
index 00000000..da3cdde6
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/text_layout_manager/settle.py
@@ -0,0 +1,429 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+import tempfile
+import traceback
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager import (
+    SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+    GravityDropRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.optimization_manager import (
+    _object_scenes_xy_aabb_manifest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.scene_geometry import (
+    _aabb_bottom_to_xy_plane_transform,
+    _copy_scene_with_transform,
+    _matrix_from_json,
+    _scale_transform,
+    _scene_to_mesh,
+    _xy_aabb_center,
+    _xy_aabb_size,
+    _z_up_to_glb_y_up_transform,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import (
+    relative_path,
+    write_json,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_warning
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager import (
+    MatplotlibManager,
+    RenderFootprintLayoutRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.text_layout_manager.layout import (
+    _layout_text_objects_grid,
+)
+
+__all__ = ["settle_text_objects_to_ground"]
+
+
+def settle_text_objects_to_ground(
+    *,
+    objects: list[dict[str, Any]],
+    spatial_relations: list[dict[str, Any]] | None = None,
+    table_constraints: list[dict[str, Any]] | None = None,
+    output_dir: Path,
+    output_root: Path,
+    sim_device: str = "cpu",
+) -> dict[str, Any]:
+    """Scale simready objects to real-world size, gravity-settle, layout on table.
+
+    For each text-input object:
+    1. Load simready GLB (GLB Y-up) → convert to internal Z-up
+    2. Apply scene-level metric scale_factor → real-world size
+    3. Gravity simulation to settle on ground plane
+    4. Move AABB bottom centre to XY origin at Z=0
+    5. Build grid/rank initialization from left_of/front_of and table constraints
+    6. Run SA-based 2D point optimization and mesh AABB collision cleanup
+    7. Apply layout positions
+
+    Returns laid-out scenes and per-object metadata.
+    """
+    try:
+        import trimesh
+    except ImportError as exc:
+        raise RuntimeError("Text object gravity settling requires trimesh.") from exc
+
+    output_dir = output_dir.expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sim = SimulationManager(headless=True, sim_device=sim_device)
+    z_to_y = _z_up_to_glb_y_up_transform()
+    y_to_z = np.linalg.inv(z_to_y)
+
+    settled_objects: list[dict[str, Any]] = []
+    object_scenes: list[tuple[str, Any]] = []
+
+    with tempfile.TemporaryDirectory(prefix="p2s_text_settle_") as tmp_dir:
+        tmp_path = Path(tmp_dir)
+        for obj in objects:
+            obj_id = str(obj.get("id", ""))
+            obj_name = str(obj.get("name", ""))
+
+            # Validate the metric scale.
+            metric_scale = obj.get("metric_scale")
+            if not isinstance(metric_scale, dict):
+                settled_objects.append(
+                    {
+                        "id": obj_id,
+                        "name": obj_name,
+                        "status": "skipped",
+                        "reason": "missing_metric_scale",
+                    }
+                )
+                continue
+            scale_factor = float(metric_scale.get("scale_factor", 1.0))
+            if not np.isfinite(scale_factor) or scale_factor <= 0.0:
+                settled_objects.append(
+                    {
+                        "id": obj_id,
+                        "name": obj_name,
+                        "status": "skipped",
+                        "reason": "invalid_scale_factor",
+                    }
+                )
+                continue
+
+            # Load the simulation-ready GLB.
+            simready_path = _resolve_generated_path(
+                obj.get("simready_geometry_path") or obj.get("mesh_path"),
+                output_root,
+            )
+            if not simready_path.is_file():
+                settled_objects.append(
+                    {
+                        "id": obj_id,
+                        "name": obj_name,
+                        "status": "skipped",
+                        "reason": "missing_simready_glb",
+                    }
+                )
+                continue
+
+            try:
+                # Load simready (GLB Y-up) → convert to internal Z-up
+                scene_yup = trimesh.load(simready_path, force="scene")
+                scene = _copy_scene_with_transform(scene_yup, y_to_z)
+
+                # Apply real-world scale
+                scale_transform = _scale_transform(scale_factor)
+                scene.apply_transform(scale_transform)
+
+                # Settle the object under gravity.
+                mesh = _scene_to_mesh(scene, trimesh=trimesh)
+                mesh_bounds = np.asarray(mesh.bounds, dtype=np.float64)
+                mesh_z_height = max(float(mesh_bounds[1][2] - mesh_bounds[0][2]), 0.0)
+                bottom_to_xy = _aabb_bottom_to_xy_plane_transform(mesh_bounds)
+                normalized_scene = _copy_scene_with_transform(scene, bottom_to_xy)
+
+                # Export to Y-up GLB for gravity
+                pre_gravity_scene = _copy_scene_with_transform(normalized_scene, z_to_y)
+                pre_gravity_path = tmp_path / f"{obj_id}_pre_gravity.glb"
+                pre_gravity_scene.export(pre_gravity_path)
+                gravity_initial_height = mesh_z_height * 0.1
+
+                gravity_status = "ok"
+                gravity_transform = np.eye(4, dtype=np.float64)
+                gravity_reason = ""
+                try:
+                    gravity_result = sim.run_gravity_simulation(
+                        GravityDropRequest(
+                            glb_path=pre_gravity_path,
+                            max_convex_hull_num=32,
+                            initial_height=gravity_initial_height,
+                        )
+                    )
+                    gravity_transform = _matrix_from_json(
+                        gravity_result.final_pose,
+                        name=f"{obj_id}.gravity_final_pose",
+                    )
+                except Exception:
+                    gravity_status = "failed"
+                    gravity_reason = traceback.format_exc()
+
+                # Apply gravity result (in internal Z-up space)
+                settled_scene = _copy_scene_with_transform(
+                    normalized_scene,
+                    gravity_transform,
+                )
+
+                # Center the bottom of the AABB at the XY origin.
+                settled_mesh = _scene_to_mesh(settled_scene, trimesh=trimesh)
+                settled_bounds = np.asarray(settled_mesh.bounds, dtype=np.float64)
+                settled_xy_center = _xy_aabb_center(settled_bounds)
+                settled_xy_size = _xy_aabb_size(settled_bounds)
+                settled_bottom_z = float(settled_bounds[0, 2])
+
+                centre_transform = np.eye(4, dtype=np.float64)
+                centre_transform[:3, 3] = [
+                    -float(settled_xy_center[0]),
+                    -float(settled_xy_center[1]),
+                    -settled_bottom_z,
+                ]
+                centred_scene = _copy_scene_with_transform(
+                    settled_scene,
+                    centre_transform,
+                )
+
+                # Verify final bounds
+                centred_mesh = _scene_to_mesh(centred_scene, trimesh=trimesh)
+                centred_bounds = np.asarray(centred_mesh.bounds, dtype=np.float64)
+                centred_xy_size = _xy_aabb_size(centred_bounds)
+
+                # Export settled GLB (Z-up → Y-up for GLB output)
+                settled_glb_path = output_dir / f"{obj_id}_settled.glb"
+                _copy_scene_with_transform(centred_scene, z_to_y).export(
+                    settled_glb_path
+                )
+
+                item = {
+                    "id": obj_id,
+                    "name": obj_name,
+                    "status": "ok",
+                    "gravity_status": gravity_status,
+                    "gravity_reason": gravity_reason,
+                    "scale_factor": scale_factor,
+                    "settled_glb_path": relative_path(
+                        str(settled_glb_path),
+                        output_root,
+                    ),
+                    "settled_xy_size_m": centred_xy_size.tolist(),
+                    "settled_xy_size_cm": (centred_xy_size * 100.0).tolist(),
+                    "settled_bounds_m": centred_bounds.tolist(),
+                    "mesh_z_height_m": mesh_z_height,
+                    "bottom_to_xy_transform": bottom_to_xy.tolist(),
+                    "gravity_transform": gravity_transform.tolist(),
+                    "centre_transform": centre_transform.tolist(),
+                    "composed_settle_transform": (
+                        centre_transform
+                        @ gravity_transform
+                        @ bottom_to_xy
+                        @ scale_transform
+                        @ y_to_z
+                    ).tolist(),
+                }
+                settled_objects.append(item)
+                object_scenes.append((obj_id, centred_scene))
+
+            except Exception:
+                settled_objects.append(
+                    {
+                        "id": obj_id,
+                        "name": obj_name,
+                        "status": "failed",
+                        "reason": traceback.format_exc(),
+                    }
+                )
+
+    # Optimize the spatial layout.
+    layout_result = None
+    if object_scenes:
+        xy_sizes = {
+            oid: np.asarray(
+                _xy_aabb_size(_scene_to_mesh(scene, trimesh=trimesh).bounds),
+                dtype=np.float64,
+            )
+            for oid, scene in object_scenes
+        }
+        relations = list(spatial_relations or [])
+        layout_result = _layout_text_objects_grid(
+            object_ids=[oid for oid, _ in object_scenes],
+            xy_sizes=xy_sizes,
+            spatial_relations=relations,
+            table_constraints=list(table_constraints or []),
+        )
+        target_centers = layout_result["centers"]
+        initial_centers = layout_result.get("initial_centers", {})
+
+        # Render footprint layout diagnostics.
+        debug_dir = output_dir / "debug"
+        debug_dir.mkdir(parents=True, exist_ok=True)
+        debug_object_ids = [oid for oid, _ in object_scenes]
+        debug_before_centers = {
+            oid: np.zeros(2, dtype=np.float64) for oid in debug_object_ids
+        }
+        debug_renders = (
+            (
+                "footprint_layout_xy_before.png",
+                "Before Layout (all at origin)",
+                debug_before_centers,
+            ),
+            (
+                "footprint_layout_xy_grid_init.png",
+                "After Grid Initialisation",
+                initial_centers,
+            ),
+            (
+                "footprint_layout_xy_after.png",
+                "After SA Optimisation",
+                target_centers,
+            ),
+        )
+        for filename, title, debug_centers in debug_renders:
+            try:
+                MatplotlibManager(figsize=(8, 8), dpi=180).render_footprint_layout(
+                    RenderFootprintLayoutRequest(
+                        object_ids=debug_object_ids,
+                        centers=debug_centers,
+                        xy_sizes=xy_sizes,
+                        output_path=debug_dir / filename,
+                        title=title,
+                    )
+                )
+            except Exception as exc:
+                log_warning(
+                    f"text clutter debug render failed file={filename} error={exc}"
+                )
+
+        # Apply layout positions to centred scenes
+        laid_out_scenes: list[tuple[str, Any]] = []
+        for oid, scene in object_scenes:
+            target_xy = target_centers[oid]
+            settled_mesh = _scene_to_mesh(scene, trimesh=trimesh)
+            settled_bounds = np.asarray(settled_mesh.bounds, dtype=np.float64)
+            current_xy = _xy_aabb_center(settled_bounds)
+            placement = np.eye(4, dtype=np.float64)
+            placement[:3, 3] = [
+                float(target_xy[0] - current_xy[0]),
+                float(target_xy[1] - current_xy[1]),
+                0.0,
+            ]
+            laid_out_scene = _copy_scene_with_transform(scene, placement)
+            laid_out_scenes.append((oid, laid_out_scene))
+
+            # Export laid-out GLB (replaces the origin-centred one)
+            laid_out_glb_path = output_dir / f"{oid}_laid_out.glb"
+            _copy_scene_with_transform(laid_out_scene, z_to_y).export(laid_out_glb_path)
+
+            # Update per-object metadata with layout position
+            for item in settled_objects:
+                if item.get("id") == oid:
+                    item["layout_target_xy"] = target_xy.tolist()
+                    item["layout_placement_transform"] = placement.tolist()
+                    item["laid_out_glb_path"] = relative_path(
+                        str(laid_out_glb_path), output_root
+                    )
+                    laid_out_bounds = np.asarray(
+                        _scene_to_mesh(laid_out_scene, trimesh=trimesh).bounds,
+                        dtype=np.float64,
+                    )
+                    item["laid_out_xy_size_cm"] = (
+                        _xy_aabb_size(laid_out_bounds) * 100.0
+                    ).tolist()
+                    break
+
+        object_scenes = laid_out_scenes
+
+    clutter_2d_aabb_cm = _object_scenes_xy_aabb_manifest(
+        object_scenes=object_scenes,
+        trimesh=trimesh,
+        unit_scale=100.0,
+        unit="cm",
+    )
+
+    debug_manifest = {
+        "status": "ok",
+        "output_dir": relative_path(str(output_dir), output_root),
+        "object_count": len(objects),
+        "settled_count": len(object_scenes),
+        "clutter_2d_aabb_cm": clutter_2d_aabb_cm,
+        "debug_image_before_path": (
+            relative_path(
+                str(debug_dir / "footprint_layout_xy_before.png"),
+                output_root,
+            )
+            if object_scenes
+            else ""
+        ),
+        "debug_image_grid_init_path": (
+            relative_path(
+                str(debug_dir / "footprint_layout_xy_grid_init.png"),
+                output_root,
+            )
+            if object_scenes
+            else ""
+        ),
+        "debug_image_after_path": (
+            relative_path(
+                str(debug_dir / "footprint_layout_xy_after.png"),
+                output_root,
+            )
+            if object_scenes
+            else ""
+        ),
+        "layout_optimization": layout_result["metadata"] if layout_result else None,
+        "objects": settled_objects,
+    }
+    debug_manifest_path = output_dir / "debug" / "settle_diagnostics.json"
+    write_json(debug_manifest_path, debug_manifest)
+
+    # Keep workflow state limited to the contract consumed by table fitting.
+    workflow_objects = [
+        {
+            key: item[key]
+            for key in (
+                "id",
+                "name",
+                "status",
+                "reason",
+                "settled_glb_path",
+                "laid_out_glb_path",
+            )
+            if key in item
+        }
+        for item in settled_objects
+    ]
+    return {
+        "status": "ok",
+        "clutter_2d_aabb_cm": clutter_2d_aabb_cm,
+        "objects": workflow_objects,
+        "debug_manifest_path": relative_path(str(debug_manifest_path), output_root),
+    }
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+    path = Path(str(value or "")).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (output_root / path).resolve()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/servers/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/servers/__init__.py
new file mode 100644
index 00000000..e50272ef
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/servers/__init__.py
@@ -0,0 +1,16 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+"""External servers, ignored by git, for testing or demo purposes."""
\ No newline at end of file
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/__init__.py
new file mode 100644
index 00000000..015c4151
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/__init__.py
@@ -0,0 +1,19 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/gym_export.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/gym_export.py
new file mode 100644
index 00000000..d26a1484
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/gym_export.py
@@ -0,0 +1,450 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+import math
+import shutil
+import time
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    STEP_RESULT_FILENAME,
+    UNIFIED_SCENE_GEN_STEP,
+)
+
+__all__ = ["export_gym_config"]
+
+_DEFAULT_OBJECT_ATTRS: dict[str, Any] = {
+    "mass": 0.01,
+    "contact_offset": 0.003,
+    "rest_offset": 0.001,
+    "restitution": 0.01,
+    "max_depenetration_velocity": 10.0,
+    "min_position_iters": 32,
+    "min_velocity_iters": 8,
+}
+
+_DEFAULT_TABLE_ATTRS: dict[str, Any] = {
+    "mass": 10.0,
+    "static_friction": 0.95,
+    "dynamic_friction": 0.9,
+    "restitution": 0.01,
+}
+
+_DEFAULT_MAX_CONVEX_HULL_NUM = 32
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
+def _resolve_path(value: str, output_root: Path) -> Path:
+    path = Path(value).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (output_root / path).resolve()
+
+
+def _read_json(path: Path) -> dict[str, Any]:
+    with path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected JSON object at {path}")
+    return data
+
+
+def _matrix_to_euler_xyz_deg(matrix: list[list[float]]) -> list[float]:
+    """Decompose a 3×3 or 4×4 rotation matrix into XYZ Euler angles (degrees)."""
+    m = np.asarray(matrix, dtype=np.float64)
+    r = m[:3, :3]
+    sy = math.sqrt(float(r[0, 0]) ** 2 + float(r[1, 0]) ** 2)
+    if sy > 1e-6:
+        x = math.atan2(float(r[2, 1]), float(r[2, 2]))
+        y = math.atan2(-float(r[2, 0]), sy)
+        z = math.atan2(float(r[1, 0]), float(r[0, 0]))
+    else:
+        x = math.atan2(-float(r[1, 2]), float(r[1, 1]))
+        y = math.atan2(-float(r[2, 0]), sy)
+        z = 0.0
+    return [math.degrees(x), math.degrees(y), math.degrees(z)]
+
+
+def _glb_to_sim_rotation() -> np.ndarray:
+    """Return the loader basis conversion from GLB Y-up to sim Z-up."""
+    return np.array(
+        [
+            [1.0, 0.0, 0.0],
+            [0.0, 0.0, -1.0],
+            [0.0, 1.0, 0.0],
+        ],
+        dtype=np.float64,
+    )
+
+
+def _glb_rotation_to_sim(rotation_matrix: list[list[float]]) -> list[list[float]]:
+    """Convert a GLB-space local rotation into simulation-space rotation."""
+    rot = np.asarray(rotation_matrix, dtype=np.float64)
+    if rot.shape == (4, 4):
+        rot = rot[:3, :3]
+    basis = _glb_to_sim_rotation()
+    return (basis @ rot @ basis.T).tolist()
+
+
+def _glb_scale_to_sim(scale: Sequence[float]) -> list[float]:
+    """Convert GLB-axis scale components to sim-axis body_scale components."""
+    values = [float(v) for v in scale]
+    if len(values) != 3:
+        raise ValueError("scale must have three components")
+    return [values[0], values[2], values[1]]
+
+
+def _glb_max_z(glb_path: Path) -> float:
+    """Maximum height (Y in GLB, Z in simulation) of a mesh."""
+    import trimesh
+
+    scene = trimesh.load(glb_path, force="scene")
+    if isinstance(scene, trimesh.Trimesh):
+        mesh = scene
+    else:
+        dumped = scene.dump(concatenate=True)
+        mesh = (
+            dumped
+            if isinstance(dumped, trimesh.Trimesh)
+            else trimesh.util.concatenate(
+                [m for m in dumped if isinstance(m, trimesh.Trimesh)]
+            )
+        )
+    return float(np.asarray(mesh.bounds, dtype=np.float64)[1, 1])  # max Y
+
+
+def _rotated_aabb_offsets(
+    glb_path: Path,
+    rotation_matrix: list[list[float]] | None,
+    scale: float | Sequence[float] = 1.0,
+) -> tuple[float, float, float]:
+    """Compute the AABB shift caused by rotation + scale alone.
+
+    Loads the simready GLB, applies *rotation_matrix* and *scale_factor*
+    around the local origin (the AABB bottom-centre), and returns the XY
+    centre and minimum Z of the resulting AABB.  These offsets are
+    subtracted from the fitted AABB bottom-centre to recover the true
+    world-space position of the simready local origin (the ``init_pos``
+    that the simulation expects).
+    """
+    import trimesh
+
+    scene = trimesh.load(glb_path, force="scene")
+    if isinstance(scene, trimesh.Trimesh):
+        mesh = scene
+    else:
+        dumped = scene.dump(concatenate=True)
+        mesh = (
+            dumped
+            if isinstance(dumped, trimesh.Trimesh)
+            else trimesh.util.concatenate(
+                [m for m in dumped if isinstance(m, trimesh.Trimesh)]
+            )
+        )
+    verts = mesh.vertices.copy()
+    if isinstance(scale, Sequence) and not isinstance(scale, (str, bytes)):
+        scale_array = np.asarray(list(scale), dtype=np.float64)
+        if scale_array.shape != (3,):
+            raise ValueError("scale must be a scalar or a 3-vector")
+        verts *= scale_array
+    else:
+        verts *= float(scale)
+    if rotation_matrix is not None:
+        rot = np.asarray(rotation_matrix, dtype=np.float64)
+        if rot.shape == (4, 4):
+            rot = rot[:3, :3]
+        verts = (rot @ verts.T).T
+    b = np.zeros((2, 3), dtype=np.float64)
+    b[0] = verts.min(axis=0)
+    b[1] = verts.max(axis=0)
+    return (
+        float(0.5 * (b[0, 0] + b[1, 0])),   # AABB centre X → sim X
+        float(-0.5 * (b[0, 2] + b[1, 2])),  # -centre Z → sim Y
+        float(b[0, 1]),                       # min Y → sim Z
+    )
+
+
+# ---------------------------------------------------------------------------
+# consolidated object manifest
+# ---------------------------------------------------------------------------
+
+
+def _build_object_manifest(
+    output_root: Path,
+    step_result: dict[str, Any],
+    table_fit_manifest: dict[str, Any],
+    aligned_by_id: dict[str, dict[str, Any]],
+) -> dict[str, Any]:
+    """Merge world_bc, rotation, scale into one per-object record.
+
+    Returns a dict keyed by object id, each value containing everything
+    needed to compute ``init_pos`` / ``init_rot`` / ``body_scale``.
+    """
+    objects_info = step_result.get("objects") or []
+
+    # index metric_scale by object id
+    metric_by_id: dict[str, float] = {}
+    for obj in objects_info:
+        oid = str(obj.get("id", ""))
+        if not oid:
+            continue
+        ms = obj.get("metric_scale")
+        sf = float(ms.get("scale_factor", 1.0)) if isinstance(ms, dict) else 1.0
+        metric_by_id[oid] = sf
+
+    # index world_aabb_bottom_center from table-fit manifest
+    world_bc_by_id: dict[str, list[float]] = {}
+    for e in table_fit_manifest.get("objects") or []:
+        eid = str(e.get("id", "")) if isinstance(e, dict) else ""
+        wbc = e.get("world_aabb_bottom_center") if isinstance(e, dict) else None
+        if eid and isinstance(wbc, list) and len(wbc) == 3:
+            world_bc_by_id[eid] = [float(v) for v in wbc]
+
+    consolidated: dict[str, Any] = {}
+    skipped_no_glb: list[str] = []
+    for obj in objects_info:
+        oid = str(obj.get("id", ""))
+        if not oid:
+            continue
+
+        source = obj.get("simready_geometry_path") or obj.get("mesh_path")
+        simready_path = _resolve_path(source or "", output_root)
+        if not simready_path.is_file():
+            skipped_no_glb.append(oid)
+            continue
+
+        description = str(obj.get("description") or obj.get("name") or "").strip()
+        scale_factor = metric_by_id.get(oid, 1.0)
+
+        aligned = aligned_by_id.get(oid)
+        rot_matrix: list[list[float]] | None = None
+        transform_scale: list[float] | None = None
+        if aligned:
+            raw = aligned.get("rotation_matrix")
+            if raw and isinstance(raw, list):
+                rot_matrix = raw
+            raw_scale = aligned.get("scale")
+            if isinstance(raw_scale, list) and len(raw_scale) == 3:
+                transform_scale = [float(v) for v in raw_scale]
+
+        wbc = world_bc_by_id.get(oid)
+
+        consolidated[oid] = {
+            "id": oid,
+            "description": description,
+            "simready_path": simready_path,
+            "scale_factor": scale_factor,
+            "transform_scale": transform_scale,
+            "rotation_matrix": rot_matrix,
+            "world_aabb_bottom_center": wbc,
+        }
+
+    if skipped_no_glb:
+        print(
+            "  [WARN] object(s) skipped (simready GLB not found): "
+            + ", ".join(skipped_no_glb)
+        )
+    extra_in_manifest = set(world_bc_by_id) - set(consolidated)
+    if extra_in_manifest:
+        print(
+            "  [WARN] object(s) in table-fit manifest but not in step_result: "
+            + ", ".join(sorted(extra_in_manifest))
+        )
+
+    return consolidated
+
+
+# ---------------------------------------------------------------------------
+# main export
+# ---------------------------------------------------------------------------
+
+
+def export_gym_config(
+    output_root: Path,
+    *,
+    export_dir: Path | None = None,
+) -> Path:
+    """Export the unified-scene-gen result as a gym_config.json bundle.
+
+    Uses **simready** GLBs — transforms are written explicitly as
+    ``body_scale``, ``init_pos``, and ``init_rot``.
+    """
+    output_root = output_root.expanduser().resolve()
+    if export_dir is None:
+        export_dir = output_root / "gym_export"
+    else:
+        export_dir = export_dir.expanduser().resolve()
+    export_dir.mkdir(parents=True, exist_ok=True)
+
+    # ── data sources ────────────────────────────────────────────────────
+    step_result = _read_json(
+        output_root / UNIFIED_SCENE_GEN_STEP / STEP_RESULT_FILENAME
+    )
+    table_fit = step_result.get("table_fit_to_clutter") or {}
+    table_fit_manifest = _read_json(
+        _resolve_path(table_fit.get("manifest_path", ""), output_root)
+    )
+
+    aligned_by_id: dict[str, dict[str, Any]] = {}
+    aligned_manifest_path = (
+        output_root
+        / UNIFIED_SCENE_GEN_STEP
+        / "glb_gen"
+        / "simready_to_aligned_manifest.json"
+    )
+    if aligned_manifest_path.is_file():
+        for item in _read_json(aligned_manifest_path).get("items", []) or []:
+            if isinstance(item, dict) and item.get("id"):
+                aligned_by_id[str(item["id"])] = item
+
+    # ── consolidated per-object manifest ─────────────────────────────────
+    object_manifest = _build_object_manifest(
+        output_root, step_result, table_fit_manifest, aligned_by_id
+    )
+
+    # ── table ────────────────────────────────────────────────────────────
+    table_info = step_result.get("table") or {}
+    table_desc = str(
+        table_info.get("complete_table_description")
+        or table_info.get("description", "")
+    ).strip()
+
+    mesh_assets_dir = export_dir / "mesh_assets"
+    mesh_assets_dir.mkdir(parents=True, exist_ok=True)
+
+    table_simready = _resolve_path(
+        table_info.get("simready_geometry_path")
+        or table_info.get("mesh_path", ""),
+        output_root,
+    )
+    if not table_simready.is_file():
+        raise FileNotFoundError(f"Table simready GLB not found: {table_simready}")
+    table_dst = mesh_assets_dir / "table" / "table_0.glb"
+    table_dst.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(table_simready, table_dst)
+
+    table_surface_z = _glb_max_z(table_simready)
+
+    uniform_scale = 1.0
+    ts = table_fit_manifest.get("table_xy_scale")
+    if isinstance(ts, dict):
+        uniform_scale = float(ts.get("uniform_scale", 1.0))
+
+    # ── objects ──────────────────────────────────────────────────────────
+    rigid_objects: list[dict[str, Any]] = []
+
+    total = len(object_manifest)
+    for idx, (oid, om) in enumerate(object_manifest.items()):
+        # Copy simready GLB
+        safe_name = oid.replace("interact_", "").strip("_") or "object"
+        obj_dir = mesh_assets_dir / safe_name / oid
+        obj_dir.mkdir(parents=True, exist_ok=True)
+        object_dst = obj_dir / f"{oid}.glb"
+        shutil.copy2(om["simready_path"], object_dst)
+
+        # body_scale.  Image-scene alignment may contain a full simready→aligned
+        # scale; text-scene layout only has the per-object metric scale.
+        sf = om["scale_factor"]
+        scale_glb = om.get("transform_scale") or [sf, sf, sf]
+        body_scale = _glb_scale_to_sim(scale_glb)
+
+        # init_rot
+        init_rot: list[float] = [0.0, 0.0, 0.0]
+        if om["rotation_matrix"] is not None:
+            init_rot = _matrix_to_euler_xyz_deg(
+                _glb_rotation_to_sim(om["rotation_matrix"])
+            )
+
+        # init_pos = world_bc - rotated_aabb_offset
+        ro = _rotated_aabb_offsets(
+            om["simready_path"], om["rotation_matrix"], scale_glb
+        )
+        wbc = om["world_aabb_bottom_center"]
+        if wbc is not None:
+            init_pos = [wbc[0] - ro[0], wbc[1] - ro[1], wbc[2] - ro[2]]
+        else:
+            init_pos = [-ro[0], -ro[1], table_surface_z - ro[2]]
+
+        rigid_objects.append(
+            {
+                "uid": oid,
+                "description": om["description"],
+                "shape": {
+                    "shape_type": "Mesh",
+                    "fpath": str(object_dst.relative_to(export_dir)),
+                    "compute_uv": False,
+                },
+                "attrs": dict(_DEFAULT_OBJECT_ATTRS),
+                "body_type": "dynamic",
+                "init_pos": init_pos,
+                "init_rot": init_rot,
+                "body_scale": body_scale,
+                "max_convex_hull_num": _DEFAULT_MAX_CONVEX_HULL_NUM,
+            }
+        )
+        wbc = om["world_aabb_bottom_center"]
+        wbc_flag = "wbc" if wbc is not None else "fallback"
+        print(
+            f"  [{idx+1}/{total}] [{oid}] {om['description']}"
+            f"  pos={init_pos}  rot={init_rot}  scale={body_scale}  src={wbc_flag}"
+        )
+
+    # ── write gym config ─────────────────────────────────────────────────
+    config = {
+        "id": f"Prompt2Scene-{int(time.time() * 1000)}-v0",
+        "max_episodes": 10,
+        "max_episode_steps": 300,
+        "env": {"events": {}, "observations": {}, "dataset": {}},
+        "robot": {},
+        "sensor": [],
+        "light": {},
+        "background": [
+            {
+                "uid": "table",
+                "description": table_desc,
+                "shape": {
+                    "shape_type": "Mesh",
+                    "fpath": str(table_dst.relative_to(export_dir)),
+                    "compute_uv": False,
+                },
+                "attrs": dict(_DEFAULT_TABLE_ATTRS),
+                "body_scale": [uniform_scale, uniform_scale, 1.0],
+                "body_type": "kinematic",
+                "init_pos": [0.0, 0.0, 0.0],
+                "init_rot": [0.0, 0.0, 0.0],
+            }
+        ],
+        "rigid_object": rigid_objects,
+    }
+
+    config_path = export_dir / "gym_config.json"
+    config_path.write_text(
+        json.dumps(config, indent=4, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+
+    return config_path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_scene_asset_generation.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_scene_asset_generation.py
new file mode 100644
index 00000000..9d3e42f1
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_scene_asset_generation.py
@@ -0,0 +1,638 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+import shutil
+import traceback
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+    decode_rle_mask,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager import (
+    GeometryGenerationManager,
+    RgbaImageToGeometryRequest,
+    RgbaImagesToGeometriesRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager import (
+    ImageGenerationManager,
+    TextToAssetImageRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager import (
+    AssetImageToRgbaRequest,
+    ImageSegmentationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_scene_manager import (
+    _export_support_aligned_layout_glbs,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager import (
+    MakeAssetSimreadyRequest,
+    MakeTableSimreadyRequest,
+    SimreadyManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.metric_scale_manager import (
+    METRIC_SCALE_ENABLED,
+    EstimateMetricScalesRequest,
+    MetricScaleManager,
+    MetricScaleObjectInput,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.scene_geometry import (
+    _compose_sam3d_multi_object_transform,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_scene_manager import (
+    _write_multi_object_layout_manifests,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_scene_manager.prompts import (
+    build_image_metric_scale_messages,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_scene_manager.schemas import (
+    IMAGE_METRIC_SCALE_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import (
+    relative_path,
+)
+
+__all__ = ["generate_image_scene_assets"]
+
+UNIFIED_SCENE_STEP = "unified_scene"
+
+
+def generate_image_scene_assets(
+    object_specs: list[dict[str, Any]],
+    table_spec: dict[str, Any],
+    spatial_relations: list[dict[str, Any]],
+    segments_data: dict[str, Any],
+    image_gen_dir: Path,
+    glb_gen_dir: Path,
+    debug_dir: Path,
+    output_root: Path,
+    llm: Any | None = None,
+) -> dict[str, Any]:
+    """Run layout-aware table/support and object generation from image masks."""
+    log_info(f"image object layout generation started count={len(object_specs)}")
+    status = "ok"
+    failure_reason = ""
+    original_image_path = str(segments_data.get("image_path", ""))
+    segment_by_id: dict[str, dict[str, Any]] = {
+        str(seg["asset_id"]): seg
+        for seg in segments_data.get("asset_segments", [])
+        if seg.get("asset_id")
+    }
+    table_segment = segments_data.get("table_segment")
+    if not isinstance(table_segment, dict):
+        table_segment = None
+    debug_subdir = debug_dir / "multi_object_masks"
+    masks_dir = debug_subdir / "masks"
+    raw_download_dir = glb_gen_dir / "raw_downloads"
+    simready_dir = glb_gen_dir / "multi_object_layouts_simready"
+    aligned_dir = glb_gen_dir / "multi_object_layouts_aligned"
+    masks_dir.mkdir(parents=True, exist_ok=True)
+    raw_download_dir.mkdir(parents=True, exist_ok=True)
+    simready_dir.mkdir(parents=True, exist_ok=True)
+    aligned_dir.mkdir(parents=True, exist_ok=True)
+
+    requested_items: list[dict[str, Any]] = []
+    mask_paths: list[Path] = []
+
+    table_id = str(table_spec.get("id", "table")).strip() or "table"
+    table_name = str(table_spec.get("name", "table")).strip() or "table"
+    is_complete_visible_table = bool(
+        table_spec.get("is_complete_visible_table", False)
+    )
+    skipped_table: dict[str, Any] | None = None
+    if table_segment is None:
+        skipped_table = {
+            "id": table_id,
+            "name": table_name,
+            "reason": "missing_table_segment",
+        }
+    else:
+        table_mask_rle = table_segment.get("mask_rle")
+        if table_mask_rle is None:
+            skipped_table = {
+                "id": table_id,
+                "name": table_name,
+                "reason": "missing_table_mask_rle",
+            }
+        else:
+            mask_path = masks_dir / f"{len(requested_items):04d}_{table_id}_mask.png"
+            decode_rle_mask(table_mask_rle).save(mask_path)
+            mask_paths.append(mask_path)
+            requested_items.append(
+                {
+                    "id": table_id,
+                    "name": table_name,
+                    "kind": "table",
+                    "mask_path": str(mask_path),
+                }
+            )
+
+    for obj_spec in object_specs:
+        obj_id = str(obj_spec.get("id", "")).strip()
+        obj_name = str(obj_spec.get("name", "")).strip()
+        if not obj_id:
+            continue
+        segment = segment_by_id.get(obj_id)
+        if segment is None:
+            continue
+        mask_rle = segment.get("mask_rle")
+        if mask_rle is None:
+            continue
+
+        mask_path = masks_dir / f"{len(requested_items):04d}_{obj_id}_mask.png"
+        decode_rle_mask(mask_rle).save(mask_path)
+        mask_paths.append(mask_path)
+        requested_items.append(
+            {
+                "id": obj_id,
+                "name": obj_name,
+                "description": str(obj_spec.get("description", "")),
+                "kind": "object",
+                "mask_path": str(mask_path),
+            }
+        )
+
+    generated_objects: list[dict[str, Any]] = []
+    generated_table: dict[str, Any] | None = None
+    image_manager = ImageGenerationManager()
+    segmentation_manager = ImageSegmentationManager()
+    geometry_manager = GeometryGenerationManager()
+    simready_manager = SimreadyManager()
+    try:
+        if skipped_table is not None:
+            raise ValueError(
+                "No valid table/support mask found for image multi-object "
+                f"layout generation: {skipped_table['reason']}"
+            )
+        if not mask_paths:
+            raise ValueError(
+                "No valid masks found for image multi-object layout generation."
+            )
+
+        result = geometry_manager.convert_rgba_images_to_geometries(
+            RgbaImagesToGeometriesRequest(
+                image_path=Path(original_image_path),
+                mask_paths=mask_paths,
+                output_dir=raw_download_dir,
+            )
+        )
+        if len(result.objects) != len(requested_items):
+            raise RuntimeError(
+                "Multi-object SAM3D result count mismatch: "
+                f"requested {len(requested_items)}, got {len(result.objects)}"
+            )
+        for requested, generated in zip(requested_items, result.objects):
+            expected_sam3d_name = Path(requested["mask_path"]).stem
+            if generated.name != expected_sam3d_name:
+                raise RuntimeError(
+                    "Multi-object SAM3D result order mismatch: "
+                    f"expected {expected_sam3d_name!r}, got {generated.name!r}"
+                )
+            downloaded_raw_path = Path(generated.geometry_path).expanduser().resolve()
+            raw_geometry_path = str(downloaded_raw_path)
+            status_parts: list[str] = []
+            transform_matrix: list[list[float]] = []
+            try:
+                transform = _compose_sam3d_multi_object_transform(
+                    rotation_quaternion_wxyz=generated.rotation_quaternion_wxyz,
+                    translation=generated.translation,
+                    scale=generated.scale,
+                )
+                transform_matrix = transform.tolist()
+            except Exception:
+                status_parts.append(
+                    f"transform_matrix_failed: {traceback.format_exc()}"
+                )
+
+            simready_geometry_path = ""
+            raw_to_simready_glb_matrix: list[list[float]] = []
+            metric_scale: dict[str, Any] | None = None
+            try:
+                if requested["kind"] == "table":
+                    if is_complete_visible_table:
+                        table_result = simready_manager.make_table_simready(
+                            MakeTableSimreadyRequest(
+                                input_path=Path(raw_geometry_path),
+                                output_path=simready_dir
+                                / f"{requested['id']}_simready.glb",
+                            )
+                        )
+                        simready_geometry_path = str(table_result.output_path)
+                        raw_to_simready_glb_matrix = table_result.transform_matrix
+                else:
+                    asset_result = simready_manager.make_asset_simready(
+                        MakeAssetSimreadyRequest(
+                            input_path=Path(raw_geometry_path),
+                            output_path=simready_dir
+                            / f"{requested['id']}_simready.glb",
+                        )
+                    )
+                    simready_geometry_path = str(asset_result.output_path)
+                    raw_to_simready_glb_matrix = asset_result.transform_matrix
+            except Exception:
+                status_parts.append(f"simready_failed: {traceback.format_exc()}")
+            item_status = "ok" if not status_parts else "; ".join(status_parts)
+            generated_item = {
+                "id": requested["id"],
+                "name": requested["name"],
+                "kind": requested["kind"],
+                "description": str(table_spec.get("description", ""))
+                if requested["kind"] == "table"
+                else str(requested.get("description", "")),
+                "complete_table_description": str(
+                    table_spec.get("complete_table_description")
+                    or table_spec.get("description", "")
+                ).strip()
+                if requested["kind"] == "table"
+                else "",
+                "is_complete_visible_table": is_complete_visible_table
+                if requested["kind"] == "table"
+                else False,
+                "status": item_status,
+                "mask_path": relative_path(requested["mask_path"], output_root),
+                "raw_geometry_path": relative_path(raw_geometry_path, output_root),
+                "simready_geometry_path": relative_path(
+                    simready_geometry_path, output_root
+                )
+                if simready_geometry_path
+                else "",
+                "mesh_path": relative_path(simready_geometry_path, output_root)
+                if simready_geometry_path
+                else "",
+                "sam3d_name": generated.name,
+                "downloaded_raw_geometry_path": relative_path(
+                    str(downloaded_raw_path), output_root
+                ),
+                "rotation_quaternion_wxyz": generated.rotation_quaternion_wxyz,
+                "translation": generated.translation,
+                "scale": generated.scale,
+                "transform_matrix": transform_matrix,
+                "raw_to_simready_glb_matrix": raw_to_simready_glb_matrix,
+                "metric_scale": metric_scale,
+            }
+            if requested["kind"] == "table":
+                support_reference_path = raw_download_dir / "support_surface_raw.glb"
+                table_raw_path = raw_download_dir / "table_raw.glb"
+                shutil.copy2(downloaded_raw_path, support_reference_path)
+                if is_complete_visible_table:
+                    shutil.copy2(downloaded_raw_path, table_raw_path)
+                    generated_item["raw_geometry_path"] = relative_path(
+                        str(table_raw_path),
+                        output_root,
+                    )
+                generated_item["support_reference_geometry_path"] = relative_path(
+                    str(support_reference_path),
+                    output_root,
+                )
+                generated_item["support_reference_transform_matrix"] = transform_matrix
+                generated_item["support_normal_source"] = "segmented_table"
+                generated_item["table_asset_source"] = "segmented_table"
+                if not is_complete_visible_table:
+                    # Replace partial image table with description-generated table.
+                    incomplete_table_id = str(
+                        generated_item.get("id")
+                        or table_spec.get("id")
+                        or "table"
+                    )
+                    incomplete_table_desc = str(
+                        table_spec.get("complete_table_description")
+                        or table_spec.get("description", "")
+                    ).strip()
+                    incomplete_debug_dir = (
+                        debug_dir / incomplete_table_id / "description_generated"
+                    )
+                    incomplete_debug_dir.mkdir(parents=True, exist_ok=True)
+                    incomplete_raw_download_dir = glb_gen_dir / "raw_downloads"
+                    incomplete_raw_download_dir.mkdir(parents=True, exist_ok=True)
+                    incomplete_raw_image = str(
+                        image_manager.generate_asset_image_from_text(
+                            TextToAssetImageRequest(
+                                prompt=incomplete_table_desc,
+                                output_path=incomplete_debug_dir
+                                / f"{incomplete_table_id}_complete.png",
+                            )
+                        )
+                    )
+                    incomplete_rgba = str(
+                        segmentation_manager.convert_asset_image_to_rgba(
+                            AssetImageToRgbaRequest(
+                                image_path=Path(incomplete_raw_image),
+                                prompt=incomplete_table_desc
+                                if incomplete_table_desc.strip()
+                                else "whole table",
+                                output_path=image_gen_dir
+                                / f"{incomplete_table_id}_complete.png",
+                            )
+                        )
+                    )
+                    incomplete_raw_glb = str(
+                        geometry_manager.convert_rgba_image_to_geometry(
+                            RgbaImageToGeometryRequest(
+                                image_path=Path(incomplete_rgba),
+                                output_path=incomplete_debug_dir
+                                / f"{incomplete_table_id}_complete_raw.glb",
+                            )
+                        )
+                    )
+                    incomplete_table_raw_path = (
+                        incomplete_raw_download_dir / "table_raw.glb"
+                    )
+                    shutil.copy2(incomplete_raw_glb, incomplete_table_raw_path)
+                    incomplete_simready = simready_manager.make_table_simready(
+                        MakeTableSimreadyRequest(
+                            input_path=incomplete_table_raw_path,
+                            output_path=glb_gen_dir
+                            / "multi_object_layouts_simready"
+                            / f"{incomplete_table_id}_simready.glb",
+                        )
+                    )
+                    generated_item.update(
+                        {
+                            "image_path": relative_path(
+                                incomplete_rgba, output_root
+                            ),
+                            "raw_geometry_path": relative_path(
+                                str(incomplete_table_raw_path), output_root
+                            ),
+                            "generated_table_raw_geometry_path": relative_path(
+                                incomplete_raw_glb, output_root
+                            ),
+                            "simready_geometry_path": relative_path(
+                                str(incomplete_simready.output_path),
+                                output_root,
+                            ),
+                            "mesh_path": relative_path(
+                                str(incomplete_simready.output_path),
+                                output_root,
+                            ),
+                            "raw_to_simready_glb_matrix": (
+                                incomplete_simready.transform_matrix
+                            ),
+                            "transform_matrix": np.eye(
+                                4, dtype=np.float64
+                            ).tolist(),
+                            "table_asset_source": "description_generated",
+                            "complete_table_description": incomplete_table_desc,
+                        }
+                    )
+                generated_table = generated_item
+            else:
+                generated_objects.append(generated_item)
+    except Exception as exc:
+        status = "failed"
+        failure_reason = traceback.format_exc()
+        log_warning(f"image object geometry generation failed error={exc}")
+
+    if generated_objects:
+        _estimate_image_scene_metric_scales(
+            objects=generated_objects,
+            bbox_name_image_path=segments_data.get("bbox_name_image_path"),
+            output_dir=glb_gen_dir,
+            output_root=output_root,
+            llm=llm,
+        )
+
+    alignment_result: dict[str, Any] | None = None
+    if generated_table is not None and generated_objects:
+        try:
+            alignment_result = _export_support_aligned_layout_glbs(
+                table=generated_table,
+                objects=generated_objects,
+                spatial_relations=spatial_relations,
+                original_image_path=Path(original_image_path)
+                if original_image_path
+                else None,
+                llm=llm,
+                output_dir=aligned_dir,
+                output_root=output_root,
+            )
+            aligned_object_by_id = {
+                item["id"]: item for item in alignment_result["objects"]
+            }
+            for generated_object in generated_objects:
+                aligned_object = aligned_object_by_id.get(generated_object["id"])
+                if aligned_object is not None:
+                    generated_object["aligned_geometry_path"] = aligned_object[
+                        "aligned_geometry_path"
+                    ]
+        except Exception as exc:
+            status = "failed"
+            failure_reason = traceback.format_exc()
+            log_warning(f"image object alignment failed error={exc}")
+            alignment_result = {
+                "status": "failed",
+                "reason": failure_reason,
+            }
+
+    manifest_paths = _write_multi_object_layout_manifests(
+        glb_gen_dir=glb_gen_dir,
+        output_root=output_root,
+        table=generated_table,
+        objects=generated_objects,
+        alignment=alignment_result,
+    )
+    table_fields = (
+        "id",
+        "name",
+        "status",
+        "is_complete_visible_table",
+        "complete_table_description",
+        "object_coverage_percent",
+        "table_asset_source",
+        "support_normal_source",
+        "image_path",
+        "raw_geometry_path",
+        "support_reference_geometry_path",
+        "generated_table_raw_geometry_path",
+        "transformed_geometry_path",
+        "simready_geometry_path",
+        "aligned_geometry_path",
+        "mesh_path",
+    )
+    object_fields = (
+        "id",
+        "name",
+        "description",
+        "status",
+        "image_path",
+        "mesh_path",
+        "aligned_geometry_path",
+        "metric_scale",
+    )
+    workflow_table = (
+        {key: generated_table[key] for key in table_fields if key in generated_table}
+        if generated_table is not None
+        else None
+    )
+    workflow_objects = [
+        {key: item[key] for key in object_fields if key in item}
+        for item in generated_objects
+    ]
+    if workflow_table is not None and workflow_table.get("status") != "ok":
+        workflow_table["status"] = "failed"
+    for item in workflow_objects:
+        if item.get("status") != "ok":
+            item["status"] = "failed"
+    workflow_alignment = (
+        {
+            key: alignment_result[key]
+            for key in ("status", "final_clutter_2d_aabb_cm")
+            if key in alignment_result
+        }
+        if alignment_result is not None
+        else None
+    )
+    result = {
+        "status": status,
+        "table": workflow_table,
+        "objects": workflow_objects,
+        "alignment": workflow_alignment,
+        "manifests": manifest_paths,
+    }
+    if failure_reason:
+        result["reason"] = failure_reason
+    log_info(
+        "image object layout generation completed "
+        f"status={status} generated={len(generated_objects)}"
+    )
+    return result
+
+
+def _estimate_image_scene_metric_scales(
+    *,
+    objects: list[dict[str, Any]],
+    bbox_name_image_path: Any,
+    output_dir: Path,
+    output_root: Path,
+    llm: Any | None,
+) -> dict[str, Any]:
+    result: dict[str, Any] = {
+        "status": "skipped",
+        "method": "image_scene_bbox_name_vlm_candidate_shape_ratio_median_scale",
+        "bbox_name_image_path": str(bbox_name_image_path or ""),
+        "objects": [],
+    }
+    try:
+        if not METRIC_SCALE_ENABLED:
+            result["reason"] = "metric_scale_disabled"
+            MetricScaleManager.set_for_all_objects(
+                objects=objects,
+                status="skipped",
+                reason="metric_scale_disabled",
+                method=str(result["method"]),
+            )
+            return result
+        if llm is None:
+            result["reason"] = "missing_llm"
+            MetricScaleManager.set_for_all_objects(
+                objects=objects,
+                status="skipped",
+                reason="missing_llm",
+                method=str(result["method"]),
+            )
+            return result
+
+        bbox_image = _resolve_generated_path(bbox_name_image_path, output_root)
+        if not bbox_image.is_file():
+            result["reason"] = "missing_bbox_name_image"
+            MetricScaleManager.set_for_all_objects(
+                objects=objects,
+                status="skipped",
+                reason="missing_bbox_name_image",
+                method=str(result["method"]),
+            )
+            return result
+
+        metric_objects = _build_metric_scale_inputs(
+            objects=objects,
+            output_root=output_root,
+        )
+        result["objects"] = MetricScaleManager.object_prompt_payload(metric_objects)
+        metric_result = MetricScaleManager.estimate_metric_scales(
+            EstimateMetricScalesRequest(
+                objects=metric_objects,
+                messages=build_image_metric_scale_messages(
+                    bbox_name_image_path=bbox_image,
+                    objects_json=result["objects"],
+                ),
+                schema=IMAGE_METRIC_SCALE_JSON_SCHEMA,
+                llm=llm,
+                context="Image scene metric scale estimate",
+                method=str(result["method"]),
+                step_name=UNIFIED_SCENE_STEP,
+                raw_output_path=output_dir / "image_metric_scale_raw_model_output.json",
+            )
+        )
+        estimates = metric_result.object_scales
+        MetricScaleManager.apply_to_objects(objects=objects, object_scales=estimates)
+        result.update(
+            {
+                "status": "ok",
+                "object_scales": estimates,
+                "unit_note": (
+                    "Per-object scale_factor is not baked into simready GLBs. "
+                    "Image alignment later computes one clamped global clutter "
+                    "scale from these per-object estimates, on top of SAM3D "
+                    "per-object layout scale."
+                ),
+            }
+        )
+    except Exception:
+        result.update({"status": "failed", "reason": traceback.format_exc()})
+        MetricScaleManager.set_for_all_objects(
+            objects=objects,
+            status="failed",
+            reason="image_scene_metric_scale_failed",
+            method=str(result["method"]),
+        )
+    return result
+
+
+def _build_metric_scale_inputs(
+    *,
+    objects: list[dict[str, Any]],
+    output_root: Path,
+) -> list[MetricScaleObjectInput]:
+    inputs: list[MetricScaleObjectInput] = []
+    for obj in objects:
+        mesh_path = _resolve_generated_path(
+            obj.get("simready_geometry_path") or obj.get("mesh_path"),
+            output_root,
+        )
+        if not mesh_path.is_file():
+            raise FileNotFoundError(f"Simready object GLB not found: {mesh_path}")
+        inputs.append(
+            MetricScaleObjectInput(
+                object_id=str(obj.get("id", "")),
+                object_name=str(obj.get("name", "")),
+                object_description=str(obj.get("description", "")),
+                mesh_path=mesh_path,
+            )
+        )
+    return inputs
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+    path = Path(str(value or "")).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (output_root / path).resolve()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_fit_scene.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_fit_scene.py
new file mode 100644
index 00000000..273f15a6
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_fit_scene.py
@@ -0,0 +1,107 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import traceback
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.table_clutter_fit_manager import (
+    fit_table_to_clutter,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+
+__all__ = ["fit_image_scene_table", "fit_text_scene_table"]
+
+
+def fit_text_scene_table(
+    *,
+    table_result: dict[str, Any],
+    clutter_layout_result: dict[str, Any],
+    output_root: Path,
+    output_dir: Path,
+) -> dict[str, Any]:
+    """Fit the text-scene table and convert failures to result data."""
+    try:
+        result = fit_table_to_clutter(
+            table_result=table_result,
+            clutter_result=clutter_layout_result,
+            output_root=output_root,
+            output_dir=output_dir,
+            object_coverage_percent=table_result.get("object_coverage_percent"),
+        )
+        log_info(f"text table fit completed status={result.get('status')}")
+        return result
+    except Exception as exc:
+        log_warning(f"text table fit failed error={exc}")
+        return {
+            "status": "failed",
+            "reason": traceback.format_exc(),
+        }
+
+
+def fit_image_scene_table(
+    *,
+    layout_result: dict[str, Any],
+    fallback_table_result: dict[str, Any] | None,
+    output_root: Path,
+    output_dir: Path,
+) -> dict[str, Any]:
+    """Fit the image-scene table or return a structured skipped result."""
+    generated_table = layout_result.get("table") or fallback_table_result
+    generated_objects = layout_result.get("objects") or []
+    alignment_result = layout_result.get("alignment")
+    if (
+        generated_table is None
+        or not generated_objects
+        or not isinstance(alignment_result, dict)
+    ):
+        return {
+            "status": "skipped",
+            "reason": "missing_table_objects_or_alignment",
+        }
+
+    try:
+        clutter_result = {
+            "clutter_2d_aabb_cm": alignment_result.get(
+                "final_clutter_2d_aabb_cm"
+            ),
+            "objects": [
+                {
+                    "id": item["id"],
+                    "status": "ok",
+                    "laid_out_glb_path": item["aligned_geometry_path"],
+                }
+                for item in generated_objects
+                if item.get("id") and item.get("aligned_geometry_path")
+            ],
+        }
+        result = fit_table_to_clutter(
+            table_result=generated_table,
+            clutter_result=clutter_result,
+            output_root=output_root,
+            output_dir=output_dir,
+            object_coverage_percent=generated_table.get("object_coverage_percent"),
+        )
+        log_info(f"image table fit completed status={result.get('status')}")
+        return result
+    except Exception as exc:
+        log_warning(f"image table fit failed error={exc}")
+        return {
+            "status": "failed",
+            "reason": traceback.format_exc(),
+        }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_asset_generation.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_asset_generation.py
new file mode 100644
index 00000000..ada7ad78
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_asset_generation.py
@@ -0,0 +1,296 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import shutil
+import traceback
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager import (
+    GeometryGenerationManager,
+    RgbaImageToGeometryRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager import (
+    ImageGenerationManager,
+    TextToAssetImageRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager import (
+    AssetImageToRgbaRequest,
+    ImageSegmentationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager import (
+    MakeAssetSimreadyRequest,
+    MakeTableSimreadyRequest,
+    SimreadyManager,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+
+__all__ = [
+    "generate_text_object_asset",
+    "generate_text_object_assets",
+    "generate_text_table_asset",
+]
+
+
+def generate_text_object_asset(
+    *,
+    object_spec: dict[str, Any],
+    image_gen_dir: Path,
+    glb_gen_dir: Path,
+    debug_dir: Path,
+) -> dict[str, Any]:
+    """Generate one object asset from a text-origin object spec."""
+    object_id = str(object_spec.get("id", "object"))
+    object_name = str(object_spec.get("name", ""))
+    description = str(object_spec.get("description", ""))
+    class_candidates = [
+        str(candidate).replace("_", " ")
+        for candidate in object_spec.get("class_candidate", [])
+        if isinstance(candidate, str) and candidate.strip()
+    ]
+    status = "ok"
+    image_path = ""
+    raw_geometry_path = ""
+    mesh_path = ""
+    raw_to_simready_matrix: list[list[float]] = []
+
+    debug_subdir = debug_dir / object_id
+    debug_subdir.mkdir(parents=True, exist_ok=True)
+    log_info(f"text object generation started id={object_id} name={object_name}")
+
+    image_manager = ImageGenerationManager()
+    segmentation_manager = ImageSegmentationManager()
+    geometry_manager = GeometryGenerationManager()
+    simready_manager = SimreadyManager()
+
+    try:
+        image_prompt = f"{object_name}, {description}".strip(", ")
+        raw_image_path = str(
+            image_manager.generate_asset_image_from_text(
+                TextToAssetImageRequest(
+                    prompt=image_prompt,
+                    output_path=debug_subdir / f"{object_id}.png",
+                )
+            )
+        )
+
+        rgba_prompts: list[str] = []
+        if description.strip():
+            rgba_prompts.append(description.strip())
+        for candidate in class_candidates:
+            candidate_prompt = f"The entire {candidate} on the center of the image"
+            if candidate_prompt not in rgba_prompts:
+                rgba_prompts.append(candidate_prompt)
+        if not rgba_prompts:
+            rgba_prompts.append(
+                f"the entire single isolated object {object_name}"
+                if object_name
+                else "the entire single isolated object"
+            )
+
+        rgba_path = ""
+        last_rgba_error: Exception | None = None
+        for prompt in rgba_prompts:
+            try:
+                rgba_path = str(
+                    segmentation_manager.convert_asset_image_to_rgba(
+                        AssetImageToRgbaRequest(
+                            image_path=Path(raw_image_path),
+                            prompt=prompt,
+                            output_path=image_gen_dir / f"{object_id}.png",
+                        )
+                    )
+                )
+                break
+            except Exception as exc:
+                last_rgba_error = exc
+                log_warning(
+                    "text object segmentation prompt failed "
+                    f"id={object_id} prompt={prompt!r} error={exc}"
+                )
+        if not rgba_path:
+            raise last_rgba_error or RuntimeError(
+                f"No RGBA prompt succeeded for {object_id}"
+            )
+
+        raw_glb_path = str(
+            geometry_manager.convert_rgba_image_to_geometry(
+                RgbaImageToGeometryRequest(
+                    image_path=Path(rgba_path),
+                    output_path=debug_subdir / f"{object_id}_raw.glb",
+                )
+            )
+        )
+        raw_geometry_dir = glb_gen_dir / "raw_downloads"
+        raw_geometry_dir.mkdir(parents=True, exist_ok=True)
+        object_raw_path = raw_geometry_dir / f"{object_id}_raw.glb"
+        shutil.copy2(raw_glb_path, object_raw_path)
+        raw_geometry_path = str(object_raw_path)
+
+        simready_result = simready_manager.make_asset_simready(
+            MakeAssetSimreadyRequest(
+                input_path=Path(raw_glb_path),
+                output_path=glb_gen_dir
+                / "text_objects_simready"
+                / f"{object_id}_simready.glb",
+            )
+        )
+        mesh_path = str(simready_result.output_path)
+        raw_to_simready_matrix = simready_result.transform_matrix
+
+        image_path = rgba_path
+        log_info(f"text object generation completed id={object_id} mesh={mesh_path}")
+    except Exception as exc:
+        status = f"failed: {traceback.format_exc()}"
+        log_warning(f"text object generation failed id={object_id} error={exc}")
+
+    return {
+        "id": object_id,
+        "name": object_name,
+        "description": description,
+        "status": status,
+        "image_path": image_path,
+        "raw_geometry_path": raw_geometry_path,
+        "mesh_path": mesh_path,
+        "simready_geometry_path": mesh_path,
+        "raw_to_simready_glb_matrix": raw_to_simready_matrix,
+        "metric_scale": None,
+    }
+
+
+def generate_text_object_assets(
+    *,
+    object_specs: list[dict[str, Any]],
+    image_gen_dir: Path,
+    glb_gen_dir: Path,
+    debug_dir: Path,
+) -> list[dict[str, Any]]:
+    """Generate all object assets for a text-origin unified scene."""
+    log_info(f"text object batch generation started count={len(object_specs)}")
+    results = [
+        generate_text_object_asset(
+            object_spec=object_spec,
+            image_gen_dir=image_gen_dir,
+            glb_gen_dir=glb_gen_dir,
+            debug_dir=debug_dir,
+        )
+        for object_spec in object_specs
+    ]
+    succeeded = sum(result.get("status") == "ok" for result in results)
+    log_info(
+        f"text object batch generation completed "
+        f"succeeded={succeeded} failed={len(results) - succeeded}"
+    )
+    return results
+
+
+def generate_text_table_asset(
+    *,
+    table_spec: dict[str, Any],
+    image_gen_dir: Path,
+    glb_gen_dir: Path,
+    debug_dir: Path,
+) -> dict[str, Any]:
+    """Generate the table asset for a text-origin unified scene."""
+    table_id = str(table_spec.get("id", "table"))
+    description = str(
+        table_spec.get("complete_table_description")
+        or table_spec.get("description", "")
+    ).strip()
+    status = "ok"
+    image_path = ""
+    raw_geometry_path = ""
+    generated_table_raw_geometry_path = ""
+    mesh_path = ""
+
+    debug_subdir = debug_dir / table_id
+    debug_subdir.mkdir(parents=True, exist_ok=True)
+    log_info(f"text table generation started id={table_id}")
+
+    image_manager = ImageGenerationManager()
+    segmentation_manager = ImageSegmentationManager()
+    geometry_manager = GeometryGenerationManager()
+    simready_manager = SimreadyManager()
+
+    try:
+        raw_image_path = str(
+            image_manager.generate_asset_image_from_text(
+                TextToAssetImageRequest(
+                    prompt=description,
+                    output_path=debug_subdir / f"{table_id}.png",
+                )
+            )
+        )
+        rgba_path = str(
+            segmentation_manager.convert_asset_image_to_rgba(
+                AssetImageToRgbaRequest(
+                    image_path=Path(raw_image_path),
+                    prompt=description if description.strip() else "whole table",
+                    output_path=image_gen_dir / f"{table_id}.png",
+                )
+            )
+        )
+        raw_glb_path = str(
+            geometry_manager.convert_rgba_image_to_geometry(
+                RgbaImageToGeometryRequest(
+                    image_path=Path(rgba_path),
+                    output_path=debug_subdir / f"{table_id}_raw.glb",
+                )
+            )
+        )
+        generated_table_raw_geometry_path = raw_glb_path
+        raw_geometry_dir = glb_gen_dir / "raw_downloads"
+        raw_geometry_dir.mkdir(parents=True, exist_ok=True)
+        table_raw_path = raw_geometry_dir / "table_raw.glb"
+        shutil.copy2(raw_glb_path, table_raw_path)
+        raw_geometry_path = str(table_raw_path)
+        mesh_path = str(
+            simready_manager.make_table_simready(
+                MakeTableSimreadyRequest(
+                    input_path=Path(raw_geometry_path),
+                    output_path=glb_gen_dir
+                    / "text_objects_simready"
+                    / f"{table_id}_simready.glb",
+                )
+            ).output_path
+        )
+        image_path = rgba_path
+        log_info(f"text table generation completed id={table_id} mesh={mesh_path}")
+    except Exception as exc:
+        status = f"failed: {traceback.format_exc()}"
+        log_warning(f"text table generation failed id={table_id} error={exc}")
+
+    return {
+        "id": table_id,
+        "name": str(table_spec.get("name", "table")),
+        "description": str(table_spec.get("description", "")),
+        "complete_table_description": description,
+        "is_complete_visible_table": bool(
+            table_spec.get("is_complete_visible_table", False)
+        ),
+        "object_coverage_percent": table_spec.get("object_coverage_percent"),
+        "status": status,
+        "image_path": image_path,
+        "raw_geometry_path": raw_geometry_path,
+        "generated_table_raw_geometry_path": generated_table_raw_geometry_path,
+        "support_reference_geometry_path": "",
+        "table_asset_source": "description_generated",
+        "support_normal_source": "",
+        "mesh_path": mesh_path,
+        "simready_geometry_path": mesh_path,
+    }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_clutter_layout.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_clutter_layout.py
new file mode 100644
index 00000000..80bc3210
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_clutter_layout.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import traceback
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.text_layout_manager import (
+    settle_text_objects_to_ground,
+)
+
+__all__ = ["generate_text_clutter_layout"]
+
+
+def generate_text_clutter_layout(
+    *,
+    object_results: list[dict[str, Any]],
+    spatial_relations: list[dict[str, Any]],
+    table_constraints: list[dict[str, Any]],
+    output_dir: Path,
+    output_root: Path,
+) -> dict[str, Any]:
+    """Settle and spatially arrange generated text-scene objects."""
+    if not object_results:
+        return {
+            "status": "skipped",
+            "reason": "no_text_objects",
+        }
+
+    try:
+        log_info(f"text clutter layout started count={len(object_results)}")
+        result = settle_text_objects_to_ground(
+            objects=object_results,
+            spatial_relations=spatial_relations,
+            table_constraints=table_constraints,
+            output_dir=output_dir,
+            output_root=output_root,
+        )
+        log_info(f"text clutter layout completed status={result.get('status')}")
+        return result
+    except Exception as exc:
+        log_warning(f"text clutter layout failed error={exc}")
+        return {
+            "status": "failed",
+            "reason": traceback.format_exc(),
+        }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_scene_metric_scale.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_scene_metric_scale.py
new file mode 100644
index 00000000..fd0b1383
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_scene_metric_scale.py
@@ -0,0 +1,161 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import traceback
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.metric_scale_manager import (
+    METRIC_SCALE_ENABLED,
+    EstimateMetricScalesRequest,
+    MetricScaleManager,
+    MetricScaleObjectInput,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+
+__all__ = ["build_metric_scale_inputs", "estimate_text_scene_metric_scale"]
+
+
+def estimate_text_scene_metric_scale(
+    *,
+    object_results: list[dict[str, Any]],
+    user_text: str,
+    messages: list[dict[str, Any]],
+    schema: dict[str, Any],
+    output_dir: Path,
+    output_root: Path,
+    llm: Any | None,
+    step_name: str,
+) -> dict[str, Any]:
+    """Estimate real-world scales for generated text-scene objects."""
+    result: dict[str, Any] = {
+        "status": "skipped",
+        "method": "text_scene_vlm_candidate_shape_ratio_median_scale",
+        "user_text": user_text,
+        "objects": [],
+    }
+    try:
+        if not object_results:
+            result["reason"] = "missing_objects"
+            log_warning("text scene metric scale skipped reason=missing_objects")
+            return result
+        if not METRIC_SCALE_ENABLED:
+            result["reason"] = "metric_scale_disabled"
+            MetricScaleManager.set_for_all_objects(
+                objects=object_results,
+                status="skipped",
+                reason="metric_scale_disabled",
+                method=str(result["method"]),
+            )
+            log_info("text scene metric scale skipped reason=metric_scale_disabled")
+            return result
+        if llm is None:
+            result["reason"] = "missing_llm"
+            MetricScaleManager.set_for_all_objects(
+                objects=object_results,
+                status="skipped",
+                reason="missing_llm",
+                method=str(result["method"]),
+            )
+            log_warning("text scene metric scale skipped reason=missing_llm")
+            return result
+
+        log_info(f"text scene metric scale started count={len(object_results)}")
+        metric_objects = build_metric_scale_inputs(
+            objects=object_results,
+            output_root=output_root,
+        )
+        result["objects"] = MetricScaleManager.object_prompt_payload(metric_objects)
+        metric_result = MetricScaleManager.estimate_metric_scales(
+            EstimateMetricScalesRequest(
+                objects=metric_objects,
+                messages=messages,
+                schema=schema,
+                llm=llm,
+                context="Text scene metric scale estimate",
+                method=str(result["method"]),
+                step_name=step_name,
+                raw_output_path=output_dir / "raw_model_output.json",
+            )
+        )
+        raw_model_output = metric_result.raw_model_output or {}
+        if not (output_dir / "raw_model_output.json").is_file():
+            try:
+                write_json(output_dir / "raw_model_output.json", raw_model_output)
+            except Exception as exc:
+                log_warning(f"metric scale raw output write failed error={exc}")
+
+        estimates = metric_result.object_scales
+        MetricScaleManager.apply_to_objects(
+            objects=object_results,
+            object_scales=estimates,
+        )
+        result.update(
+            {
+                "status": "ok",
+                "object_scales": estimates,
+                "unit_note": (
+                    "Per-object scale_factor is not baked into simready GLBs. "
+                    "For text input, simready_geometry_path multiplied by this "
+                    "scale_factor gives the estimated real-world size."
+                ),
+            }
+        )
+        log_info(f"text scene metric scale completed count={len(estimates)}")
+    except Exception as exc:
+        result.update({"status": "failed", "reason": traceback.format_exc()})
+        MetricScaleManager.set_for_all_objects(
+            objects=object_results,
+            status="failed",
+            reason="text_scene_metric_scale_failed",
+            method=str(result["method"]),
+        )
+        log_warning(f"text scene metric scale failed error={exc}")
+    return result
+
+
+def build_metric_scale_inputs(
+    *,
+    objects: list[dict[str, Any]],
+    output_root: Path,
+) -> list[MetricScaleObjectInput]:
+    inputs: list[MetricScaleObjectInput] = []
+    for obj in objects:
+        mesh_path = _resolve_generated_path(
+            obj.get("simready_geometry_path") or obj.get("mesh_path"),
+            output_root,
+        )
+        if not mesh_path.is_file():
+            raise FileNotFoundError(f"Simready object GLB not found: {mesh_path}")
+        inputs.append(
+            MetricScaleObjectInput(
+                object_id=str(obj.get("id", "")),
+                object_name=str(obj.get("name", "")),
+                object_description=str(obj.get("description", "")),
+                mesh_path=mesh_path,
+            )
+        )
+    return inputs
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+    path = Path(str(value or "")).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (output_root / path).resolve()
diff --git a/embodichain/gen_sim/prompt2scene/cli/__init__.py b/embodichain/gen_sim/prompt2scene/cli/__init__.py
new file mode 100644
index 00000000..015c4151
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/cli/__init__.py
@@ -0,0 +1,19 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/cli/start.py b/embodichain/gen_sim/prompt2scene/cli/start.py
new file mode 100644
index 00000000..fdc3a27b
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/cli/start.py
@@ -0,0 +1,90 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.pipeline.runner import run_prompt2scene
+from embodichain.gen_sim.prompt2scene.llms import load_llm_config
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["cli_prompt2scene", "main"]
+
+
+def cli_prompt2scene(
+    image_path: str | None,
+    text: str | None,
+    output_root: str,
+    llm_config_path: str | None = None,
+) -> None:
+    """Run prompt2scene from normalized CLI argument values.
+
+    Args:
+        image_path: Path to an input image, if image mode is used.
+        text: Text prompt, if text mode is used.
+        output_root: Directory where prompt2scene outputs are written.
+        llm_config_path: Optional path to the LLM config JSON file.
+    """
+    request = Prompt2SceneInput.from_cli_args(
+        image_path=Path(image_path) if image_path is not None else None,
+        text=text,
+        output_root=Path(output_root),
+    )
+    llm_cfg = load_llm_config(
+        Path(llm_config_path) if llm_config_path is not None else None
+    )
+    run_prompt2scene(request, llm_cfg=llm_cfg)
+
+
+def main() -> None:
+    """Parse command line arguments and launch the prompt2scene pipeline."""
+    parser = argparse.ArgumentParser(
+        description="embodichain.gen_sim.prompt2scene Prompt-to-Scene Pipeline"
+    )
+
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument(
+        "--image",
+        type=str,
+        help="Path to the input image file (.jpg, .jpeg, or .png)",
+    )
+    input_group.add_argument(
+        "--text",
+        type=str,
+        help="Text prompt describing the target scene",
+    )
+    parser.add_argument(
+        "--output_root",
+        type=str,
+        required=True,
+        help="Path to the output directory",
+    )
+    parser.add_argument(
+        "--llm_config",
+        type=str,
+        default=None,
+        help="Path to the LLM config JSON file",
+    )
+
+    args = parser.parse_args()
+
+    cli_prompt2scene(args.image, args.text, args.output_root, args.llm_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/embodichain/gen_sim/prompt2scene/configs/client_config.json b/embodichain/gen_sim/prompt2scene/configs/client_config.json
new file mode 100644
index 00000000..b8662eaf
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/configs/client_config.json
@@ -0,0 +1,21 @@
+{
+    "sam3_segmentation": {
+      "base_url": "http://192.168.3.23:5014",
+      "timeout_s": 1200,
+      "health_path": "/health",
+      "segment_single_object_path": "/predict"
+    },
+    "sam3d_generation": {
+      "base_url": "http://10.7.7.32:5019",
+      "timeout_s": 1800,
+      "health_path": "/health",
+      "generate_multiple_objects_path": "/generate_multiple_objects",
+      "generate_single_object_path": "/generate_single_object"
+    },
+    "zimage": {
+       "base_url": "http://192.168.3.23:5013",
+       "timeout_s": 120,
+       "health_path": "/health",
+       "generate_single_object_path": "/generate.png"
+    }
+}
diff --git a/embodichain/gen_sim/prompt2scene/configs/llm_config.json b/embodichain/gen_sim/prompt2scene/configs/llm_config.json
new file mode 100644
index 00000000..9dd82514
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/configs/llm_config.json
@@ -0,0 +1,11 @@
+{
+  "llm": {
+    "openai_compatible": {
+      "api_key": "",
+      "model": "",
+      "base_url": "",
+      "default_query": {},
+      "max_attempts": 5
+    }
+  }
+}
diff --git a/embodichain/gen_sim/prompt2scene/llms/__init__.py b/embodichain/gen_sim/prompt2scene/llms/__init__.py
new file mode 100644
index 00000000..8412eff4
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/llms/__init__.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.llms.config import OpenAICompatibleLLMCfg
+from embodichain.gen_sim.prompt2scene.llms.openai_compatible import (
+    DEFAULT_LLM_CONFIG_PATH,
+    build_chat_model,
+    load_llm_config,
+)
+
+__all__ = [
+    "DEFAULT_LLM_CONFIG_PATH",
+    "OpenAICompatibleLLMCfg",
+    "build_chat_model",
+    "load_llm_config",
+]
diff --git a/embodichain/gen_sim/prompt2scene/llms/config.py b/embodichain/gen_sim/prompt2scene/llms/config.py
new file mode 100644
index 00000000..f84c4fcf
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/llms/config.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+__all__ = [
+    "OpenAICompatibleLLMCfg",
+]
+
+
+@dataclass(frozen=True)
+class OpenAICompatibleLLMCfg:
+    """OpenAI-compatible LLM configuration."""
+
+    api_key: str
+    model: str
+    base_url: str
+    default_query: dict[str, str] = field(default_factory=dict)
+    max_attempts: int = 3
+
+    def to_manifest(self) -> dict[str, object]:
+        """Convert the LLM config to a JSON-safe manifest.
+
+        Returns:
+            LLM config metadata with sensitive values removed.
+        """
+        return {
+            "provider": "openai_compatible",
+            "model": self.model,
+            "base_url": self.base_url,
+            "has_api_key": bool(self.api_key),
+            "default_query": self.default_query,
+            "max_attempts": self.max_attempts,
+        }
diff --git a/embodichain/gen_sim/prompt2scene/llms/openai_compatible.py b/embodichain/gen_sim/prompt2scene/llms/openai_compatible.py
new file mode 100644
index 00000000..91e94a59
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/llms/openai_compatible.py
@@ -0,0 +1,115 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+from langchain_openai import ChatOpenAI
+
+from embodichain.gen_sim.prompt2scene.llms.config import OpenAICompatibleLLMCfg
+
+__all__ = ["DEFAULT_LLM_CONFIG_PATH", "build_chat_model", "load_llm_config"]
+
+DEFAULT_LLM_CONFIG_PATH = (
+    Path(__file__).resolve().parents[1] / "configs" / "llm_config.json"
+)
+
+
+def load_llm_config(config_path: Path | None = None) -> OpenAICompatibleLLMCfg:
+    """Load the prompt2scene OpenAI-compatible LLM config.
+
+    Args:
+        config_path: Optional path to the LLM config JSON file.
+
+    Returns:
+        Parsed OpenAI-compatible LLM config.
+
+    Raises:
+        FileNotFoundError: If the config file does not exist.
+        ValueError: If required config fields are missing.
+    """
+    config_path = config_path or DEFAULT_LLM_CONFIG_PATH
+    config_path = config_path.expanduser().resolve()
+
+    if not config_path.exists():
+        raise FileNotFoundError(f"LLM config not found: {config_path}")
+
+    with config_path.open("r", encoding="utf-8") as f:
+        raw_cfg: dict[str, Any] = json.load(f)
+
+    cfg = raw_cfg.get("llm", {}).get("openai_compatible", {})
+    api_key = os.getenv("OPENAI_API_KEY") or cfg.get("api_key", "")
+    model = os.getenv("OPENAI_MODEL") or cfg.get("model", "")
+    base_url = os.getenv("OPENAI_BASE_URL") or cfg.get("base_url", "")
+    default_query = cfg.get("default_query", {})
+    max_attempts = _load_positive_int(
+        os.getenv("OPENAI_MAX_ATTEMPTS") or cfg.get("max_attempts", 3),
+        key="max_attempts",
+    )
+
+    if base_url:
+        base_url = base_url.rstrip("/")
+
+    missing = [
+        name
+        for name, value in {
+            "api_key": api_key,
+            "model": model,
+            "base_url": base_url,
+        }.items()
+        if not value
+    ]
+    if missing:
+        raise ValueError(f"Missing required LLM config keys: {missing}")
+
+    if not isinstance(default_query, dict):
+        raise ValueError("LLM config key default_query must be a dict.")
+
+    return OpenAICompatibleLLMCfg(
+        api_key=api_key,
+        model=model,
+        base_url=base_url,
+        default_query=default_query,
+        max_attempts=max_attempts,
+    )
+
+
+def _load_positive_int(value: object, *, key: str) -> int:
+    try:
+        parsed = int(value)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(f"LLM config key {key} must be an integer.") from exc
+    if parsed < 1:
+        raise ValueError(f"LLM config key {key} must be >= 1.")
+    return parsed
+
+
+def build_chat_model(cfg: OpenAICompatibleLLMCfg) -> Any:
+    """Build a LangChain OpenAI-compatible chat model."""
+    kwargs: dict[str, Any] = {
+        "api_key": cfg.api_key,
+        "base_url": cfg.base_url,
+        "model": cfg.model,
+        "temperature": 0,
+    }
+    if cfg.default_query:
+        kwargs["default_query"] = cfg.default_query
+
+    return ChatOpenAI(**kwargs)
diff --git a/embodichain/gen_sim/prompt2scene/pipeline/__init__.py b/embodichain/gen_sim/prompt2scene/pipeline/__init__.py
new file mode 100644
index 00000000..a1450f03
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/pipeline/__init__.py
@@ -0,0 +1,25 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.pipeline.runner import (
+    Prompt2SceneRunResult,
+    run_prompt2scene,
+)
+
+__all__ = ["Prompt2SceneRunResult", "run_prompt2scene"]
+
diff --git a/embodichain/gen_sim/prompt2scene/pipeline/runner.py b/embodichain/gen_sim/prompt2scene/pipeline/runner.py
new file mode 100644
index 00000000..7931f00b
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/pipeline/runner.py
@@ -0,0 +1,239 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.llms import OpenAICompatibleLLMCfg
+from embodichain.gen_sim.prompt2scene.workflows.request import (
+    InputKind,
+    Prompt2SceneInput,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    IMAGE_SEGMENTS_STEP,
+    IMAGE_SPATIAL_RELATIONS_STEP,
+    SCENE_INTAKE_STEP,
+    STEP_RESULT_FILENAME,
+    step_result_path,
+    write_step_result,
+    TEXT_RELATIONS_STEP,
+    UNIFIED_SCENE_STEP,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.graph import (
+    run_unified_scene,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.graph import (
+    run_unified_scene_gen,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.gym_export import (
+    export_gym_config,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.image_relations import (
+    run_image_relations,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake import run_scene_intake
+from embodichain.gen_sim.prompt2scene.workflows.text_relations import (
+    run_text_relations,
+)
+
+__all__ = [
+    "IMAGE_SEGMENTS_DIRNAME",
+    "IMAGE_SPATIAL_RELATIONS_DIRNAME",
+    "INPUT_MANIFEST_FILENAME",
+    "SCENE_INTAKE_DIRNAME",
+    "STEP_RESULT_FILENAME",
+    "TEXT_RELATIONS_DIRNAME",
+    "UNIFIED_SCENE_DIRNAME",
+    "Prompt2SceneRunResult",
+    "run_prompt2scene",
+]
+
+INPUT_MANIFEST_FILENAME = "input_manifest.json"
+SCENE_INTAKE_DIRNAME = SCENE_INTAKE_STEP
+IMAGE_SEGMENTS_DIRNAME = IMAGE_SEGMENTS_STEP
+IMAGE_SPATIAL_RELATIONS_DIRNAME = IMAGE_SPATIAL_RELATIONS_STEP
+TEXT_RELATIONS_DIRNAME = TEXT_RELATIONS_STEP
+UNIFIED_SCENE_DIRNAME = UNIFIED_SCENE_STEP
+
+
+@dataclass(frozen=True)
+class Prompt2SceneRunResult:
+    """Result returned by the prompt2scene runner.
+
+    Args:
+        output_root: Directory where prompt2scene outputs were written.
+        manifest_path: Path to the serialized input manifest.
+        scene_intake_path: Path to the serialized scene intake output.
+        image_segments_path: Path to serialized image segment alignment output.
+        image_spatial_relations_path: Path to serialized image spatial relations.
+        text_relations_path: Path to serialized text spatial relations.
+        unified_scene_path: Path to serialized unified scene output.
+    """
+
+    output_root: Path
+    manifest_path: Path
+    scene_intake_path: Path | None = None
+    image_segments_path: Path | None = None
+    image_spatial_relations_path: Path | None = None
+    text_relations_path: Path | None = None
+    unified_scene_path: Path | None = None
+    gym_config_path: Path | None = None
+
+
+def run_prompt2scene(
+    request: Prompt2SceneInput,
+    llm_cfg: OpenAICompatibleLLMCfg | None = None,
+) -> Prompt2SceneRunResult:
+    """Run the prompt2scene pipeline.
+
+    This runner creates the output directory, writes the parsed input manifest,
+    and runs fixed VLM-based scene intake when an LLM config is provided.
+
+    Args:
+        request: Parsed prompt2scene input.
+        llm_cfg: Optional LLM config used by later pipeline stages.
+
+    Returns:
+        Paths created by the runner.
+    """
+    log.log_info(
+        "run start "
+        f"input_kind={request.input_kind.value} output_root={request.output_root}"
+    )
+    request.output_root.mkdir(parents=True, exist_ok=True)
+    manifest_path = request.output_root / INPUT_MANIFEST_FILENAME
+    manifest = request.to_manifest()
+    if llm_cfg is not None:
+        manifest["llm"] = llm_cfg.to_manifest()
+    write_json(manifest_path, manifest)
+
+    scene_intake_path = None
+    image_segments_path = None
+    image_spatial_relations_path = None
+    text_relations_path = None
+    unified_scene_path = None
+    gym_config_path = None
+    if llm_cfg is not None:
+        log.log_info("step start scene_intake")
+        scene_intake = run_scene_intake(request, llm_cfg=llm_cfg)
+        scene_intake_path = write_step_result(
+            request.output_root,
+            SCENE_INTAKE_STEP,
+            scene_intake.to_manifest(),
+        )
+        log.log_info(
+            f"step end scene_intake status=ok output={scene_intake_path}"
+        )
+        if request.input_kind == InputKind.IMAGE:
+            log.log_info("step start image_relations")
+            image_relations = run_image_relations(
+                request,
+                scene_intake=scene_intake,
+                llm_cfg=llm_cfg,
+                output_root=request.output_root,
+            )
+            image_segments_path = step_result_path(
+                request.output_root,
+                IMAGE_SEGMENTS_STEP,
+            )
+            if not image_segments_path.is_file():
+                write_step_result(
+                    request.output_root,
+                    IMAGE_SEGMENTS_STEP,
+                    image_relations.to_segmentation_manifest(),
+                )
+            image_spatial_relations_path = step_result_path(
+                request.output_root,
+                IMAGE_SPATIAL_RELATIONS_STEP,
+            )
+            if not image_spatial_relations_path.is_file():
+                write_step_result(
+                    request.output_root,
+                    IMAGE_SPATIAL_RELATIONS_STEP,
+                    image_relations.to_spatial_manifest(),
+                )
+            log.log_info(
+                "step end image_relations "
+                f"status=ok output={image_spatial_relations_path}"
+            )
+            log.log_info("step start unified_scene")
+            unified_scene = run_unified_scene(
+                request,
+                scene_intake=scene_intake,
+                image_relations=image_relations,
+                output_root=request.output_root,
+            )
+            unified_scene_path = step_result_path(
+                request.output_root,
+                UNIFIED_SCENE_STEP,
+            )
+        else:
+            log.log_info("step start text_relations")
+            text_relations = run_text_relations(
+                request,
+                scene_intake=scene_intake,
+                llm_cfg=llm_cfg,
+                output_root=request.output_root,
+            )
+            text_relations_path = step_result_path(
+                request.output_root,
+                TEXT_RELATIONS_STEP,
+            )
+            log.log_info(
+                f"step end text_relations status=ok output={text_relations_path}"
+            )
+            log.log_info("step start unified_scene")
+            unified_scene = run_unified_scene(
+                request,
+                scene_intake=scene_intake,
+                text_relations=text_relations,
+                output_root=request.output_root,
+            )
+            unified_scene_path = step_result_path(
+                request.output_root,
+                UNIFIED_SCENE_STEP,
+            )
+        log.log_info(
+            f"step end unified_scene status=ok output={unified_scene_path}"
+        )
+        log.log_info("step start unified_scene_gen")
+        run_unified_scene_gen(
+            request.output_root,
+            unified_scene_result_path=unified_scene_path,
+            llm_cfg=llm_cfg,
+        )
+        log.log_info("step end unified_scene_gen status=ok")
+
+        log.log_info("step start gym_export")
+        gym_config_path = export_gym_config(request.output_root)
+        log.log_info(f"step end gym_export status=ok output={gym_config_path}")
+
+    log.log_info(f"run end output_root={request.output_root}")
+
+    return Prompt2SceneRunResult(
+        output_root=request.output_root,
+        manifest_path=manifest_path,
+        scene_intake_path=scene_intake_path,
+        image_segments_path=image_segments_path,
+        image_spatial_relations_path=image_spatial_relations_path,
+        text_relations_path=text_relations_path,
+        unified_scene_path=unified_scene_path,
+        gym_config_path=gym_config_path,
+    )
diff --git a/embodichain/gen_sim/prompt2scene/prompts/__init__.py b/embodichain/gen_sim/prompt2scene/prompts/__init__.py
new file mode 100644
index 00000000..f72a97f6
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/__init__.py
@@ -0,0 +1,48 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from . import data
+from .base import PromptRenderer
+
+default_prompt_renderer = PromptRenderer(data)
+
+__all__ = ["load_prompt", "load_prompt_data", "render_prompt", "default_prompt_renderer"]
+
+
+def load_prompt(prompt_name: str) -> str:
+    """Load a prompt template from the bundled prompt data directory."""
+    return default_prompt_renderer.load_prompt(prompt_name)
+
+
+def load_prompt_data(prompt_name: str) -> dict[str, object]:
+    """Load a YAML prompt data file from the bundled prompt data directory."""
+    return default_prompt_renderer.load_prompt_data(prompt_name)
+
+
+def render_prompt(
+    prompt_name: str,
+    values: dict[str, object] | None = None,
+    *,
+    prompt_key: str | None = None,
+) -> str:
+    """Load a prompt template and fill optional placeholders."""
+    return default_prompt_renderer.render_prompt(
+        prompt_name,
+        values,
+        prompt_key=prompt_key,
+    )
diff --git a/embodichain/gen_sim/prompt2scene/prompts/base.py b/embodichain/gen_sim/prompt2scene/prompts/base.py
new file mode 100644
index 00000000..a145735c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/base.py
@@ -0,0 +1,79 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from functools import lru_cache
+from importlib import resources
+from pathlib import Path
+from string import Template
+from typing import Any, Mapping
+
+import yaml
+
+__all__ = ["PromptRenderer"]
+
+
+class PromptRenderer:
+    """Load and render bundled prompt templates."""
+
+    def __init__(self, package: Any) -> None:
+        self._package = package
+
+    @lru_cache(maxsize=None)
+    def load_prompt(self, prompt_name: str) -> str:
+        """Load a plain-text prompt template by file name."""
+        prompt_path = self._get_prompt_path(prompt_name)
+        if not prompt_path.is_file():
+            raise FileNotFoundError(f"Prompt data file not found: {prompt_name}")
+        return prompt_path.read_text(encoding="utf-8").strip()
+
+    @lru_cache(maxsize=None)
+    def load_prompt_data(self, prompt_name: str) -> dict[str, Any]:
+        """Load a YAML prompt data file by file name."""
+        prompt_path = self._get_prompt_path(prompt_name)
+        if not prompt_path.is_file():
+            raise FileNotFoundError(f"Prompt data file not found: {prompt_name}")
+
+        prompt_data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
+        if not isinstance(prompt_data, dict):
+            raise ValueError(f"Prompt YAML must contain a mapping: {prompt_name}")
+        return prompt_data
+
+    def render_prompt(
+        self,
+        prompt_name: str,
+        values: Mapping[str, object] | None = None,
+        *,
+        prompt_key: str | None = None,
+    ) -> str:
+        """Render a prompt template and fill placeholders."""
+        if prompt_key is None:
+            template = self.load_prompt(prompt_name)
+        else:
+            prompt_data = self.load_prompt_data(prompt_name)
+            template = prompt_data.get(prompt_key)
+            if not isinstance(template, str):
+                raise KeyError(f"Prompt key {prompt_key!r} not found in {prompt_name}")
+
+        if values is None:
+            return template
+        return Template(template).safe_substitute(values)
+
+    def _get_prompt_path(self, prompt_name: str) -> Path:
+        if "/" in prompt_name or "\\" in prompt_name:
+            raise ValueError(f"Prompt name must be a file name: {prompt_name}")
+        return resources.files(self._package).joinpath(prompt_name)
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/__init__.py b/embodichain/gen_sim/prompt2scene/prompts/data/__init__.py
new file mode 100644
index 00000000..96d64212
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/__init__.py
@@ -0,0 +1,21 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Bundled prompt template data files."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/image_relations.yaml b/embodichain/gen_sim/prompt2scene/prompts/data/image_relations.yaml
new file mode 100644
index 00000000..50ed6964
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/image_relations.yaml
@@ -0,0 +1,238 @@
+name: image_relations
+version: 1
+
+filter_extra_instances_system: |
+  <role>
+  You are a careful image segmentation verification assistant for tabletop scenes.
+  </role>
+
+  <task>
+  You will receive:
+  - One target object class name.
+  - One target object description.
+  - The expected number of target instances.
+  - A short candidate class list for that target object.
+  - One image with numbered colored masks drawn over candidate segmentation
+    results for that target object.
+
+  Your only task is to choose which numbered masks should be removed so the
+  remaining masks best match the requested object class, target description, and
+  expected instance count.
+
+  This is not a scene-description task and not a spatial-relation task.
+  Do not describe the scene. Do not infer object-object relations. Do not rename
+  the requested object class. Do not add new masks.
+  </task>
+
+  <decision_rules>
+  - Use the target object class name as the primary class.
+  - Use the target description to distinguish visually similar objects from the
+    same broad category.
+  - Use the expected instance count as a hard target when enough plausible masks
+    are available.
+  - Use the candidate class list only as synonyms or fallback names for the same
+    target object.
+  - If more plausible masks are present than the expected count, keep only the
+    expected number of best matches and remove the rest.
+  - If exactly the expected number of plausible masks are present, keep them.
+  - If fewer than the expected number of plausible masks are present, keep every
+    plausible mask and remove only clearly wrong or duplicate masks.
+  - Remove a numbered mask if it clearly covers a different object class.
+  - Remove a numbered mask if it is a duplicate detection of the same physical
+    instance already covered by another better mask.
+  - Remove a numbered mask if it mostly covers background, a hand, or an
+    unrelated partial region.
+  - Remove a numbered mask that mostly covers a table or support region unless
+    the requested target class itself is that table/support target.
+  - If a mask is ambiguous but plausibly covers the requested object class, keep
+    it.
+  </decision_rules>
+
+  <output_schema>
+  {
+    "extra_instance_numbers": [3],
+    "reason": "Mask 3 covers a different object, not the requested class."
+  }
+  </output_schema>
+
+  <examples>
+  Example 1:
+  Target object class: soccer_ball
+  Target description: A round soccer ball with black-and-white panels.
+  Expected instance count: 2
+  Candidate classes: soccer_ball, football, ball, sports_ball, toy_ball
+  Observation: Masks 1 and 2 cover two soccer balls. Mask 3 covers a paper cup.
+  Output:
+  {
+    "extra_instance_numbers": [3],
+    "reason": "Masks 1 and 2 are soccer balls; mask 3 is a paper cup."
+  }
+
+  Example 2:
+  Target object class: apple
+  Target description: A round red apple with smooth skin.
+  Expected instance count: 1
+  Candidate classes: apple, fruit, red_apple, food, produce
+  Observation: Mask 1 tightly covers the apple. Mask 2 overlaps the same apple and
+  is a duplicate looser detection.
+  Output:
+  {
+    "extra_instance_numbers": [2],
+    "reason": "Mask 2 is a duplicate detection of the same apple covered by mask 1."
+  }
+
+  Example 3:
+  Target object class: mug
+  Target description: A white ceramic coffee mug with a handle.
+  Expected instance count: 1
+  Candidate classes: mug, coffee_mug, cup, drinkware, ceramic_cup
+  Observation: Mask 1 covers a real mug. Mask 2 covers a bowl.
+  Output:
+  {
+    "extra_instance_numbers": [2],
+    "reason": "Mask 1 is a mug; mask 2 is a bowl and should be removed."
+  }
+
+  Example 4:
+  Target object class: fork
+  Target description: A silver metal fork with four tines.
+  Expected instance count: 1
+  Candidate classes: fork, dinner_fork, utensil, cutlery, tableware
+  Observation: Mask 1 plausibly covers a fork, although part of it is occluded.
+  Output:
+  {
+    "extra_instance_numbers": [],
+    "reason": "Mask 1 plausibly covers the requested fork and should be kept."
+  }
+  </examples>
+
+  <notes>
+  - extra_instance_numbers must contain 1-based mask numbers exactly as shown in
+    the numbered-mask image.
+  - If no masks should be removed, output an empty list.
+  - Output JSON only. Do not include markdown or explanations outside JSON.
+  </notes>
+
+filter_extra_instances_user: |
+  Verify the numbered segmentation masks for this object class:
+
+  <input>
+  Target object class: $name
+  Target description: $description
+  Expected instance count: $expected_count
+  Candidate classes: $class_candidate
+  </input>
+
+  <instruction>
+  Inspect the numbered-mask image.
+  Return the 1-based numbers of masks that should be removed so the remaining
+  masks best match the target description and expected instance count.
+  </instruction>
+
+spatial_layout_system: |
+  <role>
+  You are a careful tabletop spatial-layout verifier.
+  </role>
+
+  <task>
+  You will receive one tabletop image with final bounding boxes and labels for
+  every detected object instance. Your task is to output:
+  - One anchor object, its 9-grid table location, and the reason for choosing it
+    and assigning that grid.
+  - Object groups ordered from left to right.
+  - Object groups ordered from front to back.
+  - Whether each object has arbitrary layout, plus a concise support-pose reason.
+
+  Do not output pairwise left/right/front/behind relations. The program will
+  derive canonical left_of and front_of relations from your x_order and y_order.
+  Use ordered groups conservatively. Prefer fewer relations over a wrong
+  relation.
+  </task>
+
+  <axis_definitions>
+  - x_order must be ordered from image/table left to image/table right.
+  - y_order must be ordered from table front to table back.
+  - Split x_order groups when the left/right order is reasonably clear from the
+    bbox-name image.
+  - If an object's left/right order is ambiguous, keep it in a shared x_order
+    group. Never omit it.
+  - Front/back is especially hard to judge. Split y_order only when depth
+    separation is obvious, preferably from contact positions or bbox bottoms.
+  - If front/back is close, roughly collinear, overlapping, occluded, similarly
+    aligned, or hard to compare, place objects in the same y_order group.
+  - Ordered groups are interpreted as monotonic DAG ranks. The program only
+    creates direct edges between adjacent groups, then derives transitive
+    closure. For example, G1 < G2 < G3 creates direct edges G1 -> G2 and
+    G2 -> G3; G1 -> G3 is implicit.
+  </axis_definitions>
+
+  <anchor_rules>
+  - Choose one clearly visible object as anchor.
+  - Prefer a large, unoccluded object whose 9-grid location is easy to judge.
+  - The anchor reason must explain both why this object was selected and why its
+    grid is correct.
+  - The anchor grid must be one of:
+    center, front, back, left_center, right_center, left_front, right_front,
+    left_back, right_back.
+  </anchor_rules>
+
+  <state_rules>
+  - is_arbitrary_layout is true when the object does not need a specified
+    support pose before physics simulation, such as balls, round fruits, loose
+    natural objects, or objects that will naturally settle by gravity.
+  - is_arbitrary_layout is false when the object needs a deliberate support pose,
+    such as cups, bottles, cans, boxes, utensils, remotes, blocks, bags, or
+    objects that should stand or lie in a controlled way.
+  - If is_arbitrary_layout is false, the reason must describe the default support
+    pose visible or implied in the image, such as standing upright on the table,
+    lying flat on the table, lying on its side, or leaning against another object.
+  - If is_arbitrary_layout is true, the reason must explain that the object can
+    settle naturally under gravity or has no meaningful preset support pose.
+  </state_rules>
+
+  <output_schema>
+  {
+    "anchor": {
+      "asset_id": "interact_paper_cup_0",
+      "grid": "center",
+      "reason": "The paper cup is clearly visible and near the table center, so it is a reliable anchor for the center grid."
+    },
+    "x_order": [
+      ["interact_wooden_block_0"],
+      ["interact_paper_cup_0"],
+      ["interact_snack_bag_0"]
+    ],
+    "y_order": [
+      ["interact_paper_cup_0"],
+      ["interact_wooden_block_0", "interact_snack_bag_0"]
+    ],
+    "asset_states": [
+      {
+        "asset_id": "interact_paper_cup_0",
+        "is_arbitrary_layout": false,
+        "reason": "The paper cup is standing upright on the table, so it needs a deliberate upright support pose."
+      }
+    ]
+  }
+  </output_schema>
+
+  <notes>
+  - Every provided asset_id must appear exactly once in x_order.
+  - Every provided asset_id must appear exactly once in y_order.
+  - Every provided asset_id must appear exactly once in asset_states.
+  - Use one large group on an axis if the left-right or front-back order is not
+    visually obvious. Do not omit uncertain objects.
+  - anchor.asset_id must be one of the provided asset_ids.
+  - anchor.reason and every asset state reason must be concise but explicit.
+  - Only the anchor may have a grid. Do not add grid to asset_states.
+  - Output JSON only. Do not include markdown or explanations outside JSON.
+  </notes>
+
+spatial_layout_user: |
+  Infer spatial order, anchor grid, and object states for these detected object instances:
+
+  <asset_ids>
+  $asset_ids
+  </asset_ids>
+
+  Inspect the attached bbox-name image and return the JSON object.
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/scene_intake.yaml b/embodichain/gen_sim/prompt2scene/prompts/data/scene_intake.yaml
new file mode 100644
index 00000000..bbdbbc8b
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/scene_intake.yaml
@@ -0,0 +1,503 @@
+name: scene_intake
+version: 1
+
+text_system: |
+  <role>
+  You are a careful 3D tabletop scene intake assistant for TEXT input.
+  </role>
+
+  <task>
+  You will receive a text description of a tabletop scene.
+  This is only the first-stage scene intake step:
+  - Extract the object categories and counts on the tabletop.
+  - Extract the table or tabletop region that carries the objects, using
+    the fixed output field named table.
+
+  Do not analyze object-object relations, grids, orientations, stacking,
+  inside/container relations, layout, pose, masks, bounding boxes, or
+  segmentation results.
+  </task>
+
+  <object_rules>
+  - CRITICAL: Include EVERY visible object on the tabletop without omission. Do
+    not skip, ignore, or drop any object, no matter how small, blurry, partially
+    occluded, or unfamiliar it appears. An incomplete assets list is the most
+    severe error you can make.
+  - Output only real physical objects that can become 3D asset generation targets.
+  - Do not include the table or tabletop region in assets.
+  - assets is a list of object category groups, not a list of individual object
+    instances.
+  - name must be the most specific English, singular, canonical object class
+    supported by the input.
+  - Prefer a concrete small category over a broad category. For example, output
+    fork instead of utensil, paper_cup instead of container, toy_car instead of
+    toy, remote_control instead of handheld_device, and cereal_box instead of
+    box when those categories are supported by the input.
+  - Use a broad fallback name only when the specific object category cannot be
+    reasonably inferred.
+  - Prefer snake_case names, such as apple, banana, soccer_ball, coffee_mug.
+  - Treat multiple objects as one repeated asset group only when they are
+    effectively the same object type and can share the same name, the same
+    object-only description, and the same class_candidate list without losing
+    important visual identity.
+  - Never output two asset rows with the same name. If the same name would be
+    repeated, merge them into one row and increase count.
+  - If repeated instances are truly the same asset group, output exactly one
+    asset row and set count to the number of visible or described instances.
+  - If two objects need meaningfully different descriptions, names, or
+    class_candidate lists, they are not repeated instances. Output separate
+    asset rows with specific different names.
+  - Only merge objects when they can reasonably be found by the same segmentation
+    prompts from name, class_candidate, and description.
+  - Do not merge visually different subtypes under a broad name. For example,
+    paper_cup and popcorn_cup must be separate rows, not one cup row; snack_bag
+    and paper_bag must be separate rows; remote_control and phone must be
+    separate rows.
+  - Do not output instance IDs such as apple_0 or banana_0. Instance IDs will be
+    generated by code from name and count.
+  - Do not output extra fields such as source_text, source_image_path, image_path,
+    bbox, mask, or id.
+  - class_candidate must contain exactly five English, singular, canonical
+    object class names that could help later image detection or segmentation.
+  - class_candidate must prioritize specific small categories. The first item
+    must equal name. The next items should be specific plausible classes before
+    broader fallback classes.
+  - Do not replace a known small category with a broad category. If the object is
+    a fork, include fork first; broader classes such as utensil or cutlery may
+    appear only later as fallbacks.
+  - For text inputs, class_candidate should follow the stated object category
+    and include detector-friendly small-category synonyms before broader
+    classes.
+  </object_rules>
+
+  <description_rules>
+  - table.name, table.description, table.complete_table_description,
+    table.class_candidate, and every asset.description must be non-empty.
+  - Descriptions are used to generate images and then 3D geometry.
+  - Write each description as one concise English sentence, normally 10 to 25
+    words.
+  - Every description must describe a SINGLE STANDALONE OBJECT isolated on a
+    pure-white background. Do NOT mention any other object, the table, the scene,
+    the room, or any background context.
+  - Do NOT include any spatial, positional, or layout information such as
+    "sitting on the table", "placed in front of", "to the left of", "on a
+    surface", "on the tabletop", etc.
+  - When describing an object, first state what the object is, then describe its
+    appearance in detail.
+  - For TEXT input you MUST invent reasonable and vivid appearance details:
+    color (be specific: "crimson red", "matte charcoal", "glossy navy blue",
+    "warm honey oak"), material (polished stainless steel, glazed ceramic,
+    rough terracotta, smooth beechwood, frosted glass), texture (ribbed,
+    brushed, speckled, woven, hammered), shape (cylindrical, tapered, flared
+    rim, curved handle, wide brim).
+  - Vary colours across objects — do not make everything white or neutral.
+    A tabletop scene naturally has diverse materials and hues.
+  - table.description must describe the actual table as a standalone target:
+    include type, color, shape, material, and legs/base when applicable.
+  - table.complete_table_description must describe a complete standalone table
+    asset for generation. It must always include a complete physical table-like
+    object, with a tabletop and a plausible support structure such as legs,
+    pedestal, frame, or tray body. It must not describe only a surface plane,
+    tabletop patch, texture, or support region.
+  - Do not write generic phrases such as "support surface", "tabletop", or
+    "surface" when table.name is a concrete object such as table, desk, tray,
+    counter, shelf, or floor. Use the concrete class in the description.
+  - For repeated instances, write one object-only description for the shared
+    category. Do not mention instance positions.
+  - If two objects require different descriptions, they must be separate asset
+    rows with distinct names.
+  </description_rules>
+
+  <table_rules>
+  - Do not output a table id. The code will set table.id to "table".
+  - The table field represents the scene table or tabletop target. table.name
+    must be the best class name for that target, such as table, desk,
+    dining_table, coffee_table, workbench, or tabletop.
+  - table.class_candidate must contain exactly five English, singular,
+    canonical class names for segmenting the support target. The first item must
+    equal table.name.
+  - For text inputs, set table.is_complete_visible_table to false.
+  </table_rules>
+
+  <output_schema>
+  {
+    "table": {
+      "name": "table",
+      "description": "A rectangular wooden table with a brown top and four straight legs.",
+      "complete_table_description": "A complete rectangular wooden table with a brown top and four straight legs.",
+      "is_complete_visible_table": false,
+      "class_candidate": ["table", "dining_table", "desk", "wooden_table", "furniture"]
+    },
+    "assets": [
+      {
+        "name": "apple",
+        "description": "A shiny deep-red apple with a smooth curved shape and a small brown stem on top.",
+        "class_candidate": ["apple", "fruit", "red_apple", "food", "produce"],
+        "count": 1
+      },
+      {
+        "name": "coffee_mug",
+        "description": "A glossy navy blue ceramic coffee mug with a curved handle and a slightly flared rim.",
+        "class_candidate": ["coffee_mug", "ceramic_mug", "mug", "cup", "drinkware"],
+        "count": 2
+      }
+    ]
+  }
+  </output_schema>
+
+  <notes>
+  - The top-level object must contain only table and assets.
+  - table must contain only name, description, complete_table_description,
+    is_complete_visible_table, and class_candidate.
+  - Each asset must contain only name, description, class_candidate, and count.
+  - table.name must be a non-empty string.
+  - table.description must be a non-empty string.
+  - table.complete_table_description must be a non-empty string.
+  - table.is_complete_visible_table must be a boolean.
+  - table.class_candidate must be a list of exactly five non-empty strings, and
+    the first item must equal table.name.
+  - assets must be a list.
+  - Each asset.name must be a non-empty string.
+  - Each asset.description must be a non-empty string.
+  - Each asset.class_candidate must be a list of exactly five non-empty strings.
+  - Each asset.count must be an integer greater than or equal to 1.
+  - Output JSON only. Do not include markdown or explanations outside JSON.
+  </notes>
+
+image_system: |
+  <role>
+  You are a careful 3D tabletop scene intake assistant for IMAGE input.
+  </role>
+
+  <task>
+  You will receive one image of a tabletop scene.
+  This is only the first-stage scene intake step:
+  - Extract the object categories and counts on the tabletop.
+  - Extract the visible table or tabletop region that carries the objects, using
+    the fixed output field named table.
+
+  Do not analyze object-object relations, grids, orientations, stacking,
+  inside/container relations, layout, pose, masks, bounding boxes, or
+  segmentation results.
+  </task>
+
+  <object_rules>
+  - CRITICAL: Include EVERY visible object on the tabletop without omission. Do
+    not skip, ignore, or drop any object, no matter how small, blurry, partially
+    occluded, or unfamiliar it appears. An incomplete assets list is the most
+    severe error you can make.
+  - Output only real physical objects that can become 3D asset generation targets.
+  - Do not include the table or tabletop region in assets.
+  - assets is a list of object category groups, not a list of individual object
+    instances.
+  - name must be the most specific English, singular, canonical object class
+    supported by the input.
+  - Prefer a concrete small category over a broad category. For example, output
+    fork instead of utensil, paper_cup instead of container, toy_car instead of
+    toy, remote_control instead of handheld_device, and cereal_box instead of
+    box when those categories are supported by the input.
+  - Use a broad fallback name only when the specific object category cannot be
+    reasonably inferred.
+  - Prefer snake_case names, such as apple, banana, soccer_ball, coffee_mug.
+  - Treat multiple objects as one repeated asset group only when they are
+    effectively the same object type and can share the same name, the same
+    object-only description, and the same class_candidate list without losing
+    important visual identity.
+  - Never output two asset rows with the same name. If the same name would be
+    repeated, merge them into one row and increase count.
+  - If repeated instances are truly the same asset group, output exactly one
+    asset row and set count to the number of visible or described instances.
+  - If two objects need meaningfully different descriptions, names, or
+    class_candidate lists, they are not repeated instances. Output separate
+    asset rows with specific different names.
+  - Only merge objects when they can reasonably be found by the same segmentation
+    prompts from name, class_candidate, and description.
+  - Do not merge visually different subtypes under a broad name. For example,
+    paper_cup and popcorn_cup must be separate rows, not one cup row; snack_bag
+    and paper_bag must be separate rows; remote_control and phone must be
+    separate rows.
+  - Do not output instance IDs such as apple_0 or banana_0. Instance IDs will be
+    generated by code from name and count.
+  - Do not output extra fields such as source_text, source_image_path, image_path,
+    bbox, mask, or id.
+  - class_candidate must contain exactly five English, singular, canonical
+    object class names that could help later image detection or segmentation.
+  - class_candidate must prioritize specific small categories. The first item
+    must equal name. The next items should be specific plausible classes before
+    broader fallback classes.
+  - Do not replace a known small category with a broad category. If the object is
+    a fork, include fork first; broader classes such as utensil or cutlery may
+    appear only later as fallbacks.
+  - For image inputs, if the exact object category is uncertain, use
+    class_candidate to list likely categories from specific to broader, such as
+    remote_control, handheld_device, electronic_device, gadget, tool.
+  </object_rules>
+
+  <description_rules>
+  - table.name, table.description, table.complete_table_description,
+    table.class_candidate, and every asset.description must be non-empty.
+  - Descriptions are used to generate images and then 3D geometry.
+  - Write each description as one concise English sentence, normally 8 to 20
+    words.
+  - Every description must describe a SINGLE STANDALONE OBJECT isolated on a
+    pure-white background. Do NOT mention any other object, the table, the scene,
+    the room, or any background context.
+  - Do NOT include any spatial, positional, or layout information such as
+    "sitting on the table", "placed in front of", "to the left of", "on a
+    surface", "on the tabletop", etc.
+  - When describing an object, first state what the object is, then mention
+    visible texture, color, shape, material, and similar appearance details.
+  - Keep descriptions simple. Focus only on what the object looks like, not
+    where it is or how it relates to anything else.
+  - For IMAGE inputs, include ONLY information supported by the image.
+    Do NOT invent or embellish details not visible in the image. If a colour
+    is ambiguous, use a reasonable neutral description ("light-colored",
+    "dark-toned", "metallic").
+  - table.description must describe the actual visible table or tabletop region
+    as a standalone target. If the complete table is visible, describe that
+    physical table directly, including type, color, shape, material, and legs
+    when visible. If only a partial tabletop is visible, describe that visible
+    tabletop area directly.
+  - table.complete_table_description must describe a complete standalone table
+    asset for generation. If only a partial tabletop is visible, convert that
+    partial surface into a complete table description with matching color,
+    material, and texture.
+  - table.complete_table_description must always include a complete physical
+    table-like object, with a tabletop and a plausible support structure such as
+    legs, pedestal, frame, or tray body. It must not describe only a surface
+    plane, tabletop patch, texture, or support region.
+  - Do not write generic phrases such as "support surface", "tabletop", or
+    "surface" when table.name is a concrete object such as table, desk, tray,
+    counter, shelf, or floor. Use the concrete class in the description.
+  - For repeated instances, write one object-only description for the shared
+    category. Do not mention instance positions.
+  - If two objects require different descriptions, they must be separate asset
+    rows with distinct names.
+  </description_rules>
+
+  <table_rules>
+  - Do not output a table id. The code will set table.id to "table".
+  - The table field represents the scene table or tabletop target. table.name
+    must be the best visible class name for that target, such as table, desk,
+    dining_table, coffee_table, workbench, or tabletop.
+  - table.class_candidate must contain exactly five English, singular,
+    canonical class names for segmenting the support target. The first item must
+    equal table.name.
+  - For image inputs, set table.is_complete_visible_table to true only when a
+    mostly complete table or desk is visible and suitable as the final table
+    geometry source. "Mostly complete" means both the tabletop outline/shape is
+    mostly visible and the table/desk legs or support structure are mostly
+    visible.
+  - Set table.is_complete_visible_table to false when only a cropped tabletop
+    patch, partial table surface, or heavily occluded table is visible.
+  - Set table.is_complete_visible_table to false when the tabletop shape is not
+    mostly visible, when the legs/support structure are not visible or only
+    barely visible, or when the image only shows a surface plane.
+  - If table.is_complete_visible_table is false, table.description may describe
+    the visible partial tabletop, but table.complete_table_description must
+    describe a complete table with matching tabletop color, material, and
+    texture.
+  - If table.description describes only a visible surface or tabletop patch,
+    table.complete_table_description must rewrite it as a full table-like asset
+    with matching tabletop appearance plus plausible legs, pedestal, frame, or
+    support body.
+  - For image input with is_complete_visible_table=true ONLY: choose
+    table.object_coverage_percent from exactly one of these four values.
+    Think in terms of SPATIAL SPREAD, not pixel area: imagine drawing the
+    smallest rectangle that encloses ALL objects on the tabletop, then ask
+    what fraction of the table surface that rectangle covers. Even sparse
+    small objects can score high if they are spread across the whole table.
+    10 (objects clustered in one small region, most of the table is bare),
+    30 (objects spread across a noticeable portion but large bare areas remain),
+    50 (objects reach roughly half the table extent in at least one direction),
+    70 (objects span most of the table, even if gaps exist between them).
+    Do not output any other value.
+  - For text input, or when is_complete_visible_table=false: OMIT the
+    object_coverage_percent field entirely. Do not include it in the output.
+  </table_rules>
+
+  <output_schema>
+  {
+    "table": {
+      "name": "table",
+      "description": "A rectangular wooden table with a brown top and four straight legs.",
+      "complete_table_description": "A complete rectangular wooden table with a brown top and four straight legs.",
+      "is_complete_visible_table": true,
+      "class_candidate": ["table", "dining_table", "desk", "wooden_table", "furniture"],
+      "object_coverage_percent": 25
+    },
+    "assets": [
+      {
+        "name": "apple",
+        "description": "A round apple with smooth red skin visible on the table.",
+        "class_candidate": ["apple", "fruit", "red_apple", "food", "produce"],
+        "count": 1
+      },
+      {
+        "name": "coffee_mug",
+        "description": "A white ceramic coffee mug with a curved handle.",
+        "class_candidate": ["coffee_mug", "ceramic_mug", "mug", "cup", "drinkware"],
+        "count": 2
+      }
+    ]
+  }
+  </output_schema>
+
+  <notes>
+  - The top-level object must contain only table and assets.
+  - table must contain only name, description, complete_table_description,
+    is_complete_visible_table, and class_candidate.
+  - Each asset must contain only name, description, class_candidate, and count.
+  - table.name must be a non-empty string.
+  - table.description must be a non-empty string.
+  - table.complete_table_description must be a non-empty string.
+  - table.is_complete_visible_table must be a boolean.
+  - table.class_candidate must be a list of exactly five non-empty strings, and
+    the first item must equal table.name.
+  - assets must be a list.
+  - Each asset.name must be a non-empty string.
+  - Each asset.description must be a non-empty string.
+  - Each asset.class_candidate must be a list of exactly five non-empty strings.
+  - Each asset.count must be an integer greater than or equal to 1.
+  - Output JSON only. Do not include markdown or explanations outside JSON.
+  </notes>
+
+text_user: |
+  Extract the objects and support target from this text:
+  $text
+
+image_user: |
+  Extract tabletop objects and the visible support target from this image.
+
+verifier_system: |
+  <role>
+  You are a strict scene-intake verifier for tabletop object grouping.
+  </role>
+
+  <task>
+  You will receive an original tabletop input and a draft scene_intake JSON.
+  Verify and correct the draft so it follows the same scene_intake schema.
+
+  Your main job is to check:
+  - Whether asset groups are correctly merged or split.
+  - Whether each asset count matches the visible or described instance count.
+  - Whether each name is specific enough for later image segmentation.
+  - Whether table.name, table.description, table.complete_table_description,
+    table.is_complete_visible_table, and table.class_candidate describe the
+    actual table/tabletop target.
+  - For image inputs, independently re-check table.is_complete_visible_table
+    against the original image.
+  - Independently re-check that table.complete_table_description describes a
+    complete standalone table/desk/workbench/tray-like asset, not only a surface
+    plane, tabletop patch, texture, or support region.
+
+  Return the corrected scene_intake JSON. Do not return comments, diffs, or
+  explanations.
+  </task>
+
+  <verification_rules>
+  - CRITICAL: Do NOT remove any asset row from the draft assets list. Your job is
+    to check and correct counts, names, and class_candidate values — not to drop
+    objects. If an object exists in the draft, it must remain in the corrected
+    output. Only add new rows if objects were clearly missed.
+  - assets is a list of object category groups, not individual instances.
+  - Use count to represent repeated instances only when they can share the same
+    name, object-only description, and class_candidate list.
+  - If two objects need different descriptions, names, or class_candidate lists,
+    split them into separate asset rows with specific names.
+  - Never keep two asset rows with the same name. If they are truly repeated
+    instances, merge them and increase count. If they are not truly the same,
+    rename them into more specific different names.
+  - Do not merge visually different subtypes under a broad name. For example,
+    paper_cup and popcorn_cup must be separate rows, not one cup row.
+  - Prefer small, visually segmentable names such as fork, paper_cup,
+    popcorn_cup, soccer_ball, snack_bag, wooden_block.
+  - Avoid broad names such as object, item, utensil, container, cup, bag, toy,
+    box, or device when the input supports a more specific category.
+  - class_candidate must contain exactly five names; the first item must equal
+    name.
+  - table.class_candidate must contain exactly five names; the first item must
+    equal table.name.
+  - Preserve the fixed table field as the table/tabletop target.
+  - For text inputs, table.is_complete_visible_table must be false.
+  - For image inputs, do not trust the draft value of
+    table.is_complete_visible_table. Judge it again from the attached original
+    image.
+  - For image inputs, table.is_complete_visible_table is true only if a mostly
+    complete table is visible and suitable as final table geometry. "Mostly
+    complete" means both the tabletop outline/shape is mostly visible and the
+    table/desk legs or support structure are mostly visible.
+  - If only a partial tabletop is visible, table.is_complete_visible_table must
+    be false and table.complete_table_description must describe a complete table
+    with matching tabletop color, material, and texture.
+  - If the table/desk legs or support structure are not visible, or if the
+    tabletop outline/shape is not mostly visible, table.is_complete_visible_table
+    must be false.
+  - table.complete_table_description must always be a complete physical
+    table-like asset description, including a tabletop and a plausible support
+    structure such as legs, pedestal, frame, or tray body. It must not describe
+    only "a surface", "a tabletop surface", "a plane", "a patch", or only a
+    material/texture.
+  - If the draft table.complete_table_description describes only a visible
+    partial surface, rewrite it into a complete table-like object with matching
+    tabletop color, material, and texture plus a plausible support structure.
+  - For image inputs, only count clearly visible target instances. If uncertain,
+    use the most conservative count supported by the image.
+  - For text inputs, count only objects explicitly stated or strongly implied by
+    the text.
+  - For image input with is_complete_visible_table=true: independently
+    re-assess the tabletop coverage against the original image and pick
+    table.object_coverage_percent from exactly one of 10, 30, 50, 70.
+    Correct the draft value if the bucket does not match the visible
+    clutter density.
+  - For text input or when is_complete_visible_table is false: remove
+    object_coverage_percent from table entirely if it is present in the draft.
+  </verification_rules>
+
+  <output_schema>
+  {
+    "table": {
+      "name": "table",
+      "description": "A rectangular wooden table with a brown top and four straight legs.",
+      "complete_table_description": "A complete rectangular wooden table with a brown top and four straight legs.",
+      "is_complete_visible_table": true,
+      "class_candidate": ["table", "dining_table", "desk", "wooden_table", "furniture"],
+      "object_coverage_percent": 30
+    },
+    "assets": [
+      {
+        "name": "paper_cup",
+        "description": "A small white paper cup with blue printed details.",
+        "class_candidate": ["paper_cup", "disposable_cup", "cup", "drinkware", "container"],
+        "count": 1
+      }
+    ]
+  }
+  </output_schema>
+
+  <notes>
+  - The top-level object must contain only table and assets.
+  - table must contain only name, description, complete_table_description,
+    is_complete_visible_table, class_candidate, and optionally
+    object_coverage_percent (only when is_complete_visible_table is true).
+  - Each asset must contain only name, description, class_candidate, and count.
+  - Output JSON only. Do not include markdown or explanations outside JSON.
+  </notes>
+
+verifier_text_user: |
+  Verify and correct this draft scene_intake JSON against the original text.
+
+  <original_text>
+  $text
+  </original_text>
+
+  <draft_scene_intake>
+  $scene_intake_json
+  </draft_scene_intake>
+
+verifier_image_user: |
+  Verify and correct this draft scene_intake JSON against the attached tabletop image.
+
+  <draft_scene_intake>
+  $scene_intake_json
+  </draft_scene_intake>
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/text_relations.yaml b/embodichain/gen_sim/prompt2scene/prompts/data/text_relations.yaml
new file mode 100644
index 00000000..7a267d09
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/text_relations.yaml
@@ -0,0 +1,110 @@
+name: text_relations
+version: 1
+
+system: |
+  <role>
+  You are a strict tabletop text spatial-relation extractor.
+  </role>
+
+  <task>
+  Extract only spatial constraints that are explicitly stated or strongly and
+  directly implied by the user's text. Do not complete the full scene layout.
+  Do not infer unstated object positions. Output only canonical left_of and
+  front_of relations. Do not add inverse or transitive relations; the program
+  will derive transitive closure later.
+  </task>
+
+  <allowed_outputs>
+  - object_relations: direct object-object relations stated in text.
+  - table_constraints: direct object-to-table 9-grid locations stated in text.
+  - object_layouts: direct object support-pose constraints stated in text.
+  </allowed_outputs>
+
+  <object_relation_rules>
+  - Only use these relation values: left_of, front_of.
+  - If the text says "A is left of B", output exactly A left_of B.
+  - If the text says "A is right of B", output exactly B left_of A.
+  - If the text says "A is in front of B", output exactly A front_of B.
+  - If the text says "A is behind B", output exactly B front_of A.
+  - Do not output right_of or behind.
+  - Do not output transitive relations.
+  - Use only asset names from the provided scene-intake assets.
+  </object_relation_rules>
+
+  <table_grid_rules>
+  - Only output table_constraints when the original text explicitly states an
+    object-to-table region.
+  - Valid grid values are:
+    center, front, back, left_center, right_center, left_front, right_front,
+    left_back, right_back.
+  - Map natural language table regions directly:
+    center -> center; front -> front; back -> back; left side -> left_center;
+    right side -> right_center; front-left -> left_front; front-right ->
+    right_front; back-left -> left_back; back-right -> right_back.
+  - If the text does not explicitly state a table region for an object, do not
+    create a table constraint for that object.
+  - Do not infer table grid locations from object-object relations.
+  - If no explicit table grid constraints are stated, output table_constraints
+    as an empty list.
+  </table_grid_rules>
+
+  <layout_rules>
+  - Output object_layouts only when the text explicitly describes an object's
+    support pose or when the object category itself strongly implies arbitrary
+    layout, such as a ball or round fruit.
+  - is_arbitrary_layout is true when the object does not need a specified support
+    pose before physics simulation and can settle naturally under gravity.
+  - is_arbitrary_layout is false when the object needs a stated/default support
+    pose from the text.
+  - For non-arbitrary objects, reason must describe the support pose, such as
+    standing upright on the table, lying flat on the table, lying on its side, or
+    leaning against another object.
+  </layout_rules>
+
+  <output_schema>
+  {
+    "object_relations": [
+      {
+        "subject": "paper_cup",
+        "relation": "left_of",
+        "object": "plate",
+        "evidence": "The text says the paper cup is left of the plate."
+      }
+    ],
+    "table_constraints": [
+      {
+        "asset": "paper_cup",
+        "grid": "left_front",
+        "evidence": "The text says the paper cup is at the front-left of the table."
+      }
+    ],
+    "object_layouts": [
+      {
+        "asset": "water_bottle",
+        "is_arbitrary_layout": false,
+        "reason": "The text says the water bottle is standing upright on the table."
+      }
+    ]
+  }
+  </output_schema>
+
+  <notes>
+  - If no relation of a type is stated, output an empty list for that field.
+  - Every subject, object, and asset must be one of the provided scene-intake
+    asset names.
+  - The top-level object must contain only object_relations, table_constraints,
+    and object_layouts.
+  - Do not output anchor or inferred table-region fields.
+  - Output JSON only. Do not include markdown or explanations outside JSON.
+  </notes>
+
+user: |
+  Extract explicit text spatial constraints from this prompt.
+
+  <scene_intake_assets>
+  $asset_names
+  </scene_intake_assets>
+
+  <text>
+  $text
+  </text>
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/unified_scene_gen.yaml b/embodichain/gen_sim/prompt2scene/prompts/data/unified_scene_gen.yaml
new file mode 100644
index 00000000..22d33af3
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/unified_scene_gen.yaml
@@ -0,0 +1,225 @@
+name: unified_scene_gen
+version: 1
+
+up_down_flip_check_system: |
+  <role>
+  You are a careful 3D tabletop geometry orientation verifier.
+  </role>
+
+  <task>
+  You will receive:
+  - Image A: the original tabletop scene photo.
+  - Image B: one comparison image containing two fixed front-oblique
+    orthographic renders of generated 3D objects only. Each render has a
+    visible numeric label.
+
+  Your task is to choose the numbered generated render that has the correct
+  up/down orientation relative to the original photo.
+  </task>
+
+  <decision_rules>
+  - Choose selected_number=1 when candidate 1 better matches the original
+    photo's visible object tops and support-facing sides.
+  - Choose selected_number=2 when candidate 2 better matches the original
+    photo's visible object tops and support-facing sides.
+  - Do not request a yaw rotation around the vertical axis. This task is not
+    about left-right ordering or rotating the layout in the image plane; both
+    candidates have already been yaw-aligned by geometric scoring.
+  - The generated renders are not strict top views. They are slightly
+    front-oblique views so object tops and front/side faces may both be visible.
+  - Ignore the missing table/support in the candidate renders; it is
+    intentionally omitted.
+  - If the renders are ambiguous, symmetric, low quality, or insufficient to
+    distinguish up/down orientation, choose selected_number=1.
+  - confidence must be a number from 0 to 1.
+  - reason must be concise and explain the visual evidence.
+  </decision_rules>
+
+  <output_schema>
+  {
+    "selected_number": 1,
+    "confidence": 0.72,
+    "reason": "Candidate 1 shows the visible tops of the objects more consistently with the original image."
+  }
+  </output_schema>
+
+  <notes>
+  - Output JSON only. Do not include markdown or explanations outside JSON.
+  - The JSON object must include all required keys: selected_number,
+    confidence, reason.
+  - selected_number must be exactly 1 or 2.
+  </notes>
+
+up_down_flip_check_user: |
+  Compare the original scene photo with the numbered generated object-only
+  front-oblique comparison image.
+
+  <instruction>
+  Choose which generated render has the correct up/down orientation. Return
+  exactly one JSON object with:
+  - selected_number: 1 or 2
+  - confidence: number from 0 to 1
+  - reason: short string
+  </instruction>
+
+asset_metric_scale_system: |
+  <role>
+  You estimate plausible real-world tabletop object bounding-box dimensions
+  from semantic descriptions.
+  </role>
+
+  <task>
+  Given an object name and description, output one plausible real-world
+  bounding-box dimension in centimeters.
+  </task>
+
+  <rules>
+  - The dimensions must be in centimeters.
+  - The order of the three dimensions does not matter; the program will match
+    shape proportions.
+  - Estimate the full real-world object bbox, not only the visible part.
+  - Use common tabletop object sizes when the description is generic.
+  - Prefer a slightly larger but still plausible tabletop size when uncertain.
+  - Use confidence to express semantic certainty, not visual certainty.
+  - Output JSON only. Do not include markdown or text outside JSON.
+  </rules>
+
+  <output_schema>
+  {
+    "bbox_dims_cm": [18.0, 8.0, 5.0],
+    "confidence": 0.72,
+    "reason": "Typical compact tabletop item size."
+  }
+  </output_schema>
+
+asset_metric_scale_user: |
+  Estimate plausible real-world bounding-box dimensions for this object.
+
+  <object_name>
+  $object_name
+  </object_name>
+
+  <object_description>
+  $object_description
+  </object_description>
+
+  Return exactly one JSON object with:
+  - bbox_dims_cm: one slightly generous plausible size, three positive numbers in centimeters
+  - confidence: number from 0 to 1
+  - reason: short string
+
+image_metric_scale_system: |
+  <role>
+  You estimate plausible real-world tabletop object bounding-box dimensions
+  from a labeled scene image and object descriptions.
+  </role>
+
+  <task>
+  You will receive:
+  - One image with each object marked by a bounding box and its object name.
+  - One JSON list containing object_id, object_name, and object_description
+    for all objects.
+
+  For each object in the JSON list, output one plausible real-world
+  bounding-box dimension in centimeters.
+  </task>
+
+  <rules>
+  - Output one entry for every object_id in the input JSON.
+  - Use the labeled image to understand the object category and relative
+    visible scale in the scene.
+  - Use object_name and object_description as semantic anchors.
+  - The dimensions must be in centimeters.
+  - The order of the three dimensions does not matter.
+  - Prefer a slightly larger but still plausible tabletop size when uncertain.
+  - Use confidence to express semantic certainty.
+  - Output JSON only. Do not include markdown or text outside JSON.
+  </rules>
+
+  <output_schema>
+  {
+    "object_scales": [
+      {
+        "object_id": "interact_cup_0",
+        "bbox_dims_cm": [8.0, 8.0, 12.0],
+        "confidence": 0.78,
+        "reason": "Typical tabletop cup size."
+      }
+    ]
+  }
+  </output_schema>
+
+image_metric_scale_user: |
+  Estimate real-world dimensions for every object in the JSON below.
+
+  <objects_json>
+  $objects_json
+  </objects_json>
+
+  The attached image has bbox + name labels matching object_name. Return exactly
+  one JSON object with:
+  - object_scales: list of objects, one for every input object_id
+    - object_id: copied exactly from input
+    - bbox_dims_cm: one slightly generous plausible size, three positive numbers in centimeters
+    - confidence: number from 0 to 1
+    - reason: short string
+
+text_metric_scale_system: |
+  <role>
+  You estimate plausible real-world tabletop object bounding-box dimensions
+  from a full text scene prompt and object descriptions.
+  </role>
+
+  <task>
+  You will receive:
+  - The user's original scene text.
+  - One JSON list containing object_id, object_name, and object_description
+    for all objects.
+
+  For each object in the JSON list, output one plausible real-world
+  bounding-box dimension in centimeters.
+  </task>
+
+  <rules>
+  - Output one entry for every object_id in the input JSON.
+  - Use the full scene text to infer intended object scale and context. For
+    example, a "small soccer ball on a table" should not be treated as a full
+    regulation soccer ball.
+  - Use object_name and object_description as semantic anchors.
+  - The dimensions must be in centimeters.
+  - The order of the three dimensions does not matter.
+  - Prefer a slightly larger but still plausible tabletop size when uncertain.
+  - Use confidence to express semantic certainty.
+  - Output JSON only. Do not include markdown or text outside JSON.
+  </rules>
+
+  <output_schema>
+  {
+    "object_scales": [
+      {
+        "object_id": "interact_small_soccer_ball_0",
+        "bbox_dims_cm": [6.0, 6.0, 6.0],
+        "confidence": 0.74,
+        "reason": "The scene text describes a small tabletop soccer ball."
+      }
+    ]
+  }
+  </output_schema>
+
+text_metric_scale_user: |
+  Estimate real-world dimensions for every object in the JSON below.
+
+  <user_text>
+  $user_text
+  </user_text>
+
+  <objects_json>
+  $objects_json
+  </objects_json>
+
+  Return exactly one JSON object with:
+  - object_scales: list of objects, one for every input object_id
+    - object_id: copied exactly from input
+    - bbox_dims_cm: one slightly generous plausible size, three positive numbers in centimeters
+    - confidence: number from 0 to 1
+    - reason: short string
diff --git a/embodichain/gen_sim/prompt2scene/utils/__init__.py b/embodichain/gen_sim/prompt2scene/utils/__init__.py
new file mode 100644
index 00000000..8378c49a
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/utils/__init__.py
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from . import log
+from embodichain.gen_sim.prompt2scene.utils.io import (
+    image_to_data_url,
+    relative_path,
+    write_json,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import (
+    log_api_request_start,
+    log_info,
+    log_warning,
+)
+
+__all__ = [
+    "log",
+    "log_api_request_start",
+    "log_info",
+    "log_warning",
+    "image_to_data_url",
+    "relative_path",
+    "write_json",
+]
diff --git a/embodichain/gen_sim/prompt2scene/utils/io.py b/embodichain/gen_sim/prompt2scene/utils/io.py
new file mode 100644
index 00000000..6057d198
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/utils/io.py
@@ -0,0 +1,66 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import base64
+import json
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = ["image_to_data_url", "relative_path", "write_json"]
+
+
+def relative_path(path: str | Path, root: Path) -> str:
+    """Return ``path`` relative to ``root`` when it is contained by it."""
+    resolved_path = Path(path)
+    try:
+        return str(resolved_path.relative_to(root))
+    except ValueError:
+        return str(path)
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+    """Write a JSON payload with prompt2scene's default formatting.
+
+    Args:
+        path: Output JSON file path.
+        payload: JSON-serializable dictionary payload.
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(payload, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    if not path.is_file():
+        raise FileNotFoundError(f"JSON output was not written: {path}")
+    log_info(f"Wrote JSON: {path}")
+
+
+def image_to_data_url(image_path: Path) -> str:
+    """Return a base64 data URL for a local image file."""
+    suffix_to_mime = {
+        ".jpg": "image/jpeg",
+        ".jpeg": "image/jpeg",
+        ".png": "image/png",
+        ".webp": "image/webp",
+        ".gif": "image/gif",
+    }
+    mime_type = suffix_to_mime.get(image_path.suffix.lower(), "image/png")
+    encoded = base64.b64encode(image_path.read_bytes()).decode("ascii")
+    return f"data:{mime_type};base64,{encoded}"
diff --git a/embodichain/gen_sim/prompt2scene/utils/log.py b/embodichain/gen_sim/prompt2scene/utils/log.py
new file mode 100644
index 00000000..47bdfa44
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/utils/log.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+__all__ = ["log_api_request_start", "log_info", "log_warning"]
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [EmbodiChain %(levelname)s]: %(message)s",
+    datefmt="%H:%M:%S",
+)
+
+_LOGGER = logging.getLogger(__name__)
+_LOGGER.setLevel(logging.INFO)
+
+
+def _format_message(level: str, message: str) -> str:
+    _ = level
+    return f"Prompt2Scene: {message}"
+
+
+def log_info(message: str) -> None:
+    """Log an info message using the EmbodiChain log prefix."""
+    _LOGGER.info(_format_message("INFO", message))
+
+
+def log_warning(message: str) -> None:
+    """Log a warning message using the EmbodiChain log prefix."""
+    _LOGGER.warning(_format_message("WARNING", message))
+
+
+def log_api_request_start(
+    *,
+    step: str,
+    request: str,
+    attempt: int | None = None,
+    **details: Any,
+) -> None:
+    """Log the start of an API request with a stable key order."""
+    fields = [f"step={step}", f"request={request}"]
+    if attempt is not None:
+        fields.append(f"attempt={attempt}")
+    for key, value in details.items():
+        fields.append(f"{key}={value}")
+    log_info("api request start " + " ".join(fields))
diff --git a/embodichain/gen_sim/prompt2scene/workflows/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/__init__.py
new file mode 100644
index 00000000..393b0022
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/__init__.py
@@ -0,0 +1,41 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    DEBUG_DIRNAME,
+    IMAGE_SEGMENTS_STEP,
+    IMAGE_SPATIAL_RELATIONS_STEP,
+    RAW_MODEL_OUTPUT_FILENAME,
+    SCENE_INTAKE_STEP,
+    STEP_RESULT_FILENAME,
+    TEXT_RELATIONS_STEP,
+    UNIFIED_SCENE_STEP,
+    WorkflowArtifactWriter,
+)
+
+__all__ = [
+    "DEBUG_DIRNAME",
+    "IMAGE_SEGMENTS_STEP",
+    "IMAGE_SPATIAL_RELATIONS_STEP",
+    "RAW_MODEL_OUTPUT_FILENAME",
+    "SCENE_INTAKE_STEP",
+    "STEP_RESULT_FILENAME",
+    "TEXT_RELATIONS_STEP",
+    "UNIFIED_SCENE_STEP",
+    "WorkflowArtifactWriter",
+]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/artifact_writer.py b/embodichain/gen_sim/prompt2scene/workflows/artifact_writer.py
new file mode 100644
index 00000000..6587ccbb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/artifact_writer.py
@@ -0,0 +1,271 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+import re
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+
+__all__ = [
+    "DEBUG_DIRNAME",
+    "IMAGE_SEGMENTS_STEP",
+    "IMAGE_SPATIAL_RELATIONS_STEP",
+    "RAW_MODEL_OUTPUT_FILENAME",
+    "SCENE_INTAKE_STEP",
+    "STEP_RESULT_FILENAME",
+    "TEXT_RELATIONS_STEP",
+    "UNIFIED_SCENE_GEN_STEP",
+    "UNIFIED_SCENE_STEP",
+    "WorkflowArtifactWriter",
+    "debug_dir_path",
+    "debug_round_dir_path",
+    "next_debug_round_dir_path",
+    "next_debug_round_name",
+    "step_dir_path",
+    "step_result_path",
+    "write_debug_json",
+    "write_debug_round_json",
+    "write_next_raw_model_output",
+    "write_raw_model_output",
+    "write_step_result",
+]
+
+STEP_RESULT_FILENAME = "result.json"
+DEBUG_DIRNAME = "debug"
+RAW_MODEL_OUTPUT_FILENAME = "raw_model_output.json"
+
+SCENE_INTAKE_STEP = "scene_intake"
+IMAGE_SEGMENTS_STEP = "image_segments"
+IMAGE_SPATIAL_RELATIONS_STEP = "image_spatial_relations"
+TEXT_RELATIONS_STEP = "text_relations"
+UNIFIED_SCENE_STEP = "unified_scene"
+UNIFIED_SCENE_GEN_STEP = "unified_scene_gen"
+
+DEBUG_ROUND_PATTERN = re.compile(r"^round_(\d+)(?:_|$)")
+
+
+def step_dir_path(output_root: Path, step_name: str) -> Path:
+    """Return the directory path for a pipeline step."""
+    return output_root / step_name
+
+
+def step_result_path(output_root: Path, step_name: str) -> Path:
+    """Return the final result JSON path for a pipeline step."""
+    return step_dir_path(output_root, step_name) / STEP_RESULT_FILENAME
+
+
+def debug_dir_path(output_root: Path, step_name: str) -> Path:
+    """Return the debug directory path for a pipeline step."""
+    return step_dir_path(output_root, step_name) / DEBUG_DIRNAME
+
+
+def debug_round_dir_path(
+    output_root: Path,
+    step_name: str,
+    round_name: str,
+) -> Path:
+    """Return a debug subdirectory path for one model/tool round."""
+    return debug_dir_path(output_root, step_name) / round_name
+
+
+def next_debug_round_name(
+    output_root: Path,
+    step_name: str,
+    label: str | None = None,
+) -> str:
+    """Return the next step-local debug round name."""
+    debug_dir = debug_dir_path(output_root, step_name)
+    max_index = 0
+    if debug_dir.is_dir():
+        for path in debug_dir.iterdir():
+            if not path.is_dir():
+                continue
+            match = DEBUG_ROUND_PATTERN.match(path.name)
+            if match is not None:
+                max_index = max(max_index, int(match.group(1)))
+    round_name = f"round_{max_index + 1:03d}"
+    if label:
+        round_name = f"{round_name}_{_path_token(label)}"
+    return round_name
+
+
+def next_debug_round_dir_path(
+    output_root: Path,
+    step_name: str,
+    label: str | None = None,
+) -> Path:
+    """Return the next step-local debug round directory path."""
+    return debug_round_dir_path(
+        output_root,
+        step_name,
+        next_debug_round_name(output_root, step_name, label),
+    )
+
+
+def write_step_result(
+    output_root: Path,
+    step_name: str,
+    payload: dict[str, Any],
+) -> Path:
+    """Write a step's final result JSON and return its path."""
+    path = step_result_path(output_root, step_name)
+    write_json(path, payload)
+    return path
+
+
+def write_debug_json(
+    output_root: Path,
+    step_name: str,
+    round_name: str,
+    filename: str,
+    payload: dict[str, Any],
+) -> Path:
+    """Write a debug JSON file under one step debug round."""
+    path = debug_round_dir_path(output_root, step_name, round_name) / filename
+    write_json(path, payload)
+    return path
+
+
+def write_debug_round_json(
+    debug_round_dir: Path,
+    filename: str,
+    payload: dict[str, Any],
+) -> Path:
+    """Write a debug JSON file under an already selected debug round directory."""
+    path = debug_round_dir / filename
+    write_json(path, payload)
+    return path
+
+
+def write_raw_model_output(
+    output_root: Path,
+    step_name: str,
+    round_name: str,
+    payload: dict[str, Any],
+) -> Path:
+    """Write one raw structured model output under a step debug round."""
+    return write_debug_json(
+        output_root,
+        step_name,
+        round_name,
+        RAW_MODEL_OUTPUT_FILENAME,
+        payload,
+    )
+
+
+def write_next_raw_model_output(
+    output_root: Path,
+    step_name: str,
+    payload: dict[str, Any],
+    label: str | None = None,
+) -> Path:
+    """Write raw model output under the next step-local debug round."""
+    round_name = next_debug_round_name(output_root, step_name, label)
+    return write_raw_model_output(output_root, step_name, round_name, payload)
+
+
+class WorkflowArtifactWriter:
+    """Write workflow artifacts under a fixed step directory."""
+
+    def __init__(self, output_root: Path, step_name: str) -> None:
+        self._output_root = output_root
+        self._step_name = step_name
+
+    @property
+    def output_root(self) -> Path:
+        return self._output_root
+
+    @property
+    def step_name(self) -> str:
+        return self._step_name
+
+    @property
+    def step_dir(self) -> Path:
+        return step_dir_path(self._output_root, self._step_name)
+
+    @property
+    def debug_dir(self) -> Path:
+        return debug_dir_path(self._output_root, self._step_name)
+
+    @property
+    def result_path(self) -> Path:
+        return step_result_path(self._output_root, self._step_name)
+
+    def next_debug_round_name(self, label: str | None = None) -> str:
+        """Return the next debug round name for this step."""
+        return next_debug_round_name(self._output_root, self._step_name, label)
+
+    def next_debug_round_dir(self, label: str | None = None) -> Path:
+        """Return the next debug round directory for this step."""
+        return next_debug_round_dir_path(self._output_root, self._step_name, label)
+
+    def debug_round_dir(self, round_name: str) -> Path:
+        """Return one debug round directory under this step."""
+        return debug_round_dir_path(self._output_root, self._step_name, round_name)
+
+    def write_step_result(self, payload: dict[str, Any]) -> Path:
+        """Write the step's final result JSON."""
+        return write_step_result(self._output_root, self._step_name, payload)
+
+    def write_debug_round_json(
+        self,
+        *,
+        round_name: str,
+        filename: str,
+        payload: dict[str, Any],
+    ) -> Path:
+        """Write a JSON artifact inside one named debug round."""
+        return write_debug_round_json(
+            self.debug_round_dir(round_name),
+            filename=filename,
+            payload=payload,
+        )
+
+    def write_raw_model_output(
+        self,
+        *,
+        round_name: str,
+        payload: dict[str, Any],
+    ) -> Path:
+        """Write a raw model output into one named debug round."""
+        return write_raw_model_output(
+            self._output_root,
+            self._step_name,
+            round_name,
+            payload,
+        )
+
+    def write_next_raw_model_output(
+        self,
+        *,
+        payload: dict[str, Any],
+        label: str | None = None,
+    ) -> Path:
+        """Write a raw model output into the next available debug round."""
+        return write_next_raw_model_output(
+            self._output_root,
+            self._step_name,
+            payload,
+            label=label,
+        )
+
+
+def _path_token(value: str) -> str:
+    token = "".join(character if character.isalnum() else "_" for character in value)
+    return token.strip("_")[:80] or "round"
diff --git a/embodichain/gen_sim/prompt2scene/workflows/attempt_state.py b/embodichain/gen_sim/prompt2scene/workflows/attempt_state.py
new file mode 100644
index 00000000..15407e78
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/attempt_state.py
@@ -0,0 +1,30 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import TypedDict
+
+__all__ = ["AttemptState"]
+
+
+class AttemptState(TypedDict):
+    """Common retry/error fields for one model-call stage."""
+
+    attempt_count: int
+    max_attempts: int
+    last_error: str | None
+    errors: list[str]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/__init__.py
new file mode 100644
index 00000000..ab49ab72
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/__init__.py
@@ -0,0 +1,24 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.graph import (
+    build_image_relations_graph,
+    run_image_relations,
+)
+
+__all__ = ["build_image_relations_graph", "run_image_relations"]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/graph.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/graph.py
new file mode 100644
index 00000000..ff67f3a0
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/graph.py
@@ -0,0 +1,189 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.llms import (
+    OpenAICompatibleLLMCfg,
+    build_chat_model,
+)
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+    format_result_missing_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+    ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.nodes import (
+    call_vlm_filter_initial_segments_node,
+    call_vlm_spatial_layout_node,
+    normalize_asset_segments_node,
+    prepare_segmentation_input_node,
+    retry_missing_by_candidates_node,
+    segment_table_node,
+    segment_by_name_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.state import (
+    ImageRelationsState,
+)
+
+__all__ = ["build_image_relations_graph", "run_image_relations"]
+
+
+def route_after_filter_extra_instances(state: ImageRelationsState) -> str:
+    """Route to retry or continue after VLM extra-instance filtering."""
+    if state["last_error"] is None:
+        return "continue"
+    if state["attempt_count"] < state["max_attempts"]:
+        return "retry"
+    return "continue"
+
+
+def route_after_spatial_layout(state: ImageRelationsState) -> str:
+    """Route to retry or finish after spatial-layout extraction."""
+    if state["last_error"] is None:
+        return "end"
+    if state["attempt_count"] < state["max_attempts"]:
+        return "retry"
+    return "end"
+
+
+def build_image_relations_graph(llm: Any) -> Any:
+    """Build the fixed LangGraph image asset segmentation workflow."""
+    graph = StateGraph(ImageRelationsState)
+    graph.add_node("prepare_segmentation_input", prepare_segmentation_input_node)
+    graph.add_node("segment_by_name", segment_by_name_node)
+    graph.add_node(
+        "call_vlm_filter_initial_segments",
+        lambda state: call_vlm_filter_initial_segments_node(state, llm=llm),
+    )
+    graph.add_node(
+        "retry_missing_by_candidates",
+        lambda state: retry_missing_by_candidates_node(state, llm=llm),
+    )
+    graph.add_node("normalize_asset_segments", normalize_asset_segments_node)
+    graph.add_node(
+        "segment_table",
+        lambda state: segment_table_node(state, llm=llm),
+    )
+    graph.add_node(
+        "call_vlm_spatial_layout",
+        lambda state: call_vlm_spatial_layout_node(state, llm=llm),
+    )
+
+    graph.set_entry_point("prepare_segmentation_input")
+    graph.add_edge("prepare_segmentation_input", "segment_by_name")
+    graph.add_edge("segment_by_name", "call_vlm_filter_initial_segments")
+    graph.add_conditional_edges(
+        "call_vlm_filter_initial_segments",
+        route_after_filter_extra_instances,
+        {
+            "retry": "call_vlm_filter_initial_segments",
+            "continue": "retry_missing_by_candidates",
+        },
+    )
+    graph.add_edge("retry_missing_by_candidates", "normalize_asset_segments")
+    graph.add_edge("normalize_asset_segments", "segment_table")
+    graph.add_edge("segment_table", "call_vlm_spatial_layout")
+    graph.add_conditional_edges(
+        "call_vlm_spatial_layout",
+        route_after_spatial_layout,
+        {
+            "retry": "call_vlm_spatial_layout",
+            "end": END,
+        },
+    )
+    return graph.compile()
+
+
+def run_image_relations(
+    request: Prompt2SceneInput,
+    *,
+    scene_intake: SceneIntakeSpec,
+    llm_cfg: OpenAICompatibleLLMCfg,
+    output_root: Path,
+) -> ImageRelationSpec:
+    """Run image asset segmentation alignment for one prompt2scene request."""
+    llm = build_chat_model(llm_cfg)
+    graph = build_image_relations_graph(llm)
+    result = graph.invoke(
+        {
+            "request": request,
+            "scene_intake": scene_intake,
+            "output_root": output_root,
+            "segment_groups": [],
+            "raw_model_output": None,
+            "image_relations": None,
+            "attempt_count": 0,
+            "max_attempts": llm_cfg.max_attempts,
+            "last_error": None,
+            "errors": [],
+        }
+    )
+
+    image_relations = result.get("image_relations")
+    if (
+        image_relations is not None
+        and image_relations.status == "ok"
+        and image_relations.anchor is not None
+    ):
+        return image_relations
+    if image_relations is not None and image_relations.status == "ok":
+        error = format_result_missing_error(
+            "Image relations",
+            "spatial layout",
+            attempt_count=result.get("attempt_count", 0),
+            last_error=result.get("last_error"),
+            errors=result.get("errors", []),
+        )
+        log.log_warning(error)
+        raise RuntimeError(error)
+    if image_relations is not None:
+        failed_groups = [
+            group.to_manifest()
+            for group in image_relations.groups
+            if group.status != "ok"
+        ]
+        if (
+            image_relations.table_group is not None
+            and image_relations.table_group.status != "ok"
+        ):
+            failed_groups.append(image_relations.table_group.to_manifest())
+        error = (
+            "Image relations failed to align all image segments. "
+            f"Failed groups: {failed_groups}"
+        )
+        log.log_warning(error)
+        raise RuntimeError(error)
+
+    error = format_result_missing_error(
+        "Image relations",
+        "ImageRelationSpec",
+        attempt_count=result.get("attempt_count", 0),
+        last_error=result.get("last_error"),
+        errors=result.get("errors", []),
+    )
+    log.log_warning(error)
+    raise RuntimeError(error)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/nodes.py
new file mode 100644
index 00000000..ab8b6952
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/nodes.py
@@ -0,0 +1,511 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+    decode_rle_mask,
+    draw_numbered_masks,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+    ImageAssetSegment,
+    ImageRelationGroup,
+    ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import InputKind
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+    FILTER_EXTRA_INSTANCES_JSON_SCHEMA,
+    SPATIAL_LAYOUT_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.utils import (
+    log_api_request_start,
+    log,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    IMAGE_SEGMENTS_STEP,
+    IMAGE_SPATIAL_RELATIONS_STEP,
+    WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.utils import (
+    append_unique,
+    apply_spatial_layout_output,
+    asset_bbox_label,
+    draw_labeled_bboxes,
+    expand_asset_ids,
+    filter_group_segments_with_vlm,
+    filter_segments_with_vlm,
+    merge_non_overlapping_segments,
+    prompt_text,
+    path_token,
+    require_image_path,
+    segment_prompt,
+    segments_from_response,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.prompts import (
+    build_filter_extra_instances_messages,
+    build_spatial_layout_messages,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.state import (
+    ImageRelationsState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.llm_output import (
+    call_structured_json_model_step,
+    is_model_output_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+    format_attempt_error,
+)
+
+__all__ = [
+    "call_vlm_filter_extra_instances_node",
+    "call_vlm_filter_initial_segments_node",
+    "call_vlm_spatial_layout_node",
+    "normalize_asset_segments_node",
+    "prepare_segmentation_input_node",
+    "retry_missing_by_candidates_node",
+    "segment_table_node",
+    "segment_by_name_node",
+]
+
+def prepare_segmentation_input_node(state: ImageRelationsState) -> dict[str, object]:
+    """Prepare scene-intake asset groups for class-level segmentation."""
+    request = state["request"]
+    if request.input_kind != InputKind.IMAGE or request.image_path is None:
+        raise ValueError("Image relations requires an image input.")
+
+    segment_groups = []
+    for asset in state["scene_intake"].assets:
+        group = {
+            "name": asset.name,
+            "description": asset.description,
+            "asset_ids": expand_asset_ids(asset.id, asset.count),
+            "class_candidate": list(asset.class_candidate),
+            "segments": [],
+            "tried_prompts": [],
+            "debug_images": [],
+            "status": "pending",
+            "error": None,
+            "expected_count": asset.count,
+        }
+        segment_groups.append(group)
+    return {"segment_groups": segment_groups}
+
+
+def segment_by_name_node(state: ImageRelationsState) -> dict[str, object]:
+    """Run SAM3 once per object name."""
+    image_path = require_image_path(state)
+    segment_groups = []
+    for group in state["segment_groups"]:
+        prompt = prompt_text(group["name"])
+        response = segment_prompt(image_path=image_path, prompt=prompt)
+        group = dict(group)
+        group["tried_prompts"] = append_unique(group["tried_prompts"], prompt)
+        group["segments"] = segments_from_response(
+            group=group,
+            response=response,
+            source_prompt=prompt,
+        )
+        segment_groups.append(group)
+    return {"segment_groups": segment_groups}
+
+
+def call_vlm_filter_extra_instances_node(
+    state: ImageRelationsState,
+    *,
+    llm: Any,
+) -> dict[str, object]:
+    """Compatibility wrapper for the initial VLM segment filter."""
+    return call_vlm_filter_initial_segments_node(state, llm=llm)
+
+
+def call_vlm_filter_initial_segments_node(
+    state: ImageRelationsState,
+    *,
+    llm: Any,
+) -> dict[str, object]:
+    """Ask VLM to remove wrong masks from initial name-based SAM3 output."""
+    return filter_segments_with_vlm(state=state, llm=llm, stage="initial")
+def retry_missing_by_candidates_node(
+    state: ImageRelationsState,
+    *,
+    llm: Any,
+) -> dict[str, object]:
+    """Use remaining class candidates to add missing segment instances."""
+    image_path = require_image_path(state)
+    artifact_writer = WorkflowArtifactWriter(state["output_root"], IMAGE_SEGMENTS_STEP)
+    segment_groups = []
+    for group in state["segment_groups"]:
+        group = dict(group)
+        segments = group["segments"]
+        expected_count = group["expected_count"]
+        for candidate_name in group["class_candidate"][1:]:
+            if len(segments) >= expected_count:
+                break
+            prompt = prompt_text(candidate_name)
+            if prompt in group["tried_prompts"]:
+                continue
+            response = segment_prompt(image_path=image_path, prompt=prompt)
+            group["tried_prompts"] = append_unique(group["tried_prompts"], prompt)
+            new_segments = segments_from_response(
+                group=group,
+                response=response,
+                source_prompt=prompt,
+            )
+            new_segments = filter_group_segments_with_vlm(
+                llm=llm,
+                image_path=image_path,
+                artifact_writer=artifact_writer,
+                group=group,
+                segments=new_segments,
+                stage=f"fallback_{path_token(prompt)}",
+            )
+            segments = merge_non_overlapping_segments(
+                existing=segments,
+                incoming=new_segments,
+                limit=expected_count,
+            )
+        if len(segments) < expected_count:
+            description_prompt = str(group.get("description") or "").strip()
+            if description_prompt and description_prompt not in group["tried_prompts"]:
+                response = segment_prompt(
+                    image_path=image_path,
+                    prompt=description_prompt,
+                )
+                group["tried_prompts"] = append_unique(
+                    group["tried_prompts"],
+                    description_prompt,
+                )
+                new_segments = segments_from_response(
+                    group=group,
+                    response=response,
+                    source_prompt=description_prompt,
+                )
+                new_segments = filter_group_segments_with_vlm(
+                    llm=llm,
+                    image_path=image_path,
+                    artifact_writer=artifact_writer,
+                    group=group,
+                    segments=new_segments,
+                    stage="fallback_description",
+                )
+                segments = merge_non_overlapping_segments(
+                    existing=segments,
+                    incoming=new_segments,
+                    limit=expected_count,
+                )
+        group["segments"] = segments
+        segment_groups.append(group)
+    return {"segment_groups": segment_groups}
+
+
+def normalize_asset_segments_node(state: ImageRelationsState) -> dict[str, object]:
+    """Assign final segments to scene-intake asset IDs."""
+    image_path = require_image_path(state)
+    asset_segments: list[ImageAssetSegment] = []
+    relation_groups: list[ImageRelationGroup] = []
+    status = "ok"
+
+    for group in state["segment_groups"]:
+        expected_count = group["expected_count"]
+        segments = group["segments"]
+        group_status = "ok"
+        error = None
+        if len(segments) < expected_count:
+            group_status = "failed"
+            error = "missing_segments"
+            status = "failed"
+        elif len(segments) > expected_count:
+            group_status = "failed"
+            error = "extra_segments"
+            status = "failed"
+
+        relation_groups.append(
+            ImageRelationGroup(
+                name=group["name"],
+                expected_count=expected_count,
+                detected_count=len(segments),
+                status=group_status,
+                tried_prompts=list(group["tried_prompts"]),
+                asset_ids=list(group["asset_ids"]),
+                debug_images=list(group["debug_images"]),
+                error=error,
+            )
+        )
+
+        if group_status != "ok":
+            continue
+        for asset_id, segment in zip(group["asset_ids"], segments):
+            asset_segments.append(
+                ImageAssetSegment(
+                    asset_id=asset_id,
+                    name=group["name"],
+                    segment_id=segment["segment_id"],
+                    bbox_xyxy=list(segment["bbox_xyxy"]),
+                    score=float(segment["score"]),
+                    source_prompt=segment["source_prompt"],
+                    mask_rle=segment.get("mask_rle"),
+                )
+            )
+
+    bbox_name_image_path = None
+    if status == "ok":
+        artifact_writer = WorkflowArtifactWriter(
+            state["output_root"],
+            IMAGE_SEGMENTS_STEP,
+        )
+        bbox_name_image_path = str(
+            draw_labeled_bboxes(
+                image_path=image_path,
+                boxes=[
+                    {
+                        "bbox_xyxy": segment.bbox_xyxy,
+                        "label": asset_bbox_label(segment.asset_id),
+                    }
+                    for segment in asset_segments
+                ],
+                output_path=artifact_writer.step_dir / "asset_segments_bbox_name.png",
+            )
+        )
+
+    image_relations = ImageRelationSpec(
+        status=status,
+        image_path=str(image_path),
+        asset_segments=asset_segments,
+        groups=relation_groups,
+        bbox_name_image_path=bbox_name_image_path,
+    )
+    WorkflowArtifactWriter(
+        state["output_root"],
+        IMAGE_SEGMENTS_STEP,
+    ).write_step_result(image_relations.to_segmentation_manifest())
+    return {"image_relations": image_relations}
+
+
+def segment_table_node(
+    state: ImageRelationsState,
+    *,
+    llm: Any,
+) -> dict[str, object]:
+    """Segment the table/support target after object segmentation is complete."""
+    image_relations = state["image_relations"]
+    if image_relations is None or image_relations.status != "ok":
+        return {}
+
+    image_path = require_image_path(state)
+    table = state["scene_intake"].table
+    artifact_writer = WorkflowArtifactWriter(state["output_root"], IMAGE_SEGMENTS_STEP)
+    group = {
+        "name": table.name,
+        "description": table.description,
+        "asset_ids": [table.id],
+        "class_candidate": list(table.class_candidate),
+        "segments": [],
+        "tried_prompts": [],
+        "debug_images": [],
+        "status": "pending",
+        "error": None,
+        "expected_count": 1,
+    }
+    segments: list[dict[str, Any]] = []
+
+    for prompt in _table_segmentation_prompts(group):
+        if len(segments) >= 1:
+            break
+        response = segment_prompt(image_path=image_path, prompt=prompt)
+        group["tried_prompts"] = append_unique(group["tried_prompts"], prompt)
+        new_segments = segments_from_response(
+            group=group,
+            response=response,
+            source_prompt=prompt,
+        )
+        _write_table_candidate_debug_image(
+            image_path=image_path,
+            artifact_writer=artifact_writer,
+            group=group,
+            segments=new_segments,
+            stage=f"table_{path_token(prompt)}",
+        )
+        selected_segment = _select_largest_table_segment(new_segments)
+        if selected_segment is not None:
+            segments = [selected_segment]
+
+    group_status = "ok" if len(segments) == 1 else "failed"
+    error = None if group_status == "ok" else "missing_table_segment"
+    table_group = ImageRelationGroup(
+        name=group["name"],
+        expected_count=1,
+        detected_count=len(segments),
+        status=group_status,
+        tried_prompts=list(group["tried_prompts"]),
+        asset_ids=[table.id],
+        debug_images=list(group["debug_images"]),
+        error=error,
+    )
+    table_segment = None
+    if group_status == "ok":
+        segment = segments[0]
+        table_segment = ImageAssetSegment(
+            asset_id=table.id,
+            name=table.name,
+            segment_id=segment["segment_id"],
+            bbox_xyxy=list(segment["bbox_xyxy"]),
+            score=float(segment["score"]),
+            source_prompt=segment["source_prompt"],
+            mask_rle=segment.get("mask_rle"),
+        )
+
+    updated_image_relations = ImageRelationSpec(
+        status="ok" if group_status == "ok" else "failed",
+        image_path=image_relations.image_path,
+        asset_segments=image_relations.asset_segments,
+        groups=image_relations.groups,
+        table_segment=table_segment,
+        table_group=table_group,
+        bbox_name_image_path=image_relations.bbox_name_image_path,
+        anchor=image_relations.anchor,
+        x_order=image_relations.x_order,
+        y_order=image_relations.y_order,
+        asset_layouts=image_relations.asset_layouts,
+    )
+    artifact_writer.write_step_result(updated_image_relations.to_segmentation_manifest())
+    return {"image_relations": updated_image_relations}
+
+
+def _table_segmentation_prompts(group: dict[str, Any]) -> list[str]:
+    """Return table/support segmentation prompts in object-style fallback order."""
+    prompts = [prompt_text(group["name"])]
+    for candidate_name in group["class_candidate"][1:]:
+        prompts.append(prompt_text(candidate_name))
+    description_prompt = str(group.get("description") or "").strip()
+    if description_prompt:
+        prompts.append(description_prompt)
+
+    unique_prompts: list[str] = []
+    for prompt in prompts:
+        if prompt and prompt not in unique_prompts:
+            unique_prompts.append(prompt)
+    return unique_prompts
+
+
+def _write_table_candidate_debug_image(
+    *,
+    image_path: Path,
+    artifact_writer: WorkflowArtifactWriter,
+    group: dict[str, Any],
+    segments: list[dict[str, Any]],
+    stage: str,
+) -> None:
+    """Write table/support candidate mask debug image without VLM filtering."""
+    if not segments:
+        return
+    round_name = artifact_writer.next_debug_round_name(label=f"{stage}_{group['name']}")
+    round_dir = artifact_writer.debug_round_dir(round_name)
+    debug_image_path = draw_numbered_masks(
+        image_path=image_path,
+        segments=segments,
+        output_path=round_dir / "mask.png",
+    )
+    group["debug_images"] = append_unique(
+        group["debug_images"],
+        str(debug_image_path),
+    )
+
+
+def _select_largest_table_segment(
+    segments: list[dict[str, Any]],
+) -> dict[str, Any] | None:
+    """Select the largest SAM3 table/support candidate without VLM filtering."""
+    if not segments:
+        return None
+    return max(segments, key=_segment_area)
+
+
+def _segment_area(segment: dict[str, Any]) -> float:
+    mask_rle = segment.get("mask_rle")
+    if mask_rle is not None:
+        try:
+            mask = decode_rle_mask(mask_rle).convert("L")
+            histogram = mask.histogram()
+            return float(sum(count for value, count in enumerate(histogram) if value))
+        except Exception:
+            pass
+    x1, y1, x2, y2 = segment["bbox_xyxy"]
+    return max(0.0, float(x2) - float(x1)) * max(0.0, float(y2) - float(y1))
+
+
+def call_vlm_spatial_layout_node(
+    state: ImageRelationsState,
+    *,
+    llm: Any,
+) -> dict[str, object]:
+    """Ask VLM for object ordering, anchor grid, and per-object layout states."""
+    image_relations = state["image_relations"]
+    if image_relations is None or image_relations.status != "ok":
+        return {}
+    if image_relations.bbox_name_image_path is None:
+        raise ValueError("Image spatial layout requires bbox_name_image_path.")
+
+    attempt_count = state["attempt_count"] + 1
+    asset_ids = [segment.asset_id for segment in image_relations.asset_segments]
+    artifact_writer = WorkflowArtifactWriter(
+        state["output_root"],
+        IMAGE_SPATIAL_RELATIONS_STEP,
+    )
+    messages = build_spatial_layout_messages(
+        bbox_name_image_path=Path(image_relations.bbox_name_image_path),
+        asset_ids=asset_ids,
+    )
+
+    try:
+        log_api_request_start(
+            step=IMAGE_SPATIAL_RELATIONS_STEP,
+            request="spatial_layout",
+            attempt=attempt_count,
+        )
+        raw_model_output = call_structured_json_model_step(
+            llm=llm,
+            schema=SPATIAL_LAYOUT_JSON_SCHEMA,
+            messages=messages,
+            context="Image spatial layout",
+            step_name=IMAGE_SPATIAL_RELATIONS_STEP,
+            output_root=None,
+            attempt_count=attempt_count,
+            raw_output_label="spatial_layout",
+            artifact_writer=artifact_writer,
+        )
+        updated_image_relations = apply_spatial_layout_output(
+            image_relations=image_relations,
+            raw_model_output=raw_model_output,
+        )
+        artifact_writer.write_step_result(updated_image_relations.to_spatial_manifest())
+    except Exception as exc:
+        if is_model_output_error(exc) or isinstance(exc, ValueError):
+            error = format_attempt_error("Image relations spatial layout", attempt_count, exc)
+            log.log_warning(error)
+            return {
+                "attempt_count": attempt_count,
+                "last_error": error,
+                "errors": state["errors"] + [error],
+            }
+        raise
+    return {
+        "attempt_count": attempt_count,
+        "image_relations": updated_image_relations,
+        "last_error": None,
+    }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/prompts.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/prompts.py
new file mode 100644
index 00000000..f974f442
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/prompts.py
@@ -0,0 +1,113 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.prompts import render_prompt
+from embodichain.gen_sim.prompt2scene.utils.io import image_to_data_url
+
+__all__ = [
+    "build_filter_extra_instances_messages",
+    "build_spatial_layout_messages",
+]
+
+IMAGE_RELATIONS_PROMPT_NAME = "image_relations.yaml"
+
+
+def build_filter_extra_instances_messages(
+    *,
+    debug_image_path: Path,
+    name: str,
+    description: str,
+    expected_count: int,
+    class_candidate: list[str],
+) -> list[dict[str, Any]]:
+    """Build LangChain-compatible messages for VLM extra-mask filtering."""
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                IMAGE_RELATIONS_PROMPT_NAME,
+                prompt_key="filter_extra_instances_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": render_prompt(
+                        IMAGE_RELATIONS_PROMPT_NAME,
+                        {
+                            "name": name.replace("_", " "),
+                            "description": description,
+                            "expected_count": str(expected_count),
+                            "class_candidate": ", ".join(
+                                candidate.replace("_", " ")
+                                for candidate in class_candidate
+                            ),
+                        },
+                        prompt_key="filter_extra_instances_user",
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(debug_image_path)},
+                },
+            ],
+        },
+    ]
+
+
+def build_spatial_layout_messages(
+    *,
+    bbox_name_image_path: Path,
+    asset_ids: list[str],
+) -> list[dict[str, Any]]:
+    """Build messages for VLM spatial ordering and object-state extraction."""
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                IMAGE_RELATIONS_PROMPT_NAME,
+                prompt_key="spatial_layout_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": render_prompt(
+                        IMAGE_RELATIONS_PROMPT_NAME,
+                        {
+                            "asset_ids": "\n".join(
+                                f"- {asset_id}" for asset_id in asset_ids
+                            ),
+                        },
+                        prompt_key="spatial_layout_user",
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(bbox_name_image_path)},
+                },
+            ],
+        },
+    ]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/schema.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/schema.py
new file mode 100644
index 00000000..500f7c70
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/schema.py
@@ -0,0 +1,250 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.spatial import GRID_VALUE_LIST
+
+__all__ = [
+    "FILTER_EXTRA_INSTANCES_JSON_SCHEMA",
+    "ImageAnchor",
+    "ImageAssetLayout",
+    "ImageAssetSegment",
+    "ImageRelationGroup",
+    "ImageRelationSpec",
+    "SPATIAL_LAYOUT_JSON_SCHEMA",
+]
+
+FILTER_EXTRA_INSTANCES_JSON_SCHEMA: dict[str, Any] = {
+    "title": "FilterExtraImageInstancesOutput",
+    "type": "object",
+    "additionalProperties": False,
+    "properties": {
+        "extra_instance_numbers": {
+            "type": "array",
+            "description": "1-based mask numbers that should be removed.",
+            "items": {"type": "integer", "minimum": 1},
+        },
+        "reason": {
+            "type": "string",
+            "description": "Brief reason for the removal decision.",
+        },
+    },
+    "required": ["extra_instance_numbers", "reason"],
+}
+
+SPATIAL_LAYOUT_JSON_SCHEMA: dict[str, Any] = {
+    "title": "ImageSpatialLayoutOutput",
+    "type": "object",
+    "additionalProperties": False,
+    "properties": {
+        "anchor": {
+            "type": "object",
+            "additionalProperties": False,
+            "properties": {
+                "asset_id": {"type": "string", "minLength": 1},
+                "grid": {
+                    "type": "string",
+                    "enum": GRID_VALUE_LIST,
+                },
+                "reason": {"type": "string"},
+            },
+            "required": ["asset_id", "grid", "reason"],
+        },
+        "x_order": {
+            "type": "array",
+            "description": "Asset-id groups ordered from left to right.",
+            "items": {
+                "type": "array",
+                "items": {"type": "string", "minLength": 1},
+                "minItems": 1,
+            },
+            "minItems": 1,
+        },
+        "y_order": {
+            "type": "array",
+            "description": "Asset-id groups ordered from front to back.",
+            "items": {
+                "type": "array",
+                "items": {"type": "string", "minLength": 1},
+                "minItems": 1,
+            },
+            "minItems": 1,
+        },
+        "asset_states": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "additionalProperties": True,
+                "properties": {
+                    "asset_id": {"type": "string", "minLength": 1},
+                    "is_arbitrary_layout": {"type": "boolean"},
+                    "reason": {"type": "string", "minLength": 1},
+                },
+                "required": [
+                    "asset_id",
+                    "is_arbitrary_layout",
+                    "reason",
+                ],
+            },
+        },
+    },
+    "required": ["anchor", "x_order", "y_order", "asset_states"],
+}
+
+
+@dataclass(frozen=True)
+class ImageAssetSegment:
+    """Image segmentation result aligned to one scene-intake asset."""
+
+    asset_id: str
+    name: str
+    segment_id: str
+    bbox_xyxy: list[float]
+    score: float
+    source_prompt: str
+    mask_rle: dict[str, Any] | None = None
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the segment to JSON-safe data."""
+        return {
+            "asset_id": self.asset_id,
+            "name": self.name,
+            "segment_id": self.segment_id,
+            "bbox_xyxy": list(self.bbox_xyxy),
+            "score": self.score,
+            "source_prompt": self.source_prompt,
+            "mask_rle": self.mask_rle,
+        }
+
+
+@dataclass(frozen=True)
+class ImageRelationGroup:
+    """Segmentation alignment status for assets sharing one object name."""
+
+    name: str
+    expected_count: int
+    detected_count: int
+    status: str
+    tried_prompts: list[str] = field(default_factory=list)
+    asset_ids: list[str] = field(default_factory=list)
+    debug_images: list[str] = field(default_factory=list)
+    error: str | None = None
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the group to JSON-safe data."""
+        return {
+            "name": self.name,
+            "expected_count": self.expected_count,
+            "detected_count": self.detected_count,
+            "status": self.status,
+            "tried_prompts": list(self.tried_prompts),
+            "asset_ids": list(self.asset_ids),
+            "debug_images": list(self.debug_images),
+            "error": self.error,
+        }
+
+
+@dataclass(frozen=True)
+class ImageAnchor:
+    """Anchor object used to place relative ordering onto the table grid."""
+
+    asset_id: str
+    grid: str
+    reason: str = ""
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the anchor to JSON-safe data."""
+        return {
+            "asset_id": self.asset_id,
+            "grid": self.grid,
+            "reason": self.reason,
+        }
+
+
+@dataclass(frozen=True)
+class ImageAssetLayout:
+    """Support state for one image asset instance."""
+
+    asset_id: str
+    is_arbitrary_layout: bool
+    reason: str = ""
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the layout to JSON-safe data."""
+        return {
+            "asset_id": self.asset_id,
+            "is_arbitrary_layout": self.is_arbitrary_layout,
+            "reason": self.reason,
+        }
+
+
+@dataclass(frozen=True)
+class ImageRelationSpec:
+    """Image asset segmentation alignment and spatial relations."""
+
+    status: str
+    image_path: str
+    asset_segments: list[ImageAssetSegment]
+    groups: list[ImageRelationGroup]
+    table_segment: ImageAssetSegment | None = None
+    table_group: ImageRelationGroup | None = None
+    bbox_name_image_path: str | None = None
+    anchor: ImageAnchor | None = None
+    x_order: list[list[str]] = field(default_factory=list)
+    y_order: list[list[str]] = field(default_factory=list)
+    asset_layouts: list[ImageAssetLayout] = field(default_factory=list)
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the image relation spec to JSON-safe data."""
+        manifest = self.to_segmentation_manifest()
+        manifest.update(self.to_spatial_manifest())
+        return manifest
+
+    def to_segmentation_manifest(self) -> dict[str, Any]:
+        """Convert only the segmentation alignment result to JSON-safe data."""
+        return {
+            "image_path": self.image_path,
+            "bbox_name_image_path": self.bbox_name_image_path,
+            "asset_segments": [
+                segment.to_manifest() for segment in self.asset_segments
+            ],
+            "groups": [group.to_manifest() for group in self.groups],
+            "table_segment": (
+                self.table_segment.to_manifest() if self.table_segment else None
+            ),
+            "table_group": (
+                self.table_group.to_manifest() if self.table_group else None
+            ),
+        }
+
+    def to_spatial_manifest(self) -> dict[str, Any]:
+        """Convert only spatial relations and layout states to JSON-safe data."""
+        return {
+            "image_path": self.image_path,
+            "bbox_name_image_path": self.bbox_name_image_path,
+            "anchor": self.anchor.to_manifest() if self.anchor else None,
+            "spatial_order": {
+                "left_to_right": [list(group) for group in self.x_order],
+                "front_to_back": [list(group) for group in self.y_order],
+            },
+            "objects": [
+                layout.to_manifest() for layout in self.asset_layouts
+            ],
+        }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/state.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/state.py
new file mode 100644
index 00000000..59853005
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/state.py
@@ -0,0 +1,42 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+    ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+
+__all__ = ["ImageRelationsState"]
+
+
+class ImageRelationsState(AttemptState):
+    """LangGraph state for image asset segmentation alignment."""
+
+    request: Prompt2SceneInput
+    scene_intake: SceneIntakeSpec
+    output_root: Path
+    segment_groups: list[dict[str, Any]]
+    raw_model_output: dict[str, Any] | None
+    image_relations: ImageRelationSpec | None
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/utils.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/utils.py
new file mode 100644
index 00000000..27e3b1b3
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/utils.py
@@ -0,0 +1,435 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+    ImageSegmentationClient,
+    ImageSegmentationError,
+    ImageSegmentationServerRequest,
+    ImageSegmentationServerResponse,
+    bbox_iou,
+    draw_labeled_bboxes,
+    draw_numbered_masks,
+    is_usable_segmentation_candidate,
+    sort_segments_by_bbox,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+    FILTER_EXTRA_INSTANCES_JSON_SCHEMA,
+    ImageAnchor,
+    ImageAssetLayout,
+    ImageAssetSegment,
+    ImageRelationGroup,
+    ImageRelationSpec,
+    SPATIAL_LAYOUT_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.workflows.spatial import (
+    GRID_VALUES,
+    validate_exact_asset_id_coverage,
+)
+from embodichain.gen_sim.prompt2scene.utils import log_api_request_start, log
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    IMAGE_SEGMENTS_STEP,
+    IMAGE_SPATIAL_RELATIONS_STEP,
+    RAW_MODEL_OUTPUT_FILENAME,
+    WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.prompts import (
+    build_filter_extra_instances_messages,
+    build_spatial_layout_messages,
+)
+from embodichain.gen_sim.prompt2scene.workflows.llm_output import (
+    call_structured_json_model_step,
+    is_model_output_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+    format_attempt_error,
+)
+
+__all__ = [
+    "MAX_SEGMENT_RETRIES",
+    "OVERLAP_IOU_THRESHOLD",
+    "append_unique",
+    "apply_spatial_layout_output",
+    "asset_bbox_label",
+    "expand_asset_ids",
+    "filter_group_segments_with_vlm",
+    "filter_segments_with_vlm",
+    "merge_non_overlapping_segments",
+    "draw_labeled_bboxes",
+    "parse_anchor",
+    "parse_asset_states",
+    "parse_order_groups",
+    "path_token",
+    "prompt_text",
+    "remove_extra_numbered_segments",
+    "require_image_path",
+    "segment_prompt",
+    "segments_from_response",
+    "sort_segments_by_bbox",
+]
+
+MAX_SEGMENT_RETRIES = 1
+OVERLAP_IOU_THRESHOLD = 0.5
+
+
+def require_image_path(state: dict[str, Any]) -> Path:
+    """Return the request image path or raise if the input is invalid."""
+    image_path = state["request"].image_path
+    if image_path is None:
+        raise ValueError("Image relations requires request.image_path.")
+    return image_path
+
+
+def prompt_text(name: str) -> str:
+    """Convert an asset name to a natural-language prompt."""
+    return name.replace("_", " ")
+
+
+def asset_bbox_label(asset_id: str) -> str:
+    """Convert an internal asset id into a display label."""
+    prefix = "interact_"
+    return asset_id[len(prefix) :] if asset_id.startswith(prefix) else asset_id
+
+
+def expand_asset_ids(asset_id: str, count: int) -> list[str]:
+    """Expand a grouped asset id into instance ids."""
+    return [f"{asset_id}_{index}" for index in range(count)]
+
+
+def path_token(value: str) -> str:
+    """Convert a label into a filesystem-safe token."""
+    token = "".join(character if character.isalnum() else "_" for character in value)
+    return token.strip("_")[:80] or "prompt"
+
+
+def append_unique(values: list[str], value: str) -> list[str]:
+    """Append a string only if it does not already exist in the list."""
+    return values if value in values else values + [value]
+
+
+def segment_prompt(
+    *,
+    image_path: Path,
+    prompt: str,
+) -> ImageSegmentationServerResponse:
+    """Call the segmentation server with a single prompt."""
+    client = ImageSegmentationClient()
+    log_api_request_start(
+        step=IMAGE_SEGMENTS_STEP,
+        request="sam3_segment",
+        prompt=prompt,
+    )
+    result = client.segment(
+        ImageSegmentationServerRequest(prompt=prompt, image_path=image_path),
+        max_retries=MAX_SEGMENT_RETRIES,
+    )
+    if isinstance(result, ImageSegmentationError):
+        log.log_warning(result.error_message)
+        raise RuntimeError(result.error_message)
+    return result
+
+
+def segments_from_response(
+    *,
+    group: dict[str, Any],
+    response: ImageSegmentationServerResponse,
+    source_prompt: str,
+) -> list[dict[str, Any]]:
+    """Convert segmentation server output into internal segment dicts."""
+    segments = []
+    for candidate in response.result.candidates:
+        if not is_usable_segmentation_candidate(candidate):
+            continue
+        segments.append(
+            {
+                "segment_id": f"{group['name']}_{len(segments)}",
+                "bbox_xyxy": list(candidate.bbox_xyxy),
+                "score": float(candidate.score),
+                "mask_rle": candidate.mask_rle,
+                "source_prompt": source_prompt,
+            }
+        )
+    return sort_segments_by_bbox(segments)
+
+
+def apply_spatial_layout_output(
+    *,
+    image_relations: ImageRelationSpec,
+    raw_model_output: dict[str, Any],
+) -> ImageRelationSpec:
+    """Apply VLM spatial-layout output to an image-relations spec."""
+    asset_ids = [segment.asset_id for segment in image_relations.asset_segments]
+    asset_id_set = set(asset_ids)
+
+    anchor = parse_anchor(raw_model_output.get("anchor"), asset_id_set=asset_id_set)
+    x_order = parse_order_groups(
+        raw_model_output.get("x_order"),
+        asset_ids=asset_ids,
+        field_name="x_order",
+    )
+    y_order = parse_order_groups(
+        raw_model_output.get("y_order"),
+        asset_ids=asset_ids,
+        field_name="y_order",
+    )
+    state_by_asset_id = parse_asset_states(
+        raw_model_output.get("asset_states"),
+        asset_ids=asset_ids,
+    )
+    asset_layouts = [
+        ImageAssetLayout(
+            asset_id=asset_id,
+            is_arbitrary_layout=state_by_asset_id[asset_id]["is_arbitrary_layout"],
+            reason=state_by_asset_id[asset_id]["reason"],
+        )
+        for asset_id in asset_ids
+    ]
+    return ImageRelationSpec(
+        status=image_relations.status,
+        image_path=image_relations.image_path,
+        asset_segments=image_relations.asset_segments,
+        groups=image_relations.groups,
+        table_segment=image_relations.table_segment,
+        table_group=image_relations.table_group,
+        bbox_name_image_path=image_relations.bbox_name_image_path,
+        anchor=anchor,
+        x_order=x_order,
+        y_order=y_order,
+        asset_layouts=asset_layouts,
+    )
+
+
+def parse_anchor(raw_anchor: Any, *, asset_id_set: set[str]) -> ImageAnchor:
+    """Parse and validate the anchor entry."""
+    if not isinstance(raw_anchor, dict):
+        raise ValueError("anchor must be an object.")
+    asset_id = str(raw_anchor.get("asset_id") or "").strip()
+    grid = str(raw_anchor.get("grid") or "").strip()
+    reason = str(raw_anchor.get("reason") or "").strip()
+    if asset_id not in asset_id_set:
+        raise ValueError(f"anchor.asset_id is not a known asset: {asset_id!r}.")
+    if grid not in GRID_VALUES:
+        raise ValueError(f"anchor.grid is not valid: {grid!r}.")
+    return ImageAnchor(asset_id=asset_id, grid=grid, reason=reason)
+
+
+def parse_order_groups(
+    raw_order: Any,
+    *,
+    asset_ids: list[str],
+    field_name: str,
+) -> list[list[str]]:
+    """Parse ordered asset-id groups from VLM output."""
+    if not isinstance(raw_order, list) or not raw_order:
+        raise ValueError(f"{field_name} must be a non-empty list.")
+
+    groups: list[list[str]] = []
+    flattened: list[str] = []
+    for group_index, raw_group in enumerate(raw_order):
+        if not isinstance(raw_group, list) or not raw_group:
+            raise ValueError(f"{field_name}[{group_index}] must be a non-empty list.")
+        group: list[str] = []
+        for raw_asset_id in raw_group:
+            asset_id = str(raw_asset_id).strip()
+            group.append(asset_id)
+            flattened.append(asset_id)
+        groups.append(group)
+
+    validate_exact_asset_id_coverage(
+        values=flattened,
+        expected_asset_ids=asset_ids,
+        context=field_name,
+    )
+    return groups
+
+
+def parse_asset_states(
+    raw_asset_states: Any,
+    *,
+    asset_ids: list[str],
+) -> dict[str, dict[str, Any]]:
+    """Parse per-asset layout state annotations."""
+    if not isinstance(raw_asset_states, list):
+        raise ValueError("asset_states must be a list.")
+
+    state_by_asset_id: dict[str, dict[str, Any]] = {}
+    for state_index, raw_state in enumerate(raw_asset_states):
+        if not isinstance(raw_state, dict):
+            raise ValueError(f"asset_states[{state_index}] must be an object.")
+        asset_id = str(raw_state.get("asset_id") or "").strip()
+        is_arbitrary_layout = raw_state.get("is_arbitrary_layout")
+        reason = str(raw_state.get("reason") or "").strip()
+        if not isinstance(is_arbitrary_layout, bool):
+            raise ValueError(
+                f"asset_states[{state_index}].is_arbitrary_layout must be boolean."
+            )
+        if not reason:
+            raise ValueError(f"asset_states[{state_index}].reason must be non-empty.")
+        if asset_id in state_by_asset_id:
+            raise ValueError(f"asset_states has duplicate asset_id: {asset_id!r}.")
+        state_by_asset_id[asset_id] = {
+            "is_arbitrary_layout": is_arbitrary_layout,
+            "reason": reason,
+        }
+
+    validate_exact_asset_id_coverage(
+        values=list(state_by_asset_id),
+        expected_asset_ids=asset_ids,
+        context="asset_states",
+    )
+    return state_by_asset_id
+
+
+def filter_group_segments_with_vlm(
+    *,
+    llm: Any,
+    image_path: Path,
+    artifact_writer: WorkflowArtifactWriter,
+    group: dict[str, Any],
+    segments: list[dict[str, Any]],
+    stage: str,
+) -> list[dict[str, Any]]:
+    """Ask VLM to remove wrong or duplicate instances from one SAM3 result."""
+    segments = sort_segments_by_bbox(segments)
+    if not segments:
+        return segments
+
+    round_name = artifact_writer.next_debug_round_name(label=f"{stage}_{group['name']}")
+    round_dir = artifact_writer.debug_round_dir(round_name)
+    debug_image_path = draw_numbered_masks(
+        image_path=image_path,
+        segments=segments,
+        output_path=round_dir / "mask.png",
+    )
+    group["debug_images"] = append_unique(
+        group["debug_images"],
+        str(debug_image_path),
+    )
+    log_api_request_start(
+        step=IMAGE_SEGMENTS_STEP,
+        request=f"vlm_filter_{stage}",
+        debug_image=str(debug_image_path),
+    )
+    messages = build_filter_extra_instances_messages(
+        debug_image_path=debug_image_path,
+        name=group["name"],
+        description=group["description"],
+        expected_count=group["expected_count"],
+        class_candidate=group["class_candidate"],
+    )
+    raw_model_output = call_structured_json_model_step(
+        llm=llm,
+        schema=FILTER_EXTRA_INSTANCES_JSON_SCHEMA,
+        messages=messages,
+        context=f"Image relation {stage} segmentation filtering",
+        step_name=IMAGE_SEGMENTS_STEP,
+        output_root=None,
+        attempt_count=0,
+        raw_output_writer=lambda payload: artifact_writer.write_debug_round_json(
+            round_name=round_name,
+            filename=RAW_MODEL_OUTPUT_FILENAME,
+            payload=payload,
+        ),
+    )
+    return remove_extra_numbered_segments(
+        segments=segments,
+        raw_model_output=raw_model_output,
+    )
+
+
+def filter_segments_with_vlm(
+    *,
+    state: dict[str, Any],
+    llm: Any,
+    stage: str,
+) -> dict[str, object]:
+    """Filter all segment groups with VLM and return an updated state patch."""
+    segment_groups = []
+    attempt_count = state["attempt_count"] + 1
+    image_path = require_image_path(state)
+    artifact_writer = WorkflowArtifactWriter(state["output_root"], IMAGE_SEGMENTS_STEP)
+
+    try:
+        for group in state["segment_groups"]:
+            group = dict(group)
+            group["segments"] = filter_group_segments_with_vlm(
+                llm=llm,
+                image_path=image_path,
+                artifact_writer=artifact_writer,
+                group=group,
+                segments=group["segments"],
+                stage=stage,
+            )
+            segment_groups.append(group)
+    except Exception as exc:
+        if is_model_output_error(exc) or isinstance(exc, ValueError):
+            error = format_attempt_error("Image relations VLM filter", attempt_count, exc)
+            log.log_warning(error)
+            return {
+                "attempt_count": attempt_count,
+                "last_error": error,
+                "errors": state["errors"] + [error],
+            }
+        raise
+
+    return {
+        "attempt_count": attempt_count,
+        "segment_groups": segment_groups,
+        "last_error": None,
+    }
+
+
+def remove_extra_numbered_segments(
+    *,
+    segments: list[dict[str, Any]],
+    raw_model_output: dict[str, Any],
+) -> list[dict[str, Any]]:
+    """Remove numbered masks flagged as extra by the VLM."""
+    extra_numbers = raw_model_output.get("extra_instance_numbers")
+    if not isinstance(extra_numbers, list):
+        raise ValueError("extra_instance_numbers must be a list.")
+    extra_indices = {int(number) - 1 for number in extra_numbers}
+    if any(index < 0 or index >= len(segments) for index in extra_indices):
+        raise ValueError("VLM returned an out-of-range extra mask number.")
+    kept = [
+        segment for index, segment in enumerate(segments) if index not in extra_indices
+    ]
+    return kept
+
+
+def merge_non_overlapping_segments(
+    *,
+    existing: list[dict[str, Any]],
+    incoming: list[dict[str, Any]],
+    limit: int,
+) -> list[dict[str, Any]]:
+    """Merge non-overlapping segments until a limit is reached."""
+    merged = list(existing)
+    for segment in sorted(
+        incoming, key=lambda item: float(item["score"]), reverse=True
+    ):
+        if len(merged) >= limit:
+            break
+        if all(
+            bbox_iou(segment["bbox_xyxy"], other["bbox_xyxy"]) < OVERLAP_IOU_THRESHOLD
+            for other in merged
+        ):
+            merged.append(segment)
+    return sort_segments_by_bbox(merged)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/llm_output.py b/embodichain/gen_sim/prompt2scene/workflows/llm_output.py
new file mode 100644
index 00000000..bcc98bcb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/llm_output.py
@@ -0,0 +1,285 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Callable
+
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    WorkflowArtifactWriter,
+    write_next_raw_model_output,
+)
+
+__all__ = [
+    "bind_structured_output",
+    "coerce_json_object_output",
+    "is_model_output_error",
+    "call_structured_json_model_step",
+    "StructuredModelCallError",
+    "validate_json_schema",
+]
+
+
+class StructuredModelCallError(Exception):
+    """Retryable structured-model call failure."""
+
+    def __init__(
+        self,
+        *,
+        context: str,
+        attempt_count: int,
+        original_exc: Exception,
+    ) -> None:
+        self.context = context
+        self.attempt_count = attempt_count
+        self.original_exc = original_exc
+        super().__init__(str(original_exc))
+
+
+def bind_structured_output(llm: Any, schema: dict[str, Any]) -> Any:
+    """Bind a JSON schema to an LLM when the model wrapper supports it."""
+    if hasattr(llm, "with_structured_output"):
+        return llm.with_structured_output(schema)
+    return llm
+
+
+def coerce_json_object_output(response: Any, *, context: str) -> dict[str, Any]:
+    """Coerce a model response into a JSON object."""
+    if isinstance(response, dict):
+        return response
+
+    content = getattr(response, "content", response)
+    if isinstance(content, dict):
+        return content
+
+    if isinstance(content, list):
+        text_parts = [
+            item.get("text", "")
+            for item in content
+            if isinstance(item, dict) and item.get("type") == "text"
+        ]
+        content = "\n".join(text_parts)
+
+    if isinstance(content, str):
+        return _parse_json_text(content, context=context)
+
+    raise ValueError(f"{context} model output has unsupported type: {type(response)!r}")
+
+
+def is_model_output_error(exc: Exception) -> bool:
+    """Return whether an exception is a retryable model output formatting error."""
+    class_name = exc.__class__.__name__
+    module_name = exc.__class__.__module__
+    return (
+        class_name
+        in {
+            "JSONDecodeError",
+            "OutputParserException",
+            "SchemaValidationError",
+            "ValidationError",
+            "StructuredModelCallError",
+        }
+        or module_name.startswith("pydantic")
+    )
+
+
+def validate_json_schema(
+    value: Any,
+    schema: dict[str, Any],
+    *,
+    context: str,
+) -> None:
+    """Validate model output against the subset of JSON Schema used locally."""
+    _validate_schema_value(value, schema, path=context)
+
+
+def call_structured_json_model_step(
+    *,
+    llm: Any,
+    schema: dict[str, Any],
+    messages: list[dict[str, Any]],
+    context: str,
+    step_name: str,
+    output_root: Path | None,
+    attempt_count: int,
+    raw_output_label: str | None = None,
+    artifact_writer: WorkflowArtifactWriter | None = None,
+    raw_output_writer: Callable[[dict[str, Any]], None] | None = None,
+) -> dict[str, Any]:
+    """Call a structured-output model, validate JSON, and persist raw output."""
+    model = bind_structured_output(llm, schema)
+    try:
+        response = model.invoke(messages)
+        raw_model_output = coerce_json_object_output(response, context=context)
+        validate_json_schema(
+            raw_model_output,
+            schema,
+            context=f"{context} output",
+        )
+    except Exception as exc:
+        if is_model_output_error(exc) or isinstance(exc, ValueError):
+            raise StructuredModelCallError(
+                context=context,
+                attempt_count=attempt_count,
+                original_exc=exc,
+            ) from exc
+        raise
+
+    if raw_output_writer is not None:
+        raw_output_writer(raw_model_output)
+    elif artifact_writer is not None:
+        artifact_writer.write_next_raw_model_output(
+            payload=raw_model_output,
+            label=raw_output_label,
+        )
+    elif output_root is not None:
+        write_next_raw_model_output(
+            output_root=output_root,
+            step_name=step_name,
+            payload=raw_model_output,
+            label=raw_output_label,
+        )
+    return raw_model_output
+
+
+def _parse_json_text(content: str, *, context: str) -> dict[str, Any]:
+    stripped = content.strip()
+    if stripped.startswith("```"):
+        lines = stripped.splitlines()
+        if lines and lines[0].startswith("```"):
+            lines = lines[1:]
+        if lines and lines[-1].startswith("```"):
+            lines = lines[:-1]
+        stripped = "\n".join(lines).strip()
+    parsed = json.loads(stripped)
+    if not isinstance(parsed, dict):
+        raise ValueError(f"{context} model output must be a JSON object.")
+    return parsed
+
+
+def _validate_schema_value(value: Any, schema: dict[str, Any], *, path: str) -> None:
+    expected_type = schema.get("type")
+    if expected_type is not None:
+        _validate_type(value, expected_type, path=path)
+
+    enum_values = schema.get("enum")
+    if isinstance(enum_values, list) and value not in enum_values:
+        raise ValueError(f"{path} must be one of {enum_values}.")
+
+    if expected_type == "object" or isinstance(value, dict):
+        _validate_object(value, schema, path=path)
+    elif expected_type == "array" or isinstance(value, list):
+        _validate_array(value, schema, path=path)
+    elif expected_type == "string" or isinstance(value, str):
+        _validate_string(value, schema, path=path)
+    elif expected_type in {"integer", "number"}:
+        _validate_number(value, schema, path=path)
+
+
+def _validate_type(value: Any, expected_type: Any, *, path: str) -> None:
+    if isinstance(expected_type, list):
+        if any(_matches_type(value, item) for item in expected_type):
+            return
+        raise ValueError(f"{path} must match one of these types: {expected_type}.")
+
+    if not _matches_type(value, expected_type):
+        raise ValueError(f"{path} must be {expected_type}.")
+
+
+def _matches_type(value: Any, expected_type: str) -> bool:
+    if expected_type == "object":
+        return isinstance(value, dict)
+    if expected_type == "array":
+        return isinstance(value, list)
+    if expected_type == "string":
+        return isinstance(value, str)
+    if expected_type == "integer":
+        return isinstance(value, int) and not isinstance(value, bool)
+    if expected_type == "number":
+        return isinstance(value, int | float) and not isinstance(value, bool)
+    if expected_type == "boolean":
+        return isinstance(value, bool)
+    if expected_type == "null":
+        return value is None
+    return True
+
+
+def _validate_object(value: Any, schema: dict[str, Any], *, path: str) -> None:
+    if not isinstance(value, dict):
+        return
+
+    properties = schema.get("properties")
+    properties = properties if isinstance(properties, dict) else {}
+
+    required = schema.get("required", [])
+    if isinstance(required, list):
+        missing = [key for key in required if key not in value]
+        if missing:
+            raise ValueError(f"{path} missing required keys: {missing}.")
+
+    if schema.get("additionalProperties") is False:
+        extra = sorted(set(value) - set(properties))
+        if extra:
+            raise ValueError(f"{path} has unexpected keys: {extra}.")
+
+    for key, child_schema in properties.items():
+        if key not in value or not isinstance(child_schema, dict):
+            continue
+        _validate_schema_value(value[key], child_schema, path=f"{path}.{key}")
+
+
+def _validate_array(value: Any, schema: dict[str, Any], *, path: str) -> None:
+    if not isinstance(value, list):
+        return
+
+    min_items = schema.get("minItems")
+    if isinstance(min_items, int) and len(value) < min_items:
+        raise ValueError(f"{path} must contain at least {min_items} items.")
+
+    max_items = schema.get("maxItems")
+    if isinstance(max_items, int) and len(value) > max_items:
+        raise ValueError(f"{path} must contain at most {max_items} items.")
+
+    items_schema = schema.get("items")
+    if not isinstance(items_schema, dict):
+        return
+
+    for index, item in enumerate(value):
+        _validate_schema_value(item, items_schema, path=f"{path}[{index}]")
+
+
+def _validate_string(value: Any, schema: dict[str, Any], *, path: str) -> None:
+    if not isinstance(value, str):
+        return
+
+    min_length = schema.get("minLength")
+    if isinstance(min_length, int) and len(value) < min_length:
+        raise ValueError(f"{path} must contain at least {min_length} characters.")
+
+    max_length = schema.get("maxLength")
+    if isinstance(max_length, int) and len(value) > max_length:
+        raise ValueError(f"{path} must contain at most {max_length} characters.")
+
+
+def _validate_number(value: Any, schema: dict[str, Any], *, path: str) -> None:
+    if not isinstance(value, int | float) or isinstance(value, bool):
+        return
+
+    minimum = schema.get("minimum")
+    if isinstance(minimum, int | float) and value < minimum:
+        raise ValueError(f"{path} must be greater than or equal to {minimum}.")
diff --git a/embodichain/gen_sim/prompt2scene/workflows/request.py b/embodichain/gen_sim/prompt2scene/workflows/request.py
new file mode 100644
index 00000000..8cd01c30
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/request.py
@@ -0,0 +1,110 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+
+__all__ = ["InputKind", "Prompt2SceneInput"]
+
+SUPPORTED_IMAGE_SUFFIXES: frozenset[str] = frozenset({".jpg", ".jpeg", ".png"})
+
+
+class InputKind(str, Enum):
+    """Supported prompt2scene input kinds."""
+
+    IMAGE = "image"
+    TEXT = "text"
+
+
+@dataclass(frozen=True)
+class Prompt2SceneInput:
+    """Normalized prompt2scene input."""
+
+    input_kind: InputKind
+    output_root: Path
+    image_path: Path | None = None
+    text: str | None = None
+
+    @classmethod
+    def from_cli_args(
+        cls,
+        *,
+        image_path: Path | None,
+        text: str | None,
+        output_root: Path,
+    ) -> "Prompt2SceneInput":
+        """Create a prompt2scene input from CLI arguments.
+
+        Args:
+            image_path: Input image path, if image mode is selected.
+            text: Text prompt, if text mode is selected.
+            output_root: Directory where prompt2scene outputs are written.
+
+        Returns:
+            Normalized prompt2scene input.
+
+        Raises:
+            FileNotFoundError: If the image input path does not exist.
+            ValueError: If the image path is invalid or text input is empty.
+        """
+        output_root = output_root.expanduser().resolve()
+
+        if image_path is not None:
+            image_path = image_path.expanduser().resolve()
+            cls._validate_image_path(image_path)
+            return cls(
+                input_kind=InputKind.IMAGE,
+                image_path=image_path,
+                output_root=output_root,
+            )
+
+        if text is None or not text.strip():
+            raise ValueError("Text input must be non-empty.")
+
+        return cls(
+            input_kind=InputKind.TEXT,
+            text=text.strip(),
+            output_root=output_root,
+        )
+
+    def to_manifest(self) -> dict[str, str]:
+        """Convert the input to a JSON-serializable manifest."""
+        manifest: dict[str, str] = {
+            "input_kind": self.input_kind.value,
+            "output_root": str(self.output_root),
+        }
+        if self.input_kind == InputKind.IMAGE:
+            image_path = self.image_path
+            manifest["image_path"] = str(image_path)
+        else:
+            text = self.text
+            manifest["text"] = "" if text is None else text
+        return manifest
+
+    @staticmethod
+    def _validate_image_path(image_path: Path) -> None:
+        """Validate supported image input paths."""
+        if not image_path.exists():
+            raise FileNotFoundError(f"Image input not found: {image_path}")
+        if not image_path.is_file():
+            raise ValueError(f"Image input is not a file: {image_path}")
+        if image_path.suffix.lower() not in SUPPORTED_IMAGE_SUFFIXES:
+            raise ValueError(
+                "Image input must have one of these extensions: .jpg, .jpeg, .png"
+            )
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/__init__.py
new file mode 100644
index 00000000..ac862308
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/__init__.py
@@ -0,0 +1,24 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.graph import (
+    build_scene_intake_graph,
+    run_scene_intake,
+)
+
+__all__ = ["build_scene_intake_graph", "run_scene_intake"]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/graph.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/graph.py
new file mode 100644
index 00000000..77874b15
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/graph.py
@@ -0,0 +1,142 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.llms import (
+    OpenAICompatibleLLMCfg,
+    build_chat_model,
+)
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+    format_result_missing_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.nodes import (
+    call_vlm_scene_intake_node,
+    call_vlm_verify_scene_intake_node,
+    normalize_scene_intake_node,
+    normalize_verified_scene_intake_node,
+    prepare_input_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.state import (
+    SceneIntakeState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["build_scene_intake_graph", "run_scene_intake"]
+
+
+def route_after_normalize(state: SceneIntakeState) -> str:
+    """Route to retry or verify after draft scene intake normalization."""
+    if state["draft_scene_intake"] is not None:
+        return "verify"
+    if state["attempt_count"] < state["max_attempts"]:
+        return "retry"
+    return "end"
+
+
+def route_after_verified_normalize(state: SceneIntakeState) -> str:
+    """Route to retry or finish after scene intake verifier normalization."""
+    if state["scene_intake"] is not None:
+        return "end"
+    if state["attempt_count"] < state["max_attempts"]:
+        return "retry"
+    return "end"
+
+
+def build_scene_intake_graph(llm: Any) -> Any:
+    """Build the fixed LangGraph scene intake workflow."""
+    graph = StateGraph(SceneIntakeState)
+    graph.add_node("prepare_input", prepare_input_node)
+    graph.add_node(
+        "call_vlm_scene_intake",
+        lambda state: call_vlm_scene_intake_node(state, llm=llm),
+    )
+    graph.add_node("normalize_scene_intake", normalize_scene_intake_node)
+    graph.add_node(
+        "call_vlm_verify_scene_intake",
+        lambda state: call_vlm_verify_scene_intake_node(state, llm=llm),
+    )
+    graph.add_node(
+        "normalize_verified_scene_intake",
+        normalize_verified_scene_intake_node,
+    )
+
+    graph.set_entry_point("prepare_input")
+    graph.add_edge("prepare_input", "call_vlm_scene_intake")
+    graph.add_edge("call_vlm_scene_intake", "normalize_scene_intake")
+    graph.add_conditional_edges(
+        "normalize_scene_intake",
+        route_after_normalize,
+        {
+            "retry": "call_vlm_scene_intake",
+            "verify": "call_vlm_verify_scene_intake",
+            "end": END,
+        },
+    )
+    graph.add_edge("call_vlm_verify_scene_intake", "normalize_verified_scene_intake")
+    graph.add_conditional_edges(
+        "normalize_verified_scene_intake",
+        route_after_verified_normalize,
+        {
+            "retry": "call_vlm_verify_scene_intake",
+            "end": END,
+        },
+    )
+    return graph.compile()
+
+
+def run_scene_intake(
+    request: Prompt2SceneInput,
+    llm_cfg: OpenAICompatibleLLMCfg,
+) -> SceneIntakeSpec:
+    """Run fixed VLM-based scene intake for one prompt2scene request."""
+    llm = build_chat_model(llm_cfg)
+    graph = build_scene_intake_graph(llm)
+    result = graph.invoke(
+        {
+            "request": request,
+            "messages": [],
+            "raw_model_output": None,
+            "draft_scene_intake": None,
+            "scene_intake": None,
+            "attempt_count": 0,
+            "max_attempts": llm_cfg.max_attempts,
+            "last_error": None,
+            "errors": [],
+        }
+    )
+
+    scene_intake = result.get("scene_intake")
+    if scene_intake is not None:
+        return scene_intake
+
+    error = format_result_missing_error(
+        "Scene intake",
+        "SceneIntakeSpec",
+        attempt_count=result.get("attempt_count", 0),
+        last_error=result.get("last_error"),
+        errors=result.get("errors", []),
+    )
+    log.log_warning(error)
+    raise RuntimeError(error)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/nodes.py
new file mode 100644
index 00000000..8c7baf55
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/nodes.py
@@ -0,0 +1,211 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SCENE_INTAKE_JSON_SCHEMA,
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.utils import (
+    log_api_request_start,
+    log,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    SCENE_INTAKE_STEP,
+    WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.workflows.llm_output import (
+    StructuredModelCallError,
+    call_structured_json_model_step,
+)
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+    format_attempt_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.prompts import (
+    build_scene_intake_messages,
+    build_scene_intake_verifier_messages,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.state import (
+    SceneIntakeState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.utils import (
+    build_scene_intake_spec,
+)
+
+__all__ = [
+    "call_vlm_scene_intake_node",
+    "call_vlm_verify_scene_intake_node",
+    "normalize_scene_intake_node",
+    "normalize_verified_scene_intake_node",
+    "prepare_input_node",
+]
+
+
+def prepare_input_node(state: SceneIntakeState) -> dict[str, object]:
+    """Prepare chat messages for the scene intake model call."""
+    return {"messages": build_scene_intake_messages(state["request"])}
+
+
+def call_vlm_scene_intake_node(
+    state: SceneIntakeState,
+    *,
+    llm: Any,
+) -> dict[str, object]:
+    """Call the configured VLM for fixed scene intake extraction."""
+    attempt_count = state["attempt_count"] + 1
+
+    try:
+        log_api_request_start(
+            step=SCENE_INTAKE_STEP,
+            request="extract",
+            attempt=attempt_count,
+        )
+        artifact_writer = WorkflowArtifactWriter(
+            state["request"].output_root,
+            SCENE_INTAKE_STEP,
+        )
+        raw_model_output = call_structured_json_model_step(
+            llm=llm,
+            schema=SCENE_INTAKE_JSON_SCHEMA,
+            messages=state["messages"],
+            context="Scene intake",
+            step_name=SCENE_INTAKE_STEP,
+            output_root=None,
+            attempt_count=attempt_count,
+            raw_output_label="extract",
+            artifact_writer=artifact_writer,
+        )
+    except StructuredModelCallError as exc:
+        error = format_attempt_error("Scene intake", attempt_count, exc)
+        log.log_warning(error)
+        return {
+            "attempt_count": attempt_count,
+            "raw_model_output": None,
+            "last_error": error,
+            "errors": state["errors"] + [error],
+        }
+
+    return {
+        "attempt_count": attempt_count,
+        "raw_model_output": raw_model_output,
+        "last_error": None,
+    }
+
+
+def normalize_scene_intake_node(state: SceneIntakeState) -> dict[str, object]:
+    """Normalize raw VLM JSON into a draft scene intake schema."""
+    raw_model_output = state["raw_model_output"]
+    if raw_model_output is None:
+        return {}
+
+    try:
+        scene_intake = build_scene_intake_spec(
+            request=state["request"],
+            model_output=raw_model_output,
+        )
+    except ValueError as exc:
+        error = format_attempt_error("Scene intake", state["attempt_count"], exc)
+        return {
+            "draft_scene_intake": None,
+            "last_error": error,
+            "errors": state["errors"] + [error],
+        }
+
+    return {"draft_scene_intake": scene_intake, "scene_intake": None}
+
+
+def call_vlm_verify_scene_intake_node(
+    state: SceneIntakeState,
+    *,
+    llm: Any,
+) -> dict[str, object]:
+    """Ask VLM to verify and correct scene-intake grouping and counts."""
+    draft_scene_intake = state["draft_scene_intake"]
+    if draft_scene_intake is None:
+        return {}
+
+    attempt_count = state["attempt_count"] + 1
+    messages = build_scene_intake_verifier_messages(
+        request=state["request"],
+        scene_intake=draft_scene_intake,
+    )
+
+    try:
+        log_api_request_start(
+            step=SCENE_INTAKE_STEP,
+            request="verify",
+            attempt=attempt_count,
+        )
+        artifact_writer = WorkflowArtifactWriter(
+            state["request"].output_root,
+            SCENE_INTAKE_STEP,
+        )
+        raw_model_output = call_structured_json_model_step(
+            llm=llm,
+            schema=SCENE_INTAKE_JSON_SCHEMA,
+            messages=messages,
+            context="Scene intake verifier",
+            step_name=SCENE_INTAKE_STEP,
+            output_root=None,
+            attempt_count=attempt_count,
+            raw_output_label="verify",
+            artifact_writer=artifact_writer,
+        )
+    except StructuredModelCallError as exc:
+        error = format_attempt_error("Scene intake verifier", attempt_count, exc)
+        log.log_warning(error)
+        return {
+            "attempt_count": attempt_count,
+            "raw_model_output": None,
+            "scene_intake": None,
+            "last_error": error,
+            "errors": state["errors"] + [error],
+        }
+
+    return {
+        "attempt_count": attempt_count,
+        "raw_model_output": raw_model_output,
+        "scene_intake": None,
+        "last_error": None,
+    }
+
+
+def normalize_verified_scene_intake_node(
+    state: SceneIntakeState,
+) -> dict[str, object]:
+    """Normalize verifier output into the final scene intake schema."""
+    raw_model_output = state["raw_model_output"]
+    if raw_model_output is None:
+        return {}
+
+    try:
+        scene_intake = build_scene_intake_spec(
+            request=state["request"],
+            model_output=raw_model_output,
+        )
+    except ValueError as exc:
+        error = format_attempt_error("Scene intake verifier", state["attempt_count"], exc)
+        log.log_warning(error)
+        return {
+            "scene_intake": None,
+            "last_error": error,
+            "errors": state["errors"] + [error],
+        }
+
+    return {"scene_intake": scene_intake, "last_error": None}
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/prompts.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/prompts.py
new file mode 100644
index 00000000..421ec979
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/prompts.py
@@ -0,0 +1,202 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.prompts import render_prompt
+from embodichain.gen_sim.prompt2scene.workflows.request import (
+    InputKind,
+    Prompt2SceneInput,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import image_to_data_url
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+
+__all__ = [
+    "build_scene_intake_messages",
+    "build_scene_intake_verifier_messages",
+]
+
+SCENE_INTAKE_PROMPT_NAME = "scene_intake.yaml"
+
+
+def build_scene_intake_messages(request: Prompt2SceneInput) -> list[dict[str, Any]]:
+    """Build LangChain-compatible messages for scene intake."""
+    if request.input_kind == InputKind.TEXT:
+        return _build_text_messages(request)
+    return _build_image_messages(request)
+
+
+def build_scene_intake_verifier_messages(
+    *,
+    request: Prompt2SceneInput,
+    scene_intake: SceneIntakeSpec,
+) -> list[dict[str, Any]]:
+    """Build messages for scene-intake group and count verification."""
+    table_draft: dict[str, object] = {
+        "name": scene_intake.table.name,
+        "description": scene_intake.table.description,
+        "complete_table_description": (
+            scene_intake.table.complete_table_description
+        ),
+        "is_complete_visible_table": (
+            scene_intake.table.is_complete_visible_table
+        ),
+        "class_candidate": list(scene_intake.table.class_candidate),
+    }
+    if scene_intake.table.object_coverage_percent is not None:
+        table_draft["object_coverage_percent"] = (
+            scene_intake.table.object_coverage_percent
+        )
+    scene_intake_json = json.dumps(
+        {
+            "table": table_draft,
+            "assets": [
+                {
+                    "name": asset.name,
+                    "description": asset.description,
+                    "class_candidate": list(asset.class_candidate),
+                    "count": asset.count,
+                }
+                for asset in scene_intake.assets
+            ],
+        },
+        ensure_ascii=False,
+        indent=2,
+    )
+    if request.input_kind == InputKind.TEXT:
+        return _build_text_verifier_messages(
+            request=request,
+            scene_intake_json=scene_intake_json,
+        )
+    return _build_image_verifier_messages(
+        request=request,
+        scene_intake_json=scene_intake_json,
+    )
+
+
+def _build_text_messages(request: Prompt2SceneInput) -> list[dict[str, Any]]:
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(SCENE_INTAKE_PROMPT_NAME, prompt_key="text_system"),
+        },
+        {
+            "role": "user",
+            "content": render_prompt(
+                SCENE_INTAKE_PROMPT_NAME,
+                {"text": request.text or ""},
+                prompt_key="text_user",
+            ),
+        },
+    ]
+
+
+def _build_image_messages(request: Prompt2SceneInput) -> list[dict[str, Any]]:
+    image_path = request.image_path
+    if image_path is None:
+        raise ValueError("Image input requires image_path.")
+
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(SCENE_INTAKE_PROMPT_NAME, prompt_key="image_system"),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": render_prompt(
+                        SCENE_INTAKE_PROMPT_NAME,
+                        prompt_key="image_user",
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(image_path)},
+                },
+            ],
+        },
+    ]
+
+
+def _build_text_verifier_messages(
+    *,
+    request: Prompt2SceneInput,
+    scene_intake_json: str,
+) -> list[dict[str, Any]]:
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                SCENE_INTAKE_PROMPT_NAME,
+                prompt_key="verifier_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": render_prompt(
+                SCENE_INTAKE_PROMPT_NAME,
+                {
+                    "text": request.text or "",
+                    "scene_intake_json": scene_intake_json,
+                },
+                prompt_key="verifier_text_user",
+            ),
+        },
+    ]
+
+
+def _build_image_verifier_messages(
+    *,
+    request: Prompt2SceneInput,
+    scene_intake_json: str,
+) -> list[dict[str, Any]]:
+    image_path = request.image_path
+    if image_path is None:
+        raise ValueError("Image input requires image_path.")
+
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                SCENE_INTAKE_PROMPT_NAME,
+                prompt_key="verifier_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": render_prompt(
+                        SCENE_INTAKE_PROMPT_NAME,
+                        {"scene_intake_json": scene_intake_json},
+                        prompt_key="verifier_image_user",
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(image_path)},
+                },
+            ],
+        },
+    ]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/schema.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/schema.py
new file mode 100644
index 00000000..31b55e6d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/schema.py
@@ -0,0 +1,260 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.request import (
+    InputKind,
+    Prompt2SceneInput,
+)
+
+__all__ = [
+    "SCENE_INTAKE_JSON_SCHEMA",
+    "SceneIntakeAsset",
+    "SceneIntakeInputRecord",
+    "SceneIntakeSpec",
+    "SceneIntakeTable",
+]
+
+SCENE_INTAKE_JSON_SCHEMA: dict[str, Any] = {
+    "title": "SceneIntakeModelOutput",
+    "description": (
+        "Objects and table information extracted from a text or image input."
+    ),
+    "type": "object",
+    "additionalProperties": False,
+    "properties": {
+        "table": {
+            "type": "object",
+            "additionalProperties": False,
+            "properties": {
+                "name": {
+                    "type": "string",
+                    "description": (
+                        "Canonical English class name for the visible table "
+                        "or tabletop target, such as table, desk, dining_table, "
+                        "coffee_table, workbench, or tabletop."
+                    ),
+                },
+                "description": {
+                    "type": "string",
+                    "minLength": 20,
+                    "maxLength": 180,
+                    "description": (
+                        "One concise standalone appearance description of the "
+                        "visible table or tabletop region."
+                    ),
+                },
+                "complete_table_description": {
+                    "type": "string",
+                    "minLength": 20,
+                    "maxLength": 220,
+                    "description": (
+                        "One concise standalone description of a complete table "
+                        "asset for text-to-3D generation, matching the visible "
+                        "tabletop color, material, and texture."
+                    ),
+                },
+                "is_complete_visible_table": {
+                    "type": "boolean",
+                    "description": (
+                        "For image input, whether a mostly complete table is "
+                        "visible and suitable as the final table geometry source. "
+                        "For text input, this should be false."
+                    ),
+                },
+                "class_candidate": {
+                    "type": "array",
+                    "minItems": 5,
+                    "maxItems": 5,
+                    "description": (
+                        "Exactly five likely class names for segmenting the "
+                        "visible table or tabletop target."
+                    ),
+                    "items": {
+                        "type": "string",
+                        "minLength": 1,
+                    },
+                },
+                "object_coverage_percent": {
+                    "type": "integer",
+                    "enum": [10, 30, 50, 70],
+                    "description": (
+                        "For image input with a complete visible table ONLY: "
+                        "choose the closest coverage bucket for objects on the "
+                        "tabletop: 10 (mostly empty, a few small objects), "
+                        "30 (lightly cluttered), 50 (moderately cluttered), "
+                        "70 (densely packed). Omit this field entirely for "
+                        "text input or when is_complete_visible_table is false."
+                    ),
+                },
+            },
+            "required": [
+                "name",
+                "description",
+                "complete_table_description",
+                "is_complete_visible_table",
+                "class_candidate",
+            ],
+        },
+        "assets": {
+            "type": "array",
+            "description": (
+                "Object category groups on or intended for the tabletop scene."
+            ),
+            "items": {
+                "type": "object",
+                "additionalProperties": False,
+                "properties": {
+                    "name": {
+                        "type": "string",
+                        "description": (
+                            "Canonical English object name, singular, "
+                            "snake_case preferred."
+                        ),
+                    },
+                    "description": {
+                        "type": "string",
+                        "minLength": 20,
+                        "maxLength": 180,
+                        "description": (
+                            "One concise appearance description of the object for "
+                            "image and 3D geometry generation."
+                        ),
+                    },
+                    "class_candidate": {
+                        "type": "array",
+                        "minItems": 5,
+                        "maxItems": 5,
+                        "description": (
+                            "Exactly five likely object class names for later "
+                            "image detection or segmentation."
+                        ),
+                        "items": {
+                            "type": "string",
+                            "minLength": 1,
+                        },
+                    },
+                    "count": {
+                        "type": "integer",
+                        "description": (
+                            "Number of repeated instances in this object category "
+                            "group. Only group objects that can share the same name, "
+                            "description, and class_candidate list."
+                        ),
+                        "minimum": 1,
+                    },
+                },
+                "required": ["name", "description", "class_candidate", "count"],
+            },
+        },
+    },
+    "required": ["table", "assets"],
+}
+
+
+@dataclass(frozen=True)
+class SceneIntakeInputRecord:
+    """Normalized input source recorded by scene intake."""
+
+    input_kind: InputKind
+    text: str | None = None
+    image_path: str | None = None
+
+    @classmethod
+    def from_request(cls, request: Prompt2SceneInput) -> "SceneIntakeInputRecord":
+        """Create an input record from a prompt2scene request."""
+        return cls(
+            input_kind=request.input_kind,
+            text=request.text,
+            image_path=str(request.image_path) if request.image_path else None,
+        )
+
+    def to_manifest(self) -> dict[str, str | None]:
+        """Convert the input record to JSON-safe data."""
+        return {
+            "input_kind": self.input_kind.value,
+            "text": self.text,
+            "image_path": self.image_path,
+        }
+
+
+@dataclass(frozen=True)
+class SceneIntakeTable:
+    """Table/support information extracted during scene intake."""
+
+    id: str = "table"
+    name: str = "table"
+    description: str = ""
+    complete_table_description: str = ""
+    is_complete_visible_table: bool = False
+    class_candidate: list[str] = field(default_factory=list)
+    object_coverage_percent: int | None = None
+
+    def to_manifest(self) -> dict[str, object]:
+        """Convert the table record to JSON-safe data."""
+        manifest: dict[str, object] = {
+            "id": self.id,
+            "name": self.name,
+            "description": self.description,
+            "complete_table_description": self.complete_table_description,
+            "is_complete_visible_table": self.is_complete_visible_table,
+            "class_candidate": list(self.class_candidate),
+        }
+        if self.object_coverage_percent is not None:
+            manifest["object_coverage_percent"] = self.object_coverage_percent
+        return manifest
+
+
+@dataclass(frozen=True)
+class SceneIntakeAsset:
+    """Object category group extracted during scene intake."""
+
+    id: str
+    name: str
+    count: int = 1
+    description: str = ""
+    class_candidate: list[str] = field(default_factory=list)
+
+    def to_manifest(self) -> dict[str, object]:
+        """Convert the asset record to JSON-safe data."""
+        return {
+            "id": self.id,
+            "name": self.name,
+            "count": self.count,
+            "description": self.description,
+            "class_candidate": list(self.class_candidate),
+        }
+
+
+@dataclass(frozen=True)
+class SceneIntakeSpec:
+    """Unified first-step scene intake output for text and image inputs."""
+
+    input: SceneIntakeInputRecord
+    table: SceneIntakeTable
+    assets: list[SceneIntakeAsset]
+
+    def to_manifest(self) -> dict[str, object]:
+        """Convert the intake spec to JSON-safe data."""
+        return {
+            "input": self.input.to_manifest(),
+            "table": self.table.to_manifest(),
+            "assets": [asset.to_manifest() for asset in self.assets],
+        }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/state.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/state.py
new file mode 100644
index 00000000..7a96619f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/state.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+
+__all__ = ["SceneIntakeState"]
+
+
+class SceneIntakeState(AttemptState):
+    """LangGraph state for the fixed scene intake workflow."""
+
+    request: Prompt2SceneInput
+    messages: list[Any]
+    raw_model_output: dict[str, Any] | None
+    draft_scene_intake: SceneIntakeSpec | None
+    scene_intake: SceneIntakeSpec | None
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/utils.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/utils.py
new file mode 100644
index 00000000..da084f55
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/utils.py
@@ -0,0 +1,256 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_warning
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeAsset,
+    SceneIntakeInputRecord,
+    SceneIntakeSpec,
+    SceneIntakeTable,
+)
+
+__all__ = ["build_scene_intake_spec", "normalize_asset_name"]
+
+
+def normalize_asset_name(name: str) -> str:
+    """Normalize an object name for stable asset IDs."""
+    normalized = name.strip().lower()
+    normalized = normalized.replace("-", " ").replace("/", " ")
+    normalized = re.sub(r"[^a-z0-9\s_]", "", normalized)
+    normalized = re.sub(r"\s+", "_", normalized)
+    normalized = re.sub(r"_+", "_", normalized).strip("_")
+    return normalized or "object"
+
+
+def build_scene_intake_spec(
+    *,
+    request: Prompt2SceneInput,
+    model_output: dict[str, Any],
+) -> SceneIntakeSpec:
+    """Normalize raw VLM JSON into the stable scene intake schema."""
+    _validate_exact_keys(
+        model_output,
+        allowed_keys={"table", "assets"},
+        context="Scene intake model output",
+    )
+    input_record = SceneIntakeInputRecord.from_request(request)
+    table = _parse_table(_require_mapping(model_output.get("table"), "table"))
+    assets = _parse_assets(_require_list(model_output.get("assets"), "assets"))
+    return SceneIntakeSpec(input=input_record, table=table, assets=assets)
+
+
+def _parse_table(raw_table: dict[str, Any]) -> SceneIntakeTable:
+    _validate_exact_keys(
+        raw_table,
+        allowed_keys={
+            "name",
+            "description",
+            "complete_table_description",
+            "is_complete_visible_table",
+            "class_candidate",
+            "object_coverage_percent",
+        },
+        context="Scene intake table",
+    )
+
+    if "name" not in raw_table:
+        raise ValueError("Scene intake table.name is required.")
+    raw_name = str(raw_table["name"]).strip()
+    if not raw_name:
+        raise ValueError("Scene intake table.name must be non-empty.")
+    name = normalize_asset_name(raw_name)
+
+    if "description" not in raw_table:
+        raise ValueError("Scene intake table.description is required.")
+    description = str(raw_table["description"]).strip()
+    if not description:
+        raise ValueError("Scene intake table.description must be non-empty.")
+
+    if "complete_table_description" not in raw_table:
+        raise ValueError("Scene intake table.complete_table_description is required.")
+    complete_table_description = str(
+        raw_table["complete_table_description"]
+    ).strip()
+    if not complete_table_description:
+        raise ValueError(
+            "Scene intake table.complete_table_description must be non-empty."
+        )
+
+    if "is_complete_visible_table" not in raw_table:
+        raise ValueError("Scene intake table.is_complete_visible_table is required.")
+    is_complete_visible_table = raw_table["is_complete_visible_table"]
+    if not isinstance(is_complete_visible_table, bool):
+        raise ValueError(
+            "Scene intake table.is_complete_visible_table must be a boolean."
+        )
+
+    class_candidate = _parse_class_candidate(
+        raw_table.get("class_candidate"),
+        asset_index="table",
+        raw_name=name,
+    )
+
+    object_coverage_percent: int | None = None
+    raw_percent = raw_table.get("object_coverage_percent")
+    if raw_percent is not None:
+        if isinstance(raw_percent, bool):
+            raise ValueError(
+                "Scene intake table.object_coverage_percent must be an integer, "
+                "not a boolean."
+            )
+        try:
+            object_coverage_percent = int(raw_percent)
+        except (TypeError, ValueError):
+            raise ValueError(
+                "Scene intake table.object_coverage_percent must be an integer "
+                f"between 1 and 100, got {raw_percent!r}."
+            )
+        if object_coverage_percent not in (10, 30, 50, 70):
+            raise ValueError(
+                "Scene intake table.object_coverage_percent must be one of "
+                f"10, 30, 50, 70, got {object_coverage_percent}."
+            )
+
+    return SceneIntakeTable(
+        name=name,
+        description=description,
+        complete_table_description=complete_table_description,
+        is_complete_visible_table=is_complete_visible_table,
+        class_candidate=class_candidate,
+        object_coverage_percent=object_coverage_percent,
+    )
+
+
+def _parse_assets(raw_assets: list[Any]) -> list[SceneIntakeAsset]:
+    assets: list[SceneIntakeAsset] = []
+    seen_names: set[str] = set()
+
+    for asset_index, raw_asset in enumerate(raw_assets):
+        if not isinstance(raw_asset, dict):
+            raise ValueError(f"Scene intake asset {asset_index} must be an object.")
+        _validate_exact_keys(
+            raw_asset,
+            allowed_keys={"name", "description", "class_candidate", "count"},
+            context=f"Scene intake asset {asset_index}",
+        )
+
+        if "name" not in raw_asset:
+            raise ValueError(f"Scene intake asset {asset_index}.name is required.")
+        raw_name = str(raw_asset["name"]).strip()
+        if not raw_name:
+            raise ValueError(
+                f"Scene intake asset {asset_index}.name must be non-empty."
+            )
+
+        if "description" not in raw_asset:
+            raise ValueError(
+                f"Scene intake asset {asset_index}.description is required."
+            )
+        description = str(raw_asset["description"]).strip()
+        if not description:
+            raise ValueError(
+                f"Scene intake asset {asset_index}.description must be non-empty."
+            )
+
+        class_candidate = _parse_class_candidate(
+            raw_asset.get("class_candidate"),
+            asset_index=asset_index,
+            raw_name=raw_name,
+        )
+        count = _parse_count(raw_asset.get("count"), asset_index=asset_index)
+        base_name = normalize_asset_name(raw_name)
+        name = base_name
+        suffix = 2
+        while name in seen_names:
+            name = f"{base_name}_{suffix}"
+            suffix += 1
+        seen_names.add(name)
+        assets.append(
+            SceneIntakeAsset(
+                id=f"interact_{name}",
+                name=name,
+                count=count,
+                description=description,
+                class_candidate=class_candidate,
+            )
+        )
+    return assets
+
+
+def _parse_class_candidate(
+    raw_class_candidate: Any,
+    *,
+    asset_index: int | str,
+    raw_name: str,
+) -> list[str]:
+    if not isinstance(raw_class_candidate, list):
+        raise ValueError(
+            f"Scene intake asset {asset_index}.class_candidate must be a list."
+        )
+    class_candidate = [normalize_asset_name(str(item)) for item in raw_class_candidate]
+    if len(class_candidate) != 5:
+        raise ValueError(
+            f"Scene intake asset {asset_index}.class_candidate must contain exactly five entries."
+        )
+    if any(not candidate for candidate in class_candidate):
+        raise ValueError(
+            f"Scene intake asset {asset_index}.class_candidate has empty entries."
+        )
+    if class_candidate[0] != normalize_asset_name(raw_name):
+        raise ValueError(
+            f"Scene intake asset {asset_index}.class_candidate[0] must equal name."
+        )
+    return class_candidate
+
+
+def _parse_count(raw_count: Any, *, asset_index: int) -> int:
+    if not isinstance(raw_count, int) or isinstance(raw_count, bool):
+        raise ValueError(f"Scene intake asset {asset_index}.count must be an integer.")
+    if raw_count < 1:
+        raise ValueError(f"Scene intake asset {asset_index}.count must be >= 1.")
+    return raw_count
+
+
+def _validate_exact_keys(
+    value: dict[str, Any],
+    *,
+    allowed_keys: set[str],
+    context: str,
+) -> None:
+    extra_keys = sorted(set(value) - allowed_keys)
+    if extra_keys:
+        log_warning(
+            f"{context} has unexpected keys: {extra_keys}. "
+            f"These fields will be ignored."
+        )
+
+
+def _require_mapping(value: Any, context: str) -> dict[str, Any]:
+    if not isinstance(value, dict):
+        raise ValueError(f"{context} must be an object.")
+    return value
+
+
+def _require_list(value: Any, context: str) -> list[Any]:
+    if not isinstance(value, list):
+        raise ValueError(f"{context} must be a list.")
+    return value
diff --git a/embodichain/gen_sim/prompt2scene/workflows/spatial.py b/embodichain/gen_sim/prompt2scene/workflows/spatial.py
new file mode 100644
index 00000000..b5f93868
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/spatial.py
@@ -0,0 +1,309 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__ = [
+    "GRID_VALUE_LIST",
+    "GRID_VALUES",
+    "RELATION_VALUE_LIST",
+    "RELATION_VALUES",
+    "assign_grids_from_anchor_and_orders",
+    "derive_relations_from_orders",
+    "invert_relation",
+    "normalize_relation",
+    "transitive_relation_closure",
+    "validate_exact_asset_id_coverage",
+]
+
+RELATION_VALUE_LIST = ["left_of", "front_of"]
+RELATION_VALUES = frozenset(RELATION_VALUE_LIST)
+INVERSE_RELATIONS = {
+    "left_of": "right_of",
+    "right_of": "left_of",
+    "front_of": "behind",
+    "behind": "front_of",
+}
+
+GRID_VALUE_LIST = [
+    "center",
+    "front",
+    "back",
+    "left_center",
+    "right_center",
+    "left_front",
+    "right_front",
+    "left_back",
+    "right_back",
+]
+GRID_VALUES = frozenset(GRID_VALUE_LIST)
+
+
+def validate_exact_asset_id_coverage(
+    *,
+    values: list[str],
+    expected_asset_ids: list[str],
+    context: str,
+) -> None:
+    """Validate that values contain every expected asset id exactly once."""
+    expected = set(expected_asset_ids)
+    actual = set(values)
+    duplicates = sorted({asset_id for asset_id in values if values.count(asset_id) > 1})
+    missing = sorted(expected - actual)
+    unknown = sorted(actual - expected)
+    if duplicates:
+        raise ValueError(f"{context} has duplicate asset ids: {duplicates}.")
+    if missing:
+        raise ValueError(f"{context} is missing asset ids: {missing}.")
+    if unknown:
+        raise ValueError(f"{context} has unknown asset ids: {unknown}.")
+
+
+def assign_grids_from_anchor_and_orders(
+    *,
+    anchor_asset_id: str,
+    anchor_grid: str,
+    x_order: list[list[str]],
+    y_order: list[list[str]],
+    asset_ids: list[str],
+) -> dict[str, str]:
+    """Assign 9-grid labels from one anchor grid and two object orderings."""
+    anchor_x, anchor_y = _split_grid(anchor_grid)
+    x_indices = _order_indices(x_order)
+    y_indices = _order_indices(y_order)
+    anchor_x_index = x_indices[anchor_asset_id]
+    anchor_y_index = y_indices[anchor_asset_id]
+
+    grids: dict[str, str] = {}
+    for asset_id in asset_ids:
+        x_label = _axis_label_from_anchor(
+            index=x_indices[asset_id],
+            anchor_index=anchor_x_index,
+            anchor_label=anchor_x,
+            before_label="left",
+            after_label="right",
+        )
+        y_label = _axis_label_from_anchor(
+            index=y_indices[asset_id],
+            anchor_index=anchor_y_index,
+            anchor_label=anchor_y,
+            before_label="front",
+            after_label="back",
+        )
+        grids[asset_id] = _join_grid(x_label=x_label, y_label=y_label)
+    return grids
+
+
+def invert_relation(relation: str) -> str:
+    """Return the inverse of a supported spatial relation."""
+    if relation not in INVERSE_RELATIONS:
+        raise ValueError(f"Unsupported spatial relation: {relation!r}.")
+    return INVERSE_RELATIONS[relation]
+
+
+def normalize_relation(
+    *,
+    subject: str,
+    relation: str,
+    object_id: str,
+) -> tuple[str, str, str]:
+    """Normalize a relation into a canonical directional axis edge."""
+    if relation == "left_of":
+        return subject, "left_of", object_id
+    if relation == "right_of":
+        return object_id, "left_of", subject
+    if relation == "front_of":
+        return subject, "front_of", object_id
+    if relation == "behind":
+        return object_id, "front_of", subject
+    raise ValueError(f"Unsupported spatial relation: {relation!r}.")
+
+
+def transitive_relation_closure(
+    relations: list[dict[str, str]],
+) -> list[dict[str, str]]:
+    """Expand canonical left/front relations with transitive closure."""
+    direct_edges: dict[str, set[tuple[str, str]]] = {
+        "left_of": set(),
+        "front_of": set(),
+    }
+    input_edges: set[tuple[str, str, str]] = set()
+    for relation_record in relations:
+        subject = relation_record["subject"]
+        relation = relation_record["relation"]
+        object_id = relation_record["object"]
+        canonical_subject, canonical_relation, canonical_object = normalize_relation(
+            subject=subject,
+            relation=relation,
+            object_id=object_id,
+        )
+        if canonical_subject == canonical_object:
+            raise ValueError("Spatial relation cannot reference the same object.")
+        edge = (canonical_subject, canonical_object)
+        inverse_edge = (canonical_object, canonical_subject)
+        if inverse_edge in direct_edges[canonical_relation]:
+            raise ValueError(
+                "Conflicting spatial relations: "
+                f"{canonical_subject!r} {canonical_relation} {canonical_object!r}."
+            )
+        direct_edges[canonical_relation].add(edge)
+        input_edges.add((subject, relation, object_id))
+
+    output: list[dict[str, str]] = []
+    seen: set[tuple[str, str, str]] = set()
+    for canonical_relation, edges in direct_edges.items():
+        for subject, object_id in sorted(_transitive_edges(edges)):
+            _append_relation(
+                output=output,
+                seen=seen,
+                subject=subject,
+                relation=canonical_relation,
+                object_id=object_id,
+                source=(
+                    "input"
+                    if (subject, canonical_relation, object_id) in input_edges
+                    else "closure"
+                ),
+            )
+    return output
+
+
+def derive_relations_from_orders(
+    *,
+    x_order: list[list[str]],
+    y_order: list[list[str]],
+) -> list[dict[str, str]]:
+    """Derive canonical relations from adjacent order groups."""
+    relations: list[dict[str, str]] = []
+    relations.extend(_relations_from_order_groups(x_order, relation="left_of"))
+    relations.extend(_relations_from_order_groups(y_order, relation="front_of"))
+    closed = transitive_relation_closure(relations)
+    return [
+        {
+            **relation,
+            "source": "order" if relation["source"] == "input" else relation["source"],
+        }
+        for relation in closed
+    ]
+
+
+def _order_indices(order: list[list[str]]) -> dict[str, int]:
+    return {
+        asset_id: group_index
+        for group_index, group in enumerate(order)
+        for asset_id in group
+    }
+
+
+def _split_grid(grid: str) -> tuple[str, str]:
+    if grid == "center":
+        return "center", "center"
+    if grid in {"front", "back"}:
+        return "center", grid
+    if grid in {"left_center", "right_center"}:
+        return grid.split("_", maxsplit=1)[0], "center"
+    x_label, y_label = grid.split("_", maxsplit=1)
+    return x_label, y_label
+
+
+def _axis_label_from_anchor(
+    *,
+    index: int,
+    anchor_index: int,
+    anchor_label: str,
+    before_label: str,
+    after_label: str,
+) -> str:
+    if index < anchor_index:
+        return before_label
+    if index > anchor_index:
+        return after_label
+    return anchor_label
+
+
+def _join_grid(*, x_label: str, y_label: str) -> str:
+    if x_label == "center" and y_label == "center":
+        return "center"
+    if x_label == "center":
+        return y_label
+    if y_label == "center":
+        return f"{x_label}_center"
+    return f"{x_label}_{y_label}"
+
+
+def _relations_from_order_groups(
+    order_groups: list[list[str]],
+    *,
+    relation: str,
+) -> list[dict[str, str]]:
+    relations: list[dict[str, str]] = []
+    for earlier_group, later_group in zip(order_groups, order_groups[1:]):
+        for subject in earlier_group:
+            for object_id in later_group:
+                relations.append(
+                    {
+                        "subject": subject,
+                        "relation": relation,
+                        "object": object_id,
+                        "source": "input",
+                    }
+                )
+    return relations
+
+
+def _transitive_edges(
+    edges: set[tuple[str, str]],
+) -> set[tuple[str, str]]:
+    adjacency: dict[str, set[str]] = {}
+    for subject, object_id in edges:
+        adjacency.setdefault(subject, set()).add(object_id)
+        adjacency.setdefault(object_id, set())
+
+    closure: set[tuple[str, str]] = set(edges)
+    for start in adjacency:
+        stack = list(adjacency[start])
+        visited: set[str] = set()
+        while stack:
+            current = stack.pop()
+            if current in visited:
+                continue
+            visited.add(current)
+            closure.add((start, current))
+            stack.extend(adjacency.get(current, ()))
+    return closure
+
+
+def _append_relation(
+    *,
+    output: list[dict[str, str]],
+    seen: set[tuple[str, str, str]],
+    subject: str,
+    relation: str,
+    object_id: str,
+    source: str,
+) -> None:
+    key = (subject, relation, object_id)
+    if key in seen:
+        return
+    seen.add(key)
+    output.append(
+        {
+            "subject": subject,
+            "relation": relation,
+            "object": object_id,
+            "source": source,
+        }
+    )
diff --git a/embodichain/gen_sim/prompt2scene/workflows/stage_errors.py b/embodichain/gen_sim/prompt2scene/workflows/stage_errors.py
new file mode 100644
index 00000000..f8d8c230
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/stage_errors.py
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__ = ["format_attempt_error", "format_result_missing_error"]
+
+
+def format_attempt_error(stage_name: str, attempt_count: int, exc: Exception) -> str:
+    """Format a retryable stage failure message."""
+    return f"{stage_name} attempt {attempt_count} failed: {exc}"
+
+
+def format_result_missing_error(
+    stage_name: str,
+    result_name: str,
+    *,
+    attempt_count: int,
+    last_error: str | None,
+    errors: list[str],
+) -> str:
+    """Format a missing-final-result error message."""
+    return (
+        f"{stage_name} failed to produce a {result_name} after "
+        f"{attempt_count} attempts. Last error: {last_error}. "
+        f"All retryable errors: {errors}"
+    )
diff --git a/embodichain/gen_sim/prompt2scene/workflows/text_relations/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/text_relations/__init__.py
new file mode 100644
index 00000000..e2c03539
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/text_relations/__init__.py
@@ -0,0 +1,24 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.graph import (
+    build_text_relations_graph,
+    run_text_relations,
+)
+
+__all__ = ["build_text_relations_graph", "run_text_relations"]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/text_relations/graph.py b/embodichain/gen_sim/prompt2scene/workflows/text_relations/graph.py
new file mode 100644
index 00000000..f6aa6078
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/text_relations/graph.py
@@ -0,0 +1,124 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.llms import (
+    OpenAICompatibleLLMCfg,
+    build_chat_model,
+)
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+    format_result_missing_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.nodes import (
+    call_llm_text_relations_node,
+    normalize_text_relations_node,
+    prepare_text_relation_messages_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.schema import (
+    TextRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.state import (
+    TextRelationsState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["build_text_relations_graph", "run_text_relations"]
+
+
+def route_after_text_relation_normalize(state: TextRelationsState) -> str:
+    """Route to retry or finish after text relation normalization."""
+    if state["text_relations"] is not None:
+        return "end"
+    if state["attempt_count"] < state["max_attempts"]:
+        return "retry"
+    return "end"
+
+
+def build_text_relations_graph(llm: Any) -> Any:
+    """Build the fixed text spatial-relation extraction workflow."""
+    graph = StateGraph(TextRelationsState)
+    graph.add_node(
+        "prepare_text_relation_messages",
+        prepare_text_relation_messages_node,
+    )
+    graph.add_node(
+        "call_llm_text_relations",
+        lambda state: call_llm_text_relations_node(state, llm=llm),
+    )
+    graph.add_node("normalize_text_relations", normalize_text_relations_node)
+
+    graph.set_entry_point("prepare_text_relation_messages")
+    graph.add_edge("prepare_text_relation_messages", "call_llm_text_relations")
+    graph.add_edge("call_llm_text_relations", "normalize_text_relations")
+    graph.add_conditional_edges(
+        "normalize_text_relations",
+        route_after_text_relation_normalize,
+        {
+            "retry": "call_llm_text_relations",
+            "end": END,
+        },
+    )
+    return graph.compile()
+
+
+def run_text_relations(
+    request: Prompt2SceneInput,
+    *,
+    scene_intake: SceneIntakeSpec,
+    llm_cfg: OpenAICompatibleLLMCfg,
+    output_root: Path,
+) -> TextRelationSpec:
+    """Run text spatial-relation extraction for one prompt2scene request."""
+    llm = build_chat_model(llm_cfg)
+    graph = build_text_relations_graph(llm)
+    result = graph.invoke(
+        {
+            "request": request,
+            "scene_intake": scene_intake,
+            "output_root": output_root,
+            "messages": [],
+            "raw_model_output": None,
+            "text_relations": None,
+            "attempt_count": 0,
+            "max_attempts": llm_cfg.max_attempts,
+            "last_error": None,
+            "errors": [],
+        }
+    )
+
+    text_relations = result.get("text_relations")
+    if text_relations is not None:
+        return text_relations
+
+    error = format_result_missing_error(
+        "Text relations",
+        "TextRelationSpec",
+        attempt_count=result.get("attempt_count", 0),
+        last_error=result.get("last_error"),
+        errors=result.get("errors", []),
+    )
+    log.log_warning(error)
+    raise RuntimeError(error)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/text_relations/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/text_relations/nodes.py
new file mode 100644
index 00000000..67b1fc3c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/text_relations/nodes.py
@@ -0,0 +1,144 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.request import InputKind
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.schema import (
+    TEXT_RELATIONS_JSON_SCHEMA,
+    TextRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.utils import (
+    log_api_request_start,
+    log,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    TEXT_RELATIONS_STEP,
+    WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.workflows.llm_output import (
+    StructuredModelCallError,
+    call_structured_json_model_step,
+)
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+    format_attempt_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.prompts import (
+    build_text_relation_messages,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.state import (
+    TextRelationsState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.utils import (
+    build_text_relation_spec,
+)
+
+__all__ = [
+    "call_llm_text_relations_node",
+    "normalize_text_relations_node",
+    "prepare_text_relation_messages_node",
+]
+
+
+def prepare_text_relation_messages_node(
+    state: TextRelationsState,
+) -> dict[str, object]:
+    """Prepare text-relation extraction messages."""
+    request = state["request"]
+    if request.input_kind != InputKind.TEXT:
+        raise ValueError("Text relations requires a text input.")
+    return {
+        "messages": build_text_relation_messages(
+            request=request,
+            scene_intake=state["scene_intake"],
+        )
+    }
+
+
+def call_llm_text_relations_node(
+    state: TextRelationsState,
+    *,
+    llm: Any,
+) -> dict[str, object]:
+    """Call LLM to extract explicit text spatial constraints."""
+    attempt_count = state["attempt_count"] + 1
+    artifact_writer = WorkflowArtifactWriter(
+        state["output_root"],
+        TEXT_RELATIONS_STEP,
+    )
+
+    try:
+        log_api_request_start(
+            step=TEXT_RELATIONS_STEP,
+            request="extract",
+            attempt=attempt_count,
+        )
+        raw_model_output = call_structured_json_model_step(
+            llm=llm,
+            schema=TEXT_RELATIONS_JSON_SCHEMA,
+            messages=state["messages"],
+            context="Text relations",
+            step_name=TEXT_RELATIONS_STEP,
+            output_root=None,
+            attempt_count=attempt_count,
+            raw_output_label="extract",
+            artifact_writer=artifact_writer,
+        )
+    except StructuredModelCallError as exc:
+        error = format_attempt_error("Text relations", attempt_count, exc)
+        log.log_warning(error)
+        return {
+            "attempt_count": attempt_count,
+            "raw_model_output": None,
+            "last_error": error,
+            "errors": state["errors"] + [error],
+        }
+
+    return {
+        "attempt_count": attempt_count,
+        "raw_model_output": raw_model_output,
+        "last_error": None,
+    }
+
+
+def normalize_text_relations_node(state: TextRelationsState) -> dict[str, object]:
+    """Normalize raw LLM output into TextRelationSpec."""
+    raw_model_output = state["raw_model_output"]
+    if raw_model_output is None:
+        return {}
+
+    try:
+        text_relations = build_text_relation_spec(
+            scene_intake=state["scene_intake"],
+            model_output=raw_model_output,
+        )
+    except ValueError as exc:
+        error = format_attempt_error("Text relations", state["attempt_count"], exc)
+        log.log_warning(error)
+        return {
+            "text_relations": None,
+            "last_error": error,
+            "errors": state["errors"] + [error],
+        }
+
+    artifact_writer = WorkflowArtifactWriter(
+        state["output_root"],
+        TEXT_RELATIONS_STEP,
+    )
+    artifact_writer.write_step_result(text_relations.to_manifest())
+    return {"text_relations": text_relations, "last_error": None}
diff --git a/embodichain/gen_sim/prompt2scene/workflows/text_relations/prompts.py b/embodichain/gen_sim/prompt2scene/workflows/text_relations/prompts.py
new file mode 100644
index 00000000..a6f02e4f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/text_relations/prompts.py
@@ -0,0 +1,55 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.prompts import render_prompt
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+
+__all__ = ["build_text_relation_messages"]
+
+TEXT_RELATIONS_PROMPT_NAME = "text_relations.yaml"
+
+
+def build_text_relation_messages(
+    *,
+    request: Prompt2SceneInput,
+    scene_intake: SceneIntakeSpec,
+) -> list[dict[str, Any]]:
+    """Build messages for explicit text spatial-relation extraction."""
+    asset_names = "\n".join(f"- {asset.name}" for asset in scene_intake.assets)
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(TEXT_RELATIONS_PROMPT_NAME, prompt_key="system"),
+        },
+        {
+            "role": "user",
+            "content": render_prompt(
+                TEXT_RELATIONS_PROMPT_NAME,
+                {
+                    "asset_names": asset_names,
+                    "text": request.text or "",
+                },
+                prompt_key="user",
+            ),
+        },
+    ]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/text_relations/schema.py b/embodichain/gen_sim/prompt2scene/workflows/text_relations/schema.py
new file mode 100644
index 00000000..db2e513f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/text_relations/schema.py
@@ -0,0 +1,164 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.spatial import (
+    GRID_VALUE_LIST,
+    RELATION_VALUE_LIST,
+)
+
+__all__ = [
+    "TEXT_RELATIONS_JSON_SCHEMA",
+    "TextObjectLayout",
+    "TextObjectRelation",
+    "TextRelationSpec",
+    "TextTableConstraint",
+]
+
+TEXT_RELATIONS_JSON_SCHEMA: dict[str, Any] = {
+    "title": "TextRelationsOutput",
+    "type": "object",
+    "additionalProperties": False,
+    "properties": {
+        "object_relations": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "additionalProperties": False,
+                "properties": {
+                    "subject": {"type": "string", "minLength": 1},
+                    "relation": {
+                        "type": "string",
+                        "enum": RELATION_VALUE_LIST,
+                    },
+                    "object": {"type": "string", "minLength": 1},
+                    "evidence": {"type": "string", "minLength": 1},
+                },
+                "required": ["subject", "relation", "object", "evidence"],
+            },
+        },
+        "table_constraints": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "additionalProperties": False,
+                "properties": {
+                    "asset": {"type": "string", "minLength": 1},
+                    "grid": {
+                        "type": "string",
+                        "enum": GRID_VALUE_LIST,
+                    },
+                    "evidence": {"type": "string", "minLength": 1},
+                },
+                "required": ["asset", "grid", "evidence"],
+            },
+        },
+        "object_layouts": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "additionalProperties": False,
+                "properties": {
+                    "asset": {"type": "string", "minLength": 1},
+                    "is_arbitrary_layout": {"type": "boolean"},
+                    "reason": {"type": "string", "minLength": 1},
+                },
+                "required": ["asset", "is_arbitrary_layout", "reason"],
+            },
+        },
+    },
+    "required": ["object_relations", "table_constraints", "object_layouts"],
+}
+
+
+@dataclass(frozen=True)
+class TextObjectRelation:
+    """Text-stated relation between two scene-intake asset groups."""
+
+    subject: str
+    relation: str
+    object: str
+    evidence: str
+
+    def to_manifest(self) -> dict[str, str]:
+        """Convert the relation to JSON-safe data."""
+        return {
+            "subject": self.subject,
+            "relation": self.relation,
+            "object": self.object,
+            "evidence": self.evidence,
+        }
+
+
+@dataclass(frozen=True)
+class TextTableConstraint:
+    """Text-stated table grid constraint for one asset group."""
+
+    asset: str
+    grid: str
+    evidence: str
+
+    def to_manifest(self) -> dict[str, str]:
+        """Convert the table constraint to JSON-safe data."""
+        return {
+            "asset": self.asset,
+            "grid": self.grid,
+            "evidence": self.evidence,
+        }
+
+
+@dataclass(frozen=True)
+class TextObjectLayout:
+    """Text-stated object support-pose constraint."""
+
+    asset: str
+    is_arbitrary_layout: bool
+    reason: str
+
+    def to_manifest(self) -> dict[str, object]:
+        """Convert the layout constraint to JSON-safe data."""
+        return {
+            "asset": self.asset,
+            "is_arbitrary_layout": self.is_arbitrary_layout,
+            "reason": self.reason,
+        }
+
+
+@dataclass(frozen=True)
+class TextRelationSpec:
+    """Spatial constraints explicitly extracted from a text prompt."""
+
+    source_text: str
+    object_relations: list[TextObjectRelation] = field(default_factory=list)
+    table_constraints: list[TextTableConstraint] = field(default_factory=list)
+    object_layouts: list[TextObjectLayout] = field(default_factory=list)
+
+    def to_manifest(self) -> dict[str, object]:
+        """Convert the text relations to JSON-safe data."""
+        return {
+            "source_text": self.source_text,
+            "object_relations": [
+                relation.to_manifest() for relation in self.object_relations
+            ],
+            "table_constraints": [
+                constraint.to_manifest() for constraint in self.table_constraints
+            ],
+            "object_layouts": [layout.to_manifest() for layout in self.object_layouts],
+        }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/text_relations/state.py b/embodichain/gen_sim/prompt2scene/workflows/text_relations/state.py
new file mode 100644
index 00000000..b8dfa4c9
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/text_relations/state.py
@@ -0,0 +1,42 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.schema import (
+    TextRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+
+__all__ = ["TextRelationsState"]
+
+
+class TextRelationsState(AttemptState):
+    """LangGraph state for explicit text spatial-relation extraction."""
+
+    request: Prompt2SceneInput
+    scene_intake: SceneIntakeSpec
+    output_root: Path
+    messages: list[Any]
+    raw_model_output: dict[str, Any] | None
+    text_relations: TextRelationSpec | None
diff --git a/embodichain/gen_sim/prompt2scene/workflows/text_relations/utils.py b/embodichain/gen_sim/prompt2scene/workflows/text_relations/utils.py
new file mode 100644
index 00000000..58002713
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/text_relations/utils.py
@@ -0,0 +1,191 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.spatial import (
+    GRID_VALUES,
+    RELATION_VALUES,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.utils import (
+    normalize_asset_name,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.schema import (
+    TextObjectLayout,
+    TextObjectRelation,
+    TextRelationSpec,
+    TextTableConstraint,
+)
+
+__all__ = [
+    "build_text_relation_spec",
+]
+
+
+def build_text_relation_spec(
+    *,
+    scene_intake: SceneIntakeSpec,
+    model_output: dict[str, Any],
+) -> TextRelationSpec:
+    """Normalize raw LLM JSON into text relation constraints."""
+    asset_names = {asset.name for asset in scene_intake.assets}
+    object_relations = _parse_object_relations(
+        model_output.get("object_relations"),
+        asset_names=asset_names,
+    )
+    table_constraints = _parse_table_constraints(
+        model_output.get("table_constraints"),
+        asset_names=asset_names,
+    )
+    object_layouts = _parse_object_layouts(
+        model_output.get("object_layouts"),
+        asset_names=asset_names,
+    )
+    return TextRelationSpec(
+        source_text=scene_intake.input.text or "",
+        object_relations=object_relations,
+        table_constraints=table_constraints,
+        object_layouts=object_layouts,
+    )
+
+
+def _parse_object_relations(
+    raw_relations: Any,
+    *,
+    asset_names: set[str],
+) -> list[TextObjectRelation]:
+    if not isinstance(raw_relations, list):
+        raise ValueError("text_relations.object_relations must be a list.")
+    relations: list[TextObjectRelation] = []
+    seen: set[tuple[str, str, str]] = set()
+    for index, raw_relation in enumerate(raw_relations):
+        if not isinstance(raw_relation, dict):
+            raise ValueError(
+                f"text_relations.object_relations[{index}] must be an object."
+            )
+        subject = _parse_asset_name(raw_relation.get("subject"), asset_names, index)
+        relation = str(raw_relation.get("relation") or "").strip()
+        object_name = _parse_asset_name(raw_relation.get("object"), asset_names, index)
+        evidence = str(raw_relation.get("evidence") or "").strip()
+        if relation not in RELATION_VALUES:
+            raise ValueError(
+                f"text_relations.object_relations[{index}].relation is invalid."
+            )
+        if not evidence:
+            raise ValueError(
+                f"text_relations.object_relations[{index}].evidence is required."
+            )
+        key = (subject, relation, object_name)
+        if key in seen:
+            continue
+        seen.add(key)
+        relations.append(
+            TextObjectRelation(
+                subject=subject,
+                relation=relation,
+                object=object_name,
+                evidence=evidence,
+            )
+        )
+    return relations
+
+
+def _parse_table_constraints(
+    raw_constraints: Any,
+    *,
+    asset_names: set[str],
+) -> list[TextTableConstraint]:
+    if not isinstance(raw_constraints, list):
+        raise ValueError("text_relations.table_constraints must be a list.")
+    constraints: list[TextTableConstraint] = []
+    seen: set[tuple[str, str]] = set()
+    for index, raw_constraint in enumerate(raw_constraints):
+        if not isinstance(raw_constraint, dict):
+            raise ValueError(
+                f"text_relations.table_constraints[{index}] must be an object."
+            )
+        asset = _parse_asset_name(raw_constraint.get("asset"), asset_names, index)
+        grid = str(raw_constraint.get("grid") or "").strip()
+        evidence = str(raw_constraint.get("evidence") or "").strip()
+        if grid not in GRID_VALUES:
+            raise ValueError(
+                f"text_relations.table_constraints[{index}].grid is invalid."
+            )
+        if not evidence:
+            raise ValueError(
+                f"text_relations.table_constraints[{index}].evidence is required."
+            )
+        key = (asset, grid)
+        if key in seen:
+            continue
+        seen.add(key)
+        constraints.append(
+            TextTableConstraint(asset=asset, grid=grid, evidence=evidence)
+        )
+    return constraints
+
+
+def _parse_object_layouts(
+    raw_layouts: Any,
+    *,
+    asset_names: set[str],
+) -> list[TextObjectLayout]:
+    if not isinstance(raw_layouts, list):
+        raise ValueError("text_relations.object_layouts must be a list.")
+    layouts: list[TextObjectLayout] = []
+    seen: set[str] = set()
+    for index, raw_layout in enumerate(raw_layouts):
+        if not isinstance(raw_layout, dict):
+            raise ValueError(
+                f"text_relations.object_layouts[{index}] must be an object."
+            )
+        asset = _parse_asset_name(raw_layout.get("asset"), asset_names, index)
+        is_arbitrary_layout = raw_layout.get("is_arbitrary_layout")
+        reason = str(raw_layout.get("reason") or "").strip()
+        if not isinstance(is_arbitrary_layout, bool):
+            raise ValueError(
+                "text_relations.object_layouts"
+                f"[{index}].is_arbitrary_layout must be boolean."
+            )
+        if not reason:
+            raise ValueError(
+                f"text_relations.object_layouts[{index}].reason is required."
+            )
+        if asset in seen:
+            continue
+        seen.add(asset)
+        layouts.append(
+            TextObjectLayout(
+                asset=asset,
+                is_arbitrary_layout=is_arbitrary_layout,
+                reason=reason,
+            )
+        )
+    return layouts
+
+
+def _parse_asset_name(raw_name: Any, asset_names: set[str], index: int) -> str:
+    name = normalize_asset_name(str(raw_name or ""))
+    if name not in asset_names:
+        raise ValueError(
+            f"text_relations item {index} references unknown scene asset: {name!r}."
+        )
+    return name
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/__init__.py
new file mode 100644
index 00000000..015c4151
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/__init__.py
@@ -0,0 +1,19 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/graph.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/graph.py
new file mode 100644
index 00000000..7431f0c0
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/graph.py
@@ -0,0 +1,97 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+    format_result_missing_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+    ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.schema import (
+    TextRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.schema import (
+    UnifiedSceneSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.nodes import (
+    build_unified_scene_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.state import (
+    UnifiedSceneState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["build_unified_scene_graph", "run_unified_scene"]
+
+
+def build_unified_scene_graph() -> Any:
+    """Build the fixed unified-scene assembly workflow."""
+    graph = StateGraph(UnifiedSceneState)
+    graph.add_node("build_unified_scene", build_unified_scene_node)
+    graph.set_entry_point("build_unified_scene")
+    graph.add_edge("build_unified_scene", END)
+    return graph.compile()
+
+
+def run_unified_scene(
+    request: Prompt2SceneInput,
+    *,
+    scene_intake: SceneIntakeSpec,
+    image_relations: ImageRelationSpec | None = None,
+    text_relations: TextRelationSpec | None = None,
+    output_root: Path,
+) -> UnifiedSceneSpec:
+    """Run final unified-scene assembly for one prompt2scene request."""
+    graph = build_unified_scene_graph()
+    result = graph.invoke(
+        {
+            "request": request,
+            "scene_intake": scene_intake,
+            "output_root": output_root,
+            "image_relations": image_relations,
+            "text_relations": text_relations,
+            "unified_scene": None,
+            "attempt_count": 0,
+            "max_attempts": 1,
+            "last_error": None,
+            "errors": [],
+        }
+    )
+
+    unified_scene = result.get("unified_scene")
+    if unified_scene is not None:
+        return unified_scene
+
+    error = format_result_missing_error(
+        "Unified scene",
+        "UnifiedSceneSpec",
+        attempt_count=result.get("attempt_count", 0),
+        last_error=result.get("last_error"),
+        errors=result.get("errors", []),
+    )
+    log.log_warning(error)
+    raise RuntimeError(error)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/nodes.py
new file mode 100644
index 00000000..5d65a737
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/nodes.py
@@ -0,0 +1,57 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    UNIFIED_SCENE_STEP,
+    WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.state import (
+    UnifiedSceneState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.utils import (
+    build_unified_scene_from_image_relations,
+    build_unified_scene_from_text_relations,
+)
+
+__all__ = ["build_unified_scene_node"]
+
+
+def build_unified_scene_node(state: UnifiedSceneState) -> dict[str, object]:
+    """Assemble the final unified scene manifest."""
+    scene_intake = state["scene_intake"]
+    image_relations = state.get("image_relations")
+    text_relations = state.get("text_relations")
+
+    if image_relations is not None and image_relations.status == "ok":
+        unified_scene = build_unified_scene_from_image_relations(
+            scene_intake=scene_intake,
+            image_relations=image_relations,
+        )
+    elif text_relations is not None:
+        unified_scene = build_unified_scene_from_text_relations(
+            scene_intake=scene_intake,
+            text_relations=text_relations,
+        )
+    else:
+        raise ValueError("Unified scene requires image_relations or text_relations.")
+
+    WorkflowArtifactWriter(
+        state["output_root"],
+        UNIFIED_SCENE_STEP,
+    ).write_step_result(unified_scene.to_manifest())
+    return {"unified_scene": unified_scene}
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/schema.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/schema.py
new file mode 100644
index 00000000..baca2beb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/schema.py
@@ -0,0 +1,161 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+__all__ = [
+    "UnifiedObject",
+    "UnifiedSceneSpec",
+    "UnifiedSpatial",
+    "UnifiedSpatialAnchor",
+    "UnifiedSpatialRelation",
+    "UnifiedTable",
+]
+
+
+@dataclass(frozen=True)
+class UnifiedTable:
+    """Unified table/support object."""
+
+    id: str
+    name: str
+    description: str
+    complete_table_description: str
+    is_complete_visible_table: bool
+    class_candidate: list[str]
+    image_path: str | None = None
+    mesh_path: str | None = None
+    grid_cells: dict[str, list[str]] | None = None
+    object_coverage_percent: int | None = None
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the table to JSON-safe data."""
+        manifest: dict[str, Any] = {
+            "id": self.id,
+            "name": self.name,
+            "description": self.description,
+            "complete_table_description": self.complete_table_description,
+            "is_complete_visible_table": self.is_complete_visible_table,
+            "class_candidate": list(self.class_candidate),
+            "image_path": self.image_path,
+            "mesh_path": self.mesh_path,
+            "grid_cells": self.grid_cells,
+        }
+        if self.object_coverage_percent is not None:
+            manifest["object_coverage_percent"] = self.object_coverage_percent
+        return manifest
+
+
+@dataclass(frozen=True)
+class UnifiedObject:
+    """Unified object instance used by downstream scene generation."""
+
+    id: str
+    name: str
+    description: str
+    class_candidate: list[str]
+    grid: str | None = None
+    is_arbitrary_layout: bool = False
+    layout_reason: str = ""
+    image_path: str | None = None
+    mesh_path: str | None = None
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the object to JSON-safe data."""
+        return {
+            "id": self.id,
+            "name": self.name,
+            "description": self.description,
+            "class_candidate": list(self.class_candidate),
+            "grid": self.grid,
+            "is_arbitrary_layout": self.is_arbitrary_layout,
+            "layout_reason": self.layout_reason,
+            "image_path": self.image_path,
+            "mesh_path": self.mesh_path,
+        }
+
+
+@dataclass(frozen=True)
+class UnifiedSpatialAnchor:
+    """Spatial anchor used to infer a full table grid."""
+
+    object_id: str
+    grid: str
+    reason: str = ""
+
+    def to_manifest(self) -> dict[str, str]:
+        """Convert the anchor to JSON-safe data."""
+        return {
+            "object_id": self.object_id,
+            "grid": self.grid,
+            "reason": self.reason,
+        }
+
+
+@dataclass(frozen=True)
+class UnifiedSpatialRelation:
+    """Unified pairwise spatial relation between two objects."""
+
+    subject: str
+    relation: str
+    object: str
+    source: str
+
+    def to_manifest(self) -> dict[str, str]:
+        """Convert the relation to JSON-safe data."""
+        return {
+            "subject": self.subject,
+            "relation": self.relation,
+            "object": self.object,
+            "source": self.source,
+        }
+
+
+@dataclass(frozen=True)
+class UnifiedSpatial:
+    """Unified spatial relations for a scene."""
+
+    anchor: UnifiedSpatialAnchor | None = None
+    relations: list[UnifiedSpatialRelation] = field(default_factory=list)
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the spatial record to JSON-safe data."""
+        return {
+            "anchor": self.anchor.to_manifest() if self.anchor else None,
+            "relations": [relation.to_manifest() for relation in self.relations],
+        }
+
+
+@dataclass(frozen=True)
+class UnifiedSceneSpec:
+    """Unified scene representation consumed by downstream generation steps."""
+
+    input: dict[str, Any]
+    table: UnifiedTable
+    objects: list[UnifiedObject]
+    spatial: UnifiedSpatial
+
+    def to_manifest(self) -> dict[str, Any]:
+        """Convert the unified scene to JSON-safe data."""
+        return {
+            "input": dict(self.input),
+            "table": self.table.to_manifest(),
+            "objects": [obj.to_manifest() for obj in self.objects],
+            "spatial": self.spatial.to_manifest(),
+        }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/state.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/state.py
new file mode 100644
index 00000000..8152a6bf
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/state.py
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+    ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.schema import (
+    TextRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["UnifiedSceneState"]
+
+
+class UnifiedSceneState(AttemptState):
+    """LangGraph state for unified scene assembly."""
+
+    request: Prompt2SceneInput
+    scene_intake: SceneIntakeSpec
+    output_root: Path
+    image_relations: ImageRelationSpec | None
+    text_relations: TextRelationSpec | None
+    unified_scene: Any | None
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/utils.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/utils.py
new file mode 100644
index 00000000..49e4a70c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/utils.py
@@ -0,0 +1,337 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from collections import defaultdict
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+    ImageAnchor,
+    ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.spatial import (
+    assign_grids_from_anchor_and_orders,
+    derive_relations_from_orders,
+    transitive_relation_closure,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.schema import (
+    UnifiedObject,
+    UnifiedSceneSpec,
+    UnifiedSpatialAnchor,
+    UnifiedSpatialRelation,
+    UnifiedSpatial,
+    UnifiedTable,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+    SceneIntakeAsset,
+    SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.text_relations.schema import (
+    TextObjectLayout,
+    TextRelationSpec,
+)
+
+__all__ = [
+    "build_unified_object",
+    "build_unified_object_specs",
+    "build_unified_scene_from_image_relations",
+    "build_unified_scene_from_text_relations",
+    "build_unified_spatial_anchor",
+    "build_unified_table",
+    "grid_cells_from_objects",
+    "object_ids_by_name",
+    "relations_by_object_id",
+    "resolve_image_layout",
+    "resolve_text_layout",
+    "text_grids_by_object_id",
+]
+
+
+def build_unified_object_specs(
+    assets: list[SceneIntakeAsset],
+) -> list[dict[str, Any]]:
+    """Expand scene-intake assets into unified object instance specs."""
+    specs: list[dict[str, Any]] = []
+    for asset in assets:
+        for index in range(asset.count):
+            specs.append(
+                {
+                    "id": f"{asset.id}_{index}",
+                    "name": asset.name,
+                    "description": asset.description,
+                    "class_candidate": list(asset.class_candidate),
+                }
+            )
+    return specs
+
+
+def object_ids_by_name(object_specs: list[dict[str, Any]]) -> dict[str, list[str]]:
+    """Group expanded object ids by object name."""
+    grouped: dict[str, list[str]] = defaultdict(list)
+    for spec in object_specs:
+        grouped[str(spec["name"])].append(str(spec["id"]))
+    return dict(grouped)
+
+
+def build_unified_table(
+    scene_intake: SceneIntakeSpec,
+    *,
+    grid_cells: dict[str, list[str]] | None = None,
+) -> dict[str, Any]:
+    """Build a unified table record from scene intake."""
+    table: dict[str, Any] = {
+        "id": scene_intake.table.id,
+        "name": scene_intake.table.name,
+        "description": scene_intake.table.description,
+        "complete_table_description": (
+            scene_intake.table.complete_table_description
+        ),
+        "is_complete_visible_table": scene_intake.table.is_complete_visible_table,
+        "class_candidate": list(scene_intake.table.class_candidate),
+        "image_path": None,
+        "mesh_path": None,
+        "grid_cells": grid_cells,
+    }
+    if scene_intake.table.object_coverage_percent is not None:
+        table["object_coverage_percent"] = (
+            scene_intake.table.object_coverage_percent
+        )
+    return table
+
+
+def build_unified_spatial_anchor(anchor: ImageAnchor | None) -> dict[str, Any] | None:
+    """Convert the image anchor to a unified spatial anchor record."""
+    if anchor is None:
+        return None
+    return {
+        "object_id": anchor.asset_id,
+        "grid": anchor.grid,
+        "reason": anchor.reason,
+    }
+
+
+def build_unified_object(
+    *,
+    spec: dict[str, Any],
+    grid: str | None,
+    is_arbitrary_layout: bool,
+    layout_reason: str,
+) -> dict[str, Any]:
+    """Build one unified object record."""
+    return {
+        "id": spec["id"],
+        "name": spec["name"],
+        "description": spec["description"],
+        "class_candidate": list(spec["class_candidate"]),
+        "grid": grid,
+        "is_arbitrary_layout": is_arbitrary_layout,
+        "layout_reason": layout_reason,
+        "image_path": None,
+        "mesh_path": None,
+    }
+
+
+def resolve_image_layout(
+    asset_id: str,
+    layout_by_id: dict[str, Any],
+) -> tuple[bool, str]:
+    """Resolve an image asset's layout state."""
+    layout = layout_by_id.get(asset_id)
+    if layout is None:
+        return False, ""
+    return bool(layout.is_arbitrary_layout), str(layout.reason)
+
+
+def resolve_text_layout(
+    name: str,
+    layout_by_name: dict[str, TextObjectLayout],
+) -> tuple[bool, str]:
+    """Resolve a text asset's layout state."""
+    layout = layout_by_name.get(name)
+    if layout is None:
+        return False, ""
+    return bool(layout.is_arbitrary_layout), str(layout.reason)
+
+
+def text_grids_by_object_id(
+    *,
+    text_relations: TextRelationSpec,
+    ids_by_name: dict[str, list[str]],
+) -> dict[str, str | None]:
+    """Assign explicit text table constraints to object ids."""
+    grids: dict[str, str | None] = {object_id: None for ids in ids_by_name.values() for object_id in ids}
+    for constraint in text_relations.table_constraints:
+        for object_id in ids_by_name.get(constraint.asset, []):
+            grids[object_id] = constraint.grid
+    return grids
+
+
+def grid_cells_from_objects(objects: list[dict[str, Any]]) -> dict[str, list[str]] | None:
+    """Build table grid cell membership from unified objects."""
+    grid_cells: dict[str, list[str]] = {
+        "center": [],
+        "front": [],
+        "back": [],
+        "left_center": [],
+        "right_center": [],
+        "left_front": [],
+        "right_front": [],
+        "left_back": [],
+        "right_back": [],
+    }
+    any_grid = False
+    for obj in objects:
+        grid = obj.get("grid")
+        if not grid:
+            continue
+        any_grid = True
+        grid_cells.setdefault(str(grid), []).append(str(obj["id"]))
+    return grid_cells if any_grid else None
+
+
+def relations_by_object_id(
+    *,
+    text_relations: TextRelationSpec,
+    ids_by_name: dict[str, list[str]],
+) -> list[dict[str, str]]:
+    """Expand text relations to object-id relations."""
+    relations: list[dict[str, str]] = []
+    for relation in text_relations.object_relations:
+        subjects = ids_by_name.get(relation.subject, [])
+        objects = ids_by_name.get(relation.object, [])
+        for subject in subjects:
+            for object_id in objects:
+                if subject == object_id:
+                    continue
+                relations.append(
+                    {
+                        "subject": subject,
+                        "relation": relation.relation,
+                        "object": object_id,
+                        "source": "input",
+                    }
+                )
+    return relations
+
+
+def build_unified_scene_from_image_relations(
+    *,
+    scene_intake: SceneIntakeSpec,
+    image_relations: ImageRelationSpec,
+) -> UnifiedSceneSpec:
+    """Build a unified scene from image relation outputs."""
+    object_specs = build_unified_object_specs(scene_intake.assets)
+    anchor = build_unified_spatial_anchor(image_relations.anchor)
+    if anchor is None:
+        raise ValueError("Image unified scene requires an anchor.")
+    layout_by_id = {
+        layout.asset_id: layout for layout in image_relations.asset_layouts
+    }
+    objects = []
+    for spec in object_specs:
+        is_arbitrary_layout, layout_reason = resolve_image_layout(
+            spec["id"],
+            layout_by_id,
+        )
+        objects.append(
+            UnifiedObject(
+                **build_unified_object(
+                    spec=spec,
+                    grid=anchor["grid"] if spec["id"] == anchor["object_id"] else None,
+                    is_arbitrary_layout=is_arbitrary_layout,
+                    layout_reason=layout_reason,
+                )
+            )
+        )
+    relations = [
+        UnifiedSpatialRelation(**relation)
+        for relation in derive_relations_from_orders(
+            x_order=image_relations.x_order,
+            y_order=image_relations.y_order,
+        )
+    ]
+    return UnifiedSceneSpec(
+        input=scene_intake.input.to_manifest(),
+        table=UnifiedTable(
+            **build_unified_table(
+                scene_intake,
+                grid_cells=grid_cells_from_objects(
+                    [object_.to_manifest() for object_ in objects]
+                ),
+            )
+        ),
+        objects=objects,
+        spatial=UnifiedSpatial(
+            anchor=UnifiedSpatialAnchor(**anchor),
+            relations=relations,
+        ),
+    )
+
+
+def build_unified_scene_from_text_relations(
+    *,
+    scene_intake: SceneIntakeSpec,
+    text_relations: TextRelationSpec,
+) -> UnifiedSceneSpec:
+    """Build a unified scene from text relation outputs."""
+    object_specs = build_unified_object_specs(scene_intake.assets)
+    ids_by_name = object_ids_by_name(object_specs)
+    grid_by_id = text_grids_by_object_id(
+        text_relations=text_relations,
+        ids_by_name=ids_by_name,
+    )
+    layout_by_name = {
+        layout.asset: layout for layout in text_relations.object_layouts
+    }
+    objects = []
+    for spec in object_specs:
+        is_arbitrary_layout, layout_reason = resolve_text_layout(
+            spec["name"],
+            layout_by_name,
+        )
+        objects.append(
+            UnifiedObject(
+                **build_unified_object(
+                    spec=spec,
+                    grid=grid_by_id.get(spec["id"]),
+                    is_arbitrary_layout=is_arbitrary_layout,
+                    layout_reason=layout_reason,
+                )
+            )
+        )
+    relations = [
+        UnifiedSpatialRelation(**relation)
+        for relation in transitive_relation_closure(
+            relations_by_object_id(
+                text_relations=text_relations,
+                ids_by_name=ids_by_name,
+            )
+        )
+    ]
+    return UnifiedSceneSpec(
+        input=scene_intake.input.to_manifest(),
+        table=UnifiedTable(
+            **build_unified_table(
+                scene_intake,
+                grid_cells=grid_cells_from_objects(
+                    [object_.to_manifest() for object_ in objects]
+                ),
+            )
+        ),
+        objects=objects,
+        spatial=UnifiedSpatial(anchor=None, relations=relations),
+    )
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/__init__.py
new file mode 100644
index 00000000..ac849443
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/__init__.py
@@ -0,0 +1,27 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.graph import (
+    build_unified_scene_gen_graph,
+    run_unified_scene_gen,
+)
+
+__all__ = [
+    "build_unified_scene_gen_graph",
+    "run_unified_scene_gen",
+]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/graph.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/graph.py
new file mode 100644
index 00000000..5d542b39
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/graph.py
@@ -0,0 +1,106 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.llms import build_chat_model
+from embodichain.gen_sim.prompt2scene.llms.config import OpenAICompatibleLLMCfg
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.nodes import (
+    fit_image_table_to_clutter_node,
+    fit_text_table_to_clutter_node,
+    generate_image_assets_node,
+    generate_text_assets_node,
+    generate_text_clutter_layout_node,
+    load_unified_scene_input_kind_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.state import (
+    UnifiedSceneGenState,
+)
+__all__ = [
+    "build_unified_scene_gen_graph",
+    "route_after_load_input_kind",
+    "run_unified_scene_gen",
+]
+
+
+def route_after_load_input_kind(state: UnifiedSceneGenState) -> str:
+    """Route unified-scene generation by the original input kind."""
+    input_kind = state["input_kind"]
+    if input_kind == "text":
+        return "generate_text_assets"
+    if input_kind == "image":
+        return "generate_image_assets"
+    raise ValueError(f"Unsupported unified-scene input_kind: {input_kind!r}.")
+
+
+def build_unified_scene_gen_graph() -> Any:
+    """Build the unified-scene generation graph."""
+    graph = StateGraph(UnifiedSceneGenState)
+    graph.add_node("load_unified_scene_input_kind", load_unified_scene_input_kind_node)
+    graph.add_node("generate_text_assets", generate_text_assets_node)
+    graph.add_node("generate_text_clutter_layout", generate_text_clutter_layout_node)
+    graph.add_node("fit_text_table_to_clutter", fit_text_table_to_clutter_node)
+    graph.add_node("generate_image_assets", generate_image_assets_node)
+    graph.add_node("fit_image_table_to_clutter", fit_image_table_to_clutter_node)
+
+    graph.set_entry_point("load_unified_scene_input_kind")
+    graph.add_conditional_edges(
+        "load_unified_scene_input_kind",
+        route_after_load_input_kind,
+        {
+            "generate_text_assets": "generate_text_assets",
+            "generate_image_assets": "generate_image_assets",
+        },
+    )
+    graph.add_edge("generate_text_assets", "generate_text_clutter_layout")
+    graph.add_edge("generate_text_clutter_layout", "fit_text_table_to_clutter")
+    graph.add_edge("fit_text_table_to_clutter", END)
+    graph.add_edge("generate_image_assets", "fit_image_table_to_clutter")
+    graph.add_edge("fit_image_table_to_clutter", END)
+    return graph.compile()
+
+
+def run_unified_scene_gen(
+    output_root: Path,
+    *,
+    unified_scene_result_path: Path | None = None,
+    llm_cfg: OpenAICompatibleLLMCfg | None = None,
+) -> UnifiedSceneGenState:
+    """Run downstream generation routing from a unified-scene result."""
+    llm = build_chat_model(llm_cfg) if llm_cfg is not None else None
+    initial_state: UnifiedSceneGenState = {
+        "output_root": output_root,
+        "unified_scene_result_path": unified_scene_result_path,
+        "llm": llm,
+        "unified_scene": None,
+        "input_kind": None,
+        "table_result": None,
+        "text_object_results": [],
+        "text_clutter_settle_result": None,
+        "image_objects_layout_result": None,
+        "table_fit_result": None,
+        "generation_status": None,
+        "attempt_count": 0,
+        "max_attempts": 1,
+        "last_error": None,
+        "errors": [],
+    }
+    return build_unified_scene_gen_graph().invoke(initial_state)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/nodes.py
new file mode 100644
index 00000000..e12e41f1
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/nodes.py
@@ -0,0 +1,392 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.state import (
+    UnifiedSceneGenState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    UNIFIED_SCENE_GEN_STEP,
+    UNIFIED_SCENE_STEP,
+    WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.text_asset_generation import (
+    generate_text_object_assets,
+    generate_text_table_asset,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.text_scene_metric_scale import (
+    estimate_text_scene_metric_scale,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.text_clutter_layout import (
+    generate_text_clutter_layout,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.table_fit_scene import (
+    fit_image_scene_table,
+    fit_text_scene_table,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.image_scene_asset_generation import (
+    generate_image_scene_assets,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.paths import (
+    UnifiedScenePaths,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.prompts import (
+    build_text_metric_scale_messages,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.schema import (
+    IMAGE_METRIC_SCALE_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.scene_update import (
+    update_unified_scene,
+)
+
+__all__ = [
+    "fit_image_table_to_clutter_node",
+    "fit_text_table_to_clutter_node",
+    "generate_image_assets_node",
+    "generate_text_assets_node",
+    "generate_text_clutter_layout_node",
+    "load_unified_scene_input_kind_node",
+]
+
+
+def load_unified_scene_input_kind_node(
+    state: UnifiedSceneGenState,
+) -> dict[str, object]:
+    """Load unified-scene output and determine the generation route."""
+    paths = UnifiedScenePaths(state["output_root"])
+    result_path = paths.resolve_scene_result(state["unified_scene_result_path"])
+    if not result_path.is_file():
+        raise FileNotFoundError(f"Unified scene result not found: {result_path}")
+
+    with result_path.open("r", encoding="utf-8") as f:
+        unified_scene = json.load(f)
+    if not isinstance(unified_scene, dict):
+        raise ValueError("Unified scene result must be a JSON object.")
+
+    input_record = unified_scene.get("input")
+    if not isinstance(input_record, dict):
+        raise ValueError("Unified scene result requires input object.")
+
+    input_kind = str(input_record.get("input_kind") or "").strip()
+    if input_kind not in {"text", "image"}:
+        raise ValueError(
+            "Unified scene input.input_kind must be 'text' or 'image', "
+            f"got {input_kind!r}."
+        )
+
+    return {
+        "unified_scene_result_path": result_path,
+        "unified_scene": unified_scene,
+        "input_kind": input_kind,
+    }
+
+
+def generate_text_assets_node(
+    state: UnifiedSceneGenState,
+) -> dict[str, object]:
+    """Generate images, RGBA cutouts, geometry, and sim-ready GLBs for a
+    text-origin unified scene.
+    """
+    unified_scene = state["unified_scene"]
+    if unified_scene is None:
+        return {"generation_status": "no_unified_scene"}
+
+    paths = UnifiedScenePaths(state["output_root"])
+    output_root = paths.output_root
+    image_gen_dir, glb_gen_dir, debug_dir = paths.prepare_generation_dirs()
+    log_info(
+        "generate_text_assets started "
+        f"output_dir={output_root / UNIFIED_SCENE_GEN_STEP}"
+    )
+
+    table_spec = unified_scene.get("table") or {}
+    table_result = generate_text_table_asset(
+        table_spec=table_spec,
+        image_gen_dir=image_gen_dir,
+        glb_gen_dir=glb_gen_dir,
+        debug_dir=debug_dir,
+    )
+
+    object_specs = unified_scene.get("objects") or []
+    object_results = generate_text_object_assets(
+        object_specs=object_specs,
+        image_gen_dir=image_gen_dir,
+        glb_gen_dir=glb_gen_dir,
+        debug_dir=debug_dir,
+    )
+    metric_prompt_objects = [
+        {
+            "object_id": str(obj.get("id", "")),
+            "object_name": str(obj.get("name", "")),
+            "object_description": str(obj.get("description", "")),
+        }
+        for obj in object_results
+    ]
+    user_text = str((unified_scene.get("input") or {}).get("text") or "")
+    text_metric_scale_result = estimate_text_scene_metric_scale(
+        object_results=object_results,
+        user_text=user_text,
+        messages=build_text_metric_scale_messages(
+            user_text=user_text,
+            objects_json=metric_prompt_objects,
+        ),
+        schema=IMAGE_METRIC_SCALE_JSON_SCHEMA,
+        output_dir=glb_gen_dir / "metric_scale",
+        output_root=output_root,
+        llm=state.get("llm"),
+        step_name=UNIFIED_SCENE_STEP,
+    )
+
+    result_path = paths.resolve_scene_result(state["unified_scene_result_path"])
+    update_unified_scene(unified_scene, table_result, object_results, output_root)
+    write_json(result_path, unified_scene)
+    WorkflowArtifactWriter(output_root, UNIFIED_SCENE_GEN_STEP).write_step_result(
+        {
+            "table": table_result,
+            "objects": object_results,
+            "text_metric_scale": text_metric_scale_result,
+            "generation_status": "ok",
+        }
+    )
+    log_info(
+        "generate_text_assets completed "
+        f"table_status={table_result.get('status')} "
+        f"object_count={len(object_results)}"
+    )
+
+    return {
+        "unified_scene": unified_scene,
+        "table_result": table_result,
+        "text_object_results": object_results,
+        "generation_status": "ok",
+    }
+
+
+def generate_image_assets_node(state: UnifiedSceneGenState) -> dict[str, object]:
+    """Generate table assets and layout-aware object GLBs for image input.
+
+    Table/support and objects are generated in one multi-object call from the
+    original image and existing segmentation masks.
+    """
+    unified_scene = state["unified_scene"]
+    if unified_scene is None:
+        return {"generation_status": "no_unified_scene"}
+
+    paths = UnifiedScenePaths(state["output_root"])
+    output_root = paths.output_root
+    image_gen_dir, glb_gen_dir, debug_dir = paths.prepare_generation_dirs()
+    log_info(
+        "generate_image_assets started "
+        f"output_dir={output_root / UNIFIED_SCENE_GEN_STEP}"
+    )
+
+    segments_path = paths.image_segments_result
+    if not segments_path.is_file():
+        raise FileNotFoundError(
+            f"Image segments result not found: {segments_path}"
+        )
+    with segments_path.open("r", encoding="utf-8") as _f:
+        segments_data = json.load(_f)
+    if not isinstance(segments_data, dict):
+        raise ValueError("Image segments result must be a JSON object.")
+
+    table_spec = unified_scene.get("table") or {}
+    # Image input uses the segmented table/support mask in the multi-object
+    # SAM3D call below. Text table generation belongs to the text branch.
+    object_specs = unified_scene.get("objects") or []
+    object_layout_result = generate_image_scene_assets(
+        object_specs=object_specs,
+        table_spec=table_spec,
+        spatial_relations=(unified_scene.get("spatial") or {}).get("relations", []),
+        segments_data=segments_data,
+        image_gen_dir=image_gen_dir,
+        glb_gen_dir=glb_gen_dir,
+        debug_dir=debug_dir,
+        output_root=output_root,
+        llm=state.get("llm"),
+    )
+    table_result = object_layout_result.get("table") or {
+        "id": str(table_spec.get("id", "table")),
+        "name": str(table_spec.get("name", "table")),
+        "status": "missing_table_generation",
+    }
+    object_results = object_layout_result.get("objects") or []
+    generation_status = str(object_layout_result.get("status", "failed"))
+    if table_result.get("status") != "ok":
+        generation_status = str(table_result.get("status") or generation_status)
+    result_path = paths.resolve_scene_result(state["unified_scene_result_path"])
+    update_unified_scene(unified_scene, table_result, object_results, output_root)
+    write_json(result_path, unified_scene)
+    WorkflowArtifactWriter(output_root, UNIFIED_SCENE_GEN_STEP).write_step_result(
+        {
+            "table": table_result,
+            "objects_layout": object_layout_result,
+            "objects": object_results,
+            "table_fit_to_clutter": None,
+            "generation_status": generation_status,
+        }
+    )
+    log_info(f"generate_image_assets completed status={generation_status}")
+
+    return {
+        "unified_scene": unified_scene,
+        "table_result": table_result,
+        "text_object_results": object_results,
+        "image_objects_layout_result": object_layout_result,
+        "generation_status": generation_status,
+    }
+
+
+def fit_image_table_to_clutter_node(state: UnifiedSceneGenState) -> dict[str, object]:
+    """Resize the final table to fit the aligned image-object clutter."""
+    if state.get("input_kind") != "image":
+        return {}
+
+    paths = UnifiedScenePaths(state["output_root"])
+    output_root = paths.output_root
+    output_dir = paths.table_fit_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+    log_info(f"fit_image_table_to_clutter started output_dir={output_dir}")
+    layout_result = dict(state.get("image_objects_layout_result") or {})
+    table_fit_result = fit_image_scene_table(
+        layout_result=layout_result,
+        fallback_table_result=state.get("table_result"),
+        output_root=output_root,
+        output_dir=output_dir,
+    )
+    layout_result["table_fit_to_clutter"] = table_fit_result
+    WorkflowArtifactWriter(output_root, UNIFIED_SCENE_GEN_STEP).write_step_result(
+        {
+            "table": state.get("table_result"),
+            "objects_layout": layout_result,
+            "objects": state.get("text_object_results") or [],
+            "table_fit_to_clutter": table_fit_result,
+            "generation_status": state.get("generation_status"),
+        }
+    )
+    log_info(
+        f"fit_image_table_to_clutter completed status={table_fit_result.get('status')}"
+    )
+    return {
+        "image_objects_layout_result": layout_result,
+        "table_fit_result": table_fit_result,
+    }
+
+
+def generate_text_clutter_layout_node(
+    state: UnifiedSceneGenState,
+) -> dict[str, object]:
+    """Scale text objects to real-world size, gravity-settle, centre at origin.
+
+    Produces per-object settled GLBs and 2D AABB footprints for downstream
+    spatial layout optimisation and table fitting.
+    """
+    if state.get("input_kind") != "text":
+        return {}
+
+    paths = UnifiedScenePaths(state["output_root"])
+    output_root = paths.output_root
+    output_dir = paths.text_clutter_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+    log_info(f"generate_text_clutter_layout started output_dir={output_dir}")
+
+    text_object_results = state.get("text_object_results") or []
+    if not text_object_results:
+        return {
+            "text_clutter_settle_result": {
+                "status": "skipped",
+                "reason": "no_text_objects",
+            }
+        }
+
+    unified_scene = state.get("unified_scene") or {}
+    spatial_data = unified_scene.get("spatial") or {}
+    spatial_relations = spatial_data.get("relations", [])
+    table_constraints = spatial_data.get("table_constraints", [])
+
+    settle_result = generate_text_clutter_layout(
+        object_results=text_object_results,
+        spatial_relations=spatial_relations,
+        table_constraints=table_constraints,
+        output_dir=output_dir,
+        output_root=output_root,
+    )
+    WorkflowArtifactWriter(output_root, UNIFIED_SCENE_GEN_STEP).write_step_result(
+        {
+            "table": state.get("table_result"),
+            "objects": text_object_results,
+            "text_clutter_settle": settle_result,
+            "generation_status": state.get("generation_status"),
+        }
+    )
+    log_info(
+        f"generate_text_clutter_layout completed status={settle_result.get('status')}"
+    )
+    return {
+        "text_clutter_settle_result": settle_result,
+    }
+
+
+def fit_text_table_to_clutter_node(
+    state: UnifiedSceneGenState,
+) -> dict[str, object]:
+    """Resize the text-scene table to fit the laid-out clutter footprint."""
+    if state.get("input_kind") != "text":
+        return {}
+
+    paths = UnifiedScenePaths(state["output_root"])
+    output_root = paths.output_root
+    table_result = state.get("table_result")
+    settle_result = state.get("text_clutter_settle_result")
+
+    if table_result is None or settle_result is None:
+        return {
+            "table_fit_result": {
+                "status": "skipped",
+                "reason": "missing_table_or_settle_result",
+            }
+        }
+
+    output_dir = paths.table_fit_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+    log_info(f"fit_text_table_to_clutter started output_dir={output_dir}")
+    table_fit_result = fit_text_scene_table(
+        table_result=table_result,
+        clutter_layout_result=settle_result,
+        output_root=output_root,
+        output_dir=output_dir,
+    )
+    WorkflowArtifactWriter(output_root, UNIFIED_SCENE_GEN_STEP).write_step_result(
+        {
+            "table": table_result,
+            "objects": state.get("text_object_results") or [],
+            "text_clutter_settle": settle_result,
+            "table_fit_to_clutter": table_fit_result,
+            "generation_status": state.get("generation_status"),
+        }
+    )
+    log_info(
+        f"fit_text_table_to_clutter completed status={table_fit_result.get('status')}"
+    )
+    return {
+        "table_fit_result": table_fit_result,
+    }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/paths.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/paths.py
new file mode 100644
index 00000000..c4af8054
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/paths.py
@@ -0,0 +1,102 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+    IMAGE_SEGMENTS_STEP,
+    STEP_RESULT_FILENAME,
+    UNIFIED_SCENE_GEN_STEP,
+    UNIFIED_SCENE_STEP,
+)
+
+__all__ = ["UnifiedScenePaths", "resolve_generated_path"]
+
+
+def resolve_generated_path(value: Any, output_root: Path) -> Path:
+    """Resolve an absolute or output-root-relative generated artifact path."""
+    if not value:
+        return Path()
+    path = Path(str(value)).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (output_root.expanduser().resolve() / path).resolve()
+
+
+@dataclass(frozen=True)
+class UnifiedScenePaths:
+    """High-level paths owned by the unified-scene generation workflow."""
+
+    output_root: Path
+
+    def __post_init__(self) -> None:
+        object.__setattr__(
+            self,
+            "output_root",
+            self.output_root.expanduser().resolve(),
+        )
+
+    @property
+    def workflow_root(self) -> Path:
+        return self.output_root / UNIFIED_SCENE_GEN_STEP
+
+    @property
+    def image_gen_dir(self) -> Path:
+        return self.workflow_root / "image_gen"
+
+    @property
+    def glb_gen_dir(self) -> Path:
+        return self.workflow_root / "glb_gen"
+
+    @property
+    def debug_dir(self) -> Path:
+        return self.workflow_root / "debug"
+
+    @property
+    def text_clutter_dir(self) -> Path:
+        return self.glb_gen_dir / "text_clutter_settled"
+
+    @property
+    def table_fit_dir(self) -> Path:
+        return self.glb_gen_dir / "table_fit_to_clutter"
+
+    @property
+    def image_segments_result(self) -> Path:
+        return self.output_root / IMAGE_SEGMENTS_STEP / STEP_RESULT_FILENAME
+
+    def prepare_generation_dirs(self) -> tuple[Path, Path, Path]:
+        """Create and return the workflow's high-level generation directories."""
+        directories = (self.image_gen_dir, self.glb_gen_dir, self.debug_dir)
+        for directory in directories:
+            directory.mkdir(parents=True, exist_ok=True)
+        return directories
+
+    def resolve_scene_result(self, explicit_path: Path | None) -> Path:
+        """Resolve the unified-scene result produced by the preceding workflow."""
+        if explicit_path is not None:
+            return explicit_path.expanduser().resolve()
+
+        scene_dir = self.output_root / UNIFIED_SCENE_STEP
+        result_path = scene_dir / STEP_RESULT_FILENAME
+        if result_path.is_file():
+            return result_path
+
+        legacy_path = scene_dir / "results.json"
+        return legacy_path if legacy_path.is_file() else result_path
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/prompts.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/prompts.py
new file mode 100644
index 00000000..1543acfb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/prompts.py
@@ -0,0 +1,141 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.prompts import render_prompt
+from embodichain.gen_sim.prompt2scene.utils.io import image_to_data_url
+
+__all__ = [
+    "build_image_metric_scale_messages",
+    "build_text_metric_scale_messages",
+    "build_up_down_flip_check_messages",
+]
+
+UNIFIED_SCENE_GEN_PROMPT_NAME = "unified_scene_gen.yaml"
+
+
+def build_image_metric_scale_messages(
+    *,
+    bbox_name_image_path: Path,
+    objects_json: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Build messages for image-scene object metric scale estimation."""
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                UNIFIED_SCENE_GEN_PROMPT_NAME,
+                prompt_key="image_metric_scale_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": render_prompt(
+                        UNIFIED_SCENE_GEN_PROMPT_NAME,
+                        {
+                            "objects_json": json.dumps(
+                                objects_json,
+                                ensure_ascii=False,
+                                indent=2,
+                            ),
+                        },
+                        prompt_key="image_metric_scale_user",
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(bbox_name_image_path)},
+                },
+            ],
+        },
+    ]
+
+
+def build_text_metric_scale_messages(
+    *,
+    user_text: str,
+    objects_json: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Build messages for text-scene object metric scale estimation."""
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                UNIFIED_SCENE_GEN_PROMPT_NAME,
+                prompt_key="text_metric_scale_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": render_prompt(
+                UNIFIED_SCENE_GEN_PROMPT_NAME,
+                {
+                    "user_text": user_text,
+                    "objects_json": json.dumps(
+                        objects_json,
+                        ensure_ascii=False,
+                        indent=2,
+                    ),
+                },
+                prompt_key="text_metric_scale_user",
+            ),
+        },
+    ]
+
+
+def build_up_down_flip_check_messages(
+    *,
+    original_image_path: Path,
+    comparison_image_path: Path,
+) -> list[dict[str, Any]]:
+    """Build messages for VLM support-normal up/down flip verification."""
+    return [
+        {
+            "role": "system",
+            "content": render_prompt(
+                UNIFIED_SCENE_GEN_PROMPT_NAME,
+                prompt_key="up_down_flip_check_system",
+            ),
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": render_prompt(
+                        UNIFIED_SCENE_GEN_PROMPT_NAME,
+                        prompt_key="up_down_flip_check_user",
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(original_image_path)},
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_to_data_url(comparison_image_path)},
+                },
+            ],
+        },
+    ]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/scene_update.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/scene_update.py
new file mode 100644
index 00000000..2276e559
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/scene_update.py
@@ -0,0 +1,76 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.io import relative_path
+
+__all__ = ["update_unified_scene"]
+
+
+def update_unified_scene(
+    unified_scene: dict[str, Any],
+    table_result: dict[str, Any],
+    object_results: list[dict[str, Any]],
+    output_root: Path,
+) -> None:
+    """Write generated asset references back into a unified-scene payload."""
+    table = unified_scene.setdefault("table", {})
+    metadata_keys = (
+        "table_asset_source",
+        "support_normal_source",
+        "is_complete_visible_table",
+        "complete_table_description",
+    )
+    path_keys = (
+        "image_path",
+        "raw_geometry_path",
+        "support_reference_geometry_path",
+        "generated_table_raw_geometry_path",
+        "transformed_geometry_path",
+        "simready_geometry_path",
+        "aligned_geometry_path",
+        "mesh_path",
+    )
+    for key in metadata_keys:
+        if key in table_result:
+            table[key] = table_result[key]
+    for key in path_keys:
+        if table_result.get(key):
+            table[key] = relative_path(table_result[key], output_root)
+
+    objects_by_id = {
+        str(item.get("id", "")): item
+        for item in unified_scene.setdefault("objects", [])
+        if isinstance(item, dict)
+    }
+    for result in object_results:
+        target = objects_by_id.get(str(result.get("id", "")))
+        if target is None:
+            continue
+        for key in ("image_path", "mesh_path", "aligned_geometry_path"):
+            if result.get(key):
+                target[key] = relative_path(result[key], output_root)
+        metric_scale = result.get("metric_scale")
+        if isinstance(metric_scale, dict):
+            target["metric_scale"] = {
+                key: value
+                for key, value in metric_scale.items()
+                if key not in {"result_path", "raw_model_output_path"}
+            }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/schema.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/schema.py
new file mode 100644
index 00000000..b22fcebb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/schema.py
@@ -0,0 +1,71 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+__all__ = [
+    "IMAGE_METRIC_SCALE_JSON_SCHEMA",
+    "UP_DOWN_FLIP_CHECK_JSON_SCHEMA",
+]
+
+UP_DOWN_FLIP_CHECK_JSON_SCHEMA: dict[str, Any] = {
+    "title": "AlignedUpDownFlipCheckOutput",
+    "type": "object",
+    "additionalProperties": False,
+    "properties": {
+        "selected_number": {"type": "integer", "enum": [1, 2]},
+        "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
+        "reason": {"type": "string"},
+    },
+    "required": ["selected_number", "confidence", "reason"],
+}
+
+IMAGE_METRIC_SCALE_JSON_SCHEMA: dict[str, Any] = {
+    "title": "ImageMetricScaleEstimate",
+    "type": "object",
+    "additionalProperties": False,
+    "properties": {
+        "object_scales": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "additionalProperties": False,
+                "properties": {
+                    "object_id": {"type": "string"},
+                    "bbox_dims_cm": {
+                        "type": "array",
+                        "minItems": 3,
+                        "maxItems": 3,
+                        "items": {
+                            "type": "number",
+                            "minimum": 1.0e-6,
+                        },
+                    },
+                    "confidence": {
+                        "type": "number",
+                        "minimum": 0.0,
+                        "maximum": 1.0,
+                    },
+                    "reason": {"type": "string"},
+                },
+                "required": ["object_id", "bbox_dims_cm", "confidence", "reason"],
+            },
+        },
+    },
+    "required": ["object_scales"],
+}
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/state.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/state.py
new file mode 100644
index 00000000..12283516
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/state.py
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+
+__all__ = ["UnifiedSceneGenState"]
+
+
+class UnifiedSceneGenState(AttemptState):
+    """LangGraph state for downstream unified-scene generation."""
+
+    output_root: Path
+    unified_scene_result_path: Path | None
+    llm: Any | None
+    unified_scene: dict[str, Any] | None
+    input_kind: str | None
+    table_result: dict[str, Any] | None
+    text_object_results: list[dict[str, Any]]
+    text_clutter_settle_result: dict[str, Any] | None
+    image_objects_layout_result: dict[str, Any] | None
+    table_fit_result: dict[str, Any] | None
+    generation_status: str | None