From 3faa90aad902fafc945c4f84ec80bde814ee8616 Mon Sep 17 00:00:00 2001
From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com>
Date: Sun, 28 Jun 2026 15:34:51 +0800
Subject: [PATCH 1/3] fix franka fpath

---
 .../overview/sim/planners/neural_planner.md   | 16 +++-
 embodichain/data/assets/planner_assets.py     | 69 +++++++++++++++-
 examples/sim/planners/neural_planner.py       | 13 ++-
 tests/data/assets/test_planner_assets.py      | 81 +++++++++++++++++++
 4 files changed, 173 insertions(+), 6 deletions(-)
 create mode 100644 tests/data/assets/test_planner_assets.py

diff --git a/docs/source/overview/sim/planners/neural_planner.md b/docs/source/overview/sim/planners/neural_planner.md
index 632e2c48..344d2df1 100644
--- a/docs/source/overview/sim/planners/neural_planner.md
+++ b/docs/source/overview/sim/planners/neural_planner.md
@@ -15,6 +15,13 @@ trained APG checkpoint through `MotionGenerator` to reach Cartesian targets.
 
 Pre-trained checkpoints are hosted on HuggingFace and can be downloaded with
 `download_neural_planner_checkpoint()` (requires `HF_TOKEN` environment variable).
+If you already have `franka.pt`, pass it explicitly or place it at:
+
+```text
+~/.cache/embodichain_data/checkpoints/dexforce/neural_motion_generator/franka/franka.pt
+```
+
+You can also set `EMBODICHAIN_NEURAL_PLANNER_CHECKPOINT=/path/to/franka.pt`.
 
 ```python
 from embodichain.data.assets.planner_assets import download_neural_planner_checkpoint
@@ -60,4 +67,11 @@ result = motion_generator.generate(
 python examples/sim/planners/neural_planner.py --headless --device cuda
 ```
 
-The example downloads the checkpoint automatically on first run.
+Use a local checkpoint without HuggingFace:
+
+```bash
+python examples/sim/planners/neural_planner.py --headless --device cuda --checkpoint-path /path/to/franka.pt
+```
+
+The example downloads the checkpoint automatically on first run only when no
+local checkpoint is provided or found in the default cache path.
diff --git a/embodichain/data/assets/planner_assets.py b/embodichain/data/assets/planner_assets.py
index b362d809..816f339b 100644
--- a/embodichain/data/assets/planner_assets.py
+++ b/embodichain/data/assets/planner_assets.py
@@ -16,15 +16,60 @@
 from __future__ import annotations
 
 import os
+from pathlib import Path
 
 from huggingface_hub import hf_hub_download
 
+from embodichain.data.constants import EMBODICHAIN_DEFAULT_DATA_ROOT
+
 # HuggingFace endpoint. Mirrors (e.g. hf-mirror.com) often redirect to the
 # real hub without forwarding the required commit-hash response headers, so we
 # default to the canonical endpoint and rely on the system proxy when needed.
 _HF_ENDPOINT = "https://huggingface.co"
+_NEURAL_PLANNER_LOCAL_CHECKPOINT = Path(
+    "checkpoints/dexforce/neural_motion_generator/franka/franka.pt"
+)
+NEURAL_PLANNER_CHECKPOINT_ENV = "EMBODICHAIN_NEURAL_PLANNER_CHECKPOINT"
+
+__all__ = [
+    "NEURAL_PLANNER_CHECKPOINT_ENV",
+    "download_neural_planner_checkpoint",
+    "get_default_neural_planner_checkpoint_path",
+]
+
+
+def get_default_neural_planner_checkpoint_path(
+    data_root: str | os.PathLike[str] | None = None,
+) -> str:
+    """Return the default local NeuralPlanner checkpoint path."""
+    root = EMBODICHAIN_DEFAULT_DATA_ROOT if data_root is None else data_root
+    return str(Path(root).expanduser() / _NEURAL_PLANNER_LOCAL_CHECKPOINT)
+
+
+def _normalize_existing_checkpoint_path(
+    checkpoint_path: str | os.PathLike[str],
+) -> str:
+    path = os.path.abspath(os.path.expanduser(os.fspath(checkpoint_path)))
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"NeuralPlanner checkpoint not found: {path}")
+    return path
 
-__all__ = ["download_neural_planner_checkpoint"]
+
+def _resolve_local_neural_planner_checkpoint(
+    checkpoint_path: str | os.PathLike[str] | None,
+) -> str | None:
+    if checkpoint_path is not None:
+        return _normalize_existing_checkpoint_path(checkpoint_path)
+
+    env_checkpoint_path = os.environ.get(NEURAL_PLANNER_CHECKPOINT_ENV)
+    if env_checkpoint_path:
+        return _normalize_existing_checkpoint_path(env_checkpoint_path)
+
+    default_checkpoint_path = get_default_neural_planner_checkpoint_path()
+    if os.path.isfile(default_checkpoint_path):
+        return default_checkpoint_path
+
+    return None
 
 
 def download_neural_planner_checkpoint(
@@ -32,11 +77,21 @@ def download_neural_planner_checkpoint(
     filename: str = "franka/franka.pt",
     token: str | None = None,
     endpoint: str = _HF_ENDPOINT,
+    checkpoint_path: str | os.PathLike[str] | None = None,
 ) -> str:
-    """Download a neural planner checkpoint from HuggingFace.
+    """Resolve or download a neural planner checkpoint.
 
-    The repository is gated. Either set the ``HF_TOKEN`` environment variable or
-    run ``huggingface-cli login`` before calling this function.
+    Local checkpoint resolution is tried first, in this order:
+
+    1. The explicit ``checkpoint_path`` argument.
+    2. The ``EMBODICHAIN_NEURAL_PLANNER_CHECKPOINT`` environment variable.
+    3. ``~/.cache/embodichain_data/checkpoints/dexforce/neural_motion_generator/franka/franka.pt``
+       unless ``EMBODICHAIN_DATA_ROOT`` overrides the data root.
+
+    If no local checkpoint is found, the checkpoint is downloaded from
+    HuggingFace. The repository is gated. Either set the ``HF_TOKEN``
+    environment variable or run ``huggingface-cli login`` before calling this
+    function.
 
     If your network requires an HTTP proxy, set ``HTTPS_PROXY`` or
     ``https_proxy`` in the environment before launching Python.
@@ -50,6 +105,8 @@ def download_neural_planner_checkpoint(
         endpoint: HuggingFace-compatible endpoint URL. Defaults to
             ``https://huggingface.co``. Mirrors that merely redirect to the
             real hub are not supported.
+        checkpoint_path: Optional local checkpoint path. If provided, this path
+            must exist and HuggingFace is not used.
 
     Returns:
         str: Local path to the downloaded checkpoint file.
@@ -57,6 +114,10 @@ def download_neural_planner_checkpoint(
     Raises:
         RuntimeError: If the download fails, with authentication instructions.
     """
+    local_checkpoint_path = _resolve_local_neural_planner_checkpoint(checkpoint_path)
+    if local_checkpoint_path is not None:
+        return local_checkpoint_path
+
     # Normalize proxy env vars: the ``requests`` library on Linux requires the
     # lowercase form (``https_proxy``), but users typically export the uppercase
     # form (``HTTPS_PROXY``).
diff --git a/examples/sim/planners/neural_planner.py b/examples/sim/planners/neural_planner.py
index 2cf4b0bc..bc154d5f 100644
--- a/examples/sim/planners/neural_planner.py
+++ b/examples/sim/planners/neural_planner.py
@@ -56,6 +56,15 @@ def parse_args() -> argparse.Namespace:
         default=5,
         help="Number of EEF waypoints to send to the neural planner.",
     )
+    parser.add_argument(
+        "--checkpoint-path",
+        type=str,
+        default=None,
+        help=(
+            "Local NeuralPlanner checkpoint path. If omitted, local cache and "
+            "HuggingFace are used."
+        ),
+    )
     parser.add_argument(
         "--headless",
         action="store_true",
@@ -188,7 +197,9 @@ def play_trajectory(
 
 def main() -> None:
     args = parse_args()
-    checkpoint_path = download_neural_planner_checkpoint()
+    checkpoint_path = download_neural_planner_checkpoint(
+        checkpoint_path=args.checkpoint_path
+    )
 
     sim_device = _resolve_device(args.device)
     sim = SimulationManager(
diff --git a/tests/data/assets/test_planner_assets.py b/tests/data/assets/test_planner_assets.py
new file mode 100644
index 00000000..58de2d99
--- /dev/null
+++ b/tests/data/assets/test_planner_assets.py
@@ -0,0 +1,81 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+from __future__ import annotations
+
+import os
+
+import pytest
+
+from embodichain.data.assets import planner_assets
+
+
+def _raise_if_downloaded(*args, **kwargs):
+    raise AssertionError("hf_hub_download should not be called")
+
+
+def test_neural_planner_checkpoint_uses_explicit_local_path(tmp_path, monkeypatch):
+    checkpoint_path = tmp_path / "franka.pt"
+    checkpoint_path.write_bytes(b"checkpoint")
+    monkeypatch.setattr(planner_assets, "hf_hub_download", _raise_if_downloaded)
+
+    resolved_path = planner_assets.download_neural_planner_checkpoint(
+        checkpoint_path=checkpoint_path
+    )
+
+    assert resolved_path == os.path.abspath(str(checkpoint_path))
+
+
+def test_neural_planner_checkpoint_uses_env_local_path(tmp_path, monkeypatch):
+    checkpoint_path = tmp_path / "franka.pt"
+    checkpoint_path.write_bytes(b"checkpoint")
+    monkeypatch.setenv(
+        planner_assets.NEURAL_PLANNER_CHECKPOINT_ENV,
+        str(checkpoint_path),
+    )
+    monkeypatch.setattr(planner_assets, "hf_hub_download", _raise_if_downloaded)
+
+    resolved_path = planner_assets.download_neural_planner_checkpoint()
+
+    assert resolved_path == os.path.abspath(str(checkpoint_path))
+
+
+def test_neural_planner_checkpoint_uses_default_data_root(tmp_path, monkeypatch):
+    checkpoint_path = (
+        tmp_path
+        / "checkpoints"
+        / "dexforce"
+        / "neural_motion_generator"
+        / "franka"
+        / "franka.pt"
+    )
+    checkpoint_path.parent.mkdir(parents=True)
+    checkpoint_path.write_bytes(b"checkpoint")
+    monkeypatch.delenv(planner_assets.NEURAL_PLANNER_CHECKPOINT_ENV, raising=False)
+    monkeypatch.setattr(planner_assets, "EMBODICHAIN_DEFAULT_DATA_ROOT", str(tmp_path))
+    monkeypatch.setattr(planner_assets, "hf_hub_download", _raise_if_downloaded)
+
+    resolved_path = planner_assets.download_neural_planner_checkpoint()
+
+    assert resolved_path == str(checkpoint_path)
+
+
+def test_neural_planner_checkpoint_rejects_missing_explicit_path(tmp_path):
+    checkpoint_path = tmp_path / "missing.pt"
+
+    with pytest.raises(FileNotFoundError, match="NeuralPlanner checkpoint not found"):
+        planner_assets.download_neural_planner_checkpoint(
+            checkpoint_path=checkpoint_path
+        )

From 68d3838c8813005ebebd367dd4e17efcf76bdeb0 Mon Sep 17 00:00:00 2001
From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com>
Date: Sun, 28 Jun 2026 17:15:19 +0800
Subject: [PATCH 2/3] temp add benchmark and demo

---
 docs/source/overview/sim/planners/index.rst   |    2 +-
 .../overview/sim/planners/motion_generator.md |   44 +-
 .../overview/sim/planners/neural_planner.md   |   94 +-
 .../lab/sim/atomic_actions/trajectory.py      |  148 +-
 embodichain/lab/sim/planners/base_planner.py  |    4 +
 .../lab/sim/planners/motion_generator.py      |   33 +-
 .../lab/sim/planners/neural_planner.py        |  142 +-
 .../lab/sim/planners/toppra_planner.py        |    3 +
 embodichain/lab/sim/solvers/pytorch_solver.py |    7 +-
 pyproject.toml                                |    1 +
 scripts/benchmark/robotics/__init__.py        |   17 +
 scripts/benchmark/robotics/nmg/__init__.py    |   17 +
 .../robotics/nmg/franka_pick_place.py         | 3018 +++++++++++++++++
 .../benchmark/robotics/nmg/franka_planner.py  |  981 ++++++
 .../robotics/nmg/test_franka_pick_place.py    |  590 ++++
 .../robotics/nmg/test_franka_planner.py       |  189 ++
 tests/sim/atomic_actions/test_trajectory.py   |  190 +-
 tests/sim/planners/test_neural_planner.py     |  149 +-
 tests/sim/solvers/test_pytorch_solver.py      |   75 +-
 19 files changed, 5580 insertions(+), 124 deletions(-)
 create mode 100644 scripts/benchmark/robotics/__init__.py
 create mode 100644 scripts/benchmark/robotics/nmg/__init__.py
 create mode 100644 scripts/benchmark/robotics/nmg/franka_pick_place.py
 create mode 100644 scripts/benchmark/robotics/nmg/franka_planner.py
 create mode 100644 tests/benchmark/robotics/nmg/test_franka_pick_place.py
 create mode 100644 tests/benchmark/robotics/nmg/test_franka_planner.py

diff --git a/docs/source/overview/sim/planners/index.rst b/docs/source/overview/sim/planners/index.rst
index d1320be1..bf0ebdf7 100644
--- a/docs/source/overview/sim/planners/index.rst
+++ b/docs/source/overview/sim/planners/index.rst
@@ -21,7 +21,7 @@ The `embodichain` project provides a unified interface for robot trajectory plan
 
 - **MotionGenerator**: A unified trajectory planning interface that supports joint/Cartesian interpolation, automatic constraint handling, flexible planner selection, and is easily extensible for collision checking and additional planners.
 - **ToppraPlanner**: A time-optimal trajectory planner based on the TOPPRA library, supporting joint trajectory generation under velocity and acceleration constraints.
-- **NeuralPlanner** (experimental): A learning-based EEF waypoint planner for Franka Panda.
+- **NeuralPlanner**: An experimental neural motion generation backend for end-effector waypoint trajectories, backed by a trained waypoint-conditioned policy.
 - **TrajectorySampleMethod**: An enumeration for trajectory sampling strategies, supporting sampling by time, quantity, or distance.
 
 These tools can be used to generate smooth and dynamically feasible robot trajectories, and are extensible for future collision checking and various sampling requirements.
diff --git a/docs/source/overview/sim/planners/motion_generator.md b/docs/source/overview/sim/planners/motion_generator.md
index 4d8b53db..ffe97372 100644
--- a/docs/source/overview/sim/planners/motion_generator.md
+++ b/docs/source/overview/sim/planners/motion_generator.md
@@ -1,11 +1,11 @@
 # MotionGenerator
 
-`MotionGenerator` provides a unified interface for robot trajectory planning, supporting both joint space and Cartesian space interpolation. It is designed to work with different planners (such as ToppraPlanner) and can be extended to support collision checking in the future.
+`MotionGenerator` provides a unified interface for robot trajectory planning, supporting both joint space and Cartesian space interpolation. It selects a backend planner from `MotionGenCfg.planner_cfg`, such as `ToppraPlanner` for joint waypoint time-parameterization or `NeuralPlanner` for experimental end-effector waypoint motion generation.
 
 ## Features
 
 * **Unified planning interface**: Supports trajectory planning with or without collision checking (collision checking is reserved for future implementation).
-* **Flexible planner selection**: Supports TOPPRA and NeuralPlanner (experimental).
+* **Flexible planner selection**: Allows selection of different planners through `planner_cfg.planner_type`.
 * **Automatic constraint handling**: Retrieves velocity and acceleration limits from the robot or uses user-specified/default values.
 * **Supports both joint and Cartesian interpolation**: Generates discrete trajectories using either joint space or Cartesian space interpolation.
 * **Convenient sampling**: Supports various sampling strategies via `TrajectorySampleMethod`.
@@ -110,6 +110,43 @@ result = motion_gen.generate(
 )
 ```
 
+#### Neural EEF Waypoint Planning
+
+```python
+from embodichain.lab.sim.planners import (
+    MotionGenCfg,
+    MotionGenOptions,
+    MotionGenerator,
+    MoveType,
+    NeuralPlannerCfg,
+    PlanState,
+)
+
+motion_gen = MotionGenerator(
+    cfg=MotionGenCfg(
+        planner_cfg=NeuralPlannerCfg(
+            robot_uid="Franka",
+            checkpoint_path="/path/to/franka.pt",
+            control_part="main_arm",
+        )
+    )
+)
+
+target_states = [
+    PlanState(move_type=MoveType.EEF_MOVE, xpos=target_pose),
+]
+
+result = motion_gen.generate(
+    target_states=target_states,
+    options=MotionGenOptions(
+        control_part="main_arm",
+        start_qpos=start_qpos,
+    ),
+)
+```
+
+The neural backend only supports `EEF_MOVE` waypoint inputs and currently assumes a compatible 7-DoF checkpoint. It returns a policy rollout rather than a TOPPRA-constrained trajectory.
+
 #### Cartesian Space Planning
 
 ```python
@@ -178,8 +215,9 @@ print(f"Estimated sample count: {sample_count}")
 
 ## Notes
 
-* The planner type can be specified as a string or `PlannerType` enum.
+* The planner type is selected by `planner_cfg.planner_type`.
 * If the robot provides its own joint limits, those will be used; otherwise, default or user-specified limits are applied.
 * For Cartesian interpolation, inverse kinematics (IK) is used to compute joint configurations for each interpolated pose.
+* In atomic actions, the non-neural EEF path is IK plus joint interpolation. The neural planner is used only when the active planner type is `neural`.
 * The class is designed to be extensible for additional planners and collision checking in the future.
 * The sample count estimation is useful for predicting computational load and memory requirements.
diff --git a/docs/source/overview/sim/planners/neural_planner.md b/docs/source/overview/sim/planners/neural_planner.md
index 344d2df1..eef2d33e 100644
--- a/docs/source/overview/sim/planners/neural_planner.md
+++ b/docs/source/overview/sim/planners/neural_planner.md
@@ -3,18 +3,40 @@
 ````{admonition} Experimental
 :class: warning
 
-`NeuralPlanner` is an **experimental** feature. The API, checkpoint format,
-and default parameters may change without a deprecation cycle. It is currently
-only validated on the **Franka Panda** robot.
+`NeuralPlanner` is an experimental feature. The API, checkpoint format, and
+default parameters may change without a deprecation cycle. It is currently
+validated only on the Franka Panda checkpoint shipped for neural motion
+generation.
 ````
 
-`NeuralPlanner` is a learning-based EEF waypoint planner. It rolls out a
-trained APG checkpoint through `MotionGenerator` to reach Cartesian targets.
+`NeuralPlanner` is a learning-based motion generation backend for end-effector
+waypoint trajectories. It rolls out a trained waypoint-conditioned APG policy
+from the current arm joint state toward one or more target TCP poses.
 
-## Configuration
+Unlike `ToppraPlanner`, this planner is not a time-parameterization solver for
+joint waypoints. It is intended for `MoveType.EEF_MOVE` inputs and returns a
+policy rollout as joint positions plus the corresponding forward-kinematics
+poses.
+
+## Features
+
+- **End-effector waypoint planning**: Accepts
+  `PlanState(move_type=MoveType.EEF_MOVE, xpos=...)` waypoints.
+- **MotionGenerator integration**: Select it with
+  `MotionGenCfg(planner_cfg=NeuralPlannerCfg(...))`.
+- **Atomic action integration**: EEF-based atomic actions route through
+  `TrajectoryBuilder.plan_arm_traj()` and use the neural backend when the active
+  planner type is `neural` or `neural_refine`.
+- **Checkpoint-backed inference**: Loads a trained transformer waypoint
+  checkpoint and runs it in evaluation mode.
+
+## Usage
 
 Pre-trained checkpoints are hosted on HuggingFace and can be downloaded with
-`download_neural_planner_checkpoint()` (requires `HF_TOKEN` environment variable).
+`download_neural_planner_checkpoint()`. The repository is gated, so the process
+must have access to an authenticated token through `HF_TOKEN` or
+`huggingface-cli login` when no local checkpoint is provided.
+
 If you already have `franka.pt`, pass it explicitly or place it at:
 
 ```text
@@ -24,7 +46,7 @@ If you already have `franka.pt`, pass it explicitly or place it at:
 You can also set `EMBODICHAIN_NEURAL_PLANNER_CHECKPOINT=/path/to/franka.pt`.
 
 ```python
-from embodichain.data.assets.planner_assets import download_neural_planner_checkpoint
+from embodichain.data.assets import download_neural_planner_checkpoint
 from embodichain.lab.sim.planners import (
     MotionGenCfg,
     MotionGenOptions,
@@ -33,11 +55,10 @@ from embodichain.lab.sim.planners import (
     NeuralPlannerCfg,
     PlanState,
 )
-from embodichain.lab.sim.planners.neural_planner import NeuralPlanOptions
 
 checkpoint_path = download_neural_planner_checkpoint()
 
-motion_generator = MotionGenerator(
+motion_gen = MotionGenerator(
     cfg=MotionGenCfg(
         planner_cfg=NeuralPlannerCfg(
             robot_uid=robot.uid,
@@ -47,21 +68,49 @@ motion_generator = MotionGenerator(
     )
 )
 
-result = motion_generator.generate(
+result = motion_gen.generate(
     target_states=[
         PlanState(move_type=MoveType.EEF_MOVE, xpos=waypoint)
         for waypoint in waypoints
     ],
     options=MotionGenOptions(
-        plan_opts=NeuralPlanOptions(
-            control_part="main_arm",
-            start_qpos=start_qpos,
-        ),
+        control_part="main_arm",
+        start_qpos=start_qpos,
     ),
 )
 ```
 
-## Example
+## Atomic Actions
+
+When the active planner is `neural`, EEF-based atomic actions such as
+`MoveEndEffector`, `PickUp`, `MoveHeldObject`, `Place`, and the down phase of
+`Press` call the neural planner through `MotionGenerator.generate()`. The raw
+policy rollout is then resampled to the fixed waypoint count requested by the
+action config.
+
+When the active planner is `neural_refine`, the atomic-action trajectory builder
+uses the neural rollout and appends a final IK-refined waypoint before
+resampling. Joint-space actions and joint-space return phases, such as
+`MoveJoints` and the return phase of `Press`, continue to use joint
+interpolation.
+
+## Limitations
+
+- Only `MoveType.EEF_MOVE` waypoint inputs are supported.
+- Current transformer checkpoints assume a compatible arm/checkpoint pair; the
+  default checkpoint targets Franka Panda 7-DoF.
+- Multi-env atomic actions are handled by calling the neural planner once per
+  environment, not by a batched policy rollout.
+- The planner does not enforce TOPPRA-style velocity or acceleration
+  constraints.
+- Collision checking and self-collision constraints are not provided by this
+  backend.
+- Planning failure is explicit; the neural planner does not silently fall back
+  to IK interpolation or TOPPRA.
+
+## Examples
+
+Run the standalone planner example:
 
 ```bash
 python examples/sim/planners/neural_planner.py --headless --device cuda
@@ -75,3 +124,16 @@ python examples/sim/planners/neural_planner.py --headless --device cuda --checkp
 
 The example downloads the checkpoint automatically on first run only when no
 local checkpoint is provided or found in the default cache path.
+
+For deciding whether NMG can replace the IK interpolation path in atomic
+actions, compare several success notions:
+
+- `action_success`: the atomic action produced a trajectory.
+- `strict_pose_success`: the final TCP pose is within the script's strict
+  threshold, defaulting to 1 mm and 0.05 rad.
+- `nmg_threshold_success`: the final TCP pose is within the NMG waypoint
+  threshold, defaulting to 5 cm and 0.3 rad.
+- downstream task success: the simulated task outcome after trajectory replay.
+
+Use `strict_pose_success`, `final_pos_error`, `final_rot_error`, and the
+downstream task outcome rather than `action_success` alone.
diff --git a/embodichain/lab/sim/atomic_actions/trajectory.py b/embodichain/lab/sim/atomic_actions/trajectory.py
index be584cbc..301b570b 100644
--- a/embodichain/lab/sim/atomic_actions/trajectory.py
+++ b/embodichain/lab/sim/atomic_actions/trajectory.py
@@ -23,8 +23,9 @@
 import numpy as np
 import torch
 
-from embodichain.lab.sim.planners import PlanState
+from embodichain.lab.sim.planners import MoveType, PlanState
 from embodichain.lab.sim.planners.motion_generator import MotionGenOptions
+from embodichain.lab.sim.planners.neural_planner import NeuralPlanOptions
 from embodichain.lab.sim.planners.toppra_planner import ToppraPlanOptions
 from embodichain.lab.sim.utility.action_utils import interpolate_with_distance
 from embodichain.utils import logger
@@ -255,6 +256,15 @@ def plan_arm_traj(
         arm_dof: int,
     ) -> tuple[bool, torch.Tensor]:
         """Plan batched arm trajectories for all environments."""
+        if self._uses_neural_planner():
+            return self._plan_arm_traj_with_neural_planner(
+                target_states_list,
+                start_qpos,
+                n_waypoints,
+                control_part=control_part,
+                arm_dof=arm_dof,
+            )
+
         n_envs = start_qpos.shape[0]
         n_state = len(target_states_list[0])
         xpos_traj = torch.zeros(
@@ -285,6 +295,142 @@ def plan_arm_traj(
         )
         return True, interp
 
+    def _uses_neural_planner(self) -> bool:
+        return self._active_planner_type() in ("neural", "neural_refine")
+
+    def _uses_neural_refine(self) -> bool:
+        return self._active_planner_type() == "neural_refine"
+
+    def _active_planner_type(self) -> str | None:
+        planner = getattr(self.motion_generator, "planner", None)
+        cfg = getattr(planner, "cfg", None)
+        return getattr(cfg, "planner_type", None)
+
+    def _empty_arm_traj(
+        self, n_envs: int, n_waypoints: int, arm_dof: int
+    ) -> torch.Tensor:
+        return torch.zeros(
+            (n_envs, n_waypoints, arm_dof),
+            dtype=torch.float32,
+            device=self.device,
+        )
+
+    def _append_final_ik_refine(
+        self,
+        positions: torch.Tensor,
+        target_states: list[PlanState],
+        *,
+        env_idx: int,
+        control_part: str,
+        arm_dof: int,
+    ) -> tuple[bool, torch.Tensor]:
+        """Append an IK-snapped final waypoint for the last EEF target pose."""
+        final_target = target_states[-1]
+        if final_target.move_type != MoveType.EEF_MOVE or final_target.xpos is None:
+            logger.log_warning(
+                "Neural refine requires the final target state to be an EEF pose."
+            )
+            return False, positions
+
+        seed = positions[-1, :arm_dof].unsqueeze(0)
+        success, refined_qpos = self.robot.compute_ik(
+            pose=final_target.xpos.unsqueeze(0),
+            name=control_part,
+            joint_seed=seed,
+            env_ids=[env_idx],
+        )
+        if not self.all_envs_success(success):
+            logger.log_warning(
+                f"Neural refine failed to compute final IK for env {env_idx}."
+            )
+            return False, positions
+
+        refined_qpos = refined_qpos.to(device=self.device, dtype=torch.float32)
+        return True, torch.cat([positions, refined_qpos[:, :arm_dof]], dim=0)
+
+    def _plan_arm_traj_with_neural_planner(
+        self,
+        target_states_list: list[list[PlanState]],
+        start_qpos: torch.Tensor,
+        n_waypoints: int,
+        *,
+        control_part: str,
+        arm_dof: int,
+    ) -> tuple[bool, torch.Tensor]:
+        """Plan each env with NeuralPlanner, then resample to action waypoints."""
+        per_env_trajs = []
+        for env_idx, target_states in enumerate(target_states_list):
+            result = self.motion_generator.generate(
+                target_states=target_states,
+                options=MotionGenOptions(
+                    control_part=control_part,
+                    start_qpos=start_qpos[env_idx],
+                    plan_opts=NeuralPlanOptions(
+                        control_part=control_part,
+                        start_qpos=start_qpos[env_idx],
+                        env_id=env_idx,
+                    ),
+                ),
+            )
+            if not self.all_envs_success(result.success) or result.positions is None:
+                logger.log_warning(
+                    f"NeuralPlanner failed to plan arm trajectory for env {env_idx}."
+                )
+                return (
+                    False,
+                    self._empty_arm_traj(start_qpos.shape[0], n_waypoints, arm_dof),
+                )
+
+            positions = result.positions.to(device=self.device, dtype=torch.float32)
+            if positions.dim() == 3:
+                if positions.shape[0] != 1:
+                    logger.log_warning(
+                        "NeuralPlanner returned a batched trajectory for a single env "
+                        f"request: {positions.shape}."
+                    )
+                    return (
+                        False,
+                        self._empty_arm_traj(start_qpos.shape[0], n_waypoints, arm_dof),
+                    )
+                positions = positions.squeeze(0)
+            if positions.shape[-1] < arm_dof:
+                logger.log_warning(
+                    f"NeuralPlanner returned {positions.shape[-1]} joints, "
+                    f"but '{control_part}' expects {arm_dof}."
+                )
+                return (
+                    False,
+                    self._empty_arm_traj(start_qpos.shape[0], n_waypoints, arm_dof),
+                )
+            positions = positions[:, :arm_dof]
+            if self._uses_neural_refine():
+                ok, positions = self._append_final_ik_refine(
+                    positions,
+                    target_states,
+                    env_idx=env_idx,
+                    control_part=control_part,
+                    arm_dof=arm_dof,
+                )
+                if not ok:
+                    return (
+                        False,
+                        self._empty_arm_traj(start_qpos.shape[0], n_waypoints, arm_dof),
+                    )
+            per_env_trajs.append(positions)
+
+        raw_traj = torch.nn.utils.rnn.pad_sequence(
+            per_env_trajs, batch_first=True, padding_value=0.0
+        )
+        for env_idx, traj in enumerate(per_env_trajs):
+            raw_traj[env_idx, traj.shape[0] :] = traj[-1]
+
+        interp = interpolate_with_distance(
+            trajectory=raw_traj,
+            interp_num=n_waypoints,
+            device=self.device,
+        )
+        return True, interp
+
     def plan_joint_traj(
         self,
         start_qpos: torch.Tensor,
diff --git a/embodichain/lab/sim/planners/base_planner.py b/embodichain/lab/sim/planners/base_planner.py
index 5eff14cf..a23564f2 100644
--- a/embodichain/lab/sim/planners/base_planner.py
+++ b/embodichain/lab/sim/planners/base_planner.py
@@ -110,6 +110,10 @@ def __init__(self, cfg: BasePlannerCfg):
 
         self.device = self.robot.device
 
+    def default_plan_options(self) -> PlanOptions:
+        """Return planner-specific default runtime options."""
+        return PlanOptions()
+
     @validate_plan_options
     @abstractmethod
     def plan(
diff --git a/embodichain/lab/sim/planners/motion_generator.py b/embodichain/lab/sim/planners/motion_generator.py
index ba09efc2..e21ab3b9 100644
--- a/embodichain/lab/sim/planners/motion_generator.py
+++ b/embodichain/lab/sim/planners/motion_generator.py
@@ -96,6 +96,7 @@ class MotionGenerator:
     _support_planner_dict = {
         "toppra": (ToppraPlanner, ToppraPlannerCfg),
         "neural": (NeuralPlanner, NeuralPlannerCfg),
+        "neural_refine": (NeuralPlanner, NeuralPlannerCfg),
     }
 
     def __init__(self, cfg: MotionGenCfg) -> None:
@@ -220,18 +221,30 @@ def generate(
         else:
             target_plan_states = target_states
 
-        if options.plan_opts is None:
-            if hasattr(self.planner, "default_plan_options"):
-                options.plan_opts = self.planner.default_plan_options()
-            else:
-                options.plan_opts = PlanOptions()
-        # Planner-specific options (e.g. NeuralPlanOptions) must be set on
-        # plan_opts explicitly, same as ToppraPlanOptions.
-        result = self.planner.plan(
-            target_states=target_plan_states, options=options.plan_opts
-        )
+        plan_opts = self._resolve_plan_options(options)
+        result = self.planner.plan(target_states=target_plan_states, options=plan_opts)
         return result
 
+    def _resolve_plan_options(self, options: MotionGenOptions) -> PlanOptions:
+        """Resolve planner-specific options for the active planner."""
+        plan_opts = options.plan_opts
+        if plan_opts is None:
+            plan_opts = self.planner.default_plan_options()
+        elif hasattr(plan_opts, "copy"):
+            plan_opts = plan_opts.copy()
+
+        if (
+            hasattr(plan_opts, "control_part")
+            and getattr(plan_opts, "control_part") is None
+        ):
+            plan_opts.control_part = options.control_part
+        if (
+            hasattr(plan_opts, "start_qpos")
+            and getattr(plan_opts, "start_qpos") is None
+        ):
+            plan_opts.start_qpos = options.start_qpos
+        return plan_opts
+
     def estimate_trajectory_sample_count(
         self,
         xpos_list: torch.Tensor | list[torch.Tensor] | None = None,
diff --git a/embodichain/lab/sim/planners/neural_planner.py b/embodichain/lab/sim/planners/neural_planner.py
index 62adb0e6..a2b48c17 100644
--- a/embodichain/lab/sim/planners/neural_planner.py
+++ b/embodichain/lab/sim/planners/neural_planner.py
@@ -55,7 +55,20 @@ def normalize(self, obs: torch.Tensor) -> torch.Tensor:
 
 
 class _WaypointTransformerActor(nn.Module):
-    """APG waypoint actor runtime copied in lightweight form for inference."""
+    """Lightweight APG waypoint actor used for runtime inference."""
+
+    @staticmethod
+    def expected_obs_dim(
+        action_dim: int, num_waypoints: int, use_relative_obs: bool
+    ) -> int:
+        """Return the observation size implied by the checkpoint metadata."""
+        action_dim = int(action_dim)
+        num_waypoints = int(num_waypoints)
+        obs_dim = action_dim + 7 + num_waypoints * 3
+        obs_dim += num_waypoints * 4 + num_waypoints * 2 + action_dim
+        if use_relative_obs:
+            obs_dim += 7
+        return obs_dim
 
     def __init__(
         self,
@@ -69,20 +82,19 @@ def __init__(
         transformer_ff_dim: int | None = None,
     ):
         super().__init__()
+        self.action_dim = int(action_dim)
         self.num_waypoints = int(num_waypoints)
         self.use_relative_obs = bool(use_relative_obs)
-        if int(action_dim) != 7:
-            raise ValueError(
-                "Waypoint transformer checkpoints currently assume a 7-DoF arm. "
-                f"Got action_dim={action_dim}."
-            )
-        self.state_dim = 7 + 7 + 7 + (7 if self.use_relative_obs else 0)
+        self.state_dim = self.action_dim + 7 + self.action_dim
+        if self.use_relative_obs:
+            self.state_dim += 7
         self.waypoint_token_dim = 3 + 4 + 1 + 1
 
-        expected_obs_dim = 7 + 7 + self.num_waypoints * 3
-        expected_obs_dim += self.num_waypoints * 4 + self.num_waypoints * 2 + 7
-        if self.use_relative_obs:
-            expected_obs_dim += 7
+        expected_obs_dim = self.expected_obs_dim(
+            self.action_dim,
+            self.num_waypoints,
+            self.use_relative_obs,
+        )
         if int(obs_dim) != expected_obs_dim:
             raise ValueError(
                 "Waypoint transformer expected obs_dim "
@@ -117,14 +129,15 @@ def __init__(
             nn.LayerNorm(hidden_dim),
             _layer_init(nn.Linear(hidden_dim, hidden_dim)),
             nn.Tanh(),
-            _layer_init(nn.Linear(hidden_dim, action_dim), std=0.01),
+            _layer_init(nn.Linear(hidden_dim, self.action_dim), std=0.01),
         )
 
     def _parse_obs(self, x: torch.Tensor):
         n = self.num_waypoints
+        d = self.action_dim
         cursor = 0
-        joint = x[:, cursor : cursor + 7]
-        cursor += 7
+        joint = x[:, cursor : cursor + d]
+        cursor += d
         eef_pose = x[:, cursor : cursor + 7]
         cursor += 7
         waypoint_pos = x[:, cursor : cursor + 3 * n].reshape(-1, n, 3)
@@ -135,8 +148,8 @@ def _parse_obs(self, x: torch.Tensor):
         cursor += n
         valid_mask = x[:, cursor : cursor + n].reshape(-1, n, 1)
         cursor += n
-        last_action = x[:, cursor : cursor + 7]
-        cursor += 7
+        last_action = x[:, cursor : cursor + d]
+        cursor += d
 
         state_parts = [joint, eef_pose, last_action]
         if self.use_relative_obs:
@@ -168,16 +181,18 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 @configclass
 class NeuralPlannerCfg(BasePlannerCfg):
+    """Configuration for the neural motion generation planner."""
+
     planner_type: str = "neural"
 
     checkpoint_path: str = MISSING
-    """Path to an APG waypoint checkpoint (.pt), e.g. from ``download_neural_planner_checkpoint()``."""
+    """Path to an APG waypoint checkpoint."""
 
     control_part: str | None = None
-    """Robot control part used for FK and qpos, e.g. 'left_arm'."""
+    """Robot control part used for FK and qpos, e.g. ``main_arm``."""
 
     max_steps: int | None = None
-    """Maximum rollout steps. If None, uses checkpoint max_episode_steps."""
+    """Maximum rollout steps. If None, uses checkpoint ``max_episode_steps``."""
 
     action_scale: float = 0.2
     """Delta joint scaling factor in radians."""
@@ -186,32 +201,31 @@ class NeuralPlannerCfg(BasePlannerCfg):
     """Number of arm joints controlled by the APG policy."""
 
     pos_eps: float | None = None
-    """Waypoint position threshold. If None, uses checkpoint waypoint_pos_threshold."""
+    """Waypoint position threshold. If None, uses the checkpoint value."""
 
     rot_eps: float | None = None
-    """Waypoint rotation threshold. If None, uses checkpoint waypoint_rot_threshold."""
+    """Waypoint rotation threshold. If None, uses the checkpoint value."""
 
     dt: float = 0.01
-    """Nominal timestep reported in PlanResult."""
+    """Nominal timestep reported in :class:`PlanResult`."""
 
 
 @configclass
 class NeuralPlanOptions(PlanOptions):
+    """Runtime options for :class:`NeuralPlanner`."""
+
     control_part: str | None = None
     start_qpos: torch.Tensor | None = None
     max_steps: int | None = None
+    env_id: int | None = None
 
 
 class NeuralPlanner(BasePlanner):
+    """Learning-based EEF waypoint planner backed by a trained APG policy."""
+
     def __init__(self, cfg: NeuralPlannerCfg):
         super().__init__(cfg)
 
-        if self.robot.num_instances > 1:
-            logger.log_error(
-                "NeuralPlanner currently supports one robot instance",
-                NotImplementedError,
-            )
-
         self.cfg: NeuralPlannerCfg = cfg
         if cfg.checkpoint_path is MISSING or not str(cfg.checkpoint_path):
             logger.log_error("checkpoint_path is required", ValueError)
@@ -274,6 +288,19 @@ def _load_checkpoint(self, checkpoint_path: Path) -> None:
         self._intermediate_orientation = bool(
             self._ckpt_args.get("waypoint_intermediate_orientation", True)
         )
+        expected_obs_dim = _WaypointTransformerActor.expected_obs_dim(
+            self._action_dim,
+            self._num_waypoints,
+            self._use_relative_obs,
+        )
+        if self._obs_dim != expected_obs_dim:
+            raise ValueError(
+                f"Checkpoint obs_dim {self._obs_dim} does not match "
+                f"num_arm_joints={self._action_dim} for "
+                f"waypoint_max={self._num_waypoints}. Expected obs_dim "
+                f"{expected_obs_dim}. Use a checkpoint trained for this robot "
+                "and control_part."
+            )
 
         self._normalizer = _RunningObsNormalizer(
             ckpt["obs_normalizer"]["mean"].to(self.device),
@@ -313,33 +340,34 @@ def plan(
         target_states: list[PlanState],
         options: NeuralPlanOptions = NeuralPlanOptions(),
     ) -> PlanResult:
+        """Plan an EEF waypoint trajectory by rolling out the neural policy."""
         if not target_states:
             return PlanResult(success=False, positions=None)
 
         control_part = options.control_part or self.cfg.control_part
         if control_part is None:
-            logger.log_error(
-                "control_part is required for NeuralPlanner",
-                ValueError,
-            )
+            logger.log_error("control_part is required for NeuralPlanner", ValueError)
+        env_id = self._resolve_env_id(options.env_id)
 
         waypoints_pos, waypoints_quat, valid_mask, episode_k = self._parse_waypoints(
             target_states
         )
-        qpos = self._initial_qpos(control_part, options.start_qpos)
-        limits = self.robot.get_qpos_limits(name=control_part)[0].to(self.device)
+        qpos = self._initial_qpos(control_part, options.start_qpos, env_id=env_id)
+        limits = self.robot.get_qpos_limits(name=control_part, env_ids=[env_id])[0].to(
+            self.device
+        )
         lower = limits[: self._action_dim, 0]
         upper = limits[: self._action_dim, 1]
 
         last_action = torch.zeros(1, self._action_dim, device=self.device)
         active_idx = 0
         positions = [qpos.squeeze(0).clone()]
-        xpos_list = [self._fk_matrix(qpos, control_part).squeeze(0)]
+        xpos_list = [self._fk_matrix(qpos, control_part, env_id=env_id).squeeze(0)]
         max_steps = int(options.max_steps or self._max_steps)
 
         with torch.no_grad():
             for _ in range(max_steps):
-                ee_pose = self._fk_pose_xyzw(qpos, control_part)
+                ee_pose = self._fk_pose_xyzw(qpos, control_part, env_id=env_id)
                 obs = self._build_obs(
                     qpos[:, : self._action_dim],
                     ee_pose,
@@ -357,9 +385,11 @@ def plan(
                 last_action = action
 
                 positions.append(qpos.squeeze(0).clone())
-                xpos_list.append(self._fk_matrix(qpos, control_part).squeeze(0))
+                xpos_list.append(
+                    self._fk_matrix(qpos, control_part, env_id=env_id).squeeze(0)
+                )
 
-                ee_pose = self._fk_pose_xyzw(qpos, control_part)
+                ee_pose = self._fk_pose_xyzw(qpos, control_part, env_id=env_id)
                 if self._is_active_reached(
                     ee_pose, waypoints_pos, waypoints_quat, active_idx, episode_k
                 ):
@@ -413,11 +443,27 @@ def _parse_waypoints(
 
         return waypoint_pos, waypoint_quat, valid_mask, len(target_states)
 
+    def _resolve_env_id(self, env_id: int | None) -> int:
+        if env_id is None:
+            if self.robot.num_instances > 1:
+                logger.log_error(
+                    "env_id is required for NeuralPlanner when the robot has "
+                    f"{self.robot.num_instances} instances.",
+                    ValueError,
+                )
+            return 0
+        if env_id < 0 or env_id >= self.robot.num_instances:
+            logger.log_error(
+                f"env_id must be in [0, {self.robot.num_instances}), got {env_id}.",
+                ValueError,
+            )
+        return int(env_id)
+
     def _initial_qpos(
-        self, control_part: str, start_qpos: torch.Tensor | None
+        self, control_part: str, start_qpos: torch.Tensor | None, *, env_id: int
     ) -> torch.Tensor:
         if start_qpos is None:
-            qpos = self.robot.get_qpos(name=control_part)[0]
+            qpos = self.robot.get_qpos(name=control_part)[env_id]
         else:
             qpos = torch.as_tensor(start_qpos, dtype=torch.float32, device=self.device)
         if qpos.dim() == 1:
@@ -430,11 +476,19 @@ def _initial_qpos(
             )
         return qpos.to(self.device).clone()
 
-    def _fk_matrix(self, qpos: torch.Tensor, control_part: str) -> torch.Tensor:
-        return self.robot.compute_fk(qpos=qpos, name=control_part, to_matrix=True)
+    def _fk_matrix(
+        self, qpos: torch.Tensor, control_part: str, *, env_id: int
+    ) -> torch.Tensor:
+        return self.robot.compute_fk(
+            qpos=qpos, name=control_part, env_ids=[env_id], to_matrix=True
+        )
 
-    def _fk_pose_xyzw(self, qpos: torch.Tensor, control_part: str) -> torch.Tensor:
-        fk = self.robot.compute_fk(qpos=qpos, name=control_part, to_matrix=False)
+    def _fk_pose_xyzw(
+        self, qpos: torch.Tensor, control_part: str, *, env_id: int
+    ) -> torch.Tensor:
+        fk = self.robot.compute_fk(
+            qpos=qpos, name=control_part, env_ids=[env_id], to_matrix=False
+        )
         pos = fk[:, :3]
         quat_xyzw = convert_quat(fk[:, 3:7], to="xyzw")
         return torch.cat([pos, quat_xyzw], dim=-1)
diff --git a/embodichain/lab/sim/planners/toppra_planner.py b/embodichain/lab/sim/planners/toppra_planner.py
index 218d17ed..81585805 100644
--- a/embodichain/lab/sim/planners/toppra_planner.py
+++ b/embodichain/lab/sim/planners/toppra_planner.py
@@ -91,6 +91,9 @@ def __init__(self, cfg: ToppraPlannerCfg):
                 NotImplementedError,
             )
 
+    def default_plan_options(self) -> ToppraPlanOptions:
+        return ToppraPlanOptions()
+
     @validate_plan_options(options_cls=ToppraPlanOptions)
     def plan(
         self,
diff --git a/embodichain/lab/sim/solvers/pytorch_solver.py b/embodichain/lab/sim/solvers/pytorch_solver.py
index 277eaf87..6dff8d76 100644
--- a/embodichain/lab/sim/solvers/pytorch_solver.py
+++ b/embodichain/lab/sim/solvers/pytorch_solver.py
@@ -382,9 +382,10 @@ def get_ik(
         tcp_xpos = torch.as_tensor(
             self.tcp_xpos, device=self.device, dtype=torch.float32
         )
-        tcp_xpos_inv = tcp_xpos.clone()
-        tcp_xpos_inv[:3, :3] = tcp_xpos_inv[:3, :3].T
-        tcp_xpos_inv[:3, 3] = -tcp_xpos_inv[:3, :3] @ tcp_xpos_inv[:3, 3]
+        tcp_xpos_inv = torch.eye(4, device=self.device, dtype=torch.float32)
+        tcp_rotation_inv = tcp_xpos[:3, :3].T
+        tcp_xpos_inv[:3, :3] = tcp_rotation_inv
+        tcp_xpos_inv[:3, 3] = -tcp_rotation_inv @ tcp_xpos[:3, 3]
         target_xpos = target_xpos @ tcp_xpos_inv
 
         # Get joint limits and ensure shape matches dof
diff --git a/pyproject.toml b/pyproject.toml
index f5506b0d..f87ec38b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ dependencies = [
   "black==26.3.1",
   "fvcore",
   "h5py",
+  "huggingface-hub",
   "tensordict",
   "viser==1.0.21",
   "lerobot>=0.4.4"
diff --git a/scripts/benchmark/robotics/__init__.py b/scripts/benchmark/robotics/__init__.py
new file mode 100644
index 00000000..1b84ff8b
--- /dev/null
+++ b/scripts/benchmark/robotics/__init__.py
@@ -0,0 +1,17 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Robotics benchmark scripts."""
diff --git a/scripts/benchmark/robotics/nmg/__init__.py b/scripts/benchmark/robotics/nmg/__init__.py
new file mode 100644
index 00000000..2edc730e
--- /dev/null
+++ b/scripts/benchmark/robotics/nmg/__init__.py
@@ -0,0 +1,17 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Neural motion generation benchmark scripts."""
diff --git a/scripts/benchmark/robotics/nmg/franka_pick_place.py b/scripts/benchmark/robotics/nmg/franka_pick_place.py
new file mode 100644
index 00000000..7b21a66d
--- /dev/null
+++ b/scripts/benchmark/robotics/nmg/franka_pick_place.py
@@ -0,0 +1,3018 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Benchmark Franka pick-place atomic actions with IK and NMG planners.
+
+The benchmark separates planning-only trajectory quality from real physics
+replay. It uses the original Franka PandaWithHand asset and reports gripper vs.
+object feasibility instead of modifying the asset or forcing object motion.
+Run: python -m scripts.benchmark.robotics.nmg.franka_pick_place
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import os
+import struct
+import time
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import psutil
+import torch
+
+from embodichain.data import get_data_path
+from embodichain.data.assets.planner_assets import download_neural_planner_checkpoint
+from embodichain.lab.gym.utils.gym_utils import add_env_launcher_args_to_parser
+from embodichain.lab.sim import SimulationManager, SimulationManagerCfg
+from embodichain.lab.sim.atomic_actions import (
+    AntipodalAffordance,
+    AtomicActionEngine,
+    EndEffectorPoseTarget,
+    GraspTarget,
+    HeldObjectState,
+    MoveEndEffector,
+    MoveEndEffectorCfg,
+    ObjectSemantics,
+    PickUp,
+    PickUpCfg,
+    Place,
+    PlaceCfg,
+)
+from embodichain.lab.sim.cfg import (
+    JointDrivePropertiesCfg,
+    LightCfg,
+    RenderCfg,
+    RigidBodyAttributesCfg,
+    RigidObjectCfg,
+    RobotCfg,
+)
+from embodichain.lab.sim.objects import RigidObject, Robot
+from embodichain.lab.sim.planners import (
+    MotionGenCfg,
+    MotionGenerator,
+    NeuralPlannerCfg,
+    ToppraPlannerCfg,
+)
+from embodichain.lab.sim.sensors import (
+    ArticulationContactFilterCfg,
+    CameraCfg,
+    ContactSensor,
+    ContactSensorCfg,
+)
+from embodichain.lab.sim.shapes import CubeCfg, MeshCfg
+from embodichain.lab.sim.solvers import PytorchSolverCfg
+from embodichain.toolkits.graspkit.pg_grasp.antipodal_generator import (
+    AntipodalSamplerCfg,
+    GraspGeneratorCfg,
+)
+from embodichain.toolkits.graspkit.pg_grasp.gripper_collision_checker import (
+    GripperCollisionCfg,
+)
+from embodichain.utils import logger
+from embodichain.utils.math import quat_error_magnitude, quat_from_matrix
+
+SCRIPT_NAME = "franka_pick_place_nmg"
+ARM_NAME = "main_arm"
+HAND_NAME = "hand"
+ROBOT_UID = "FrankaPanda"
+TABLE_UID = "support_table"
+TABLE_COLLIDER_UID = "support_table_collider"
+TABLE_MESH_PATH = "ShopTableSimple/shop_table_simple.ply"
+TABLE_MESH_TOP_Z = 0.8265
+TABLE_TOP_Z = 0.0
+TABLE_INIT_POS = (0.0, 0.0, TABLE_TOP_Z - TABLE_MESH_TOP_Z)
+TABLE_SCALE = (1.0, 1.0, 1.0)
+TABLE_COLLIDER_SIZE = (1.0, 1.0, 0.04)
+TABLE_COLLIDER_INIT_POS = (
+    0.0,
+    0.0,
+    TABLE_TOP_Z - TABLE_COLLIDER_SIZE[2] / 2.0,
+)
+OBJECT_SUPPORT_MARGIN = 0.002
+
+PICK_SAMPLE_INTERVAL = 120
+PLACE_SAMPLE_INTERVAL = 120
+MOVE_SAMPLE_INTERVAL = 80
+HAND_INTERP_STEPS = 12
+POST_REPLAY_HOLD_STEPS = 80
+POST_GRASP_HOLD_STEPS = 80
+
+DEFAULT_POS_THRESHOLD = 1e-3
+DEFAULT_ROT_THRESHOLD = 0.05
+DEFAULT_OBJECT_POS_THRESHOLD = 0.05
+DEFAULT_NMG_POS_THRESHOLD = 0.05
+DEFAULT_NMG_ROT_THRESHOLD = 0.3
+DEFAULT_PRE_PICK_Z = 0.36
+DEFAULT_SEED = 0
+DEFAULT_GRIPPER_CLOSE_MARGIN = 0.012
+DEFAULT_GRASP_HOLD_STEPS = 80
+DEFAULT_OBJECT_REPLAY_MODE = "physics"
+DEFAULT_GRASP_AXIS = "auto"
+DEFAULT_FRANKA_TCP_Z = 0.1034
+DEFAULT_FRANKA_TCP_YAW = -math.pi / 4.0
+DEFAULT_GRASP_DEPTH_OFFSET = 0.0
+DEFAULT_MIN_PHYSICAL_GPU_FREE_MB = 2048.0
+
+FRANKA_FINGER_LINK_NAMES = ("leftfinger", "rightfinger")
+FRANKA_GRIPPER_CLOSING_AXIS_INDEX = 1
+FRANKA_FINGER_LENGTH = 0.058
+FRANKA_ROOT_Z_WIDTH = 0.060
+FRANKA_Y_THICKNESS = 0.030
+FRANKA_START_QPOS = (
+    0.0,
+    -math.pi / 4.0,
+    0.0,
+    -3.0 * math.pi / 4.0,
+    0.0,
+    math.pi / 2.0,
+    math.pi / 4.0,
+)
+FRANKA_REST_QPOS = (0.10, -0.65, 0.00, -2.10, 0.00, 1.55, 0.85)
+
+OBJECT_PRESETS: dict[str, dict[str, Any]] = {
+    "cube": {
+        "label": "cube",
+        "shape": "cube",
+        "size": (0.045, 0.045, 0.045),
+        "init_pos": (0.31, 0.00, 0.0),
+        "init_rot": (0.0, 0.0, 0.0),
+        "body_scale": (1.0, 1.0, 1.0),
+        "mass": 0.03,
+        "max_convex_hull_num": 1,
+        "use_usd_properties": False,
+        "mesh_min_z": -0.0225,
+    },
+    "sugar_box": {
+        "label": "sugar_box",
+        "mesh_path": "SugarBox/sugar_box_usd/sugar_box.usda",
+        "init_pos": (0.31, 0.00, 0.0),
+        "init_rot": (0.0, 0.0, 0.0),
+        # Default benchmark scale. Use --object_scale to sweep strict-physics
+        # grasp feasibility without editing this preset.
+        "body_scale": (1.0, 1.0, 1.0),
+        "mass": 0.05,
+        "max_convex_hull_num": 16,
+        "use_usd_properties": False,
+        "mesh_min_z": -0.02256747,
+    },
+    "cup": {
+        "label": "cup",
+        "mesh_path": "CoffeeCup/cup.ply",
+        "init_pos": (0.31, 0.00, 0.0),
+        "init_rot": (0.0, 0.0, -90.0),
+        "body_scale": (1.0, 1.0, 1.0),
+        "mass": 0.01,
+        "max_convex_hull_num": 16,
+        "use_usd_properties": False,
+    },
+}
+
+APPROACH_DIRECTIONS = {
+    "top": (0.0, 0.0, -1.0),
+    "side": (0.0, 1.0, 0.0),
+    "side_y": (0.0, -1.0, 0.0),
+}
+
+
+@dataclass(frozen=True)
+class GraspPreflight:
+    """Result of object-vs-gripper feasibility checks."""
+
+    status: str
+    object_grasp_width_m: float
+    gripper_min_opening_m: float
+    gripper_max_opening_m: float
+    reason: str
+
+    @property
+    def failed(self) -> bool:
+        """Return whether the preflight indicates an infeasible grasp."""
+        return self.status != "ok"
+
+
+@dataclass(frozen=True)
+class FrankaHandOpeningSpec:
+    """URDF-derived Franka finger geometry used to compute jaw opening."""
+
+    left_origin_y: float
+    left_axis_y: float
+    left_inner_y: float
+    right_origin_y: float
+    right_axis_y: float
+    right_inner_y: float
+
+
+@dataclass(frozen=True)
+class PlanOutcome:
+    """Result returned by the action planner for one trial."""
+
+    success: bool
+    trajectory: torch.Tensor
+    held_object: HeldObjectState | None
+    place_eef_pose: torch.Tensor | None
+    rest_eef_pose: torch.Tensor
+
+
+@dataclass(frozen=True)
+class ContactStats:
+    """Summary of current finger-object contacts for one replay instant."""
+
+    count: int
+    max_impulse: float
+    total_impulse: float
+    min_distance: float | None
+
+
+@dataclass(frozen=True)
+class ReplayOutcome:
+    """Physical replay metrics for one planned trajectory."""
+
+    replay_time_sec: float
+    replay_success: bool
+    physical_success: bool
+    object_lifted: bool
+    object_moved: bool
+    object_pos_error: float | None
+    object_rot_error: float | None
+    object_max_z: float | None
+    object_displacement_m: float | None
+    object_replay_mode: str
+    close_end_index: int | None
+    release_start_index: int | None
+    min_hand_opening_m: float | None
+    final_hand_opening_m: float | None
+    actual_min_hand_opening_m: float | None
+    actual_close_hand_opening_m: float | None
+    actual_final_hand_opening_m: float | None
+    close_contact_count: int | None
+    path_contact_count: int | None
+    hold_contact_count: int | None
+    max_contact_count: int | None
+    max_contact_impulse: float | None
+    max_total_contact_impulse: float | None
+    min_contact_distance_m: float | None
+    close_tcp_object_delta_m: list[float] | None
+    planned_grasp_object_delta_m: list[float] | None
+    close_leftfinger_object_delta_m: list[float] | None
+    close_rightfinger_object_delta_m: list[float] | None
+    close_tcp_x_axis: list[float] | None
+    close_tcp_y_axis: list[float] | None
+    close_tcp_z_axis: list[float] | None
+
+
+@dataclass(frozen=True)
+class SummaryRow:
+    """Aggregate measured rows for one planner and mode."""
+
+    planner: str
+    mode: str
+    rows: list[dict[str, object]]
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark Franka PickUp -> Place with IK and NMG planners."
+    )
+    add_env_launcher_args_to_parser(parser)
+    parser.set_defaults(headless=True)
+    parser.add_argument(
+        "--planner",
+        choices=["ik_interpolate", "neural", "neural_refine", "all"],
+        default="all",
+        help="Planner backend to evaluate. 'all' evaluates all supported planners.",
+    )
+    parser.add_argument(
+        "--neural_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Local Franka NMG checkpoint path. If omitted and a neural planner is "
+            "selected, the project checkpoint downloader is used."
+        ),
+    )
+    parser.add_argument(
+        "--mode",
+        choices=["planner", "physical", "both"],
+        default="planner",
+        help="Evaluation mode. 'both' runs separate planner and physical rows.",
+    )
+    parser.add_argument(
+        "--object",
+        choices=sorted(OBJECT_PRESETS),
+        default="sugar_box",
+        help="Object preset used for the pick-place scene.",
+    )
+    parser.add_argument(
+        "--object_scale",
+        type=float,
+        nargs=3,
+        default=None,
+        metavar=("SX", "SY", "SZ"),
+        help="Optional object body scale override for strict-physics experiments.",
+    )
+    parser.add_argument(
+        "--num_repeats",
+        type=int,
+        default=3,
+        help="Measured repeats per planner and mode.",
+    )
+    parser.add_argument(
+        "--warmup_repeats",
+        type=int,
+        default=1,
+        help="Warmup repeats per planner and mode; excluded from summaries.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=DEFAULT_SEED,
+        help="Seed used before trial generation.",
+    )
+    parser.add_argument(
+        "--n_sample",
+        type=int,
+        default=10000,
+        help="Number of antipodal samples used by grasp generation.",
+    )
+    parser.add_argument(
+        "--n_deviated_approach_directions",
+        type=int,
+        default=1,
+        help=(
+            "Number of approach directions sampled around the requested approach. "
+            "Use 1 for deterministic strict-physics checks."
+        ),
+    )
+    parser.add_argument(
+        "--force_reannotate",
+        action="store_true",
+        help="Force grasp region re-annotation instead of using cached data.",
+    )
+    parser.add_argument(
+        "--report_path",
+        type=str,
+        default=None,
+        help="Optional Markdown report path. Defaults to outputs/benchmarks/.",
+    )
+    parser.add_argument(
+        "--save_raw_jsonl",
+        type=str,
+        default=None,
+        help="Optional JSONL path for raw per-trial rows.",
+    )
+    parser.add_argument(
+        "--pre_pick_z",
+        type=float,
+        default=DEFAULT_PRE_PICK_Z,
+        help="World z position used to seed the pre-pick TCP pose.",
+    )
+    parser.add_argument(
+        "--tcp_z",
+        type=float,
+        default=DEFAULT_FRANKA_TCP_Z,
+        help=(
+            "Franka solver TCP z offset in ee_link frame. The default preserves "
+            "the existing NMG comparator convention."
+        ),
+    )
+    parser.add_argument(
+        "--tcp_yaw",
+        type=float,
+        default=DEFAULT_FRANKA_TCP_YAW,
+        help=(
+            "Franka solver TCP yaw offset in radians. The default preserves "
+            "the existing NMG comparator convention."
+        ),
+    )
+    parser.add_argument(
+        "--grasp_depth_offset",
+        type=float,
+        default=DEFAULT_GRASP_DEPTH_OFFSET,
+        help=(
+            "Meters to move the resolved grasp TCP backwards along its local z "
+            "axis before planning. Negative values push the Franka fingers "
+            "deeper into the object for physical replay experiments."
+        ),
+    )
+    parser.add_argument(
+        "--grasp_axis",
+        choices=["auto", "short", "long"],
+        default=DEFAULT_GRASP_AXIS,
+        help=(
+            "Preferred object horizontal axis for the Franka finger centerline. "
+            "'short' favors closing across the object's short side."
+        ),
+    )
+    parser.add_argument(
+        "--approach",
+        choices=["top", "side", "side_y", "custom"],
+        default="top",
+        help="Pick approach direction preset.",
+    )
+    parser.add_argument(
+        "--custom_approach_direction",
+        type=float,
+        nargs=3,
+        default=None,
+        metavar=("X", "Y", "Z"),
+        help="World-frame approach direction used when --approach custom.",
+    )
+    parser.add_argument(
+        "--step_repeat",
+        type=int,
+        default=4,
+        help="Simulation update steps per trajectory waypoint during physical replay.",
+    )
+    parser.add_argument(
+        "--replay_control",
+        choices=["target", "direct"],
+        default="target",
+        help=(
+            "Replay mode. 'target' drives joint targets through physics; 'direct' "
+            "sets current qpos at every waypoint."
+        ),
+    )
+    parser.add_argument(
+        "--object_replay_mode",
+        choices=["physics", "attached"],
+        default=DEFAULT_OBJECT_REPLAY_MODE,
+        help=(
+            "'physics' evaluates real object contact. 'attached' binds the object "
+            "to the planned held-object transform after grasp for demo inspection."
+        ),
+    )
+    parser.add_argument(
+        "--grasp_hold_steps",
+        type=int,
+        default=DEFAULT_GRASP_HOLD_STEPS,
+        help="Extra physics steps inserted after gripper close before lifting.",
+    )
+    parser.add_argument(
+        "--gripper_close_margin",
+        type=float,
+        default=DEFAULT_GRIPPER_CLOSE_MARGIN,
+        help="Meters subtracted from estimated object width when closing Franka fingers.",
+    )
+    parser.add_argument(
+        "--screenshot_dir",
+        type=str,
+        default=None,
+        help="Optional directory for diagnostic replay screenshots.",
+    )
+    parser.add_argument(
+        "--hold_steps",
+        type=int,
+        default=40,
+        help="Simulation update steps before planning and after replay.",
+    )
+    parser.add_argument(
+        "--pos_success_threshold",
+        type=float,
+        default=DEFAULT_POS_THRESHOLD,
+        help="Strict final TCP position threshold in meters.",
+    )
+    parser.add_argument(
+        "--rot_success_threshold",
+        type=float,
+        default=DEFAULT_ROT_THRESHOLD,
+        help="Strict final TCP rotation threshold in radians.",
+    )
+    parser.add_argument(
+        "--object_pos_success_threshold",
+        type=float,
+        default=DEFAULT_OBJECT_POS_THRESHOLD,
+        help="Placed-object position threshold in meters for physical success.",
+    )
+    parser.add_argument(
+        "--nmg_pos_success_threshold",
+        type=float,
+        default=DEFAULT_NMG_POS_THRESHOLD,
+        help="Loose position threshold passed to NMG planners.",
+    )
+    parser.add_argument(
+        "--nmg_rot_success_threshold",
+        type=float,
+        default=DEFAULT_NMG_ROT_THRESHOLD,
+        help="Loose rotation threshold passed to NMG planners.",
+    )
+    parser.add_argument(
+        "--min_physical_gpu_free_mb",
+        type=float,
+        default=DEFAULT_MIN_PHYSICAL_GPU_FREE_MB,
+        help=(
+            "Minimum free GPU memory required before launching physical replay. "
+            "Set <= 0 to disable the preflight skip."
+        ),
+    )
+    return parser.parse_args(argv)
+
+
+def validate_args(args: argparse.Namespace) -> None:
+    """Validate benchmark arguments."""
+    if args.num_repeats < 1:
+        raise ValueError("--num_repeats must be >= 1.")
+    if args.warmup_repeats < 0:
+        raise ValueError("--warmup_repeats must be >= 0.")
+    if args.n_sample < 1:
+        raise ValueError("--n_sample must be >= 1.")
+    if args.n_deviated_approach_directions < 1:
+        raise ValueError("--n_deviated_approach_directions must be >= 1.")
+    if args.object_scale is not None and any(v <= 0.0 for v in args.object_scale):
+        raise ValueError("--object_scale values must be > 0.")
+    if args.pre_pick_z <= 0.0:
+        raise ValueError("--pre_pick_z must be > 0.")
+    if args.tcp_z <= 0.0:
+        raise ValueError("--tcp_z must be > 0.")
+    if args.step_repeat < 1:
+        raise ValueError("--step_repeat must be >= 1.")
+    if args.grasp_hold_steps < 0:
+        raise ValueError("--grasp_hold_steps must be >= 0.")
+    if args.gripper_close_margin < 0.0:
+        raise ValueError("--gripper_close_margin must be >= 0.")
+    if args.hold_steps < 0:
+        raise ValueError("--hold_steps must be >= 0.")
+    if args.min_physical_gpu_free_mb < 0.0:
+        raise ValueError("--min_physical_gpu_free_mb must be >= 0.")
+
+
+def simulation_requires_cuda(args: argparse.Namespace) -> bool:
+    """Return whether the selected simulation settings require CUDA."""
+    if str(args.device).startswith("cuda"):
+        return True
+    return args.renderer in ("hybrid", "fast-rt", "rt")
+
+
+def expand_planner_selection(planner: str) -> list[str]:
+    """Expand planner aliases into concrete planner names."""
+    if planner == "all":
+        return ["ik_interpolate", "neural", "neural_refine"]
+    return [planner]
+
+
+def expand_mode_selection(mode: str) -> list[str]:
+    """Expand mode aliases into concrete benchmark modes."""
+    if mode == "both":
+        return ["planner", "physical"]
+    return [mode]
+
+
+def free_gpu_memory_mb(gpu_id: int) -> float | None:
+    """Return free GPU memory in MiB, or None when CUDA is unavailable."""
+    if not torch.cuda.is_available():
+        return None
+    free_bytes, _ = torch.cuda.mem_get_info(int(gpu_id))
+    return free_bytes / 1024**2
+
+
+def physical_gpu_memory_skip_reason(
+    args: argparse.Namespace,
+    modes: list[str],
+) -> str | None:
+    """Return a graceful-skip reason when physical replay is likely to OOM."""
+    if "physical" not in modes or args.min_physical_gpu_free_mb <= 0.0:
+        return None
+    free_mb = free_gpu_memory_mb(args.gpu_id)
+    if free_mb is None or free_mb >= args.min_physical_gpu_free_mb:
+        return None
+    return (
+        "Physical replay was skipped before SimulationManager initialization "
+        f"because free GPU memory on gpu_id={args.gpu_id} is {free_mb:.1f} MiB, "
+        f"below --min_physical_gpu_free_mb={args.min_physical_gpu_free_mb:.1f} MiB. "
+        "DexSim/Vulkan may otherwise terminate the process before Python can write "
+        "benchmark reports."
+    )
+
+
+def _franka_tcp(
+    tcp_z: float = DEFAULT_FRANKA_TCP_Z,
+    tcp_yaw: float = DEFAULT_FRANKA_TCP_YAW,
+) -> list[list[float]]:
+    """Return the Franka TCP convention used by the existing NMG comparator."""
+    c = math.cos(float(tcp_yaw))
+    s = math.sin(float(tcp_yaw))
+    return [
+        [c, -s, 0.0, 0.0],
+        [s, c, 0.0, 0.0],
+        [0.0, 0.0, 1.0, float(tcp_z)],
+        [0.0, 0.0, 0.0, 1.0],
+    ]
+
+
+def resolve_object_body_scale(
+    object_name: str,
+    object_scale: tuple[float, float, float] | list[float] | None = None,
+) -> tuple[float, float, float]:
+    """Return the benchmark scale for an object preset or CLI override."""
+    if object_scale is not None:
+        if len(object_scale) != 3:
+            raise ValueError(f"object_scale must have 3 values, got {object_scale}")
+        scale = tuple(float(v) for v in object_scale)
+    else:
+        scale = tuple(float(v) for v in OBJECT_PRESETS[object_name]["body_scale"])
+    if any(v <= 0.0 for v in scale):
+        raise ValueError(f"object scale values must be > 0, got {scale}")
+    return scale
+
+
+def object_body_scale(object_name: str) -> tuple[float, float, float]:
+    """Return the fixed benchmark scale for an object preset."""
+    return resolve_object_body_scale(object_name)
+
+
+def object_supported_z(
+    object_name: str,
+    object_scale: tuple[float, float, float] | list[float] | None = None,
+) -> float:
+    """Return the object origin z that places its scaled bottom on the table."""
+    preset = OBJECT_PRESETS[object_name]
+    scale = resolve_object_body_scale(object_name, object_scale)
+    if preset.get("shape") == "cube":
+        size_z = float(preset["size"][2]) * scale[2]
+        return TABLE_TOP_Z + size_z / 2.0 + OBJECT_SUPPORT_MARGIN
+    mesh_min_z = float(preset.get("mesh_min_z", 0.0))
+    return TABLE_TOP_Z - mesh_min_z * scale[2] + OBJECT_SUPPORT_MARGIN
+
+
+def object_initial_position(
+    object_name: str,
+    object_scale: tuple[float, float, float] | list[float] | None = None,
+) -> tuple[float, float, float]:
+    """Return the benchmark object position with support-plane z compensation."""
+    pos = OBJECT_PRESETS[object_name]["init_pos"]
+    return (float(pos[0]), float(pos[1]), object_supported_z(object_name, object_scale))
+
+
+def object_initial_pose(
+    object_name: str,
+    device: torch.device,
+    object_scale: tuple[float, float, float] | list[float] | None = None,
+) -> torch.Tensor:
+    """Return the initial object pose as an unbatched 7D wxyz pose tensor."""
+    preset = OBJECT_PRESETS[object_name]
+    pos = object_initial_position(object_name, object_scale)
+    yaw = math.radians(float(preset["init_rot"][2]))
+    return torch.tensor(
+        [
+            pos[0],
+            pos[1],
+            pos[2],
+            math.cos(yaw / 2.0),
+            0.0,
+            0.0,
+            math.sin(yaw / 2.0),
+        ],
+        dtype=torch.float32,
+        device=device,
+    )
+
+
+def resolve_approach_direction(
+    args: argparse.Namespace, device: torch.device
+) -> torch.Tensor:
+    """Resolve and normalize the pick approach direction."""
+    if args.approach == "custom":
+        if args.custom_approach_direction is None:
+            raise ValueError(
+                "--custom_approach_direction is required when --approach custom."
+            )
+        direction = args.custom_approach_direction
+    else:
+        direction = APPROACH_DIRECTIONS[args.approach]
+
+    approach_direction = torch.tensor(direction, dtype=torch.float32, device=device)
+    norm = torch.linalg.norm(approach_direction)
+    if norm < 1e-6:
+        raise ValueError("approach_direction must be non-zero.")
+    return approach_direction / norm
+
+
+def resolve_hand_open_close_qpos(
+    limits: torch.Tensor,
+    *,
+    open_qpos: float = 0.04,
+    close_qpos: float = 0.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Clamp scalar Panda finger qpos values to per-joint limits."""
+    limits = limits.to(dtype=torch.float32)
+    lower = limits[:, 0]
+    upper = limits[:, 1]
+    hand_open = torch.full_like(lower, float(open_qpos)).clamp(min=lower, max=upper)
+    hand_close = torch.full_like(lower, float(close_qpos)).clamp(min=lower, max=upper)
+    return hand_open, hand_close
+
+
+def resolve_adaptive_hand_close_qpos(
+    limits: torch.Tensor,
+    *,
+    object_width_m: float,
+    margin_m: float,
+    franka_urdf_path: str | None = None,
+) -> torch.Tensor:
+    """Return a Franka close target that fits the object without over-closing."""
+    limits = limits.to(dtype=torch.float32)
+    lower = limits[:, 0]
+    upper = limits[:, 1]
+    target_total_opening = max(float(object_width_m) - float(margin_m), 0.0)
+    if franka_urdf_path is not None:
+        spec = load_franka_hand_opening_spec(franka_urdf_path)
+        closed_opening = float(
+            franka_hand_opening_from_finger_qpos(
+                torch.zeros(2, dtype=torch.float32),
+                spec,
+            ).item()
+        )
+        axis_gap = abs(spec.left_axis_y) + abs(spec.right_axis_y)
+        per_finger = (target_total_opening - closed_opening) / max(axis_gap, 1e-6)
+    else:
+        per_finger = target_total_opening / float(limits.shape[0])
+    return torch.full_like(lower, per_finger).clamp(min=lower, max=upper)
+
+
+def get_hand_open_close_qpos(
+    robot: Robot,
+    preflight: GraspPreflight | None = None,
+    *,
+    close_margin_m: float = DEFAULT_GRIPPER_CLOSE_MARGIN,
+    franka_urdf_path: str | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Return Franka hand open and adaptive close qpos values."""
+    limits = robot.get_qpos_limits(name=HAND_NAME)[0].to(
+        device=robot.device, dtype=torch.float32
+    )
+    hand_open, hand_close = resolve_hand_open_close_qpos(limits)
+    if preflight is not None and preflight.object_grasp_width_m > 0.0:
+        hand_close = resolve_adaptive_hand_close_qpos(
+            limits,
+            object_width_m=preflight.object_grasp_width_m,
+            margin_m=close_margin_m,
+            franka_urdf_path=franka_urdf_path,
+        ).to(device=robot.device)
+    return hand_open, hand_close
+
+
+def _finger_joint_spec(urdf_root: ET.Element, joint_name: str) -> tuple[float, float]:
+    """Return origin-y and axis-y for a Franka finger joint."""
+    joint = urdf_root.find(f".//joint[@name='{joint_name}']")
+    if joint is None:
+        raise ValueError(f"Franka hand joint not found in URDF: {joint_name}")
+    origin = joint.find("origin")
+    axis = joint.find("axis")
+    if origin is None or axis is None:
+        raise ValueError(f"Franka hand joint is missing origin or axis: {joint_name}")
+    origin_xyz = [float(v) for v in origin.attrib.get("xyz", "0 0 0").split()]
+    axis_xyz = [float(v) for v in axis.attrib.get("xyz", "0 0 0").split()]
+    if len(origin_xyz) != 3 or len(axis_xyz) != 3:
+        raise ValueError(f"Invalid Franka finger joint fields: {joint_name}")
+    return origin_xyz[1], axis_xyz[1]
+
+
+def _mesh_filename(urdf_root: ET.Element, link_name: str) -> str:
+    """Return the first collision mesh filename for a URDF link."""
+    link = urdf_root.find(f".//link[@name='{link_name}']")
+    if link is None:
+        raise ValueError(f"Franka link not found in URDF: {link_name}")
+    mesh = link.find("./collision/geometry/mesh")
+    if mesh is None or "filename" not in mesh.attrib:
+        raise ValueError(f"Franka link has no collision mesh: {link_name}")
+    return mesh.attrib["filename"]
+
+
+def _binary_stl_bounds(path: Path) -> tuple[list[float], list[float]]:
+    """Return axis-aligned bounds for a binary STL file."""
+    data = path.read_bytes()
+    if len(data) < 84:
+        raise ValueError(f"Invalid binary STL file: {path}")
+    n_triangles = struct.unpack_from("<I", data, 80)[0]
+    expected_len = 84 + n_triangles * 50
+    if len(data) < expected_len:
+        raise ValueError(f"Truncated binary STL file: {path}")
+
+    mins = [math.inf, math.inf, math.inf]
+    maxs = [-math.inf, -math.inf, -math.inf]
+    offset = 84
+    for _ in range(n_triangles):
+        # normal(12 bytes), 3 vertices(36 bytes), attribute count(2 bytes)
+        vertices = struct.unpack_from("<9f", data, offset + 12)
+        offset += 50
+        for axis in range(3):
+            coords = vertices[axis::3]
+            mins[axis] = min(mins[axis], *coords)
+            maxs[axis] = max(maxs[axis], *coords)
+    return mins, maxs
+
+
+def _finger_inner_y_from_mesh(
+    urdf_path: str,
+    urdf_root: ET.Element,
+    *,
+    link_name: str,
+) -> float:
+    """Estimate the inner finger collision surface y in the link frame."""
+    mesh_path = Path(urdf_path).parent / _mesh_filename(urdf_root, link_name)
+    mins, maxs = _binary_stl_bounds(mesh_path)
+    if link_name == "leftfinger":
+        return mins[1]
+    if link_name == "rightfinger":
+        # The right finger collision mesh is rotated by pi around z. Its mesh
+        # minimum-y face maps to positive link-y, which is the inner side.
+        return -mins[1]
+    raise ValueError(f"Unsupported Franka finger link: {link_name}")
+
+
+def load_franka_hand_opening_spec(urdf_path: str) -> FrankaHandOpeningSpec:
+    """Load Franka finger joint geometry needed for jaw opening estimates."""
+    root = ET.parse(urdf_path).getroot()
+    left_origin_y, left_axis_y = _finger_joint_spec(root, "finger_joint1")
+    right_origin_y, right_axis_y = _finger_joint_spec(root, "finger_joint2")
+    return FrankaHandOpeningSpec(
+        left_origin_y=left_origin_y,
+        left_axis_y=left_axis_y,
+        left_inner_y=_finger_inner_y_from_mesh(
+            urdf_path,
+            root,
+            link_name="leftfinger",
+        ),
+        right_origin_y=right_origin_y,
+        right_axis_y=right_axis_y,
+        right_inner_y=_finger_inner_y_from_mesh(
+            urdf_path,
+            root,
+            link_name="rightfinger",
+        ),
+    )
+
+
+def franka_hand_opening_from_finger_qpos(
+    finger_qpos: torch.Tensor,
+    spec: FrankaHandOpeningSpec,
+) -> torch.Tensor:
+    """Return physical jaw opening from Franka finger qpos values."""
+    if finger_qpos.dim() == 1:
+        finger_qpos = finger_qpos.unsqueeze(0)
+    if finger_qpos.shape[-1] != 2:
+        raise ValueError(f"finger_qpos must end with 2 values, got {finger_qpos.shape}")
+    left_y = spec.left_origin_y + spec.left_axis_y * finger_qpos[..., 0]
+    left_y = left_y + spec.left_inner_y
+    right_y = spec.right_origin_y + spec.right_axis_y * finger_qpos[..., 1]
+    right_y = right_y + spec.right_inner_y
+    return torch.abs(left_y - right_y)
+
+
+def estimate_gripper_opening_range(
+    urdf_path: str, hand_limits: torch.Tensor
+) -> tuple[float, float]:
+    """Estimate min and max Franka finger opening from joint travel.
+
+    Panda finger joints are prismatic and move symmetrically. The physical
+    opening is the distance between the two child-link joint origins after
+    applying each prismatic displacement.
+    """
+    spec = load_franka_hand_opening_spec(urdf_path)
+    limits = torch.as_tensor(hand_limits, dtype=torch.float32).detach().cpu()
+    if limits.shape != (2, 2):
+        raise ValueError(
+            f"Franka hand_limits must have shape (2, 2), got {limits.shape}"
+        )
+
+    openings = []
+    for left_q in (float(limits[0, 0]), float(limits[0, 1])):
+        for right_q in (float(limits[1, 0]), float(limits[1, 1])):
+            opening = franka_hand_opening_from_finger_qpos(
+                torch.tensor([left_q, right_q], dtype=torch.float32),
+                spec,
+            )
+            openings.append(float(opening.item()))
+    return min(openings), max(openings)
+
+
+def estimate_object_grasp_width(vertices: torch.Tensor) -> float:
+    """Estimate the narrowest horizontal grasp width from scaled object vertices."""
+    verts = torch.as_tensor(vertices, dtype=torch.float32)
+    if verts.numel() == 0:
+        return 0.0
+    if verts.dim() != 2 or verts.shape[1] < 3:
+        raise ValueError(f"vertices must have shape (N, 3), got {verts.shape}")
+    extents = verts[:, :2].max(dim=0).values - verts[:, :2].min(dim=0).values
+    positive = extents[extents > 1e-6]
+    if positive.numel() == 0:
+        return 0.0
+    return float(torch.min(positive).item())
+
+
+def horizontal_bbox_axis(vertices: torch.Tensor, axis_preference: str) -> torch.Tensor:
+    """Return the world horizontal short or long bbox axis for object vertices."""
+    if axis_preference not in ("short", "long"):
+        raise ValueError(f"Unsupported axis_preference: {axis_preference}")
+    verts = torch.as_tensor(vertices, dtype=torch.float32)
+    if verts.dim() != 2 or verts.shape[1] < 3:
+        raise ValueError(f"vertices must have shape (N, 3), got {verts.shape}")
+    extents = verts[:, :2].max(dim=0).values - verts[:, :2].min(dim=0).values
+    axis_idx = int(torch.argmin(extents).item())
+    if axis_preference == "long":
+        axis_idx = 1 - axis_idx
+    axis = torch.zeros(3, dtype=torch.float32, device=verts.device)
+    axis[axis_idx] = 1.0
+    return axis
+
+
+def grasp_axis_alignment_cost(
+    candidate_axis: torch.Tensor, preferred_axis: torch.Tensor
+) -> torch.Tensor:
+    """Return 0 for candidate axes aligned with the preferred object axis."""
+    candidate_axis = torch.as_tensor(candidate_axis, dtype=torch.float32)
+    preferred_axis = torch.as_tensor(
+        preferred_axis,
+        dtype=torch.float32,
+        device=candidate_axis.device,
+    )
+    preferred_axis = preferred_axis / torch.clamp(
+        torch.linalg.norm(preferred_axis),
+        min=1e-6,
+    )
+    candidate_axis = candidate_axis / torch.clamp(
+        torch.linalg.norm(candidate_axis, dim=-1, keepdim=True), min=1e-6
+    )
+    return 1.0 - torch.abs((candidate_axis * preferred_axis).sum(dim=-1))
+
+
+def rerank_grasp_costs_by_axis(
+    costs: torch.Tensor,
+    grasp_poses: torch.Tensor,
+    preferred_axis: torch.Tensor,
+    *,
+    weight: float = 2.0,
+    axis_index: int = FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
+) -> torch.Tensor:
+    """Add an axis-alignment penalty to grasp candidate costs."""
+    if grasp_poses.dim() != 3 or grasp_poses.shape[-2:] != (4, 4):
+        raise ValueError(
+            f"grasp_poses must have shape (N, 4, 4), got {grasp_poses.shape}"
+        )
+    if axis_index not in (0, 1, 2):
+        raise ValueError(f"axis_index must be 0, 1, or 2, got {axis_index}")
+    costs = torch.as_tensor(costs, dtype=torch.float32, device=grasp_poses.device)
+    return costs + float(weight) * grasp_axis_alignment_cost(
+        grasp_poses[:, :3, axis_index],
+        preferred_axis,
+    )
+
+
+def evaluate_grasp_preflight(
+    *,
+    object_grasp_width_m: float,
+    gripper_min_opening_m: float,
+    gripper_max_opening_m: float,
+    margin_m: float = 0.002,
+) -> GraspPreflight:
+    """Check whether the object width fits the gripper opening range."""
+    if object_grasp_width_m <= 0.0:
+        return GraspPreflight(
+            status="unknown_width",
+            object_grasp_width_m=object_grasp_width_m,
+            gripper_min_opening_m=gripper_min_opening_m,
+            gripper_max_opening_m=gripper_max_opening_m,
+            reason="object_grasp_width_m is not positive",
+        )
+    if object_grasp_width_m < gripper_min_opening_m - margin_m:
+        return GraspPreflight(
+            status="too_small",
+            object_grasp_width_m=object_grasp_width_m,
+            gripper_min_opening_m=gripper_min_opening_m,
+            gripper_max_opening_m=gripper_max_opening_m,
+            reason="object narrower than gripper minimum opening",
+        )
+    if object_grasp_width_m > gripper_max_opening_m + margin_m:
+        return GraspPreflight(
+            status="too_large",
+            object_grasp_width_m=object_grasp_width_m,
+            gripper_min_opening_m=gripper_min_opening_m,
+            gripper_max_opening_m=gripper_max_opening_m,
+            reason="object wider than gripper maximum opening",
+        )
+    return GraspPreflight(
+        status="ok",
+        object_grasp_width_m=object_grasp_width_m,
+        gripper_min_opening_m=gripper_min_opening_m,
+        gripper_max_opening_m=gripper_max_opening_m,
+        reason="object width lies inside gripper opening range",
+    )
+
+
+def build_grasp_preflight(
+    obj: RigidObject,
+    robot: Robot,
+    *,
+    franka_urdf_path: str,
+) -> GraspPreflight:
+    """Build a preflight result from live object vertices and robot limits."""
+    vertices = obj.get_vertices(env_ids=[0], scale=True)[0]
+    hand_limits = robot.get_qpos_limits(name=HAND_NAME)[0]
+    min_opening, max_opening = estimate_gripper_opening_range(
+        franka_urdf_path, hand_limits
+    )
+    return evaluate_grasp_preflight(
+        object_grasp_width_m=estimate_object_grasp_width(vertices),
+        gripper_min_opening_m=min_opening,
+        gripper_max_opening_m=max_opening,
+    )
+
+
+def make_sim(args: argparse.Namespace) -> SimulationManager:
+    """Create a simulation manager for benchmark trials."""
+    sim = SimulationManager(
+        SimulationManagerCfg(
+            width=1280,
+            height=720,
+            headless=args.headless,
+            sim_device=args.device,
+            gpu_id=args.gpu_id,
+            render_cfg=RenderCfg(renderer=args.renderer),
+            physics_dt=1.0 / 100.0,
+            num_envs=1,
+            arena_space=args.arena_space,
+        )
+    )
+    sim.add_light(
+        cfg=LightCfg(
+            uid="main_light",
+            color=(0.6, 0.6, 0.6),
+            intensity=30.0,
+            init_pos=(1.0, 0.0, 3.0),
+        )
+    )
+    return sim
+
+
+def create_franka(
+    sim: SimulationManager,
+    *,
+    tcp_z: float = DEFAULT_FRANKA_TCP_Z,
+    tcp_yaw: float = DEFAULT_FRANKA_TCP_YAW,
+) -> tuple[Robot, str]:
+    """Create the Franka PandaWithHand robot from the original asset."""
+    urdf = get_data_path("Franka/Panda/PandaWithHand.urdf")
+    if not os.path.isfile(urdf):
+        raise FileNotFoundError(f"Franka URDF not found: {urdf}")
+
+    cfg = RobotCfg(
+        uid=ROBOT_UID,
+        fpath=urdf,
+        drive_pros=JointDrivePropertiesCfg(
+            stiffness={"Joint[1-7]": 1e4, "finger_joint[1-2]": 1e3},
+            damping={"Joint[1-7]": 1e3, "finger_joint[1-2]": 1e2},
+            max_effort={"Joint[1-7]": 1e5, "finger_joint[1-2]": 1e3},
+            drive_type="force",
+        ),
+        control_parts={
+            ARM_NAME: [
+                "Joint1",
+                "Joint2",
+                "Joint3",
+                "Joint4",
+                "Joint5",
+                "Joint6",
+                "Joint7",
+            ],
+            HAND_NAME: ["finger_joint1", "finger_joint2"],
+        },
+        solver_cfg={
+            ARM_NAME: PytorchSolverCfg(
+                end_link_name="ee_link",
+                root_link_name="base_link",
+                tcp=_franka_tcp(tcp_z, tcp_yaw),
+                num_samples=30,
+            ),
+        },
+        init_qpos=[*FRANKA_START_QPOS, 0.04, 0.04],
+    )
+    return sim.add_robot(cfg=cfg), urdf
+
+
+def create_support_table(sim: SimulationManager) -> RigidObject:
+    """Create a stable static support collider for the benchmark object."""
+    table_cfg = RigidObjectCfg(
+        uid=TABLE_COLLIDER_UID,
+        shape=CubeCfg(size=list(TABLE_COLLIDER_SIZE)),
+        attrs=RigidBodyAttributesCfg(
+            mass=10.0,
+            dynamic_friction=0.97,
+            static_friction=0.99,
+        ),
+        body_type="static",
+        init_pos=TABLE_COLLIDER_INIT_POS,
+    )
+    table = sim.add_rigid_object(cfg=table_cfg)
+    if hasattr(table, "set_visible"):
+        table.set_visible(False)
+    return table
+
+
+def create_visual_support_table(sim: SimulationManager) -> RigidObject:
+    """Create the visual table mesh below the hidden support collider."""
+    table_cfg = RigidObjectCfg(
+        uid=TABLE_UID,
+        shape=MeshCfg(fpath=get_data_path(TABLE_MESH_PATH)),
+        attrs=RigidBodyAttributesCfg(
+            mass=10.0,
+            dynamic_friction=0.97,
+            static_friction=0.99,
+        ),
+        body_scale=TABLE_SCALE,
+        body_type="static",
+        init_pos=TABLE_INIT_POS,
+    )
+    return sim.add_rigid_object(cfg=table_cfg)
+
+
+def create_object(
+    sim: SimulationManager,
+    object_name: str,
+    object_scale: tuple[float, float, float] | list[float] | None = None,
+) -> RigidObject:
+    """Create the benchmark object."""
+    preset = OBJECT_PRESETS[object_name]
+    scale = resolve_object_body_scale(object_name, object_scale)
+    shape = (
+        CubeCfg(size=list(preset["size"]))
+        if preset.get("shape") == "cube"
+        else MeshCfg(fpath=get_data_path(preset["mesh_path"]))
+    )
+    cfg = RigidObjectCfg(
+        uid=preset["label"],
+        shape=shape,
+        attrs=RigidBodyAttributesCfg(
+            mass=preset["mass"],
+            dynamic_friction=0.97,
+            static_friction=0.99,
+            enable_ccd=True,
+            contact_offset=0.004,
+        ),
+        max_convex_hull_num=preset["max_convex_hull_num"],
+        init_pos=object_initial_position(object_name, scale),
+        init_rot=preset["init_rot"],
+        body_scale=scale,
+        use_usd_properties=preset["use_usd_properties"],
+    )
+    obj = sim.add_rigid_object(cfg=cfg)
+    sim.update(step=20)
+    return obj
+
+
+def reset_object_pose(
+    obj: RigidObject,
+    object_name: str,
+    object_scale: tuple[float, float, float] | list[float] | None = None,
+) -> torch.Tensor:
+    """Reset the benchmark object to its initial pose and return it as a matrix."""
+    pose = object_initial_pose(object_name, obj.device, object_scale).unsqueeze(0)
+    obj.set_local_pose(pose, env_ids=[0])
+    obj.clear_dynamics(env_ids=[0])
+    return obj.get_local_pose(to_matrix=True)[0].clone()
+
+
+def make_arm_qpos(values: tuple[float, ...], device: torch.device) -> torch.Tensor:
+    """Create a float32 arm qpos tensor."""
+    return torch.tensor(values, dtype=torch.float32, device=device)
+
+
+def set_robot_start_qpos(robot: Robot, hand_open: torch.Tensor) -> None:
+    """Reset arm and hand joints to the benchmark start configuration."""
+    start_arm = make_arm_qpos(FRANKA_START_QPOS, robot.device).unsqueeze(0)
+    hand_open = hand_open.unsqueeze(0)
+    for target in (False, True):
+        robot.set_qpos(start_arm, name=ARM_NAME, target=target)
+        robot.set_qpos(hand_open, name=HAND_NAME, target=target)
+    robot.clear_dynamics()
+
+
+def compute_fk_pose(robot: Robot, qpos_values: tuple[float, ...]) -> torch.Tensor:
+    """Compute an unbatched TCP pose from a known arm qpos."""
+    return robot.compute_fk(
+        qpos=make_arm_qpos(qpos_values, robot.device).unsqueeze(0),
+        name=ARM_NAME,
+        to_matrix=True,
+    )[0]
+
+
+def make_rest_pose(robot: Robot) -> torch.Tensor:
+    """Return a reachable final rest EEF pose generated through Franka FK."""
+    return compute_fk_pose(robot, FRANKA_REST_QPOS)
+
+
+def make_object_target_pose(obj: RigidObject) -> torch.Tensor:
+    """Return a desired object placement pose derived from the live object pose."""
+    target = obj.get_local_pose(to_matrix=True)[0].clone()
+    target[:3, 3] += torch.tensor(
+        [-0.10, 0.24, 0.0],
+        dtype=torch.float32,
+        device=target.device,
+    )
+    return target
+
+
+def make_pre_pick_eef_pose(robot: Robot, position: torch.Tensor) -> torch.Tensor:
+    """Return a pre-pick TCP pose using the current TCP orientation."""
+    pose = robot.compute_fk(
+        qpos=robot.get_qpos(name=ARM_NAME),
+        name=ARM_NAME,
+        to_matrix=True,
+    ).clone()
+    pose[:, :3, 3] = position
+    return pose
+
+
+def initialize_pre_pick_robot_pose(
+    robot: Robot,
+    obj: RigidObject,
+    hand_open: torch.Tensor,
+    *,
+    pre_pick_z: float,
+    reference_pose: torch.Tensor | None = None,
+) -> None:
+    """Move Franka to a pre-pick pose above the object."""
+    obj_pose = (
+        obj.get_local_pose(to_matrix=True)
+        if reference_pose is None
+        else reference_pose.to(device=robot.device, dtype=torch.float32).unsqueeze(0)
+    )
+    move_position = obj_pose[:, :3, 3].clone()
+    move_position[:, 2] = pre_pick_z
+    pre_pick_pose = make_pre_pick_eef_pose(robot, move_position)
+    ik_success, arm_qpos = robot.compute_ik(
+        pose=pre_pick_pose,
+        joint_seed=robot.get_qpos(name=ARM_NAME),
+        name=ARM_NAME,
+    )
+    if not torch.all(ik_success):
+        obj_xyz = obj_pose[0, :3, 3].detach().cpu().tolist()
+        target_xyz = move_position[0].detach().cpu().tolist()
+        raise RuntimeError(
+            "Failed to initialize Franka at the pre-pick pose "
+            f"(object_xyz={obj_xyz}, target_xyz={target_xyz})."
+        )
+
+    n_envs = robot.get_qpos().shape[0]
+    hand_qpos = hand_open.unsqueeze(0).repeat(n_envs, 1)
+    for target in (False, True):
+        robot.set_qpos(arm_qpos, name=ARM_NAME, target=target)
+        robot.set_qpos(hand_qpos, name=HAND_NAME, target=target)
+    robot.clear_dynamics()
+
+
+def build_gripper_collision_cfg(preflight: GraspPreflight) -> GripperCollisionCfg:
+    """Build collision geometry approximating the original Franka hand."""
+    return GripperCollisionCfg(
+        max_open_length=preflight.gripper_max_opening_m,
+        finger_length=FRANKA_FINGER_LENGTH,
+        y_thickness=FRANKA_Y_THICKNESS,
+        root_z_width=FRANKA_ROOT_Z_WIDTH,
+        open_check_margin=0.002,
+        point_sample_dense=0.012,
+    )
+
+
+def build_grasp_generator_cfg(
+    args: argparse.Namespace, preflight: GraspPreflight
+) -> GraspGeneratorCfg:
+    """Build grasp annotation config."""
+    return GraspGeneratorCfg(
+        viser_port=11801,
+        antipodal_sampler_cfg=AntipodalSamplerCfg(
+            n_sample=args.n_sample,
+            max_length=preflight.gripper_max_opening_m,
+            min_length=max(0.003, preflight.gripper_min_opening_m),
+        ),
+        is_partial_annotate=False,
+        is_filter_ground_collision=True,
+        n_deviated_approach_directions=args.n_deviated_approach_directions,
+        n_top_grasps=30,
+    )
+
+
+def create_object_semantics(
+    obj: RigidObject,
+    args: argparse.Namespace,
+    preflight: GraspPreflight,
+) -> ObjectSemantics:
+    """Create grasp semantics for the benchmark object."""
+    label = OBJECT_PRESETS[args.object]["label"]
+    vertices = obj.get_vertices(env_ids=[0], scale=True)[0]
+    triangles = obj.get_triangles(env_ids=[0])[0]
+    return ObjectSemantics(
+        label=label,
+        geometry={
+            "mesh_vertices": vertices,
+            "mesh_triangles": triangles,
+        },
+        affordance=AntipodalAffordance(
+            mesh_vertices=vertices,
+            mesh_triangles=triangles,
+            gripper_collision_cfg=build_gripper_collision_cfg(preflight),
+            generator_cfg=build_grasp_generator_cfg(args, preflight),
+            force_reannotate=args.force_reannotate,
+        ),
+        entity=obj,
+    )
+
+
+def resolve_checkpoint(args: argparse.Namespace, planners: list[str]) -> str | None:
+    """Resolve the NMG checkpoint path if any selected planner needs it."""
+    if not any(planner in ("neural", "neural_refine") for planner in planners):
+        return None
+    if args.neural_checkpoint:
+        return args.neural_checkpoint
+    return download_neural_planner_checkpoint()
+
+
+def build_motion_generator(
+    robot: Robot,
+    planner_name: str,
+    checkpoint_path: str | None,
+    args: argparse.Namespace,
+) -> MotionGenerator:
+    """Create a motion generator for one planner backend."""
+    if planner_name == "ik_interpolate":
+        planner_cfg = ToppraPlannerCfg(robot_uid=robot.uid)
+    elif planner_name in ("neural", "neural_refine"):
+        if checkpoint_path is None:
+            raise ValueError("checkpoint_path is required for neural planners.")
+        planner_cfg = NeuralPlannerCfg(
+            robot_uid=robot.uid,
+            planner_type=planner_name,
+            checkpoint_path=checkpoint_path,
+            control_part=ARM_NAME,
+            num_arm_joints=len(robot.get_joint_ids(ARM_NAME)),
+            pos_eps=args.nmg_pos_success_threshold,
+            rot_eps=args.nmg_rot_success_threshold,
+        )
+    else:
+        raise ValueError(f"Unsupported planner: {planner_name}")
+    return MotionGenerator(cfg=MotionGenCfg(planner_cfg=planner_cfg))
+
+
+class PhysicalPickUp(PickUp):
+    """Benchmark-local PickUp variant with an optional grasp-depth offset."""
+
+    def __init__(
+        self,
+        motion_generator: MotionGenerator,
+        cfg: PickUpCfg,
+        *,
+        grasp_depth_offset: float,
+        grasp_axis: str,
+    ) -> None:
+        super().__init__(motion_generator, cfg=cfg)
+        self.grasp_depth_offset = float(grasp_depth_offset)
+        self.grasp_axis = str(grasp_axis)
+
+    def _resolve_grasp_pose(
+        self,
+        semantics: ObjectSemantics,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.grasp_axis == "auto":
+            is_success, grasp_xpos = super()._resolve_grasp_pose(semantics)
+        else:
+            is_success, grasp_xpos = self._resolve_axis_preferred_grasp_pose(
+                semantics,
+            )
+        if abs(self.grasp_depth_offset) < 1e-9:
+            return is_success, grasp_xpos
+        grasp_xpos = self.builder.apply_local_offset(
+            grasp_xpos,
+            grasp_xpos[:, :3, 2] * self.grasp_depth_offset,
+        )
+        return is_success, grasp_xpos
+
+    def _resolve_axis_preferred_grasp_pose(
+        self,
+        semantics: ObjectSemantics,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        obj_poses = semantics.entity.get_local_pose(to_matrix=True)
+        grasp_poses_result = semantics.affordance.get_valid_grasp_poses(
+            obj_poses=obj_poses,
+            approach_direction=self.approach_direction,
+        )
+        init_qpos = self.robot.get_qpos(name=self.cfg.control_part)
+        is_success_list = []
+        best_grasp_xpos_list = []
+        vertices = semantics.entity.get_vertices(env_ids=[0], scale=True)[0]
+        preferred_axis = horizontal_bbox_axis(vertices, self.grasp_axis)
+        for env_id, (candidate_xpos, candidate_costs) in enumerate(grasp_poses_result):
+            if candidate_xpos.shape == (4, 4):
+                candidate_xpos = candidate_xpos.unsqueeze(0)
+            if candidate_costs.dim() == 0:
+                candidate_costs = candidate_costs.unsqueeze(0)
+            candidate_xpos = candidate_xpos.to(self.device)
+            candidate_costs = candidate_costs.to(self.device)
+            joint_seed = init_qpos[env_id : env_id + 1, None, :].repeat(
+                1, candidate_xpos.shape[0], 1
+            )
+            ik_success, _ = self.robot.compute_batch_ik(
+                pose=candidate_xpos.unsqueeze(0),
+                name=self.cfg.control_part,
+                joint_seed=joint_seed,
+            )
+            reranked_costs = rerank_grasp_costs_by_axis(
+                candidate_costs,
+                candidate_xpos,
+                preferred_axis.to(candidate_xpos.device),
+                axis_index=FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
+            )
+            masked_costs = torch.where(
+                ik_success[0],
+                reranked_costs,
+                torch.full_like(reranked_costs, 10000.0),
+            )
+            best_cost, best_idx = masked_costs.min(dim=0)
+            is_success_list.append(best_cost < 9999.0)
+            best_grasp_xpos_list.append(candidate_xpos[best_idx].unsqueeze(0))
+        is_success = torch.stack(is_success_list).to(device=self.device)
+        best_grasp_xpos = torch.cat(best_grasp_xpos_list, dim=0).to(self.device)
+        return is_success, best_grasp_xpos
+
+
+def build_engine(
+    robot: Robot,
+    motion_gen: MotionGenerator,
+    args: argparse.Namespace,
+    preflight: GraspPreflight | None = None,
+    franka_urdf_path: str | None = None,
+) -> AtomicActionEngine:
+    """Register the complete benchmark action sequence."""
+    hand_open, hand_close = get_hand_open_close_qpos(
+        robot,
+        preflight,
+        close_margin_m=args.gripper_close_margin,
+        franka_urdf_path=franka_urdf_path,
+    )
+    pickup_cfg = PickUpCfg(
+        control_part=ARM_NAME,
+        hand_control_part=HAND_NAME,
+        hand_open_qpos=hand_open,
+        hand_close_qpos=hand_close,
+        approach_direction=resolve_approach_direction(args, robot.device),
+        pre_grasp_distance=0.12,
+        lift_height=0.14,
+        sample_interval=PICK_SAMPLE_INTERVAL,
+        hand_interp_steps=HAND_INTERP_STEPS,
+    )
+    place_cfg = PlaceCfg(
+        control_part=ARM_NAME,
+        hand_control_part=HAND_NAME,
+        hand_open_qpos=hand_open,
+        hand_close_qpos=hand_close,
+        lift_height=0.14,
+        sample_interval=PLACE_SAMPLE_INTERVAL,
+        hand_interp_steps=HAND_INTERP_STEPS,
+    )
+    move_cfg = MoveEndEffectorCfg(
+        control_part=ARM_NAME,
+        sample_interval=MOVE_SAMPLE_INTERVAL,
+    )
+
+    engine = AtomicActionEngine(motion_generator=motion_gen)
+    engine.register(
+        PhysicalPickUp(
+            motion_gen,
+            cfg=pickup_cfg,
+            grasp_depth_offset=args.grasp_depth_offset,
+            grasp_axis=args.grasp_axis,
+        )
+    )
+    engine.register(Place(motion_gen, cfg=place_cfg))
+    engine.register(MoveEndEffector(motion_gen, cfg=move_cfg))
+    return engine
+
+
+def plan_sequence(
+    engine: AtomicActionEngine,
+    semantics: ObjectSemantics,
+    object_target_pose: torch.Tensor,
+    rest_pose: torch.Tensor,
+) -> PlanOutcome:
+    """Plan the full PickUp -> Place -> MoveEndEffector sequence."""
+    pick_success, pick_traj, pick_state = engine.run(
+        steps=[("pick_up", GraspTarget(semantics=semantics))]
+    )
+    if not pick_success or pick_state.held_object is None:
+        return PlanOutcome(
+            success=False,
+            trajectory=pick_traj,
+            held_object=pick_state.held_object,
+            place_eef_pose=None,
+            rest_eef_pose=rest_pose,
+        )
+
+    object_to_eef = pick_state.held_object.object_to_eef.to(
+        device=object_target_pose.device, dtype=torch.float32
+    )
+    if object_to_eef.shape == (4, 4):
+        object_to_eef = object_to_eef.unsqueeze(0)
+    place_pose = torch.bmm(object_target_pose.unsqueeze(0), object_to_eef)[0]
+
+    finish_success, finish_traj, _ = engine.run(
+        steps=[
+            ("place", EndEffectorPoseTarget(xpos=place_pose)),
+            ("move_end_effector", EndEffectorPoseTarget(xpos=rest_pose)),
+        ],
+        state=pick_state,
+    )
+    trajectory = torch.cat([pick_traj, finish_traj], dim=1)
+    return PlanOutcome(
+        success=finish_success,
+        trajectory=trajectory,
+        held_object=pick_state.held_object,
+        place_eef_pose=place_pose,
+        rest_eef_pose=rest_pose,
+    )
+
+
+def _sync_cuda() -> None:
+    """Synchronize CUDA stream when available."""
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+
+def _reset_peak_gpu_memory() -> None:
+    """Reset PyTorch peak GPU memory stats when CUDA is available."""
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+
+
+def _peak_gpu_memory_mb() -> float:
+    """Return peak GPU memory allocated by PyTorch in MB."""
+    if not torch.cuda.is_available():
+        return 0.0
+    return torch.cuda.max_memory_allocated() / 1024**2
+
+
+def _memory_snapshot() -> dict[str, float]:
+    """Return current process memory usage in MB."""
+    process = psutil.Process(os.getpid())
+    cpu_mb = process.memory_info().rss / 1024**2
+    gpu_mb = (
+        torch.cuda.memory_allocated() / 1024**2 if torch.cuda.is_available() else 0.0
+    )
+    return {"cpu_mb": cpu_mb, "gpu_mb": gpu_mb}
+
+
+def timed_plan_sequence(
+    engine: AtomicActionEngine,
+    semantics: ObjectSemantics,
+    object_target_pose: torch.Tensor,
+    rest_pose: torch.Tensor,
+) -> tuple[PlanOutcome, float, dict[str, float], float]:
+    """Plan once and return time, memory delta, and peak GPU memory."""
+    _reset_peak_gpu_memory()
+    before = _memory_snapshot()
+    _sync_cuda()
+    start = time.perf_counter()
+    outcome = plan_sequence(engine, semantics, object_target_pose, rest_pose)
+    _sync_cuda()
+    elapsed = time.perf_counter() - start
+    after = _memory_snapshot()
+    mem_delta = {
+        "cpu_mb": after["cpu_mb"] - before["cpu_mb"],
+        "gpu_mb": after["gpu_mb"] - before["gpu_mb"],
+    }
+    return outcome, elapsed, mem_delta, _peak_gpu_memory_mb()
+
+
+def _set_replay_qpos(robot: Robot, qpos: torch.Tensor, *, mode: str) -> None:
+    """Apply a replay qpos either as a target or as a direct state reset."""
+    if mode == "target":
+        robot.set_qpos(qpos, target=True)
+    elif mode == "direct":
+        robot.set_qpos(qpos, target=False)
+        robot.set_qpos(qpos, target=True)
+    else:
+        raise ValueError(f"Unsupported replay control mode: {mode}")
+
+
+def make_finger_object_contact_sensor(
+    sim: SimulationManager,
+    obj: RigidObject,
+    *,
+    uid_suffix: str,
+) -> ContactSensor:
+    """Create a sensor that only reports Franka finger contacts with the object."""
+    contact_cfg = ContactSensorCfg(
+        uid=f"franka_pick_contact_{uid_suffix}_{time.time_ns()}",
+        rigid_uid_list=[obj.uid],
+        articulation_cfg_list=[
+            ArticulationContactFilterCfg(
+                articulation_uid=ROBOT_UID,
+                link_name_list=list(FRANKA_FINGER_LINK_NAMES),
+            )
+        ],
+        filter_need_both_actor=True,
+        max_contacts_per_env=128,
+    )
+    return sim.add_sensor(sensor_cfg=contact_cfg)
+
+
+def summarize_contact_sensor(contact_sensor: ContactSensor | None) -> ContactStats:
+    """Return current filtered contact count and impulse summary."""
+    if contact_sensor is None:
+        return ContactStats(0, 0.0, 0.0, None)
+    contact_sensor.update()
+    report = contact_sensor.get_data()
+    valid = report["is_valid"][0]
+    count = int(valid.sum().item())
+    if count == 0:
+        return ContactStats(0, 0.0, 0.0, None)
+    impulses = report["impulse"][0][valid]
+    distances = report["distance"][0][valid]
+    return ContactStats(
+        count=count,
+        max_impulse=float(impulses.max().item()),
+        total_impulse=float(impulses.sum().item()),
+        min_distance=float(distances.min().item()),
+    )
+
+
+def _max_optional_float(current: float | None, candidate: float | None) -> float | None:
+    """Return the max of two optional floats."""
+    if candidate is None:
+        return current
+    if current is None:
+        return candidate
+    return max(current, candidate)
+
+
+def _min_optional_float(current: float | None, candidate: float | None) -> float | None:
+    """Return the min of two optional floats."""
+    if candidate is None:
+        return current
+    if current is None:
+        return candidate
+    return min(current, candidate)
+
+
+def _record_contact_stats(
+    stats: ContactStats,
+    *,
+    max_count: int,
+    max_impulse: float | None,
+    max_total_impulse: float | None,
+    min_distance: float | None,
+) -> tuple[int, float | None, float | None, float | None]:
+    """Update aggregate contact diagnostics with one sample."""
+    return (
+        max(max_count, stats.count),
+        _max_optional_float(max_impulse, stats.max_impulse),
+        _max_optional_float(max_total_impulse, stats.total_impulse),
+        _min_optional_float(min_distance, stats.min_distance),
+    )
+
+
+def hand_opening_from_qpos(
+    robot: Robot,
+    qpos: torch.Tensor,
+    spec: FrankaHandOpeningSpec,
+) -> torch.Tensor:
+    """Return physical Franka jaw opening from a full-DoF qpos tensor."""
+    if qpos.dim() == 1:
+        qpos = qpos.unsqueeze(0)
+    hand_joint_ids = robot.get_joint_ids(HAND_NAME)
+    return franka_hand_opening_from_finger_qpos(qpos[:, hand_joint_ids], spec)
+
+
+def current_hand_opening(robot: Robot, spec: FrankaHandOpeningSpec) -> float:
+    """Return the actual current Franka jaw opening in simulation."""
+    opening = franka_hand_opening_from_finger_qpos(
+        robot.get_qpos(name=HAND_NAME)[0],
+        spec,
+    )
+    return float(opening.item())
+
+
+def object_delta_from_tcp(
+    robot: Robot,
+    obj: RigidObject,
+    qpos: torch.Tensor,
+) -> list[float]:
+    """Return object-position minus TCP-position for one full-DoF qpos."""
+    if qpos.dim() == 2:
+        qpos = qpos[0]
+    arm_joint_ids = robot.get_joint_ids(ARM_NAME)
+    tcp_pose = compute_tcp_pose(robot, qpos[arm_joint_ids], control_part=ARM_NAME)
+    obj_pos = obj.get_local_pose(to_matrix=True)[0, :3, 3]
+    return [float(v) for v in (obj_pos - tcp_pose[:3, 3]).detach().cpu().tolist()]
+
+
+def link_delta_from_object(
+    robot: Robot, obj: RigidObject, link_name: str
+) -> list[float]:
+    """Return link-position minus object-position for one live link pose."""
+    link_pos = robot.get_link_pose(link_name, to_matrix=True)[0, :3, 3]
+    obj_pos = obj.get_local_pose(to_matrix=True)[0, :3, 3]
+    return [float(v) for v in (link_pos - obj_pos).detach().cpu().tolist()]
+
+
+def tcp_axes_from_qpos(robot: Robot, qpos: torch.Tensor) -> tuple[list[float], ...]:
+    """Return TCP x/y/z axes for one full-DoF qpos."""
+    if qpos.dim() == 2:
+        qpos = qpos[0]
+    arm_joint_ids = robot.get_joint_ids(ARM_NAME)
+    tcp_pose = compute_tcp_pose(robot, qpos[arm_joint_ids], control_part=ARM_NAME)
+    return tuple(
+        [float(v) for v in tcp_pose[:3, axis].detach().cpu().tolist()]
+        for axis in range(3)
+    )
+
+
+def _find_hand_close_end_index(
+    robot: Robot,
+    traj: torch.Tensor,
+    spec: FrankaHandOpeningSpec,
+) -> int | None:
+    """Return the first index where the gripper reaches its minimum opening."""
+    if traj.numel() == 0:
+        return None
+    opening = hand_opening_from_qpos(robot, traj[0], spec).reshape(-1)
+    min_opening = torch.min(opening)
+    close_indices = torch.nonzero(opening <= min_opening + 1e-4, as_tuple=False)
+    if close_indices.numel() == 0:
+        return None
+    return int(close_indices[0, 0].item())
+
+
+def _find_hand_release_start_index(
+    robot: Robot,
+    traj: torch.Tensor,
+    spec: FrankaHandOpeningSpec,
+) -> int | None:
+    """Return the first index after closing where the gripper opens again."""
+    close_end_idx = _find_hand_close_end_index(robot, traj, spec)
+    if close_end_idx is None:
+        return None
+    opening = hand_opening_from_qpos(robot, traj[0], spec).reshape(-1)
+    min_opening = torch.min(opening)
+    release_indices = torch.nonzero(
+        opening[close_end_idx:] > min_opening + 1e-4,
+        as_tuple=False,
+    )
+    if release_indices.numel() == 0:
+        return None
+    return close_end_idx + int(release_indices[0, 0].item())
+
+
+def compute_attached_object_pose(
+    eef_pose: torch.Tensor,
+    object_to_eef: torch.Tensor,
+) -> torch.Tensor:
+    """Return object pose from an EEF pose and object-to-EEF transform."""
+    eef_pose = _as_unbatched_pose(eef_pose).to(dtype=torch.float32)
+    object_to_eef = _as_unbatched_pose(object_to_eef).to(
+        device=eef_pose.device, dtype=torch.float32
+    )
+    return torch.mm(eef_pose, torch.inverse(object_to_eef))
+
+
+def update_attached_object_pose(
+    robot: Robot,
+    obj: RigidObject,
+    qpos: torch.Tensor,
+    object_to_eef: torch.Tensor | None,
+) -> None:
+    """Move the object with the planned TCP pose in attached replay mode."""
+    if object_to_eef is None:
+        return
+    arm_joint_ids = robot.get_joint_ids(ARM_NAME)
+    eef_pose = compute_tcp_pose(robot, qpos[0, arm_joint_ids], control_part=ARM_NAME)
+    obj_pose = compute_attached_object_pose(eef_pose, object_to_eef)
+    obj.set_local_pose(obj_pose.unsqueeze(0), env_ids=[0])
+    obj.clear_dynamics(env_ids=[0])
+
+
+def save_replay_screenshot(
+    sim: SimulationManager,
+    *,
+    screenshot_dir: str | None,
+    planner: str,
+    mode: str,
+    repeat_id: int,
+    label: str,
+) -> None:
+    """Save a diagnostic screenshot if a screenshot directory is configured."""
+    if not screenshot_dir:
+        return
+    from PIL import Image
+    import numpy as np
+
+    screenshot_path = Path(screenshot_dir)
+    screenshot_path.mkdir(parents=True, exist_ok=True)
+    camera = sim.add_sensor(
+        sensor_cfg=CameraCfg(
+            uid=f"debug_camera_{planner}_{mode}_{repeat_id}_{label}",
+            width=640,
+            height=480,
+            intrinsics=(520.0, 520.0, 320.0, 240.0),
+            extrinsics=CameraCfg.ExtrinsicsCfg(
+                eye=(0.82, -0.72, 0.58),
+                target=(0.24, 0.04, 0.10),
+                up=(0.0, 0.0, 1.0),
+            ),
+            near=0.01,
+            far=4.0,
+            enable_color=True,
+        )
+    )
+    sim.update(step=1)
+    camera.update()
+    color = camera.get_data()["color"][0, :, :, :3].detach().cpu().numpy()
+    out_path = (
+        screenshot_path / f"{SCRIPT_NAME}_{planner}_{mode}_{repeat_id}_{label}.png"
+    )
+    Image.fromarray(np.ascontiguousarray(color)).save(out_path)
+
+
+def replay_trajectory(
+    sim: SimulationManager,
+    robot: Robot,
+    obj: RigidObject,
+    traj: torch.Tensor,
+    object_target_pose: torch.Tensor,
+    args: argparse.Namespace,
+    *,
+    held_object: HeldObjectState | None,
+    planner: str,
+    mode: str,
+    repeat_id: int,
+    hand_opening_spec: FrankaHandOpeningSpec,
+) -> ReplayOutcome:
+    """Replay a planned full-DoF trajectory with real object physics."""
+    if traj.numel() == 0:
+        return ReplayOutcome(
+            0.0,
+            False,
+            False,
+            False,
+            False,
+            None,
+            None,
+            None,
+            None,
+            args.object_replay_mode,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+    start_pose = obj.get_local_pose(to_matrix=True)[0].clone()
+    start_pos = start_pose[:3, 3].clone()
+    max_z = float(start_pos[2].item())
+    start = time.perf_counter()
+    close_end_idx = _find_hand_close_end_index(robot, traj, hand_opening_spec)
+    release_start_idx = _find_hand_release_start_index(
+        robot,
+        traj,
+        hand_opening_spec,
+    )
+    object_to_eef = (
+        None
+        if held_object is None
+        else _as_unbatched_pose(held_object.object_to_eef).to(robot.device)
+    )
+    opening = hand_opening_from_qpos(robot, traj[0], hand_opening_spec).reshape(-1)
+    min_hand_opening = float(torch.min(opening).item())
+    final_hand_opening = float(opening[-1].item())
+    contact_sensor = (
+        None
+        if args.object_replay_mode != "physics"
+        else make_finger_object_contact_sensor(
+            sim,
+            obj,
+            uid_suffix=f"{planner}_{mode}_{repeat_id}",
+        )
+    )
+    actual_min_hand_opening = current_hand_opening(robot, hand_opening_spec)
+    actual_close_hand_opening = None
+    close_contact_count = None
+    path_contact_count = None
+    hold_contact_count = None
+    max_contact_count = 0
+    max_contact_impulse = None
+    max_total_contact_impulse = None
+    min_contact_distance = None
+    close_tcp_object_delta = None
+    close_leftfinger_object_delta = None
+    close_rightfinger_object_delta = None
+    close_tcp_x_axis = None
+    close_tcp_y_axis = None
+    close_tcp_z_axis = None
+    planned_grasp_object_delta = (
+        None
+        if close_end_idx is None
+        else object_delta_from_tcp(robot, obj, traj[:, close_end_idx, :])
+    )
+    try:
+        for idx in range(traj.shape[1]):
+            _set_replay_qpos(robot, traj[:, idx, :], mode=args.replay_control)
+            if (
+                args.object_replay_mode == "attached"
+                and close_end_idx is not None
+                and idx >= close_end_idx
+                and (release_start_idx is None or idx < release_start_idx)
+            ):
+                update_attached_object_pose(robot, obj, traj[:, idx, :], object_to_eef)
+            sim.update(step=max(args.step_repeat, 1))
+            actual_min_hand_opening = min(
+                actual_min_hand_opening,
+                current_hand_opening(robot, hand_opening_spec),
+            )
+            contact_stats = summarize_contact_sensor(contact_sensor)
+            if contact_sensor is not None:
+                (
+                    max_contact_count,
+                    max_contact_impulse,
+                    max_total_contact_impulse,
+                    min_contact_distance,
+                ) = _record_contact_stats(
+                    contact_stats,
+                    max_count=max_contact_count,
+                    max_impulse=max_contact_impulse,
+                    max_total_impulse=max_total_contact_impulse,
+                    min_distance=min_contact_distance,
+                )
+            if close_end_idx is not None and idx == close_end_idx:
+                close_contact_count = (
+                    None if contact_sensor is None else contact_stats.count
+                )
+                actual_close_hand_opening = current_hand_opening(
+                    robot,
+                    hand_opening_spec,
+                )
+                close_tcp_object_delta = object_delta_from_tcp(
+                    robot,
+                    obj,
+                    traj[:, idx, :],
+                )
+                close_leftfinger_object_delta = link_delta_from_object(
+                    robot,
+                    obj,
+                    "leftfinger",
+                )
+                close_rightfinger_object_delta = link_delta_from_object(
+                    robot,
+                    obj,
+                    "rightfinger",
+                )
+                (
+                    close_tcp_x_axis,
+                    close_tcp_y_axis,
+                    close_tcp_z_axis,
+                ) = tcp_axes_from_qpos(robot, traj[:, idx, :])
+                save_replay_screenshot(
+                    sim,
+                    screenshot_dir=args.screenshot_dir,
+                    planner=planner,
+                    mode=mode,
+                    repeat_id=repeat_id,
+                    label="after_close",
+                )
+                sim.update(step=args.grasp_hold_steps + POST_GRASP_HOLD_STEPS)
+                if args.object_replay_mode == "attached":
+                    update_attached_object_pose(
+                        robot, obj, traj[:, idx, :], object_to_eef
+                    )
+                actual_min_hand_opening = min(
+                    actual_min_hand_opening,
+                    current_hand_opening(robot, hand_opening_spec),
+                )
+                contact_stats = summarize_contact_sensor(contact_sensor)
+                if contact_sensor is not None:
+                    close_contact_count = max(
+                        close_contact_count or 0,
+                        contact_stats.count,
+                    )
+                    (
+                        max_contact_count,
+                        max_contact_impulse,
+                        max_total_contact_impulse,
+                        min_contact_distance,
+                    ) = _record_contact_stats(
+                        contact_stats,
+                        max_count=max_contact_count,
+                        max_impulse=max_contact_impulse,
+                        max_total_impulse=max_total_contact_impulse,
+                        min_distance=min_contact_distance,
+                    )
+            obj_pos = obj.get_local_pose(to_matrix=True)[0, :3, 3]
+            max_z = max(max_z, float(obj_pos[2].item()))
+
+        path_contact_stats = summarize_contact_sensor(contact_sensor)
+        path_contact_count = (
+            None if contact_sensor is None else path_contact_stats.count
+        )
+        if contact_sensor is not None:
+            (
+                max_contact_count,
+                max_contact_impulse,
+                max_total_contact_impulse,
+                min_contact_distance,
+            ) = _record_contact_stats(
+                path_contact_stats,
+                max_count=max_contact_count,
+                max_impulse=max_contact_impulse,
+                max_total_impulse=max_total_contact_impulse,
+                min_distance=min_contact_distance,
+            )
+        save_replay_screenshot(
+            sim,
+            screenshot_dir=args.screenshot_dir,
+            planner=planner,
+            mode=mode,
+            repeat_id=repeat_id,
+            label="after_path",
+        )
+        _set_replay_qpos(robot, traj[:, -1, :], mode=args.replay_control)
+        sim.update(step=args.hold_steps + POST_REPLAY_HOLD_STEPS)
+        actual_min_hand_opening = min(
+            actual_min_hand_opening,
+            current_hand_opening(robot, hand_opening_spec),
+        )
+        actual_final_hand_opening = current_hand_opening(robot, hand_opening_spec)
+        hold_contact_stats = summarize_contact_sensor(contact_sensor)
+        hold_contact_count = (
+            None if contact_sensor is None else hold_contact_stats.count
+        )
+        if contact_sensor is not None:
+            (
+                max_contact_count,
+                max_contact_impulse,
+                max_total_contact_impulse,
+                min_contact_distance,
+            ) = _record_contact_stats(
+                hold_contact_stats,
+                max_count=max_contact_count,
+                max_impulse=max_contact_impulse,
+                max_total_impulse=max_total_contact_impulse,
+                min_distance=min_contact_distance,
+            )
+        save_replay_screenshot(
+            sim,
+            screenshot_dir=args.screenshot_dir,
+            planner=planner,
+            mode=mode,
+            repeat_id=repeat_id,
+            label="after_hold",
+        )
+    finally:
+        # SimulationManager currently keeps sensor assets alive for the lifetime
+        # of the simulation, so each replay uses a unique contact sensor UID.
+        pass
+    elapsed = time.perf_counter() - start
+    object_pos_error, object_rot_error = compute_object_pose_error(
+        obj, object_target_pose
+    )
+    final_pos = obj.get_local_pose(to_matrix=True)[0, :3, 3]
+    displacement = float(torch.linalg.norm(final_pos - start_pos).item())
+    object_lifted = max_z - float(start_pos[2].item()) >= 0.03
+    object_moved = displacement >= 0.03
+    replay_success = (
+        object_lifted
+        and object_moved
+        and object_pos_error <= args.object_pos_success_threshold
+    )
+    physical_success = replay_success and args.object_replay_mode == "physics"
+    return ReplayOutcome(
+        replay_time_sec=elapsed,
+        replay_success=replay_success,
+        physical_success=physical_success,
+        object_lifted=object_lifted,
+        object_moved=object_moved,
+        object_pos_error=object_pos_error,
+        object_rot_error=object_rot_error,
+        object_max_z=max_z,
+        object_displacement_m=displacement,
+        object_replay_mode=args.object_replay_mode,
+        close_end_index=close_end_idx,
+        release_start_index=release_start_idx,
+        min_hand_opening_m=min_hand_opening,
+        final_hand_opening_m=final_hand_opening,
+        actual_min_hand_opening_m=actual_min_hand_opening,
+        actual_close_hand_opening_m=actual_close_hand_opening,
+        actual_final_hand_opening_m=actual_final_hand_opening,
+        close_contact_count=close_contact_count,
+        path_contact_count=path_contact_count,
+        hold_contact_count=hold_contact_count,
+        max_contact_count=None if contact_sensor is None else max_contact_count,
+        max_contact_impulse=max_contact_impulse,
+        max_total_contact_impulse=max_total_contact_impulse,
+        min_contact_distance_m=min_contact_distance,
+        close_tcp_object_delta_m=close_tcp_object_delta,
+        planned_grasp_object_delta_m=planned_grasp_object_delta,
+        close_leftfinger_object_delta_m=close_leftfinger_object_delta,
+        close_rightfinger_object_delta_m=close_rightfinger_object_delta,
+        close_tcp_x_axis=close_tcp_x_axis,
+        close_tcp_y_axis=close_tcp_y_axis,
+        close_tcp_z_axis=close_tcp_z_axis,
+    )
+
+
+def trajectory_stats(robot: Robot, traj: torch.Tensor) -> dict[str, object]:
+    """Compute joint path statistics for a full-DoF trajectory."""
+    if traj.numel() == 0:
+        return {
+            "trajectory_steps": 0,
+            "joint_path_length": 0.0,
+            "max_joint_step": 0.0,
+            "final_qpos": None,
+        }
+    arm_joint_ids = robot.get_joint_ids(ARM_NAME)
+    arm_traj = traj[0, :, arm_joint_ids]
+    deltas = torch.diff(arm_traj, dim=0)
+    step_norms = (
+        torch.linalg.norm(deltas, dim=-1)
+        if deltas.numel()
+        else torch.zeros(1, device=robot.device)
+    )
+    return {
+        "trajectory_steps": int(traj.shape[1]),
+        "joint_path_length": float(step_norms.sum().item()),
+        "max_joint_step": float(step_norms.max().item()),
+        "final_qpos": [float(v) for v in traj[0, -1, :].detach().cpu().tolist()],
+    }
+
+
+def _as_unbatched_pose(pose: torch.Tensor) -> torch.Tensor:
+    """Return the first pose when a batched pose tensor is provided."""
+    if pose.dim() == 3:
+        return pose[0]
+    return pose
+
+
+def pose_error(
+    actual_pose: torch.Tensor, target_pose: torch.Tensor
+) -> tuple[float, float]:
+    """Return position and rotation error between two pose matrices."""
+    actual_pose = _as_unbatched_pose(actual_pose)
+    target_pose = _as_unbatched_pose(target_pose)
+    actual_quat = quat_from_matrix(actual_pose[:3, :3].unsqueeze(0))[0]
+    target_quat = quat_from_matrix(target_pose[:3, :3].unsqueeze(0))[0]
+    pos_error = float(torch.linalg.norm(actual_pose[:3, 3] - target_pose[:3, 3]).item())
+    rot_error = float(
+        quat_error_magnitude(
+            target_quat.unsqueeze(0),
+            actual_quat.unsqueeze(0),
+        )[0].item()
+    )
+    return pos_error, rot_error
+
+
+def compute_tcp_pose(
+    robot: Robot, arm_qpos: torch.Tensor, *, control_part: str
+) -> torch.Tensor:
+    """Compute one unbatched TCP pose matrix from arm qpos."""
+    return robot.compute_fk(
+        qpos=arm_qpos.unsqueeze(0),
+        name=control_part,
+        to_matrix=True,
+    )[0]
+
+
+def compute_object_pose_error(
+    obj: RigidObject,
+    target_pose: torch.Tensor,
+) -> tuple[float, float]:
+    """Return object pose error to a target pose for env 0."""
+    object_pose = obj.get_local_pose(to_matrix=True)[0]
+    return pose_error(object_pose, target_pose)
+
+
+def build_trial_row(
+    *,
+    planner: str,
+    mode: str,
+    repeat_id: int,
+    warmup: bool,
+    outcome: PlanOutcome,
+    planning_time_sec: float,
+    mem_delta: dict[str, float],
+    peak_gpu_mb: float,
+    robot: Robot,
+    preflight: GraspPreflight,
+    replay: ReplayOutcome | None,
+    args: argparse.Namespace,
+) -> dict[str, object]:
+    """Build one raw metrics row."""
+    stats = trajectory_stats(robot, outcome.trajectory)
+    final_tcp_pos_error = None
+    final_tcp_rot_error = None
+    strict_tcp_success = False
+    if outcome.trajectory.numel() > 0:
+        arm_joint_ids = robot.get_joint_ids(ARM_NAME)
+        final_arm_qpos = outcome.trajectory[0, -1, arm_joint_ids]
+        final_pose = compute_tcp_pose(robot, final_arm_qpos, control_part=ARM_NAME)
+        final_tcp_pos_error, final_tcp_rot_error = pose_error(
+            final_pose, outcome.rest_eef_pose
+        )
+        strict_tcp_success = (
+            final_tcp_pos_error <= args.pos_success_threshold
+            and final_tcp_rot_error <= args.rot_success_threshold
+        )
+
+    planner_success = bool(outcome.success and strict_tcp_success)
+    physical_evaluated = replay is not None
+    physical_success = bool(replay.physical_success) if replay is not None else None
+    replay_success = bool(replay.replay_success) if replay is not None else None
+    task_success = physical_success if mode == "physical" else planner_success
+
+    row: dict[str, object] = {
+        "script": SCRIPT_NAME,
+        "planner": planner,
+        "mode": mode,
+        "object": args.object,
+        "object_scale": list(resolve_object_body_scale(args.object, args.object_scale)),
+        "gripper_closing_axis_index": FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
+        "tcp_z": float(args.tcp_z),
+        "tcp_yaw": float(args.tcp_yaw),
+        "grasp_depth_offset": float(args.grasp_depth_offset),
+        "gripper_close_margin": float(args.gripper_close_margin),
+        "grasp_axis": args.grasp_axis,
+        "repeat_id": repeat_id,
+        "warmup": warmup,
+        "action_success": bool(outcome.success),
+        "planner_success": planner_success,
+        "physical_evaluated": physical_evaluated,
+        "physical_success": physical_success,
+        "replay_success": replay_success,
+        "task_success": bool(task_success),
+        "planning_time_sec": float(planning_time_sec),
+        "replay_time_sec": float(replay.replay_time_sec) if replay else 0.0,
+        "object_replay_mode": replay.object_replay_mode if replay else None,
+        "replay_control": args.replay_control if replay else None,
+        "cpu_delta_mb": float(mem_delta["cpu_mb"]),
+        "gpu_delta_mb": float(mem_delta["gpu_mb"]),
+        "peak_gpu_mb": float(peak_gpu_mb),
+        "final_tcp_pos_error": final_tcp_pos_error,
+        "final_tcp_rot_error": final_tcp_rot_error,
+        "strict_tcp_success": strict_tcp_success,
+        "preflight_status": preflight.status,
+        "preflight_failed": preflight.failed,
+        "preflight_reason": preflight.reason,
+        "object_grasp_width_m": preflight.object_grasp_width_m,
+        "gripper_min_opening_m": preflight.gripper_min_opening_m,
+        "gripper_max_opening_m": preflight.gripper_max_opening_m,
+        "object_lifted": replay.object_lifted if replay else None,
+        "object_moved": replay.object_moved if replay else None,
+        "object_pos_error": replay.object_pos_error if replay else None,
+        "object_rot_error": replay.object_rot_error if replay else None,
+        "object_max_z": replay.object_max_z if replay else None,
+        "object_displacement_m": replay.object_displacement_m if replay else None,
+        "close_end_index": replay.close_end_index if replay else None,
+        "release_start_index": replay.release_start_index if replay else None,
+        "min_hand_opening_m": replay.min_hand_opening_m if replay else None,
+        "final_hand_opening_m": replay.final_hand_opening_m if replay else None,
+        "actual_min_hand_opening_m": (
+            replay.actual_min_hand_opening_m if replay else None
+        ),
+        "actual_close_hand_opening_m": (
+            replay.actual_close_hand_opening_m if replay else None
+        ),
+        "actual_final_hand_opening_m": (
+            replay.actual_final_hand_opening_m if replay else None
+        ),
+        "close_contact_count": replay.close_contact_count if replay else None,
+        "path_contact_count": replay.path_contact_count if replay else None,
+        "hold_contact_count": replay.hold_contact_count if replay else None,
+        "max_contact_count": replay.max_contact_count if replay else None,
+        "max_contact_impulse": replay.max_contact_impulse if replay else None,
+        "max_total_contact_impulse": (
+            replay.max_total_contact_impulse if replay else None
+        ),
+        "min_contact_distance_m": replay.min_contact_distance_m if replay else None,
+        "close_tcp_object_delta_m": replay.close_tcp_object_delta_m if replay else None,
+        "planned_grasp_object_delta_m": (
+            replay.planned_grasp_object_delta_m if replay else None
+        ),
+        "close_leftfinger_object_delta_m": (
+            replay.close_leftfinger_object_delta_m if replay else None
+        ),
+        "close_rightfinger_object_delta_m": (
+            replay.close_rightfinger_object_delta_m if replay else None
+        ),
+        "close_tcp_x_axis": replay.close_tcp_x_axis if replay else None,
+        "close_tcp_y_axis": replay.close_tcp_y_axis if replay else None,
+        "close_tcp_z_axis": replay.close_tcp_z_axis if replay else None,
+    }
+    row.update(stats)
+    return row
+
+
+def run_one_trial(
+    *,
+    sim: SimulationManager,
+    robot: Robot,
+    obj: RigidObject,
+    franka_urdf_path: str,
+    planner: str,
+    mode: str,
+    repeat_id: int,
+    warmup: bool,
+    checkpoint_path: str | None,
+    args: argparse.Namespace,
+) -> dict[str, object]:
+    """Run one independent benchmark trial."""
+    object_start_pose = reset_object_pose(obj, args.object, args.object_scale)
+    hand_open, _ = get_hand_open_close_qpos(robot)
+    set_robot_start_qpos(robot, hand_open)
+    try:
+        initialize_pre_pick_robot_pose(
+            robot,
+            obj,
+            hand_open,
+            pre_pick_z=args.pre_pick_z,
+            reference_pose=object_start_pose,
+        )
+    except RuntimeError as exc:
+        reason = str(exc)
+        logger.log_warning(
+            f"{SCRIPT_NAME}: planner={planner} mode={mode} repeat={repeat_id} "
+            f"warmup={warmup} setup_failed={reason}"
+        )
+        return make_failed_trial_row(
+            planner=planner,
+            mode=mode,
+            object_name=args.object,
+            repeat_id=repeat_id,
+            warmup=warmup,
+            reason=reason,
+        )
+    sim.update(step=max(args.hold_steps, 1))
+
+    preflight = build_grasp_preflight(obj, robot, franka_urdf_path=franka_urdf_path)
+    object_target_pose = make_object_target_pose(obj)
+    rest_pose = make_rest_pose(robot)
+    motion_gen = build_motion_generator(robot, planner, checkpoint_path, args)
+    engine = build_engine(
+        robot,
+        motion_gen,
+        args,
+        preflight,
+        franka_urdf_path=franka_urdf_path,
+    )
+    semantics = create_object_semantics(obj, args, preflight)
+
+    outcome, planning_time_sec, mem_delta, peak_gpu_mb = timed_plan_sequence(
+        engine,
+        semantics,
+        object_target_pose,
+        rest_pose,
+    )
+
+    replay = None
+    can_replay = (
+        mode == "physical"
+        and outcome.success
+        and (not preflight.failed or args.object_replay_mode == "attached")
+    )
+    if can_replay:
+        hand_opening_spec = load_franka_hand_opening_spec(franka_urdf_path)
+        replay = replay_trajectory(
+            sim,
+            robot,
+            obj,
+            outcome.trajectory,
+            object_target_pose,
+            args,
+            held_object=outcome.held_object,
+            planner=planner,
+            mode=mode,
+            repeat_id=repeat_id,
+            hand_opening_spec=hand_opening_spec,
+        )
+
+    row = build_trial_row(
+        planner=planner,
+        mode=mode,
+        repeat_id=repeat_id,
+        warmup=warmup,
+        outcome=outcome,
+        planning_time_sec=planning_time_sec,
+        mem_delta=mem_delta,
+        peak_gpu_mb=peak_gpu_mb,
+        robot=robot,
+        preflight=preflight,
+        replay=replay,
+        args=args,
+    )
+
+    logger.log_info(
+        f"{SCRIPT_NAME}: planner={planner} mode={mode} repeat={repeat_id} "
+        f"warmup={warmup} planner_success={row['planner_success']} "
+        f"physical_success={row['physical_success']} preflight={preflight.status}"
+    )
+    return row
+
+
+def _jsonable(value: Any) -> Any:
+    """Convert tensors and dataclasses to JSON-serializable values."""
+    if isinstance(value, torch.Tensor):
+        if value.numel() == 1:
+            return float(value.detach().cpu().item())
+        return value.detach().cpu().reshape(-1).tolist()
+    if isinstance(value, dict):
+        return {key: _jsonable(val) for key, val in value.items()}
+    if isinstance(value, (list, tuple)):
+        return [_jsonable(item) for item in value]
+    return value
+
+
+def empty_replay_diagnostics() -> dict[str, object]:
+    """Return replay diagnostic fields for rows without physical replay."""
+    return {
+        "actual_min_hand_opening_m": None,
+        "actual_close_hand_opening_m": None,
+        "actual_final_hand_opening_m": None,
+        "close_contact_count": None,
+        "path_contact_count": None,
+        "hold_contact_count": None,
+        "max_contact_count": None,
+        "max_contact_impulse": None,
+        "max_total_contact_impulse": None,
+        "min_contact_distance_m": None,
+        "close_tcp_object_delta_m": None,
+        "planned_grasp_object_delta_m": None,
+        "close_leftfinger_object_delta_m": None,
+        "close_rightfinger_object_delta_m": None,
+        "close_tcp_x_axis": None,
+        "close_tcp_y_axis": None,
+        "close_tcp_z_axis": None,
+    }
+
+
+def make_skipped_rows(
+    planners: list[str],
+    modes: list[str],
+    *,
+    object_name: str,
+    reason: str,
+) -> list[dict[str, object]]:
+    """Build measured rows for a gracefully skipped live-simulation benchmark."""
+    rows: list[dict[str, object]] = []
+    for planner in planners:
+        for mode in modes:
+            rows.append(
+                {
+                    "script": SCRIPT_NAME,
+                    "planner": planner,
+                    "mode": mode,
+                    "object": object_name,
+                    "object_scale": None,
+                    "gripper_closing_axis_index": FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
+                    "tcp_z": None,
+                    "tcp_yaw": None,
+                    "grasp_depth_offset": None,
+                    "gripper_close_margin": None,
+                    "grasp_axis": None,
+                    "repeat_id": 0,
+                    "warmup": False,
+                    "action_success": False,
+                    "planner_success": False,
+                    "physical_evaluated": False,
+                    "physical_success": None,
+                    "replay_success": None,
+                    "task_success": False,
+                    "planning_time_sec": 0.0,
+                    "replay_time_sec": 0.0,
+                    "object_replay_mode": None,
+                    "replay_control": None,
+                    "cpu_delta_mb": 0.0,
+                    "gpu_delta_mb": 0.0,
+                    "peak_gpu_mb": 0.0,
+                    "final_tcp_pos_error": None,
+                    "final_tcp_rot_error": None,
+                    "strict_tcp_success": False,
+                    "preflight_status": "skipped",
+                    "preflight_failed": True,
+                    "preflight_reason": reason,
+                    "object_grasp_width_m": 0.0,
+                    "gripper_min_opening_m": 0.0,
+                    "gripper_max_opening_m": 0.0,
+                    "object_lifted": None,
+                    "object_moved": None,
+                    "object_pos_error": None,
+                    "object_rot_error": None,
+                    "object_max_z": None,
+                    "object_displacement_m": None,
+                    "close_end_index": None,
+                    "release_start_index": None,
+                    "min_hand_opening_m": None,
+                    "final_hand_opening_m": None,
+                    **empty_replay_diagnostics(),
+                    "trajectory_steps": 0,
+                    "joint_path_length": 0.0,
+                    "max_joint_step": 0.0,
+                    "final_qpos": None,
+                }
+            )
+    return rows
+
+
+def make_failed_trial_row(
+    *,
+    planner: str,
+    mode: str,
+    object_name: str,
+    repeat_id: int,
+    warmup: bool,
+    reason: str,
+) -> dict[str, object]:
+    """Build one failed row when setup cannot reach the planner call."""
+    return {
+        "script": SCRIPT_NAME,
+        "planner": planner,
+        "mode": mode,
+        "object": object_name,
+        "object_scale": None,
+        "gripper_closing_axis_index": FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
+        "tcp_z": None,
+        "tcp_yaw": None,
+        "grasp_depth_offset": None,
+        "gripper_close_margin": None,
+        "grasp_axis": None,
+        "repeat_id": repeat_id,
+        "warmup": warmup,
+        "action_success": False,
+        "planner_success": False,
+        "physical_evaluated": False,
+        "physical_success": None,
+        "replay_success": None,
+        "task_success": False,
+        "planning_time_sec": 0.0,
+        "replay_time_sec": 0.0,
+        "object_replay_mode": None,
+        "replay_control": None,
+        "cpu_delta_mb": 0.0,
+        "gpu_delta_mb": 0.0,
+        "peak_gpu_mb": 0.0,
+        "final_tcp_pos_error": None,
+        "final_tcp_rot_error": None,
+        "strict_tcp_success": False,
+        "preflight_status": "setup_failed",
+        "preflight_failed": True,
+        "preflight_reason": reason,
+        "object_grasp_width_m": 0.0,
+        "gripper_min_opening_m": 0.0,
+        "gripper_max_opening_m": 0.0,
+        "object_lifted": None,
+        "object_moved": None,
+        "object_pos_error": None,
+        "object_rot_error": None,
+        "object_max_z": None,
+        "object_displacement_m": None,
+        "close_end_index": None,
+        "release_start_index": None,
+        "min_hand_opening_m": None,
+        "final_hand_opening_m": None,
+        **empty_replay_diagnostics(),
+        "trajectory_steps": 0,
+        "joint_path_length": 0.0,
+        "max_joint_step": 0.0,
+        "final_qpos": None,
+    }
+
+
+def write_raw_jsonl(path: str | None, row: dict[str, object]) -> None:
+    """Append one raw metrics row when requested."""
+    if not path:
+        return
+    result_path = Path(path)
+    result_path.parent.mkdir(parents=True, exist_ok=True)
+    with result_path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(_jsonable(row), sort_keys=True) + "\n")
+
+
+def _numeric_values(rows: list[dict[str, object]], key: str) -> list[float]:
+    """Return numeric values for a row key."""
+    values = []
+    for row in rows:
+        value = row.get(key)
+        if value is None:
+            continue
+        values.append(float(value))
+    return values
+
+
+def _mean(values: list[float]) -> float | None:
+    """Return the arithmetic mean of values."""
+    if not values:
+        return None
+    return sum(values) / len(values)
+
+
+def _quantile(values: list[float], q: float) -> float | None:
+    """Return a linear-interpolated quantile."""
+    if not values:
+        return None
+    ordered = sorted(values)
+    if len(ordered) == 1:
+        return ordered[0]
+    index = (len(ordered) - 1) * q
+    lower = int(math.floor(index))
+    upper = int(math.ceil(index))
+    if lower == upper:
+        return ordered[lower]
+    weight = index - lower
+    return ordered[lower] * (1.0 - weight) + ordered[upper] * weight
+
+
+def _rate(rows: list[dict[str, object]], key: str) -> float | None:
+    """Return the true-rate for a boolean row key."""
+    values = [row.get(key) for row in rows if row.get(key) is not None]
+    if not values:
+        return None
+    return sum(bool(value) for value in values) / len(values)
+
+
+def _fmt_float(value: float | None, digits: int = 3) -> str:
+    """Format a possibly-missing float."""
+    if value is None:
+        return "-"
+    return f"{value:.{digits}f}"
+
+
+def _fmt_rate(value: float | None) -> str:
+    """Format a possibly-missing rate."""
+    if value is None:
+        return "-"
+    return f"{value * 100.0:.1f}%"
+
+
+def summarize_rows(rows: list[dict[str, object]]) -> list[SummaryRow]:
+    """Group measured rows by planner and mode."""
+    measured = [row for row in rows if not row.get("warmup", False)]
+    keys = sorted({(str(row["planner"]), str(row["mode"])) for row in measured})
+    return [
+        SummaryRow(
+            planner=planner,
+            mode=mode,
+            rows=[
+                row
+                for row in measured
+                if row["planner"] == planner and row["mode"] == mode
+            ],
+        )
+        for planner, mode in keys
+    ]
+
+
+def _preflight_status(rows: list[dict[str, object]]) -> str:
+    """Return a compact aggregate preflight status."""
+    statuses = sorted({str(row.get("preflight_status", "-")) for row in rows})
+    if len(statuses) == 1:
+        return statuses[0]
+    return "mixed"
+
+
+def _single_status(rows: list[dict[str, object]], key: str) -> str:
+    """Return a compact aggregate string for a row key."""
+    values = sorted({str(row.get(key, "-")) for row in rows})
+    if len(values) == 1:
+        return values[0]
+    return "mixed"
+
+
+def make_perf_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Build Time & Memory report rows."""
+    perf_rows = []
+    for summary in summarize_rows(rows):
+        times = _numeric_values(summary.rows, "planning_time_sec")
+        perf_rows.append(
+            {
+                "planner": summary.planner,
+                "mode": summary.mode,
+                "repeat_count": len(summary.rows),
+                "cost_time_ms_mean": _fmt_float(
+                    None if _mean(times) is None else _mean(times) * 1000.0, 2
+                ),
+                "cost_time_ms_p50": _fmt_float(
+                    (
+                        None
+                        if _quantile(times, 0.5) is None
+                        else _quantile(times, 0.5) * 1000.0
+                    ),
+                    2,
+                ),
+                "cost_time_ms_p95": _fmt_float(
+                    (
+                        None
+                        if _quantile(times, 0.95) is None
+                        else _quantile(times, 0.95) * 1000.0
+                    ),
+                    2,
+                ),
+                "cpu_delta_mb": _fmt_float(
+                    _mean(_numeric_values(summary.rows, "cpu_delta_mb")), 2
+                ),
+                "gpu_delta_mb": _fmt_float(
+                    _mean(_numeric_values(summary.rows, "gpu_delta_mb")), 2
+                ),
+                "peak_gpu_mb": _fmt_float(
+                    _mean(_numeric_values(summary.rows, "peak_gpu_mb")), 2
+                ),
+            }
+        )
+    return perf_rows
+
+
+def make_metric_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Build Success & Other Metrics report rows."""
+    metric_rows = []
+    for summary in summarize_rows(rows):
+        final_pos = _mean(_numeric_values(summary.rows, "final_tcp_pos_error"))
+        final_rot = _mean(_numeric_values(summary.rows, "final_tcp_rot_error"))
+        width = _mean(_numeric_values(summary.rows, "object_grasp_width_m"))
+        min_open = _mean(_numeric_values(summary.rows, "gripper_min_opening_m"))
+        max_open = _mean(_numeric_values(summary.rows, "gripper_max_opening_m"))
+        replay_displacement = _mean(
+            _numeric_values(summary.rows, "object_displacement_m")
+        )
+        actual_min_open = _mean(
+            _numeric_values(summary.rows, "actual_min_hand_opening_m")
+        )
+        actual_close_open = _mean(
+            _numeric_values(summary.rows, "actual_close_hand_opening_m")
+        )
+        max_contact_count = _mean(_numeric_values(summary.rows, "max_contact_count"))
+        max_contact_impulse = _mean(
+            _numeric_values(summary.rows, "max_contact_impulse")
+        )
+        metric_rows.append(
+            {
+                "planner": summary.planner,
+                "mode": summary.mode,
+                "object_replay_mode": _single_status(
+                    summary.rows, "object_replay_mode"
+                ),
+                "action_success_rate": _fmt_rate(_rate(summary.rows, "action_success")),
+                "planner_success_rate": _fmt_rate(
+                    _rate(summary.rows, "planner_success")
+                ),
+                "replay_success_rate": _fmt_rate(_rate(summary.rows, "replay_success")),
+                "physical_success_rate": _fmt_rate(
+                    _rate(summary.rows, "physical_success")
+                ),
+                "preflight_failed_rate": _fmt_rate(
+                    _rate(summary.rows, "preflight_failed")
+                ),
+                "preflight_status": _preflight_status(summary.rows),
+                "object_grasp_width_m": _fmt_float(width, 4),
+                "gripper_min_opening_m": _fmt_float(min_open, 4),
+                "gripper_max_opening_m": _fmt_float(max_open, 4),
+                "min_hand_opening_m": _fmt_float(
+                    _mean(_numeric_values(summary.rows, "min_hand_opening_m")), 4
+                ),
+                "actual_min_hand_opening_m": _fmt_float(actual_min_open, 4),
+                "actual_close_hand_opening_m": _fmt_float(actual_close_open, 4),
+                "max_contact_count": _fmt_float(max_contact_count, 1),
+                "max_contact_impulse": _fmt_float(max_contact_impulse, 4),
+                "object_displacement_m": _fmt_float(replay_displacement, 4),
+                "final_tcp_pos_err_mm": _fmt_float(
+                    None if final_pos is None else final_pos * 1000.0, 3
+                ),
+                "final_tcp_rot_err_deg": _fmt_float(
+                    None if final_rot is None else final_rot * 180.0 / math.pi, 3
+                ),
+                "joint_path_length": _fmt_float(
+                    _mean(_numeric_values(summary.rows, "joint_path_length")), 4
+                ),
+                "max_joint_step": _fmt_float(
+                    _mean(_numeric_values(summary.rows, "max_joint_step")), 4
+                ),
+            }
+        )
+    return metric_rows
+
+
+def make_leaderboard_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Build leaderboard rows sorted by the mode-specific primary metric."""
+    leaderboard = []
+    for summary in summarize_rows(rows):
+        planner_rate = _rate(summary.rows, "planner_success") or 0.0
+        task_rate = _rate(summary.rows, "task_success") or 0.0
+        physical_rate = _rate(summary.rows, "physical_success")
+        replay_rate = _rate(summary.rows, "replay_success")
+        overall = task_rate if summary.mode == "physical" else planner_rate
+        avg_time = _mean(_numeric_values(summary.rows, "planning_time_sec")) or 0.0
+        avg_pos = _mean(_numeric_values(summary.rows, "final_tcp_pos_error"))
+        leaderboard.append(
+            {
+                "planner": summary.planner,
+                "mode": summary.mode,
+                "overall_success_rate": overall,
+                "planner_success_rate": planner_rate,
+                "physical_success_rate": physical_rate,
+                "replay_success_rate": replay_rate,
+                "avg_cost_time_ms": avg_time * 1000.0,
+                "avg_final_tcp_pos_err_mm": (
+                    None if avg_pos is None else avg_pos * 1000.0
+                ),
+            }
+        )
+    leaderboard.sort(
+        key=lambda row: (
+            -float(row["overall_success_rate"]),
+            float(row["avg_cost_time_ms"]),
+        )
+    )
+    return [
+        {
+            "rank": rank,
+            "planner": row["planner"],
+            "mode": row["mode"],
+            "overall_success_rate": _fmt_rate(float(row["overall_success_rate"])),
+            "planner_success_rate": _fmt_rate(float(row["planner_success_rate"])),
+            "replay_success_rate": _fmt_rate(row["replay_success_rate"]),
+            "physical_success_rate": _fmt_rate(row["physical_success_rate"]),
+            "avg_cost_time_ms": _fmt_float(float(row["avg_cost_time_ms"]), 2),
+            "avg_final_tcp_pos_err_mm": _fmt_float(row["avg_final_tcp_pos_err_mm"], 3),
+        }
+        for rank, row in enumerate(leaderboard, start=1)
+    ]
+
+
+def _markdown_table(rows: list[dict[str, object]]) -> list[str]:
+    """Format rows into a Markdown table."""
+    if not rows:
+        rows = [{"status": "No rows were produced."}]
+    headers = list(rows[0].keys())
+    lines = [
+        "| " + " | ".join(headers) + " |",
+        "| " + " | ".join(["---"] * len(headers)) + " |",
+    ]
+    for row in rows:
+        lines.append("| " + " | ".join(str(row[header]) for header in headers) + " |")
+    return lines
+
+
+def default_report_path() -> Path:
+    """Return the default timestamped Markdown report path."""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return Path("outputs/benchmarks") / f"{SCRIPT_NAME}_{timestamp}.md"
+
+
+def write_markdown_report(
+    rows: list[dict[str, object]],
+    report_path: str | None = None,
+) -> Path:
+    """Write the benchmark Markdown report with exactly three tables."""
+    path = Path(report_path) if report_path else default_report_path()
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    lines: list[str] = [
+        f"# {SCRIPT_NAME} Benchmark Report",
+        "",
+        f"Generated at: {datetime.now().isoformat(timespec='seconds')}",
+        "",
+        "## Time & Memory",
+        "",
+        *_markdown_table(make_perf_rows(rows)),
+        "",
+        "## Success & Other Metrics",
+        "",
+        *_markdown_table(make_metric_rows(rows)),
+        "",
+        "## Leaderboard",
+        "",
+        *_markdown_table(make_leaderboard_rows(rows)),
+    ]
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+    return path
+
+
+def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
+    """Run the selected planner benchmark trials."""
+    validate_args(args)
+    torch.manual_seed(args.seed)
+    planners = expand_planner_selection(args.planner)
+    modes = expand_mode_selection(args.mode)
+    rows: list[dict[str, object]] = []
+
+    if args.save_raw_jsonl:
+        raw_path = Path(args.save_raw_jsonl)
+        raw_path.parent.mkdir(parents=True, exist_ok=True)
+        raw_path.write_text("", encoding="utf-8")
+
+    if simulation_requires_cuda(args) and not torch.cuda.is_available():
+        reason = (
+            "CUDA is unavailable to PyTorch, so the live DexSim benchmark was "
+            "skipped before SimulationManager initialization."
+        )
+        logger.log_warning(reason)
+        rows = make_skipped_rows(
+            planners,
+            modes,
+            object_name=args.object,
+            reason=reason,
+        )
+        for row in rows:
+            write_raw_jsonl(args.save_raw_jsonl, row)
+        report_path = write_markdown_report(rows, args.report_path)
+        logger.log_info(f"Markdown benchmark report saved: {report_path}")
+        return rows
+
+    skip_reason = physical_gpu_memory_skip_reason(args, modes)
+    if skip_reason is not None:
+        logger.log_warning(skip_reason)
+        rows = make_skipped_rows(
+            planners,
+            modes,
+            object_name=args.object,
+            reason=skip_reason,
+        )
+        for row in rows:
+            write_raw_jsonl(args.save_raw_jsonl, row)
+        report_path = write_markdown_report(rows, args.report_path)
+        logger.log_info(f"Markdown benchmark report saved: {report_path}")
+        return rows
+
+    checkpoint_path = resolve_checkpoint(args, planners)
+
+    sim = make_sim(args)
+    robot, franka_urdf_path = create_franka(
+        sim,
+        tcp_z=args.tcp_z,
+        tcp_yaw=args.tcp_yaw,
+    )
+    create_support_table(sim)
+    create_visual_support_table(sim)
+    obj = create_object(sim, args.object, args.object_scale)
+    if not args.headless:
+        sim.open_window()
+
+    for planner in planners:
+        for mode in modes:
+            for warmup in (True, False):
+                repeat_count = args.warmup_repeats if warmup else args.num_repeats
+                for repeat_id in range(repeat_count):
+                    row = run_one_trial(
+                        sim=sim,
+                        robot=robot,
+                        obj=obj,
+                        franka_urdf_path=franka_urdf_path,
+                        planner=planner,
+                        mode=mode,
+                        repeat_id=repeat_id,
+                        warmup=warmup,
+                        checkpoint_path=checkpoint_path,
+                        args=args,
+                    )
+                    rows.append(row)
+                    write_raw_jsonl(args.save_raw_jsonl, row)
+
+    report_path = write_markdown_report(rows, args.report_path)
+    logger.log_info(f"Markdown benchmark report saved: {report_path}")
+    for row in make_leaderboard_rows(rows):
+        logger.log_info(json.dumps(row, sort_keys=True))
+    return rows
+
+
+def run_all_benchmarks() -> None:
+    """Parse CLI args and run the Franka pick-place NMG benchmark."""
+    run_benchmark(parse_args())
+
+
+if __name__ == "__main__":
+    run_all_benchmarks()
diff --git a/scripts/benchmark/robotics/nmg/franka_planner.py b/scripts/benchmark/robotics/nmg/franka_planner.py
new file mode 100644
index 00000000..49015527
--- /dev/null
+++ b/scripts/benchmark/robotics/nmg/franka_planner.py
@@ -0,0 +1,981 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Benchmark Franka end-effector waypoint planning backends.
+
+This benchmark isolates planner quality from grasp annotation and physics. It
+generates reachable Franka TCP waypoint poses from known joint configurations,
+then compares IK+TOPPRA interpolation, NeuralPlanner, and NeuralPlanner with a
+final IK refinement.
+Run: python -m scripts.benchmark.robotics.nmg.franka_planner
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import os
+import time
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import psutil
+import torch
+
+from embodichain.data.assets.planner_assets import download_neural_planner_checkpoint
+from embodichain.lab.gym.utils.gym_utils import add_env_launcher_args_to_parser
+from embodichain.lab.sim import SimulationManager, SimulationManagerCfg
+from embodichain.lab.sim.cfg import RenderCfg
+from embodichain.lab.sim.objects import Robot
+from embodichain.lab.sim.planners import (
+    MoveType,
+    MotionGenCfg,
+    MotionGenOptions,
+    MotionGenerator,
+    NeuralPlanOptions,
+    NeuralPlannerCfg,
+    PlanResult,
+    PlanState,
+    ToppraPlanOptions,
+    ToppraPlannerCfg,
+)
+from embodichain.lab.sim.utility.action_utils import interpolate_with_distance
+from embodichain.utils.math import quat_error_magnitude, quat_from_matrix
+from scripts.benchmark.robotics.nmg.franka_pick_place import (
+    ARM_NAME,
+    FRANKA_REST_QPOS,
+    FRANKA_START_QPOS,
+    SCRIPT_NAME as PICK_PLACE_SCRIPT_NAME,
+    create_franka,
+)
+
+SCRIPT_NAME = "franka_planner_nmg"
+DEFAULT_SEED = 0
+DEFAULT_NUM_TRIALS = 8
+DEFAULT_WARMUP_TRIALS = 1
+DEFAULT_NUM_WAYPOINTS = 3
+DEFAULT_SAMPLE_INTERVAL = 120
+DEFAULT_POS_THRESHOLD = 1e-3
+DEFAULT_ROT_THRESHOLD = 0.05
+DEFAULT_NMG_POS_THRESHOLD = 0.05
+DEFAULT_NMG_ROT_THRESHOLD = 0.3
+PLANNER_NAMES = ("ik_toppra", "neural", "neural_refine")
+
+
+@dataclass(frozen=True)
+class PlannerTrial:
+    """A reachable waypoint planning trial."""
+
+    trial_id: int
+    start_qpos: torch.Tensor
+    target_qpos: torch.Tensor
+    waypoints: list[torch.Tensor]
+
+
+@dataclass(frozen=True)
+class PlannerOutcome:
+    """Planner result with normalized trajectory fields."""
+
+    action_success: bool
+    positions: torch.Tensor | None
+    planning_time_sec: float
+    cpu_delta_mb: float
+    gpu_delta_mb: float
+    peak_gpu_mb: float
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark Franka EEF waypoint planners without grasp physics."
+    )
+    add_env_launcher_args_to_parser(parser)
+    parser.add_argument(
+        "--planner",
+        choices=[*PLANNER_NAMES, "all"],
+        default="all",
+        help="Planner backend to evaluate.",
+    )
+    parser.add_argument(
+        "--neural_checkpoint",
+        type=str,
+        default=None,
+        help="Local Franka NMG checkpoint. Required for neural planners unless downloadable.",
+    )
+    parser.add_argument(
+        "--num_trials",
+        type=int,
+        default=DEFAULT_NUM_TRIALS,
+        help="Measured trials per planner.",
+    )
+    parser.add_argument(
+        "--warmup_trials",
+        type=int,
+        default=DEFAULT_WARMUP_TRIALS,
+        help="Warmup trials per planner; excluded from summaries.",
+    )
+    parser.add_argument(
+        "--num_waypoints",
+        type=int,
+        default=DEFAULT_NUM_WAYPOINTS,
+        help="Number of EEF waypoints per trial.",
+    )
+    parser.add_argument(
+        "--sample_interval",
+        type=int,
+        default=DEFAULT_SAMPLE_INTERVAL,
+        help="Target trajectory samples for IK+TOPPRA and resampled neural paths.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=DEFAULT_SEED,
+        help="Random seed for waypoint generation.",
+    )
+    parser.add_argument(
+        "--report_path",
+        type=str,
+        default=None,
+        help="Optional Markdown report path. Defaults to outputs/benchmarks/.",
+    )
+    parser.add_argument(
+        "--save_raw_jsonl",
+        type=str,
+        default=None,
+        help="Optional JSONL path for raw per-trial rows.",
+    )
+    parser.add_argument(
+        "--pos_success_threshold",
+        type=float,
+        default=DEFAULT_POS_THRESHOLD,
+        help="Strict final TCP position threshold in meters.",
+    )
+    parser.add_argument(
+        "--rot_success_threshold",
+        type=float,
+        default=DEFAULT_ROT_THRESHOLD,
+        help="Strict final TCP rotation threshold in radians.",
+    )
+    parser.add_argument(
+        "--nmg_pos_success_threshold",
+        type=float,
+        default=DEFAULT_NMG_POS_THRESHOLD,
+        help="Loose position threshold passed to NMG planners.",
+    )
+    parser.add_argument(
+        "--nmg_rot_success_threshold",
+        type=float,
+        default=DEFAULT_NMG_ROT_THRESHOLD,
+        help="Loose rotation threshold passed to NMG planners.",
+    )
+    return parser.parse_args(argv)
+
+
+def validate_args(args: argparse.Namespace) -> None:
+    """Validate benchmark arguments."""
+    if args.num_trials < 1:
+        raise ValueError("--num_trials must be >= 1.")
+    if args.warmup_trials < 0:
+        raise ValueError("--warmup_trials must be >= 0.")
+    if args.num_waypoints < 1:
+        raise ValueError("--num_waypoints must be >= 1.")
+    if args.sample_interval < 2:
+        raise ValueError("--sample_interval must be >= 2.")
+
+
+def expand_planner_selection(planner: str) -> list[str]:
+    """Expand planner aliases into concrete planner names."""
+    if planner == "all":
+        return list(PLANNER_NAMES)
+    return [planner]
+
+
+def simulation_requires_cuda(args: argparse.Namespace) -> bool:
+    """Return whether the selected simulation settings require CUDA."""
+    if str(args.device).startswith("cuda"):
+        return True
+    return args.renderer in ("hybrid", "fast-rt", "rt")
+
+
+def _sync_cuda() -> None:
+    """Synchronize CUDA stream when available."""
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+
+def _reset_peak_gpu_memory() -> None:
+    """Reset PyTorch peak GPU memory stats when CUDA is available."""
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+
+
+def _peak_gpu_memory_mb() -> float:
+    """Return peak GPU memory allocated by PyTorch in MB."""
+    if not torch.cuda.is_available():
+        return 0.0
+    return torch.cuda.max_memory_allocated() / 1024**2
+
+
+def _memory_snapshot() -> dict[str, float]:
+    """Return current process memory usage in MB."""
+    process = psutil.Process(os.getpid())
+    cpu_mb = process.memory_info().rss / 1024**2
+    gpu_mb = (
+        torch.cuda.memory_allocated() / 1024**2 if torch.cuda.is_available() else 0.0
+    )
+    return {"cpu_mb": cpu_mb, "gpu_mb": gpu_mb}
+
+
+def make_sim(args: argparse.Namespace) -> SimulationManager:
+    """Create a minimal simulation manager for Franka kinematics."""
+    return SimulationManager(
+        SimulationManagerCfg(
+            width=640,
+            height=480,
+            headless=True,
+            sim_device=args.device,
+            gpu_id=args.gpu_id,
+            render_cfg=RenderCfg(renderer=args.renderer),
+            physics_dt=1.0 / 100.0,
+            num_envs=1,
+            arena_space=args.arena_space,
+        )
+    )
+
+
+def resolve_checkpoint(args: argparse.Namespace, planners: list[str]) -> str | None:
+    """Resolve the NMG checkpoint path if a neural planner is selected."""
+    if not any(planner in ("neural", "neural_refine") for planner in planners):
+        return None
+    if args.neural_checkpoint:
+        return args.neural_checkpoint
+    return download_neural_planner_checkpoint()
+
+
+def build_motion_generator(
+    robot: Robot,
+    planner_name: str,
+    checkpoint_path: str | None,
+    args: argparse.Namespace,
+) -> MotionGenerator:
+    """Create a motion generator for a planner backend."""
+    if planner_name == "ik_toppra":
+        planner_cfg = ToppraPlannerCfg(robot_uid=robot.uid)
+    elif planner_name in ("neural", "neural_refine"):
+        if checkpoint_path is None:
+            raise ValueError("checkpoint_path is required for neural planners.")
+        planner_cfg = NeuralPlannerCfg(
+            robot_uid=robot.uid,
+            planner_type=planner_name,
+            checkpoint_path=checkpoint_path,
+            control_part=ARM_NAME,
+            num_arm_joints=len(robot.get_joint_ids(ARM_NAME)),
+            pos_eps=args.nmg_pos_success_threshold,
+            rot_eps=args.nmg_rot_success_threshold,
+        )
+    else:
+        raise ValueError(f"Unsupported planner: {planner_name}")
+    return MotionGenerator(cfg=MotionGenCfg(planner_cfg=planner_cfg))
+
+
+def _franka_lower_upper(robot: Robot) -> tuple[torch.Tensor, torch.Tensor]:
+    """Return Franka arm qpos lower and upper limits."""
+    limits = robot.get_qpos_limits(name=ARM_NAME)[0].to(
+        device=robot.device, dtype=torch.float32
+    )
+    return limits[:, 0], limits[:, 1]
+
+
+def _qpos_from_ratio(
+    lower: torch.Tensor, upper: torch.Tensor, ratio_values: tuple[float, ...]
+) -> torch.Tensor:
+    """Create a qpos inside joint limits from normalized ratios."""
+    ratios = torch.tensor(ratio_values, dtype=torch.float32, device=lower.device)
+    return lower + ratios * (upper - lower)
+
+
+def make_seed_qpos_bank(robot: Robot) -> list[torch.Tensor]:
+    """Create deterministic reachable qpos candidates for waypoint trials."""
+    lower, upper = _franka_lower_upper(robot)
+    nominal = [
+        torch.tensor(FRANKA_START_QPOS, dtype=torch.float32, device=robot.device),
+        torch.tensor(FRANKA_REST_QPOS, dtype=torch.float32, device=robot.device),
+        _qpos_from_ratio(lower, upper, (0.50, 0.38, 0.55, 0.34, 0.52, 0.66, 0.60)),
+        _qpos_from_ratio(lower, upper, (0.46, 0.44, 0.47, 0.40, 0.58, 0.61, 0.45)),
+        _qpos_from_ratio(lower, upper, (0.55, 0.42, 0.43, 0.37, 0.47, 0.64, 0.53)),
+        _qpos_from_ratio(lower, upper, (0.48, 0.50, 0.60, 0.36, 0.44, 0.58, 0.57)),
+        _qpos_from_ratio(lower, upper, (0.53, 0.35, 0.50, 0.43, 0.61, 0.62, 0.49)),
+        _qpos_from_ratio(lower, upper, (0.44, 0.47, 0.54, 0.39, 0.40, 0.69, 0.51)),
+    ]
+    return [qpos.clamp(lower, upper) for qpos in nominal]
+
+
+def fk_pose(robot: Robot, qpos: torch.Tensor) -> torch.Tensor:
+    """Compute an unbatched Franka TCP pose."""
+    return robot.compute_fk(qpos=qpos.unsqueeze(0), name=ARM_NAME, to_matrix=True)[0]
+
+
+def make_trials(
+    robot: Robot,
+    *,
+    total_trials: int,
+    num_waypoints: int,
+    seed: int,
+) -> list[PlannerTrial]:
+    """Build deterministic reachable waypoint trials from FK-generated targets."""
+    bank = make_seed_qpos_bank(robot)
+    generator = torch.Generator(device="cpu")
+    generator.manual_seed(seed)
+    trials: list[PlannerTrial] = []
+    for trial_id in range(total_trials):
+        start_idx = trial_id % len(bank)
+        start_qpos = bank[start_idx]
+        candidates = torch.randperm(len(bank), generator=generator).tolist()
+        indices = [idx for idx in candidates if idx != start_idx]
+        while len(indices) < num_waypoints:
+            indices.extend(idx for idx in range(len(bank)) if idx != start_idx)
+        target_qpos = torch.stack([bank[idx] for idx in indices[:num_waypoints]])
+        waypoints = [fk_pose(robot, qpos) for qpos in target_qpos]
+        trials.append(
+            PlannerTrial(
+                trial_id=trial_id,
+                start_qpos=start_qpos.clone(),
+                target_qpos=target_qpos.clone(),
+                waypoints=waypoints,
+            )
+        )
+    return trials
+
+
+def _all_success(success: bool | torch.Tensor) -> bool:
+    """Return true when all success flags are true."""
+    if isinstance(success, torch.Tensor):
+        return bool(torch.all(success).item())
+    return bool(success)
+
+
+def plan_ik_toppra(
+    motion_gen: MotionGenerator,
+    trial: PlannerTrial,
+    *,
+    sample_interval: int,
+) -> PlanResult:
+    """Plan through EEF waypoints using sequential IK followed by TOPPRA."""
+    robot = motion_gen.robot
+    qpos_seed = trial.start_qpos.unsqueeze(0)
+    qpos_targets = [trial.start_qpos]
+    for waypoint in trial.waypoints:
+        success, qpos = robot.compute_ik(
+            pose=waypoint.unsqueeze(0),
+            joint_seed=qpos_seed,
+            name=ARM_NAME,
+            env_ids=[0],
+        )
+        if not _all_success(success):
+            return PlanResult(success=False, positions=None)
+        qpos_seed = qpos
+        qpos_targets.append(qpos.squeeze(0))
+
+    target_states = [
+        PlanState(move_type=MoveType.JOINT_MOVE, qpos=qpos) for qpos in qpos_targets
+    ]
+    return motion_gen.generate(
+        target_states=target_states,
+        options=ToppraMotionOptions(
+            start_qpos=trial.start_qpos,
+            control_part=ARM_NAME,
+            sample_interval=sample_interval,
+        ).to_motion_options(),
+    )
+
+
+@dataclass(frozen=True)
+class ToppraMotionOptions:
+    """Small helper to keep IK+TOPPRA runtime options explicit."""
+
+    start_qpos: torch.Tensor
+    control_part: str
+    sample_interval: int
+
+    def to_motion_options(self) -> MotionGenOptions:
+        return MotionGenOptions(
+            start_qpos=self.start_qpos,
+            control_part=self.control_part,
+            plan_opts=ToppraPlanOptions(sample_interval=self.sample_interval),
+        )
+
+
+def plan_neural(
+    motion_gen: MotionGenerator,
+    trial: PlannerTrial,
+    *,
+    planner_name: str,
+    sample_interval: int,
+) -> PlanResult:
+    """Plan through EEF waypoints using NeuralPlanner."""
+    target_states = [
+        PlanState(move_type=MoveType.EEF_MOVE, xpos=waypoint)
+        for waypoint in trial.waypoints
+    ]
+    result = motion_gen.generate(
+        target_states=target_states,
+        options=NeuralMotionOptions(
+            start_qpos=trial.start_qpos,
+            control_part=ARM_NAME,
+        ).to_motion_options(),
+    )
+    if (
+        planner_name == "neural_refine"
+        and result.positions is not None
+        and _all_success(result.success)
+    ):
+        seed = result.positions[-1].unsqueeze(0)
+        success, refined = motion_gen.robot.compute_ik(
+            pose=trial.waypoints[-1].unsqueeze(0),
+            joint_seed=seed,
+            name=ARM_NAME,
+            env_ids=[0],
+        )
+        if not _all_success(success):
+            return PlanResult(success=False, positions=result.positions)
+        result.positions = torch.cat([result.positions, refined], dim=0)
+
+    if result.positions is not None:
+        result.positions = interpolate_with_distance(
+            result.positions.unsqueeze(0),
+            interp_num=sample_interval,
+            device=motion_gen.device,
+        ).squeeze(0)
+    return result
+
+
+@dataclass(frozen=True)
+class NeuralMotionOptions:
+    """Small helper to keep neural runtime options explicit."""
+
+    start_qpos: torch.Tensor
+    control_part: str
+
+    def to_motion_options(self) -> MotionGenOptions:
+        return MotionGenOptions(
+            start_qpos=self.start_qpos,
+            control_part=self.control_part,
+            plan_opts=NeuralPlanOptions(
+                control_part=self.control_part,
+                start_qpos=self.start_qpos,
+                env_id=0,
+            ),
+        )
+
+
+def run_planner_once(
+    motion_gen: MotionGenerator,
+    planner_name: str,
+    trial: PlannerTrial,
+    args: argparse.Namespace,
+) -> PlannerOutcome:
+    """Run one timed planner call."""
+    _reset_peak_gpu_memory()
+    before = _memory_snapshot()
+    _sync_cuda()
+    start = time.perf_counter()
+    if planner_name == "ik_toppra":
+        result = plan_ik_toppra(
+            motion_gen,
+            trial,
+            sample_interval=args.sample_interval,
+        )
+    elif planner_name in ("neural", "neural_refine"):
+        result = plan_neural(
+            motion_gen,
+            trial,
+            planner_name=planner_name,
+            sample_interval=args.sample_interval,
+        )
+    else:
+        raise ValueError(f"Unsupported planner: {planner_name}")
+    _sync_cuda()
+    elapsed = time.perf_counter() - start
+    after = _memory_snapshot()
+    return PlannerOutcome(
+        action_success=_all_success(result.success),
+        positions=result.positions,
+        planning_time_sec=elapsed,
+        cpu_delta_mb=after["cpu_mb"] - before["cpu_mb"],
+        gpu_delta_mb=after["gpu_mb"] - before["gpu_mb"],
+        peak_gpu_mb=_peak_gpu_memory_mb(),
+    )
+
+
+def pose_error(
+    actual_pose: torch.Tensor, target_pose: torch.Tensor
+) -> tuple[float, float]:
+    """Return position and rotation error between pose matrices."""
+    actual_quat = quat_from_matrix(actual_pose[:3, :3].unsqueeze(0))[0]
+    target_quat = quat_from_matrix(target_pose[:3, :3].unsqueeze(0))[0]
+    pos_error = float(torch.linalg.norm(actual_pose[:3, 3] - target_pose[:3, 3]).item())
+    rot_error = float(
+        quat_error_magnitude(
+            target_quat.unsqueeze(0),
+            actual_quat.unsqueeze(0),
+        )[0].item()
+    )
+    return pos_error, rot_error
+
+
+def trajectory_quality(
+    robot: Robot, positions: torch.Tensor | None, trial: PlannerTrial
+) -> dict[str, object]:
+    """Compute final pose and joint path quality metrics."""
+    if positions is None or positions.numel() == 0:
+        return {
+            "trajectory_steps": 0,
+            "final_tcp_pos_error": None,
+            "final_tcp_rot_error": None,
+            "joint_path_length": 0.0,
+            "max_joint_step": 0.0,
+            "mean_target_qpos_error": None,
+            "final_qpos": None,
+        }
+    final_pose = fk_pose(robot, positions[-1])
+    final_pos, final_rot = pose_error(final_pose, trial.waypoints[-1])
+    deltas = torch.diff(positions, dim=0)
+    step_norms = (
+        torch.linalg.norm(deltas, dim=-1)
+        if deltas.numel()
+        else torch.zeros(1, device=robot.device)
+    )
+    target_errors = []
+    for target_qpos in trial.target_qpos:
+        dists = torch.linalg.norm(positions - target_qpos.unsqueeze(0), dim=-1)
+        target_errors.append(float(torch.min(dists).item()))
+    return {
+        "trajectory_steps": int(positions.shape[0]),
+        "final_tcp_pos_error": final_pos,
+        "final_tcp_rot_error": final_rot,
+        "joint_path_length": float(step_norms.sum().item()),
+        "max_joint_step": float(step_norms.max().item()),
+        "mean_target_qpos_error": sum(target_errors) / len(target_errors),
+        "final_qpos": [float(v) for v in positions[-1].detach().cpu().tolist()],
+    }
+
+
+def build_trial_row(
+    *,
+    planner: str,
+    trial: PlannerTrial,
+    warmup: bool,
+    outcome: PlannerOutcome,
+    robot: Robot,
+    args: argparse.Namespace,
+) -> dict[str, object]:
+    """Build a raw benchmark row."""
+    quality = trajectory_quality(robot, outcome.positions, trial)
+    final_pos = quality["final_tcp_pos_error"]
+    final_rot = quality["final_tcp_rot_error"]
+    strict_success = (
+        outcome.action_success
+        and final_pos is not None
+        and final_rot is not None
+        and float(final_pos) <= args.pos_success_threshold
+        and float(final_rot) <= args.rot_success_threshold
+    )
+    nmg_threshold_success = (
+        outcome.action_success
+        and final_pos is not None
+        and final_rot is not None
+        and float(final_pos) <= args.nmg_pos_success_threshold
+        and float(final_rot) <= args.nmg_rot_success_threshold
+    )
+    row: dict[str, object] = {
+        "script": SCRIPT_NAME,
+        "planner": planner,
+        "trial_id": trial.trial_id,
+        "warmup": warmup,
+        "num_waypoints": len(trial.waypoints),
+        "action_success": outcome.action_success,
+        "strict_pose_success": bool(strict_success),
+        "nmg_threshold_success": bool(nmg_threshold_success),
+        "planning_time_sec": outcome.planning_time_sec,
+        "cpu_delta_mb": outcome.cpu_delta_mb,
+        "gpu_delta_mb": outcome.gpu_delta_mb,
+        "peak_gpu_mb": outcome.peak_gpu_mb,
+    }
+    row.update(quality)
+    return row
+
+
+def _jsonable(value: Any) -> Any:
+    """Convert tensors and nested values to JSON-serializable values."""
+    if isinstance(value, torch.Tensor):
+        if value.numel() == 1:
+            return float(value.detach().cpu().item())
+        return value.detach().cpu().reshape(-1).tolist()
+    if isinstance(value, dict):
+        return {key: _jsonable(val) for key, val in value.items()}
+    if isinstance(value, (list, tuple)):
+        return [_jsonable(item) for item in value]
+    return value
+
+
+def write_raw_jsonl(path: str | None, row: dict[str, object]) -> None:
+    """Append one raw row when requested."""
+    if not path:
+        return
+    result_path = Path(path)
+    result_path.parent.mkdir(parents=True, exist_ok=True)
+    with result_path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(_jsonable(row), sort_keys=True) + "\n")
+
+
+def _numeric_values(rows: list[dict[str, object]], key: str) -> list[float]:
+    """Return numeric values for a row key."""
+    values = []
+    for row in rows:
+        value = row.get(key)
+        if value is None:
+            continue
+        values.append(float(value))
+    return values
+
+
+def _mean(values: list[float]) -> float | None:
+    """Return the arithmetic mean of values."""
+    if not values:
+        return None
+    return sum(values) / len(values)
+
+
+def _quantile(values: list[float], q: float) -> float | None:
+    """Return a linear-interpolated quantile."""
+    if not values:
+        return None
+    ordered = sorted(values)
+    if len(ordered) == 1:
+        return ordered[0]
+    index = (len(ordered) - 1) * q
+    lower = int(math.floor(index))
+    upper = int(math.ceil(index))
+    if lower == upper:
+        return ordered[lower]
+    weight = index - lower
+    return ordered[lower] * (1.0 - weight) + ordered[upper] * weight
+
+
+def _rate(rows: list[dict[str, object]], key: str) -> float | None:
+    """Return the true-rate for a boolean row key."""
+    values = [row.get(key) for row in rows if row.get(key) is not None]
+    if not values:
+        return None
+    return sum(bool(value) for value in values) / len(values)
+
+
+def _fmt_float(value: float | None, digits: int = 3) -> str:
+    """Format a possibly-missing float."""
+    if value is None:
+        return "-"
+    return f"{value:.{digits}f}"
+
+
+def _fmt_rate(value: float | None) -> str:
+    """Format a possibly-missing rate."""
+    if value is None:
+        return "-"
+    return f"{value * 100.0:.1f}%"
+
+
+def measured_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Return non-warmup rows."""
+    return [row for row in rows if not row.get("warmup", False)]
+
+
+def summarize_by_planner(
+    rows: list[dict[str, object]],
+) -> list[tuple[str, list[dict[str, object]]]]:
+    """Group measured rows by planner."""
+    measured = measured_rows(rows)
+    planners = sorted({str(row["planner"]) for row in measured})
+    return [
+        (planner, [row for row in measured if row["planner"] == planner])
+        for planner in planners
+    ]
+
+
+def make_perf_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Build Time & Memory report rows."""
+    perf_rows = []
+    for planner, group in summarize_by_planner(rows):
+        times = _numeric_values(group, "planning_time_sec")
+        perf_rows.append(
+            {
+                "planner": planner,
+                "repeat_count": len(group),
+                "cost_time_ms_mean": _fmt_float(
+                    None if _mean(times) is None else _mean(times) * 1000.0, 2
+                ),
+                "cost_time_ms_p50": _fmt_float(
+                    (
+                        None
+                        if _quantile(times, 0.5) is None
+                        else _quantile(times, 0.5) * 1000.0
+                    ),
+                    2,
+                ),
+                "cost_time_ms_p95": _fmt_float(
+                    (
+                        None
+                        if _quantile(times, 0.95) is None
+                        else _quantile(times, 0.95) * 1000.0
+                    ),
+                    2,
+                ),
+                "cpu_delta_mb": _fmt_float(
+                    _mean(_numeric_values(group, "cpu_delta_mb")), 2
+                ),
+                "gpu_delta_mb": _fmt_float(
+                    _mean(_numeric_values(group, "gpu_delta_mb")), 2
+                ),
+                "peak_gpu_mb": _fmt_float(
+                    _mean(_numeric_values(group, "peak_gpu_mb")), 2
+                ),
+            }
+        )
+    return perf_rows
+
+
+def make_metric_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Build Success & Other Metrics report rows."""
+    metric_rows = []
+    for planner, group in summarize_by_planner(rows):
+        final_pos = _mean(_numeric_values(group, "final_tcp_pos_error"))
+        final_rot = _mean(_numeric_values(group, "final_tcp_rot_error"))
+        metric_rows.append(
+            {
+                "planner": planner,
+                "action_success_rate": _fmt_rate(_rate(group, "action_success")),
+                "strict_pose_success_rate": _fmt_rate(
+                    _rate(group, "strict_pose_success")
+                ),
+                "nmg_threshold_success_rate": _fmt_rate(
+                    _rate(group, "nmg_threshold_success")
+                ),
+                "final_tcp_pos_err_mm": _fmt_float(
+                    None if final_pos is None else final_pos * 1000.0, 3
+                ),
+                "final_tcp_rot_err_deg": _fmt_float(
+                    None if final_rot is None else final_rot * 180.0 / math.pi, 3
+                ),
+                "joint_path_length": _fmt_float(
+                    _mean(_numeric_values(group, "joint_path_length")), 4
+                ),
+                "max_joint_step": _fmt_float(
+                    _mean(_numeric_values(group, "max_joint_step")), 4
+                ),
+                "mean_target_qpos_error": _fmt_float(
+                    _mean(_numeric_values(group, "mean_target_qpos_error")), 4
+                ),
+            }
+        )
+    return metric_rows
+
+
+def make_leaderboard_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Build leaderboard rows sorted by strict success then latency."""
+    leaderboard = []
+    for planner, group in summarize_by_planner(rows):
+        strict_rate = _rate(group, "strict_pose_success") or 0.0
+        loose_rate = _rate(group, "nmg_threshold_success") or 0.0
+        avg_time = _mean(_numeric_values(group, "planning_time_sec")) or 0.0
+        avg_pos = _mean(_numeric_values(group, "final_tcp_pos_error"))
+        leaderboard.append(
+            {
+                "planner": planner,
+                "strict_rate": strict_rate,
+                "loose_rate": loose_rate,
+                "avg_time_ms": avg_time * 1000.0,
+                "avg_pos_mm": None if avg_pos is None else avg_pos * 1000.0,
+            }
+        )
+    leaderboard.sort(
+        key=lambda row: (
+            -float(row["strict_rate"]),
+            -float(row["loose_rate"]),
+            float(row["avg_time_ms"]),
+        )
+    )
+    return [
+        {
+            "rank": rank,
+            "planner": row["planner"],
+            "overall_success_rate": _fmt_rate(float(row["strict_rate"])),
+            "nmg_threshold_success_rate": _fmt_rate(float(row["loose_rate"])),
+            "avg_cost_time_ms": _fmt_float(float(row["avg_time_ms"]), 2),
+            "avg_final_tcp_pos_err_mm": _fmt_float(row["avg_pos_mm"], 3),
+        }
+        for rank, row in enumerate(leaderboard, start=1)
+    ]
+
+
+def _markdown_table(rows: list[dict[str, object]]) -> list[str]:
+    """Format rows into a Markdown table."""
+    if not rows:
+        rows = [{"status": "No rows were produced."}]
+    headers = list(rows[0].keys())
+    lines = [
+        "| " + " | ".join(headers) + " |",
+        "| " + " | ".join(["---"] * len(headers)) + " |",
+    ]
+    for row in rows:
+        lines.append("| " + " | ".join(str(row[header]) for header in headers) + " |")
+    return lines
+
+
+def default_report_path() -> Path:
+    """Return the default timestamped Markdown report path."""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return Path("outputs/benchmarks") / f"{SCRIPT_NAME}_{timestamp}.md"
+
+
+def write_markdown_report(
+    rows: list[dict[str, object]], report_path: str | None = None
+) -> Path:
+    """Write the benchmark Markdown report with exactly three tables."""
+    path = Path(report_path) if report_path else default_report_path()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    lines: list[str] = [
+        f"# {SCRIPT_NAME} Benchmark Report",
+        "",
+        f"Generated at: {datetime.now().isoformat(timespec='seconds')}",
+        "",
+        f"Downstream demo companion: {PICK_PLACE_SCRIPT_NAME}",
+        "",
+        "## Time & Memory",
+        "",
+        *_markdown_table(make_perf_rows(rows)),
+        "",
+        "## Success & Other Metrics",
+        "",
+        *_markdown_table(make_metric_rows(rows)),
+        "",
+        "## Leaderboard",
+        "",
+        *_markdown_table(make_leaderboard_rows(rows)),
+    ]
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+    return path
+
+
+def make_skipped_rows(
+    planners: list[str],
+    *,
+    reason: str,
+) -> list[dict[str, object]]:
+    """Build rows for a gracefully skipped live-simulation benchmark."""
+    rows = []
+    for planner in planners:
+        rows.append(
+            {
+                "script": SCRIPT_NAME,
+                "planner": planner,
+                "trial_id": 0,
+                "warmup": False,
+                "num_waypoints": 0,
+                "action_success": False,
+                "strict_pose_success": False,
+                "nmg_threshold_success": False,
+                "planning_time_sec": 0.0,
+                "cpu_delta_mb": 0.0,
+                "gpu_delta_mb": 0.0,
+                "peak_gpu_mb": 0.0,
+                "trajectory_steps": 0,
+                "final_tcp_pos_error": None,
+                "final_tcp_rot_error": None,
+                "joint_path_length": 0.0,
+                "max_joint_step": 0.0,
+                "mean_target_qpos_error": None,
+                "final_qpos": None,
+                "skip_reason": reason,
+            }
+        )
+    return rows
+
+
+def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
+    """Run the selected planner benchmark."""
+    validate_args(args)
+    torch.manual_seed(args.seed)
+    planners = expand_planner_selection(args.planner)
+
+    if args.save_raw_jsonl:
+        raw_path = Path(args.save_raw_jsonl)
+        raw_path.parent.mkdir(parents=True, exist_ok=True)
+        raw_path.write_text("", encoding="utf-8")
+
+    if simulation_requires_cuda(args) and not torch.cuda.is_available():
+        reason = "CUDA is unavailable, so the Franka planner benchmark was skipped."
+        rows = make_skipped_rows(planners, reason=reason)
+        for row in rows:
+            write_raw_jsonl(args.save_raw_jsonl, row)
+        write_markdown_report(rows, args.report_path)
+        return rows
+
+    checkpoint_path = resolve_checkpoint(args, planners)
+    sim = make_sim(args)
+    robot, _ = create_franka(sim)
+    sim.update(step=1)
+    total_trials = args.warmup_trials + args.num_trials
+    trials = make_trials(
+        robot,
+        total_trials=total_trials,
+        num_waypoints=args.num_waypoints,
+        seed=args.seed,
+    )
+    rows: list[dict[str, object]] = []
+    for planner in planners:
+        motion_gen = build_motion_generator(robot, planner, checkpoint_path, args)
+        for index, trial in enumerate(trials):
+            warmup = index < args.warmup_trials
+            robot.set_qpos(trial.start_qpos.unsqueeze(0), name=ARM_NAME, target=False)
+            robot.set_qpos(trial.start_qpos.unsqueeze(0), name=ARM_NAME, target=True)
+            outcome = run_planner_once(motion_gen, planner, trial, args)
+            row = build_trial_row(
+                planner=planner,
+                trial=trial,
+                warmup=warmup,
+                outcome=outcome,
+                robot=robot,
+                args=args,
+            )
+            rows.append(row)
+            write_raw_jsonl(args.save_raw_jsonl, row)
+
+    report_path = write_markdown_report(rows, args.report_path)
+    for row in make_leaderboard_rows(rows):
+        print(json.dumps(row, sort_keys=True))
+    print(f"Markdown benchmark report saved: {report_path}")
+    return rows
+
+
+def run_all_benchmarks() -> None:
+    """Parse CLI args and run the Franka planner benchmark."""
+    run_benchmark(parse_args())
+
+
+if __name__ == "__main__":
+    run_all_benchmarks()
diff --git a/tests/benchmark/robotics/nmg/test_franka_pick_place.py b/tests/benchmark/robotics/nmg/test_franka_pick_place.py
new file mode 100644
index 00000000..39cad0c9
--- /dev/null
+++ b/tests/benchmark/robotics/nmg/test_franka_pick_place.py
@@ -0,0 +1,590 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Tests for the Franka pick-place NMG benchmark helpers."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+import torch
+
+from embodichain.data import get_data_path
+from scripts.benchmark.robotics.nmg.franka_pick_place import (
+    ContactStats,
+    DEFAULT_FRANKA_TCP_YAW,
+    DEFAULT_FRANKA_TCP_Z,
+    DEFAULT_GRASP_AXIS,
+    DEFAULT_GRASP_DEPTH_OFFSET,
+    DEFAULT_GRIPPER_CLOSE_MARGIN,
+    FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
+    OBJECT_SUPPORT_MARGIN,
+    TABLE_COLLIDER_INIT_POS,
+    TABLE_COLLIDER_SIZE,
+    TABLE_INIT_POS,
+    TABLE_TOP_Z,
+    compute_attached_object_pose,
+    create_support_table,
+    estimate_gripper_opening_range,
+    estimate_object_grasp_width,
+    evaluate_grasp_preflight,
+    expand_mode_selection,
+    expand_planner_selection,
+    franka_hand_opening_from_finger_qpos,
+    grasp_axis_alignment_cost,
+    horizontal_bbox_axis,
+    initialize_pre_pick_robot_pose,
+    load_franka_hand_opening_spec,
+    empty_replay_diagnostics,
+    make_leaderboard_rows,
+    make_failed_trial_row,
+    make_skipped_rows,
+    physical_gpu_memory_skip_reason,
+    _record_contact_stats,
+    object_body_scale,
+    object_initial_position,
+    object_supported_z,
+    parse_args,
+    rerank_grasp_costs_by_axis,
+    resolve_object_body_scale,
+    resolve_hand_open_close_qpos,
+    simulation_requires_cuda,
+    write_markdown_report,
+)
+
+
+class _FakeSim:
+    def __init__(self):
+        self.cfg = None
+
+    def add_rigid_object(self, cfg):
+        self.cfg = cfg
+        return cfg
+
+
+class _FakeObject:
+    def __init__(self):
+        self.live_pose = torch.eye(4).unsqueeze(0)
+        self.live_pose[0, :3, 3] = torch.tensor([0.2, 0.1, -0.3])
+
+    def get_local_pose(self, to_matrix: bool = False):
+        assert to_matrix
+        return self.live_pose.clone()
+
+
+class _FakeRobotForPrePick:
+    device = torch.device("cpu")
+
+    def __init__(self):
+        self.ik_pose = None
+        self.qpos_set = []
+
+    def get_qpos(self, name=None):
+        if name == "main_arm":
+            return torch.zeros(1, 7)
+        return torch.zeros(1, 9)
+
+    def compute_fk(self, qpos, name, to_matrix):
+        pose = torch.eye(4).unsqueeze(0)
+        return pose
+
+    def compute_ik(self, pose, joint_seed, name):
+        self.ik_pose = pose.clone()
+        return torch.tensor([True]), torch.ones(1, 7)
+
+    def set_qpos(self, qpos, name=None, target=False):
+        self.qpos_set.append((name, target, qpos.clone()))
+
+    def clear_dynamics(self):
+        pass
+
+
+def _rows() -> list[dict[str, object]]:
+    return [
+        {
+            "planner": "ik_interpolate",
+            "mode": "planner",
+            "warmup": True,
+            "action_success": False,
+            "planner_success": False,
+            "physical_success": None,
+            "replay_success": None,
+            "preflight_failed": False,
+            "preflight_status": "ok",
+            "planning_time_sec": 10.0,
+            "cpu_delta_mb": 0.0,
+            "gpu_delta_mb": 0.0,
+            "peak_gpu_mb": 0.0,
+            "final_tcp_pos_error": 1.0,
+            "final_tcp_rot_error": 1.0,
+            "joint_path_length": 10.0,
+            "max_joint_step": 1.0,
+            "object_grasp_width_m": 0.05,
+            "gripper_min_opening_m": 0.0,
+            "gripper_max_opening_m": 0.08,
+            **empty_replay_diagnostics(),
+        },
+        {
+            "planner": "ik_interpolate",
+            "mode": "planner",
+            "warmup": False,
+            "action_success": True,
+            "planner_success": True,
+            "physical_success": None,
+            "replay_success": None,
+            "preflight_failed": False,
+            "preflight_status": "ok",
+            "planning_time_sec": 1.0,
+            "cpu_delta_mb": 1.0,
+            "gpu_delta_mb": 0.0,
+            "peak_gpu_mb": 0.0,
+            "final_tcp_pos_error": 0.001,
+            "final_tcp_rot_error": 0.01,
+            "joint_path_length": 0.5,
+            "max_joint_step": 0.1,
+            "object_grasp_width_m": 0.05,
+            "gripper_min_opening_m": 0.0,
+            "gripper_max_opening_m": 0.08,
+            **empty_replay_diagnostics(),
+        },
+        {
+            "planner": "neural",
+            "mode": "planner",
+            "warmup": False,
+            "action_success": False,
+            "planner_success": False,
+            "physical_success": None,
+            "replay_success": None,
+            "preflight_failed": True,
+            "preflight_status": "too_small",
+            "planning_time_sec": 0.2,
+            "cpu_delta_mb": 0.5,
+            "gpu_delta_mb": 2.0,
+            "peak_gpu_mb": 10.0,
+            "final_tcp_pos_error": 0.04,
+            "final_tcp_rot_error": 0.2,
+            "joint_path_length": 0.7,
+            "max_joint_step": 0.2,
+            "object_grasp_width_m": 0.02,
+            "gripper_min_opening_m": 0.08,
+            "gripper_max_opening_m": 0.16,
+            **empty_replay_diagnostics(),
+        },
+        {
+            "planner": "neural_refine",
+            "mode": "physical",
+            "warmup": False,
+            "action_success": True,
+            "planner_success": True,
+            "physical_success": True,
+            "replay_success": True,
+            "preflight_failed": False,
+            "preflight_status": "ok",
+            "planning_time_sec": 0.3,
+            "cpu_delta_mb": 0.5,
+            "gpu_delta_mb": 2.0,
+            "peak_gpu_mb": 10.0,
+            "final_tcp_pos_error": 0.002,
+            "final_tcp_rot_error": 0.02,
+            "joint_path_length": 0.8,
+            "max_joint_step": 0.2,
+            "object_grasp_width_m": 0.05,
+            "gripper_min_opening_m": 0.0,
+            "gripper_max_opening_m": 0.08,
+            "actual_min_hand_opening_m": 0.03,
+            "actual_close_hand_opening_m": 0.031,
+            "actual_final_hand_opening_m": 0.08,
+            "close_contact_count": 2,
+            "path_contact_count": 0,
+            "hold_contact_count": 0,
+            "max_contact_count": 2,
+            "max_contact_impulse": 0.4,
+            "max_total_contact_impulse": 0.7,
+            "min_contact_distance_m": -0.001,
+            "close_tcp_object_delta_m": [0.0, 0.0, -0.02],
+            "planned_grasp_object_delta_m": [0.0, 0.0, -0.02],
+            "close_leftfinger_object_delta_m": [0.0, 0.02, 0.0],
+            "close_rightfinger_object_delta_m": [0.0, -0.02, 0.0],
+            "close_tcp_x_axis": [1.0, 0.0, 0.0],
+            "close_tcp_y_axis": [0.0, 1.0, 0.0],
+            "close_tcp_z_axis": [0.0, 0.0, 1.0],
+        },
+    ]
+
+
+def test_cli_defaults_and_selection_expansion():
+    args = parse_args([])
+
+    assert args.mode == "planner"
+    assert args.object == "sugar_box"
+    assert args.planner == "all"
+    assert args.headless is True
+    assert args.object_replay_mode == "physics"
+    assert args.object_scale is None
+    assert args.grasp_axis == DEFAULT_GRASP_AXIS
+    assert args.tcp_z == pytest.approx(DEFAULT_FRANKA_TCP_Z)
+    assert args.tcp_yaw == pytest.approx(DEFAULT_FRANKA_TCP_YAW)
+    assert args.grasp_depth_offset == pytest.approx(DEFAULT_GRASP_DEPTH_OFFSET)
+    assert args.n_deviated_approach_directions == 1
+    assert args.gripper_close_margin == pytest.approx(DEFAULT_GRIPPER_CLOSE_MARGIN)
+    assert expand_planner_selection("all") == [
+        "ik_interpolate",
+        "neural",
+        "neural_refine",
+    ]
+    assert expand_planner_selection("neural") == ["neural"]
+    assert expand_mode_selection("both") == ["planner", "physical"]
+
+
+def test_cli_accepts_object_scale_and_short_axis_grasp():
+    args = parse_args(
+        [
+            "--object_scale",
+            "0.9",
+            "0.9",
+            "0.9",
+            "--grasp_axis",
+            "short",
+        ]
+    )
+
+    assert args.object_scale == pytest.approx([0.9, 0.9, 0.9])
+    assert args.grasp_axis == "short"
+
+
+def test_object_body_scale_does_not_downscale_cup_by_default():
+    assert object_body_scale("cup") == pytest.approx((1.0, 1.0, 1.0))
+
+
+def test_resolve_object_body_scale_uses_cli_override():
+    assert resolve_object_body_scale("sugar_box", [0.9, 0.8, 1.1]) == pytest.approx(
+        (0.9, 0.8, 1.1)
+    )
+
+
+def test_cube_initial_position_places_bottom_above_support_plane():
+    x, y, z = object_initial_position("cube")
+
+    assert (x, y) == pytest.approx((0.31, 0.0))
+    assert z == pytest.approx(TABLE_TOP_Z + 0.045 / 2.0 + OBJECT_SUPPORT_MARGIN)
+
+
+def test_object_supported_z_respects_scale_override():
+    z = object_supported_z("cube", [1.0, 1.0, 2.0])
+
+    assert z == pytest.approx(TABLE_TOP_Z + 0.045 + OBJECT_SUPPORT_MARGIN)
+
+
+def test_sugar_box_scale_uses_preset_value_by_default():
+    assert object_body_scale("sugar_box") == pytest.approx((1.0, 1.0, 1.0))
+
+
+def test_support_table_is_static_and_below_interaction_plane():
+    sim = _FakeSim()
+
+    cfg = create_support_table(sim)
+
+    assert cfg.body_type == "static"
+    assert cfg.uid == "support_table_collider"
+    assert cfg.shape.size == list(TABLE_COLLIDER_SIZE)
+    assert cfg.init_pos == TABLE_COLLIDER_INIT_POS
+    assert TABLE_TOP_Z == pytest.approx(0.0)
+
+
+def test_sugar_box_initial_position_places_bottom_above_support_plane():
+    x, y, z = object_initial_position("sugar_box")
+
+    assert (x, y) == pytest.approx((0.31, 0.0))
+    assert z > TABLE_TOP_Z
+    assert z == pytest.approx(object_supported_z("sugar_box"), abs=1e-6)
+    assert z - TABLE_TOP_Z > OBJECT_SUPPORT_MARGIN
+
+
+def test_resolve_hand_open_close_qpos_clamps_to_joint_limits():
+    limits = torch.tensor([[0.0, 0.04], [0.0, 0.04]], dtype=torch.float32)
+
+    hand_open, hand_close = resolve_hand_open_close_qpos(
+        limits,
+        open_qpos=0.08,
+        close_qpos=-0.01,
+    )
+
+    assert hand_open.tolist() == pytest.approx([0.04, 0.04])
+    assert hand_close.tolist() == pytest.approx([0.0, 0.0])
+
+
+def test_estimate_franka_opening_range_uses_collision_surfaces():
+    limits = torch.tensor([[0.0, 0.04], [0.0, 0.04]], dtype=torch.float32)
+    expected_min_opening = 0.08 - 2.0 * 0.0005448426818475127
+
+    min_opening, max_opening = estimate_gripper_opening_range(
+        get_data_path("Franka/Panda/PandaWithHand.urdf"),
+        limits,
+    )
+
+    assert min_opening == pytest.approx(expected_min_opening)
+    assert max_opening == pytest.approx(expected_min_opening + 0.08)
+
+
+def test_franka_hand_opening_from_finger_qpos_reports_physical_jaw_gap():
+    spec = load_franka_hand_opening_spec(
+        get_data_path("Franka/Panda/PandaWithHand.urdf")
+    )
+    finger_qpos = torch.tensor([[0.0, 0.0], [0.04, 0.04]], dtype=torch.float32)
+    expected_min_opening = 0.08 - 2.0 * 0.0005448426818475127
+
+    opening = franka_hand_opening_from_finger_qpos(finger_qpos, spec)
+
+    assert opening.tolist() == pytest.approx(
+        [expected_min_opening, expected_min_opening + 0.08]
+    )
+
+
+def test_estimate_object_grasp_width_uses_narrowest_horizontal_extent():
+    vertices = torch.tensor(
+        [
+            [-0.1, -0.03, 0.0],
+            [0.1, 0.04, 0.0],
+            [0.0, 0.02, 0.1],
+        ],
+        dtype=torch.float32,
+    )
+
+    assert estimate_object_grasp_width(vertices) == pytest.approx(0.07)
+
+
+def test_horizontal_bbox_axis_selects_short_and_long_axes():
+    vertices = torch.tensor(
+        [
+            [-0.1, -0.03, 0.0],
+            [0.1, 0.03, 0.0],
+            [0.0, 0.03, 0.1],
+        ],
+        dtype=torch.float32,
+    )
+
+    assert horizontal_bbox_axis(vertices, "short").tolist() == pytest.approx(
+        [0.0, 1.0, 0.0]
+    )
+    assert horizontal_bbox_axis(vertices, "long").tolist() == pytest.approx(
+        [1.0, 0.0, 0.0]
+    )
+
+
+def test_grasp_axis_rerank_supports_explicit_generator_x_axis():
+    poses = torch.eye(4, dtype=torch.float32).unsqueeze(0).repeat(2, 1, 1)
+    poses[0, :3, 0] = torch.tensor([1.0, 0.0, 0.0])
+    poses[1, :3, 0] = torch.tensor([0.0, 1.0, 0.0])
+    costs = torch.zeros(2, dtype=torch.float32)
+
+    reranked = rerank_grasp_costs_by_axis(
+        costs,
+        poses,
+        torch.tensor([1.0, 0.0, 0.0]),
+        axis_index=0,
+    )
+
+    assert grasp_axis_alignment_cost(
+        poses[:, :3, 0],
+        torch.tensor([1.0, 0.0, 0.0]),
+    ).tolist() == pytest.approx([0.0, 1.0])
+    assert reranked[0] < reranked[1]
+
+
+def test_grasp_axis_rerank_defaults_to_franka_closing_axis():
+    poses = torch.eye(4, dtype=torch.float32).unsqueeze(0).repeat(2, 1, 1)
+    poses[0, :3, 0] = torch.tensor([1.0, 0.0, 0.0])
+    poses[0, :3, 1] = torch.tensor([0.0, 1.0, 0.0])
+    poses[1, :3, 0] = torch.tensor([0.0, 1.0, 0.0])
+    poses[1, :3, 1] = torch.tensor([1.0, 0.0, 0.0])
+    costs = torch.zeros(2, dtype=torch.float32)
+
+    reranked = rerank_grasp_costs_by_axis(
+        costs,
+        poses,
+        torch.tensor([0.0, 1.0, 0.0]),
+    )
+
+    assert FRANKA_GRIPPER_CLOSING_AXIS_INDEX == 1
+    assert reranked[0] < reranked[1]
+
+
+def test_compute_attached_object_pose_inverts_object_to_eef_transform():
+    eef_pose = torch.eye(4)
+    eef_pose[:3, 3] = torch.tensor([0.4, -0.2, 0.3])
+    object_to_eef = torch.eye(4)
+    object_to_eef[:3, 3] = torch.tensor([0.1, 0.0, 0.02])
+
+    object_pose = compute_attached_object_pose(eef_pose, object_to_eef)
+
+    assert object_pose[:3, 3].tolist() == pytest.approx([0.3, -0.2, 0.28])
+
+
+def test_contact_stats_aggregation_tracks_impulse_and_distance_extrema():
+    max_count, max_impulse, max_total_impulse, min_distance = _record_contact_stats(
+        ContactStats(count=2, max_impulse=0.3, total_impulse=0.5, min_distance=-0.001),
+        max_count=0,
+        max_impulse=None,
+        max_total_impulse=None,
+        min_distance=None,
+    )
+
+    max_count, max_impulse, max_total_impulse, min_distance = _record_contact_stats(
+        ContactStats(count=1, max_impulse=0.2, total_impulse=0.7, min_distance=0.002),
+        max_count=max_count,
+        max_impulse=max_impulse,
+        max_total_impulse=max_total_impulse,
+        min_distance=min_distance,
+    )
+
+    assert max_count == 2
+    assert max_impulse == pytest.approx(0.3)
+    assert max_total_impulse == pytest.approx(0.7)
+    assert min_distance == pytest.approx(-0.001)
+
+
+def test_evaluate_grasp_preflight_marks_too_small_object():
+    preflight = evaluate_grasp_preflight(
+        object_grasp_width_m=0.03,
+        gripper_min_opening_m=0.08,
+        gripper_max_opening_m=0.16,
+    )
+
+    assert preflight.failed
+    assert preflight.status == "too_small"
+    assert "narrower" in preflight.reason
+
+
+def test_adaptive_franka_close_clamps_objects_below_minimum_opening():
+    from scripts.benchmark.robotics.nmg.franka_pick_place import (
+        resolve_adaptive_hand_close_qpos,
+    )
+
+    limits = torch.tensor([[0.0, 0.04], [0.0, 0.04]], dtype=torch.float32)
+
+    hand_close = resolve_adaptive_hand_close_qpos(
+        limits,
+        object_width_m=0.045,
+        margin_m=0.0,
+        franka_urdf_path=get_data_path("Franka/Panda/PandaWithHand.urdf"),
+    )
+
+    assert hand_close.tolist() == pytest.approx([0.0, 0.0])
+
+
+def test_simulation_cuda_preflight_and_skipped_rows():
+    args = parse_args(["--device", "cpu", "--renderer", "fast-rt"])
+
+    assert simulation_requires_cuda(args)
+    rows = make_skipped_rows(
+        ["ik_interpolate", "neural"],
+        ["planner"],
+        object_name="sugar_box",
+        reason="cuda missing",
+    )
+
+    assert len(rows) == 2
+    assert {row["planner"] for row in rows} == {"ik_interpolate", "neural"}
+    assert all(row["preflight_status"] == "skipped" for row in rows)
+    assert all(row["preflight_failed"] for row in rows)
+
+
+def test_initialize_pre_pick_uses_reference_pose_before_live_physics_drift():
+    robot = _FakeRobotForPrePick()
+    obj = _FakeObject()
+    reference_pose = torch.eye(4)
+    reference_pose[:3, 3] = torch.tensor([0.31, 0.0, 0.025])
+
+    initialize_pre_pick_robot_pose(
+        robot,
+        obj,
+        torch.tensor([0.04, 0.04]),
+        pre_pick_z=0.36,
+        reference_pose=reference_pose,
+    )
+
+    assert robot.ik_pose is not None
+    assert robot.ik_pose[0, :3, 3].tolist() == pytest.approx([0.31, 0.0, 0.36])
+
+
+def test_physical_gpu_memory_preflight_skips_before_dexsim_oom(monkeypatch):
+    from scripts.benchmark.robotics.nmg import franka_pick_place
+
+    monkeypatch.setattr(franka_pick_place, "free_gpu_memory_mb", lambda gpu_id: 512.0)
+    args = parse_args(["--mode", "physical", "--min_physical_gpu_free_mb", "1024"])
+
+    reason = physical_gpu_memory_skip_reason(args, ["physical"])
+
+    assert reason is not None
+    assert "free GPU memory" in reason
+
+
+def test_failed_trial_row_records_setup_failure():
+    row = make_failed_trial_row(
+        planner="ik_interpolate",
+        mode="planner",
+        object_name="sugar_box",
+        repeat_id=0,
+        warmup=False,
+        reason="pre-pick ik failed",
+    )
+
+    assert row["planner_success"] is False
+    assert row["preflight_status"] == "setup_failed"
+    assert row["preflight_failed"] is True
+    assert row["preflight_reason"] == "pre-pick ik failed"
+
+
+def test_write_markdown_report_has_exactly_three_tables(tmp_path):
+    report_path = tmp_path / "report.md"
+
+    written = write_markdown_report(_rows(), str(report_path))
+
+    text = written.read_text(encoding="utf-8")
+    assert written == report_path
+    assert text.count("## Time & Memory") == 1
+    assert text.count("## Success & Other Metrics") == 1
+    assert text.count("## Leaderboard") == 1
+    assert text.count("\n| ---") == 3
+    assert "## Notes" not in text
+
+
+def test_leaderboard_includes_all_planner_mode_pairs():
+    rows = make_leaderboard_rows(_rows())
+    pairs = {(row["planner"], row["mode"]) for row in rows}
+
+    assert pairs == {
+        ("ik_interpolate", "planner"),
+        ("neural", "planner"),
+        ("neural_refine", "physical"),
+    }
+    assert rows[0]["overall_success_rate"] == "100.0%"
+
+
+def test_benchmark_source_does_not_contain_invalid_hacks():
+    source_path = Path("scripts/benchmark/robotics/nmg/franka_pick_place.py")
+    text = source_path.read_text(encoding="utf-8")
+
+    forbidden = [
+        "semantic_held_object",
+        "patched_finger_origins",
+        "prepare_franka_grasp_urdf",
+        "cup_scale",
+        "set_local_pose(torch.bmm",
+    ]
+    for pattern in forbidden:
+        assert pattern not in text
diff --git a/tests/benchmark/robotics/nmg/test_franka_planner.py b/tests/benchmark/robotics/nmg/test_franka_planner.py
new file mode 100644
index 00000000..d1efc9d5
--- /dev/null
+++ b/tests/benchmark/robotics/nmg/test_franka_planner.py
@@ -0,0 +1,189 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Tests for the Franka planner NMG benchmark helpers."""
+
+from __future__ import annotations
+
+import torch
+
+from scripts.benchmark.robotics.nmg.franka_planner import (
+    PlannerOutcome,
+    PlannerTrial,
+    build_trial_row,
+    expand_planner_selection,
+    make_leaderboard_rows,
+    make_metric_rows,
+    make_perf_rows,
+    make_skipped_rows,
+    parse_args,
+    simulation_requires_cuda,
+    write_markdown_report,
+)
+
+
+class _FakeRobot:
+    device = torch.device("cpu")
+
+
+def _trial() -> PlannerTrial:
+    return PlannerTrial(
+        trial_id=0,
+        start_qpos=torch.zeros(7),
+        target_qpos=torch.zeros(1, 7),
+        waypoints=[torch.eye(4)],
+    )
+
+
+def _rows() -> list[dict[str, object]]:
+    return [
+        {
+            "script": "franka_planner_nmg",
+            "planner": "ik_toppra",
+            "trial_id": 0,
+            "warmup": True,
+            "num_waypoints": 1,
+            "action_success": False,
+            "strict_pose_success": False,
+            "nmg_threshold_success": False,
+            "planning_time_sec": 10.0,
+            "cpu_delta_mb": 0.0,
+            "gpu_delta_mb": 0.0,
+            "peak_gpu_mb": 0.0,
+            "trajectory_steps": 0,
+            "final_tcp_pos_error": None,
+            "final_tcp_rot_error": None,
+            "joint_path_length": 0.0,
+            "max_joint_step": 0.0,
+            "mean_target_qpos_error": None,
+            "final_qpos": None,
+        },
+        {
+            "script": "franka_planner_nmg",
+            "planner": "ik_toppra",
+            "trial_id": 1,
+            "warmup": False,
+            "num_waypoints": 1,
+            "action_success": True,
+            "strict_pose_success": True,
+            "nmg_threshold_success": True,
+            "planning_time_sec": 0.8,
+            "cpu_delta_mb": 1.0,
+            "gpu_delta_mb": 0.0,
+            "peak_gpu_mb": 0.0,
+            "trajectory_steps": 120,
+            "final_tcp_pos_error": 0.0005,
+            "final_tcp_rot_error": 0.01,
+            "joint_path_length": 0.5,
+            "max_joint_step": 0.1,
+            "mean_target_qpos_error": 0.0,
+            "final_qpos": [0.0] * 7,
+        },
+        {
+            "script": "franka_planner_nmg",
+            "planner": "neural",
+            "trial_id": 1,
+            "warmup": False,
+            "num_waypoints": 1,
+            "action_success": True,
+            "strict_pose_success": False,
+            "nmg_threshold_success": True,
+            "planning_time_sec": 0.2,
+            "cpu_delta_mb": 0.5,
+            "gpu_delta_mb": 2.0,
+            "peak_gpu_mb": 10.0,
+            "trajectory_steps": 120,
+            "final_tcp_pos_error": 0.02,
+            "final_tcp_rot_error": 0.2,
+            "joint_path_length": 0.7,
+            "max_joint_step": 0.2,
+            "mean_target_qpos_error": 0.1,
+            "final_qpos": [0.1] * 7,
+        },
+    ]
+
+
+def test_cli_defaults_and_selection_expansion():
+    args = parse_args([])
+
+    assert args.planner == "all"
+    assert args.num_trials == 8
+    assert expand_planner_selection("all") == [
+        "ik_toppra",
+        "neural",
+        "neural_refine",
+    ]
+    assert expand_planner_selection("neural") == ["neural"]
+
+
+def test_simulation_cuda_preflight():
+    args = parse_args(["--device", "cpu", "--renderer", "fast-rt"])
+
+    assert simulation_requires_cuda(args)
+    skipped = make_skipped_rows(["ik_toppra", "neural"], reason="cuda missing")
+
+    assert len(skipped) == 2
+    assert {row["planner"] for row in skipped} == {"ik_toppra", "neural"}
+    assert all(row["skip_reason"] == "cuda missing" for row in skipped)
+
+
+def test_report_builders_ignore_warmup_rows():
+    perf_rows = make_perf_rows(_rows())
+    metric_rows = make_metric_rows(_rows())
+    leaderboard_rows = make_leaderboard_rows(_rows())
+
+    assert {row["planner"] for row in perf_rows} == {"ik_toppra", "neural"}
+    assert all(row["repeat_count"] == 1 for row in perf_rows)
+    assert metric_rows[0]["strict_pose_success_rate"] == "100.0%"
+    assert leaderboard_rows[0]["planner"] == "ik_toppra"
+
+
+def test_write_markdown_report_has_exactly_three_tables(tmp_path):
+    report_path = tmp_path / "report.md"
+
+    written = write_markdown_report(_rows(), str(report_path))
+
+    text = written.read_text(encoding="utf-8")
+    assert written == report_path
+    assert text.count("## Time & Memory") == 1
+    assert text.count("## Success & Other Metrics") == 1
+    assert text.count("## Leaderboard") == 1
+    assert text.count("| planner |") == 3
+
+
+def test_build_trial_row_marks_failed_empty_trajectory():
+    outcome = PlannerOutcome(
+        action_success=False,
+        positions=None,
+        planning_time_sec=0.1,
+        cpu_delta_mb=0.0,
+        gpu_delta_mb=0.0,
+        peak_gpu_mb=0.0,
+    )
+
+    row = build_trial_row(
+        planner="neural",
+        trial=_trial(),
+        warmup=False,
+        outcome=outcome,
+        robot=_FakeRobot(),
+        args=parse_args([]),
+    )
+
+    assert row["action_success"] is False
+    assert row["strict_pose_success"] is False
+    assert row["nmg_threshold_success"] is False
+    assert row["trajectory_steps"] == 0
diff --git a/tests/sim/atomic_actions/test_trajectory.py b/tests/sim/atomic_actions/test_trajectory.py
index ef28ab8b..d94ecf20 100644
--- a/tests/sim/atomic_actions/test_trajectory.py
+++ b/tests/sim/atomic_actions/test_trajectory.py
@@ -23,6 +23,7 @@
 from unittest.mock import Mock, patch
 
 from embodichain.lab.sim.atomic_actions.trajectory import TrajectoryBuilder
+from embodichain.lab.sim.planners import MoveType, PlanResult, PlanState
 
 
 def _make_mock_motion_generator(num_envs: int = 2, arm_dof: int = 6) -> Mock:
@@ -35,7 +36,13 @@ def get_qpos(name=None):
 
     robot.get_qpos = get_qpos
 
-    def compute_ik(pose=None, qpos_seed=None, name=None, joint_seed=None):
+    def compute_ik(
+        pose=None,
+        qpos_seed=None,
+        name=None,
+        joint_seed=None,
+        env_ids=None,
+    ):
         seed = joint_seed if joint_seed is not None else qpos_seed
         if seed is None:
             seed = torch.zeros(num_envs, arm_dof)
@@ -52,6 +59,7 @@ def compute_fk(qpos=None, name=None, to_matrix=True):
     mg = Mock()
     mg.robot = robot
     mg.device = torch.device("cpu")
+    mg.planner.cfg.planner_type = "toppra"
     return mg
 
 
@@ -213,6 +221,186 @@ def test_interpolates_start_to_target(self):
         assert torch.equal(kwargs["trajectory"][:, 1, :], target)
 
 
+class TestPlanArmTraj:
+    def setup_method(self):
+        self.mg = _make_mock_motion_generator()
+        self.builder = TrajectoryBuilder(self.mg)
+
+    def test_non_neural_planner_uses_ik_path(self):
+        start = torch.zeros(2, 6)
+        target_states = [
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+        ]
+        expected = torch.ones(2, 5, 6)
+        with patch(
+            "embodichain.lab.sim.atomic_actions.trajectory.interpolate_with_distance",
+            return_value=expected,
+        ):
+            ok, out = self.builder.plan_arm_traj(
+                target_states,
+                start,
+                5,
+                control_part="arm",
+                arm_dof=6,
+            )
+
+        assert ok is True
+        assert out is expected
+        assert not self.mg.generate.called
+
+    def test_neural_planner_calls_motion_generator_per_env(self):
+        self.mg.planner.cfg.planner_type = "neural"
+        start = torch.zeros(2, 6)
+        target_states = [
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+        ]
+
+        def generate(target_states, options):
+            offset = float(self.mg.generate.call_count)
+            return PlanResult(
+                success=True,
+                positions=torch.stack(
+                    [
+                        options.start_qpos,
+                        options.start_qpos + offset,
+                    ]
+                ),
+            )
+
+        self.mg.generate.side_effect = generate
+
+        expected = torch.ones(2, 4, 6)
+        with patch(
+            "embodichain.lab.sim.atomic_actions.trajectory.interpolate_with_distance",
+            return_value=expected,
+        ):
+            ok, out = self.builder.plan_arm_traj(
+                target_states,
+                start,
+                4,
+                control_part="arm",
+                arm_dof=6,
+            )
+
+        assert ok is True
+        assert self.mg.generate.call_count == 2
+        assert out is expected
+
+    def test_neural_planner_failure_returns_false(self):
+        self.mg.planner.cfg.planner_type = "neural"
+        self.mg.generate.return_value = PlanResult(success=False, positions=None)
+        start = torch.zeros(2, 6)
+        target_states = [
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+        ]
+
+        ok, out = self.builder.plan_arm_traj(
+            target_states,
+            start,
+            4,
+            control_part="arm",
+            arm_dof=6,
+        )
+
+        assert ok is False
+        assert out.shape == (2, 4, 6)
+
+    def test_raw_neural_planner_does_not_call_final_ik_refine(self):
+        self.mg.planner.cfg.planner_type = "neural"
+        self.mg.robot.compute_ik = Mock()
+        start = torch.zeros(2, 6)
+        target_states = [
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+        ]
+        self.mg.generate.return_value = PlanResult(
+            success=True,
+            positions=torch.stack([torch.zeros(6), torch.ones(6)]),
+        )
+
+        expected = torch.ones(2, 4, 6)
+        with patch(
+            "embodichain.lab.sim.atomic_actions.trajectory.interpolate_with_distance",
+            return_value=expected,
+        ):
+            ok, out = self.builder.plan_arm_traj(
+                target_states,
+                start,
+                4,
+                control_part="arm",
+                arm_dof=6,
+            )
+
+        assert ok is True
+        assert out is expected
+        self.mg.robot.compute_ik.assert_not_called()
+
+    def test_neural_refine_appends_final_ik_waypoint(self):
+        self.mg.planner.cfg.planner_type = "neural_refine"
+        start = torch.zeros(2, 6)
+        target_states = [
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+        ]
+        self.mg.generate.return_value = PlanResult(
+            success=True,
+            positions=torch.stack([torch.zeros(6), torch.ones(6)]),
+        )
+        refined_qpos = torch.full((1, 6), 2.0)
+        self.mg.robot.compute_ik = Mock(
+            return_value=(torch.ones(1, dtype=torch.bool), refined_qpos)
+        )
+
+        expected = torch.ones(2, 4, 6)
+        with patch(
+            "embodichain.lab.sim.atomic_actions.trajectory.interpolate_with_distance",
+            return_value=expected,
+        ) as interpolate:
+            ok, out = self.builder.plan_arm_traj(
+                target_states,
+                start,
+                4,
+                control_part="arm",
+                arm_dof=6,
+            )
+
+        assert ok is True
+        assert out is expected
+        assert self.mg.robot.compute_ik.call_count == 2
+        _, kwargs = interpolate.call_args
+        assert kwargs["trajectory"].shape == (2, 3, 6)
+        assert torch.allclose(kwargs["trajectory"][:, -1, :], refined_qpos.expand(2, 6))
+
+    def test_neural_refine_ik_failure_returns_false(self):
+        self.mg.planner.cfg.planner_type = "neural_refine"
+        start = torch.zeros(2, 6)
+        target_states = [
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+            [PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+        ]
+        self.mg.generate.return_value = PlanResult(
+            success=True,
+            positions=torch.stack([torch.zeros(6), torch.ones(6)]),
+        )
+        self.mg.robot.compute_ik = Mock(
+            return_value=(torch.zeros(1, dtype=torch.bool), torch.zeros(1, 6))
+        )
+
+        ok, out = self.builder.plan_arm_traj(
+            target_states,
+            start,
+            4,
+            control_part="arm",
+            arm_dof=6,
+        )
+
+        assert ok is False
+        assert out.shape == (2, 4, 6)
+
+
 class TestIkSolve:
     def test_uses_first_env_seed_for_single_pose(self):
         mg = _make_mock_motion_generator(num_envs=3, arm_dof=6)
diff --git a/tests/sim/planners/test_neural_planner.py b/tests/sim/planners/test_neural_planner.py
index 878c54ff..e3cf29f4 100644
--- a/tests/sim/planners/test_neural_planner.py
+++ b/tests/sim/planners/test_neural_planner.py
@@ -81,10 +81,13 @@ class FakeRobot:
     num_instances = 1
 
     def get_qpos(self, name: str | None = None, target: bool = False) -> torch.Tensor:
-        return torch.zeros(1, NUM_ARM_JOINTS)
+        return torch.zeros(self.num_instances, NUM_ARM_JOINTS)
 
-    def get_qpos_limits(self, name: str | None = None) -> torch.Tensor:
-        limits = torch.zeros(1, NUM_ARM_JOINTS, 2)
+    def get_qpos_limits(
+        self, name: str | None = None, env_ids: list[int] | None = None
+    ) -> torch.Tensor:
+        batch = len(env_ids) if env_ids is not None else self.num_instances
+        limits = torch.zeros(batch, NUM_ARM_JOINTS, 2)
         limits[..., 0] = -2.0
         limits[..., 1] = 2.0
         return limits
@@ -93,6 +96,7 @@ def compute_fk(
         self,
         qpos: torch.Tensor,
         name: str | None = None,
+        env_ids: list[int] | None = None,
         to_matrix: bool = False,
         **kwargs,
     ) -> torch.Tensor:
@@ -110,19 +114,17 @@ def get_robot(self, uid: str) -> FakeRobot:
         return self.robot
 
 
-def test_neural_planner_is_registered():
-    assert MotionGenerator._support_planner_dict["neural"][0] is NeuralPlanner
-    assert MotionGenerator._support_planner_dict["neural"][1] is NeuralPlannerCfg
-
-
-def test_neural_planner_generate_with_fake_checkpoint(tmp_path, monkeypatch):
-    checkpoint_path = _create_fake_checkpoint(tmp_path)
+def _patch_sim_manager(monkeypatch) -> None:
     fake_sim = FakeSimulationManager()
     monkeypatch.setattr(
         SimulationManager, "get_instance", classmethod(lambda cls: fake_sim)
     )
 
-    motion_generator = MotionGenerator(
+
+def _make_motion_generator(tmp_path, monkeypatch) -> MotionGenerator:
+    _patch_sim_manager(monkeypatch)
+    checkpoint_path = _create_fake_checkpoint(tmp_path)
+    return MotionGenerator(
         cfg=MotionGenCfg(
             planner_cfg=NeuralPlannerCfg(
                 robot_uid="fake_robot",
@@ -132,9 +134,18 @@ def test_neural_planner_generate_with_fake_checkpoint(tmp_path, monkeypatch):
         )
     )
 
-    target_state = PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))
+
+def test_neural_planner_is_registered():
+    assert MotionGenerator._support_planner_dict["neural"][0] is NeuralPlanner
+    assert MotionGenerator._support_planner_dict["neural"][1] is NeuralPlannerCfg
+    assert MotionGenerator._support_planner_dict["neural_refine"][0] is NeuralPlanner
+    assert MotionGenerator._support_planner_dict["neural_refine"][1] is NeuralPlannerCfg
+
+
+def test_neural_planner_generate_with_fake_checkpoint(tmp_path, monkeypatch):
+    motion_generator = _make_motion_generator(tmp_path, monkeypatch)
     result = motion_generator.generate(
-        target_states=[target_state],
+        target_states=[PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
         options=MotionGenOptions(
             plan_opts=NeuralPlanOptions(
                 control_part="main_arm",
@@ -152,21 +163,7 @@ def test_neural_planner_generate_with_fake_checkpoint(tmp_path, monkeypatch):
 
 
 def test_neural_planner_uses_plan_opts_start_qpos(tmp_path, monkeypatch):
-    checkpoint_path = _create_fake_checkpoint(tmp_path)
-    fake_sim = FakeSimulationManager()
-    monkeypatch.setattr(
-        SimulationManager, "get_instance", classmethod(lambda cls: fake_sim)
-    )
-
-    motion_generator = MotionGenerator(
-        cfg=MotionGenCfg(
-            planner_cfg=NeuralPlannerCfg(
-                robot_uid="fake_robot",
-                checkpoint_path=checkpoint_path,
-                control_part="main_arm",
-            )
-        )
-    )
+    motion_generator = _make_motion_generator(tmp_path, monkeypatch)
     custom_qpos = torch.ones(NUM_ARM_JOINTS)
     result = motion_generator.generate(
         target_states=[PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
@@ -182,23 +179,45 @@ def test_neural_planner_uses_plan_opts_start_qpos(tmp_path, monkeypatch):
     assert torch.allclose(result.positions[0], custom_qpos)
 
 
-def test_neural_planner_rejects_short_start_qpos(tmp_path, monkeypatch):
-    checkpoint_path = _create_fake_checkpoint(tmp_path)
-    fake_sim = FakeSimulationManager()
-    monkeypatch.setattr(
-        SimulationManager, "get_instance", classmethod(lambda cls: fake_sim)
+def test_motion_generator_builds_default_neural_plan_options(tmp_path, monkeypatch):
+    motion_generator = _make_motion_generator(tmp_path, monkeypatch)
+    custom_qpos = torch.full((NUM_ARM_JOINTS,), 0.25)
+
+    result = motion_generator.generate(
+        target_states=[PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+        options=MotionGenOptions(
+            control_part="main_arm",
+            start_qpos=custom_qpos,
+        ),
     )
 
-    motion_generator = MotionGenerator(
-        cfg=MotionGenCfg(
-            planner_cfg=NeuralPlannerCfg(
-                robot_uid="fake_robot",
-                checkpoint_path=checkpoint_path,
-                control_part="main_arm",
-            )
-        )
+    assert result.success is True
+    assert torch.allclose(result.positions[0], custom_qpos)
+
+
+def test_motion_generator_does_not_mutate_user_plan_options(tmp_path, monkeypatch):
+    motion_generator = _make_motion_generator(tmp_path, monkeypatch)
+    custom_qpos = torch.full((NUM_ARM_JOINTS,), 0.5)
+    plan_opts = NeuralPlanOptions()
+
+    result = motion_generator.generate(
+        target_states=[PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+        options=MotionGenOptions(
+            control_part="main_arm",
+            start_qpos=custom_qpos,
+            plan_opts=plan_opts,
+        ),
     )
 
+    assert result.success is True
+    assert torch.allclose(result.positions[0], custom_qpos)
+    assert plan_opts.control_part is None
+    assert plan_opts.start_qpos is None
+
+
+def test_neural_planner_rejects_short_start_qpos(tmp_path, monkeypatch):
+    motion_generator = _make_motion_generator(tmp_path, monkeypatch)
+
     with pytest.raises(ValueError, match="policy expects"):
         motion_generator.generate(
             target_states=[PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
@@ -209,3 +228,51 @@ def test_neural_planner_rejects_short_start_qpos(tmp_path, monkeypatch):
                 ),
             ),
         )
+
+
+def test_neural_planner_rejects_joint_move(tmp_path, monkeypatch):
+    motion_generator = _make_motion_generator(tmp_path, monkeypatch)
+
+    with pytest.raises(ValueError, match="EEF_MOVE"):
+        motion_generator.generate(
+            target_states=[
+                PlanState(move_type=MoveType.JOINT_MOVE, qpos=torch.zeros(7))
+            ],
+            options=MotionGenOptions(
+                plan_opts=NeuralPlanOptions(
+                    control_part="main_arm",
+                    start_qpos=torch.zeros(NUM_ARM_JOINTS),
+                ),
+            ),
+        )
+
+
+def test_neural_planner_requires_env_id_for_multi_instance(tmp_path, monkeypatch):
+    motion_generator = _make_motion_generator(tmp_path, monkeypatch)
+    motion_generator.robot.num_instances = 2
+
+    with pytest.raises(ValueError, match="env_id is required"):
+        motion_generator.generate(
+            target_states=[PlanState(move_type=MoveType.EEF_MOVE, xpos=torch.eye(4))],
+            options=MotionGenOptions(
+                control_part="main_arm",
+                start_qpos=torch.zeros(NUM_ARM_JOINTS),
+            ),
+        )
+
+
+def test_neural_planner_rejects_checkpoint_dof_mismatch(tmp_path, monkeypatch):
+    _patch_sim_manager(monkeypatch)
+    checkpoint_path = _create_fake_checkpoint(tmp_path)
+
+    with pytest.raises(ValueError, match="num_arm_joints=6"):
+        MotionGenerator(
+            cfg=MotionGenCfg(
+                planner_cfg=NeuralPlannerCfg(
+                    robot_uid="fake_robot",
+                    checkpoint_path=checkpoint_path,
+                    control_part="main_arm",
+                    num_arm_joints=6,
+                )
+            )
+        )
diff --git a/tests/sim/solvers/test_pytorch_solver.py b/tests/sim/solvers/test_pytorch_solver.py
index 64bafee8..e337084a 100644
--- a/tests/sim/solvers/test_pytorch_solver.py
+++ b/tests/sim/solvers/test_pytorch_solver.py
@@ -15,14 +15,13 @@
 # ----------------------------------------------------------------------------
 
 import os
+import math
 import torch
 import pytest
 import numpy as np
 
-from embodichain.lab.sim import SimulationManager, SimulationManagerCfg
-from embodichain.lab.sim.objects import Robot
-from embodichain.lab.sim.cfg import RobotCfg, RenderCfg
 from embodichain.data import get_data_path
+from embodichain.utils.math import quat_error_magnitude, quat_from_matrix
 from embodichain.utils.utility import reset_all_seeds
 
 
@@ -65,11 +64,79 @@ def grid_sample_qpos_from_limits(
     return stacked
 
 
+def _franka_tcp() -> list[list[float]]:
+    c = math.cos(-math.pi / 4)
+    s = math.sin(-math.pi / 4)
+    return [
+        [c, -s, 0.0, 0.0],
+        [s, c, 0.0, 0.0],
+        [0.0, 0.0, 1.0, 0.1034],
+        [0.0, 0.0, 0.0, 1.0],
+    ]
+
+
+def _pose_error(
+    target_pose: torch.Tensor, actual_pose: torch.Tensor
+) -> tuple[float, float]:
+    pos_error = float(torch.linalg.norm(actual_pose[:3, 3] - target_pose[:3, 3]))
+    target_quat = quat_from_matrix(target_pose[:3, :3].unsqueeze(0))
+    actual_quat = quat_from_matrix(actual_pose[:3, :3].unsqueeze(0))
+    rot_error = float(quat_error_magnitude(target_quat, actual_quat)[0])
+    return pos_error, rot_error
+
+
+def test_pytorch_solver_ik_respects_rotated_tcp():
+    """FK->IK->FK should remain accurate when TCP contains rotation."""
+    from embodichain.lab.sim.solvers.pytorch_solver import PytorchSolverCfg
+
+    urdf = get_data_path("Franka/Panda/PandaWithHand.urdf")
+    cfg = PytorchSolverCfg(
+        urdf_path=urdf,
+        joint_names=[
+            "Joint1",
+            "Joint2",
+            "Joint3",
+            "Joint4",
+            "Joint5",
+            "Joint6",
+            "Joint7",
+        ],
+        end_link_name="ee_link",
+        root_link_name="base_link",
+        tcp=_franka_tcp(),
+        num_samples=30,
+    )
+    solver = cfg.init_solver(device=torch.device("cpu"))
+    start_qpos = torch.tensor(
+        [0.0, -math.pi / 4, 0.0, -3.0 * math.pi / 4, 0.0, math.pi / 2, math.pi / 4],
+        dtype=torch.float32,
+    )
+    target_qpos = start_qpos + torch.tensor(
+        [0.12, -0.08, 0.10, 0.06, -0.07, 0.08, -0.05],
+        dtype=torch.float32,
+    )
+
+    target_pose = solver.get_fk(target_qpos.unsqueeze(0))[0]
+    success, ik_qpos = solver.get_ik(
+        target_pose.unsqueeze(0),
+        qpos_seed=start_qpos.unsqueeze(0),
+    )
+    final_pose = solver.get_fk(ik_qpos[:, 0, :])[0]
+    pos_error, rot_error = _pose_error(target_pose, final_pose)
+
+    assert success.all()
+    assert pos_error < 1e-3
+    assert rot_error < 5e-3
+
+
 # Base test class for CPU and CUDA
 class BaseSolverTest:
     sim = None  # Define as a class attribute
 
     def setup_simulation(self, solver_type: str):
+        from embodichain.lab.sim import SimulationManager, SimulationManagerCfg
+        from embodichain.lab.sim.cfg import RobotCfg
+
         # Set up simulation with specified device (CPU or CUDA)
         config = SimulationManagerCfg(headless=True, sim_device="cpu")
         self.sim = SimulationManager(config)
@@ -101,7 +168,7 @@ def setup_simulation(self, solver_type: str):
             },
         }
 
-        self.robot: Robot = self.sim.add_robot(cfg=RobotCfg.from_dict(cfg_dict))
+        self.robot = self.sim.add_robot(cfg=RobotCfg.from_dict(cfg_dict))
 
         # Wait for robot to stabilize.
         self.sim.update(step=100)

From b4f5d97554288a27bd8867c3673762950cbeedf5 Mon Sep 17 00:00:00 2001
From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com>
Date: Mon, 29 Jun 2026 16:50:40 +0800
Subject: [PATCH 3/3] fix demo benchmark sugar_box bug

---
 .../overview/sim/planners/neural_planner.md   |  63 ++
 .../robotics/nmg/franka_pick_place.py         | 638 ++++++++++++++++--
 .../benchmark/robotics/nmg/franka_planner.py  | 372 ++++++++--
 .../robotics/nmg/test_franka_pick_place.py    | 242 ++++++-
 .../robotics/nmg/test_franka_planner.py       |  74 +-
 5 files changed, 1266 insertions(+), 123 deletions(-)

diff --git a/docs/source/overview/sim/planners/neural_planner.md b/docs/source/overview/sim/planners/neural_planner.md
index eef2d33e..f95a16aa 100644
--- a/docs/source/overview/sim/planners/neural_planner.md
+++ b/docs/source/overview/sim/planners/neural_planner.md
@@ -131,9 +131,72 @@ actions, compare several success notions:
 - `action_success`: the atomic action produced a trajectory.
 - `strict_pose_success`: the final TCP pose is within the script's strict
   threshold, defaulting to 1 mm and 0.05 rad.
+- `all_waypoint_strict_success`: every target waypoint is reached by some
+  trajectory sample within the strict threshold. Use this as the primary planner
+  quality signal for multi-waypoint NMG comparisons.
 - `nmg_threshold_success`: the final TCP pose is within the NMG waypoint
   threshold, defaulting to 5 cm and 0.3 rad.
 - downstream task success: the simulated task outcome after trajectory replay.
 
 Use `strict_pose_success`, `final_pos_error`, `final_rot_error`, and the
 downstream task outcome rather than `action_success` alone.
+
+## Benchmarks
+
+Use the Franka benchmark in three layers:
+
+1. Demo-matched planner benchmark: mirrors this example's fixed start qpos and
+   compact relative TCP offsets.
+2. Reachable-FK planner benchmark: uses a broader bank of FK-generated reachable
+   target poses.
+3. Atomic-action benchmark: runs the Franka `PickUp -> Place -> MoveEndEffector`
+   integration path with planner-only and optional physical replay modes.
+
+Run the first two layers with `franka_planner.py`:
+
+```bash
+PYTHONPATH="$PWD" conda run -n embodichain040 python -m scripts.benchmark.robotics.nmg.franka_planner \
+  --device cuda \
+  --planner all \
+  --trial_source demo_offsets \
+  --neural_checkpoint franka.pt \
+  --sample_interval 120
+```
+
+```bash
+PYTHONPATH="$PWD" conda run -n embodichain040 python -m scripts.benchmark.robotics.nmg.franka_planner \
+  --device cuda \
+  --planner all \
+  --trial_source fk_bank \
+  --neural_checkpoint franka.pt \
+  --sample_interval 120
+```
+
+Run the downstream atomic-action layer separately:
+
+```bash
+PYTHONPATH="$PWD" conda run -n embodichain040 python -m scripts.benchmark.robotics.nmg.franka_pick_place \
+  --device cuda \
+  --planner all \
+  --mode planner \
+  --neural_checkpoint franka.pt \
+  --object sugar_box \
+  --support_surface ground
+```
+
+For visual inspection, run the physical layer with `--open_window`. The
+`attached` object replay mode is useful when you want to inspect the planned
+held-object transform without requiring a successful contact grasp:
+
+```bash
+PYTHONPATH="$PWD" conda run -n embodichain040 python -m scripts.benchmark.robotics.nmg.franka_pick_place \
+  --device cuda \
+  --planner ik_interpolate \
+  --mode physical \
+  --neural_checkpoint franka.pt \
+  --object sugar_box \
+  --support_surface ground \
+  --object_replay_mode attached \
+  --replay_control target \
+  --open_window
+```
diff --git a/scripts/benchmark/robotics/nmg/franka_pick_place.py b/scripts/benchmark/robotics/nmg/franka_pick_place.py
index 7b21a66d..8eb6f79c 100644
--- a/scripts/benchmark/robotics/nmg/franka_pick_place.py
+++ b/scripts/benchmark/robotics/nmg/franka_pick_place.py
@@ -29,6 +29,7 @@
 import math
 import os
 import struct
+import sys
 import time
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass
@@ -89,6 +90,11 @@
 )
 from embodichain.utils import logger
 from embodichain.utils.math import quat_error_magnitude, quat_from_matrix
+from scripts.tutorials.atomic_action.tutorial_utils import (
+    draw_axis_marker,
+    start_auto_play_recording,
+    stop_auto_play_recording,
+)
 
 SCRIPT_NAME = "franka_pick_place_nmg"
 ARM_NAME = "main_arm"
@@ -98,7 +104,7 @@
 TABLE_COLLIDER_UID = "support_table_collider"
 TABLE_MESH_PATH = "ShopTableSimple/shop_table_simple.ply"
 TABLE_MESH_TOP_Z = 0.8265
-TABLE_TOP_Z = 0.0
+TABLE_TOP_Z = 0.1
 TABLE_INIT_POS = (0.0, 0.0, TABLE_TOP_Z - TABLE_MESH_TOP_Z)
 TABLE_SCALE = (1.0, 1.0, 1.0)
 TABLE_COLLIDER_SIZE = (1.0, 1.0, 0.04)
@@ -108,6 +114,19 @@
     TABLE_TOP_Z - TABLE_COLLIDER_SIZE[2] / 2.0,
 )
 OBJECT_SUPPORT_MARGIN = 0.002
+TUTORIAL_PLACE_PROFILE = "tutorial_place"
+BENCHMARK_PROFILE = "benchmark"
+DEFAULT_DEMO_PROFILE = TUTORIAL_PLACE_PROFILE
+TUTORIAL_OBJECT_NAME = "sugar_box"
+TUTORIAL_OBJECT_XY = (-0.42, -0.08)
+TUTORIAL_OBJECT_INIT_Z = 0.05
+TUTORIAL_OBJECT_SCALE = (0.8, 0.8, 0.8)
+TUTORIAL_PRE_GRASP_DISTANCE = 0.15
+TUTORIAL_PICK_LIFT_HEIGHT = 0.16
+TUTORIAL_PLACE_LIFT_HEIGHT = 0.14
+BENCHMARK_PRE_GRASP_DISTANCE = 0.12
+BENCHMARK_PICK_LIFT_HEIGHT = 0.14
+BENCHMARK_PLACE_LIFT_HEIGHT = 0.14
 
 PICK_SAMPLE_INTERVAL = 120
 PLACE_SAMPLE_INTERVAL = 120
@@ -115,6 +134,7 @@
 HAND_INTERP_STEPS = 12
 POST_REPLAY_HOLD_STEPS = 80
 POST_GRASP_HOLD_STEPS = 80
+TUTORIAL_POST_TRAJECTORY_STEPS = 240
 
 DEFAULT_POS_THRESHOLD = 1e-3
 DEFAULT_ROT_THRESHOLD = 0.05
@@ -131,6 +151,8 @@
 DEFAULT_FRANKA_TCP_YAW = -math.pi / 4.0
 DEFAULT_GRASP_DEPTH_OFFSET = 0.0
 DEFAULT_MIN_PHYSICAL_GPU_FREE_MB = 2048.0
+DEFAULT_ARENA_SPACE = 2.5
+DEFAULT_OBJECT_SETTLE_STEPS = 10
 
 FRANKA_FINGER_LINK_NAMES = ("leftfinger", "rightfinger")
 FRANKA_GRIPPER_CLOSING_AXIS_INDEX = 1
@@ -166,8 +188,6 @@
         "mesh_path": "SugarBox/sugar_box_usd/sugar_box.usda",
         "init_pos": (0.31, 0.00, 0.0),
         "init_rot": (0.0, 0.0, 0.0),
-        # Default benchmark scale. Use --object_scale to sweep strict-physics
-        # grasp feasibility without editing this preset.
         "body_scale": (1.0, 1.0, 1.0),
         "mass": 0.05,
         "max_convex_hull_num": 16,
@@ -294,7 +314,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         description="Benchmark Franka PickUp -> Place with IK and NMG planners."
     )
     add_env_launcher_args_to_parser(parser)
-    parser.set_defaults(headless=True)
+    parser.set_defaults(headless=True, arena_space=DEFAULT_ARENA_SPACE)
     parser.add_argument(
         "--planner",
         choices=["ik_interpolate", "neural", "neural_refine", "all"],
@@ -316,12 +336,32 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         default="planner",
         help="Evaluation mode. 'both' runs separate planner and physical rows.",
     )
+    parser.add_argument(
+        "--run_kind",
+        choices=["auto", "demo", "benchmark"],
+        default="auto",
+        help=(
+            "Execution kind. 'demo' runs one tutorial-style PickUp -> Place "
+            "sequence without the benchmark loop. 'auto' uses demo when "
+            "--open_window is set, otherwise benchmark."
+        ),
+    )
     parser.add_argument(
         "--object",
         choices=sorted(OBJECT_PRESETS),
         default="sugar_box",
         help="Object preset used for the pick-place scene.",
     )
+    parser.add_argument(
+        "--demo_profile",
+        choices=[TUTORIAL_PLACE_PROFILE, BENCHMARK_PROFILE],
+        default=DEFAULT_DEMO_PROFILE,
+        help=(
+            "'tutorial_place' mirrors scripts/tutorials/atomic_action/place.py "
+            "as closely as possible while keeping the Franka robot. 'benchmark' "
+            "uses the original object-target benchmark sequence."
+        ),
+    )
     parser.add_argument(
         "--object_scale",
         type=float,
@@ -330,6 +370,28 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         metavar=("SX", "SY", "SZ"),
         help="Optional object body scale override for strict-physics experiments.",
     )
+    parser.add_argument(
+        "--object_xy",
+        type=float,
+        nargs=2,
+        default=None,
+        metavar=("X", "Y"),
+        help=(
+            "Optional world-frame object XY override. This affects both initial "
+            "creation and per-trial object reset."
+        ),
+    )
+    parser.add_argument(
+        "--support_surface",
+        choices=["ground", "table"],
+        default="ground",
+        help=(
+            "Support surface for the object. 'ground' matches the atomic-action "
+            "pickup tutorial and uses the SimulationManager default plane. "
+            "'table' creates the benchmark's legacy hidden table collider and "
+            "visual table mesh."
+        ),
+    )
     parser.add_argument(
         "--num_repeats",
         type=int,
@@ -368,6 +430,25 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         action="store_true",
         help="Force grasp region re-annotation instead of using cached data.",
     )
+    parser.add_argument(
+        "--open_window",
+        action="store_true",
+        help="Open the simulation viewer window for visual inspection.",
+    )
+    parser.add_argument(
+        "--auto_play",
+        action="store_true",
+        help=(
+            "Run the open-window tutorial profile without waiting for keyboard "
+            "inspection prompts."
+        ),
+    )
+    parser.add_argument(
+        "--inspect_seconds",
+        type=float,
+        default=0.0,
+        help="Optional seconds to keep updating the viewer before starting trials.",
+    )
     parser.add_argument(
         "--report_path",
         type=str,
@@ -461,6 +542,15 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
             "to the planned held-object transform after grasp for demo inspection."
         ),
     )
+    parser.add_argument(
+        "--demo_object_replay_mode",
+        choices=["physics", "attached"],
+        default="attached",
+        help=(
+            "Object replay mode used only by --run_kind demo. The benchmark loop "
+            "continues to use --object_replay_mode."
+        ),
+    )
     parser.add_argument(
         "--grasp_hold_steps",
         type=int,
@@ -524,7 +614,72 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
             "Set <= 0 to disable the preflight skip."
         ),
     )
-    return parser.parse_args(argv)
+    args = parser.parse_args(argv)
+    if args.open_window:
+        args.headless = False
+    if (
+        args.demo_profile == TUTORIAL_PLACE_PROFILE
+        and args.object != TUTORIAL_OBJECT_NAME
+    ):
+        logger.log_warning(
+            f"{TUTORIAL_PLACE_PROFILE} is calibrated for {TUTORIAL_OBJECT_NAME}; "
+            f"received --object {args.object}."
+        )
+    return args
+
+
+def should_pause_for_tutorial_inspection(args: argparse.Namespace) -> bool:
+    """Return whether the open-window tutorial profile should wait for inspection."""
+    return (
+        not args.headless
+        and args.demo_profile == TUTORIAL_PLACE_PROFILE
+        and not args.auto_play
+    )
+
+
+def pause_for_tutorial_inspection(args: argparse.Namespace) -> None:
+    """Pause before planning when the tutorial viewer runs in an interactive shell."""
+    if not should_pause_for_tutorial_inspection(args):
+        return
+    if not sys.stdin.isatty():
+        logger.log_warning(
+            f"{SCRIPT_NAME}: --open_window tutorial inspection prompt skipped "
+            "because stdin is not interactive. Use --auto_play to suppress this "
+            "warning explicitly."
+        )
+        return
+    input("Inspect the object, then press Enter to plan PickUp -> Place...")
+
+
+def inspect_viewer_before_trials(
+    sim: SimulationManager, args: argparse.Namespace
+) -> None:
+    """Keep the viewer alive briefly before benchmark trials start."""
+    if args.inspect_seconds <= 0.0:
+        return
+    steps = max(1, int(round(args.inspect_seconds / (1.0 / 100.0))))
+    sim.update(step=steps)
+
+
+def effective_replay_control(args: argparse.Namespace) -> str:
+    """Return the replay control mode used for this profile."""
+    if args.demo_profile == TUTORIAL_PLACE_PROFILE:
+        return "direct"
+    return args.replay_control
+
+
+def effective_final_hold_steps(args: argparse.Namespace) -> int:
+    """Return the number of final hold iterations used after replay."""
+    if args.demo_profile == TUTORIAL_PLACE_PROFILE:
+        return TUTORIAL_POST_TRAJECTORY_STEPS
+    return args.hold_steps + POST_REPLAY_HOLD_STEPS
+
+
+def effective_pre_plan_hold_steps(args: argparse.Namespace) -> int:
+    """Return physics hold steps inserted before planning."""
+    if args.demo_profile == TUTORIAL_PLACE_PROFILE:
+        return 0
+    return max(args.hold_steps, 1)
 
 
 def validate_args(args: argparse.Namespace) -> None:
@@ -551,6 +706,8 @@ def validate_args(args: argparse.Namespace) -> None:
         raise ValueError("--gripper_close_margin must be >= 0.")
     if args.hold_steps < 0:
         raise ValueError("--hold_steps must be >= 0.")
+    if args.inspect_seconds < 0.0:
+        raise ValueError("--inspect_seconds must be >= 0.")
     if args.min_physical_gpu_free_mb < 0.0:
         raise ValueError("--min_physical_gpu_free_mb must be >= 0.")
 
@@ -621,12 +778,15 @@ def _franka_tcp(
 def resolve_object_body_scale(
     object_name: str,
     object_scale: tuple[float, float, float] | list[float] | None = None,
+    demo_profile: str = DEFAULT_DEMO_PROFILE,
 ) -> tuple[float, float, float]:
     """Return the benchmark scale for an object preset or CLI override."""
     if object_scale is not None:
         if len(object_scale) != 3:
             raise ValueError(f"object_scale must have 3 values, got {object_scale}")
         scale = tuple(float(v) for v in object_scale)
+    elif demo_profile == TUTORIAL_PLACE_PROFILE and object_name == TUTORIAL_OBJECT_NAME:
+        scale = TUTORIAL_OBJECT_SCALE
     else:
         scale = tuple(float(v) for v in OBJECT_PRESETS[object_name]["body_scale"])
     if any(v <= 0.0 for v in scale):
@@ -642,10 +802,11 @@ def object_body_scale(object_name: str) -> tuple[float, float, float]:
 def object_supported_z(
     object_name: str,
     object_scale: tuple[float, float, float] | list[float] | None = None,
+    demo_profile: str = DEFAULT_DEMO_PROFILE,
 ) -> float:
-    """Return the object origin z that places its scaled bottom on the table."""
+    """Return the object origin z that places its scaled bottom on the support plane."""
     preset = OBJECT_PRESETS[object_name]
-    scale = resolve_object_body_scale(object_name, object_scale)
+    scale = resolve_object_body_scale(object_name, object_scale, demo_profile)
     if preset.get("shape") == "cube":
         size_z = float(preset["size"][2]) * scale[2]
         return TABLE_TOP_Z + size_z / 2.0 + OBJECT_SUPPORT_MARGIN
@@ -656,20 +817,50 @@ def object_supported_z(
 def object_initial_position(
     object_name: str,
     object_scale: tuple[float, float, float] | list[float] | None = None,
+    support_surface: str = "ground",
+    demo_profile: str = DEFAULT_DEMO_PROFILE,
+    object_xy: tuple[float, float] | list[float] | None = None,
 ) -> tuple[float, float, float]:
-    """Return the benchmark object position with support-plane z compensation."""
+    """Return the benchmark object position."""
+    if support_surface not in ("ground", "table"):
+        raise ValueError(f"Unsupported support_surface: {support_surface}")
     pos = OBJECT_PRESETS[object_name]["init_pos"]
-    return (float(pos[0]), float(pos[1]), object_supported_z(object_name, object_scale))
+    xy = None if object_xy is None else (float(object_xy[0]), float(object_xy[1]))
+    if demo_profile == TUTORIAL_PLACE_PROFILE and object_name == TUTORIAL_OBJECT_NAME:
+        xy = xy or (float(TUTORIAL_OBJECT_XY[0]), float(TUTORIAL_OBJECT_XY[1]))
+        if support_surface == "ground":
+            return (
+                xy[0],
+                xy[1],
+                TUTORIAL_OBJECT_INIT_Z,
+            )
+        pos = (xy[0], xy[1], pos[2])
+    elif xy is not None:
+        pos = (xy[0], xy[1], pos[2])
+    return (
+        float(pos[0]),
+        float(pos[1]),
+        object_supported_z(object_name, object_scale, demo_profile),
+    )
 
 
 def object_initial_pose(
     object_name: str,
     device: torch.device,
     object_scale: tuple[float, float, float] | list[float] | None = None,
+    support_surface: str = "ground",
+    demo_profile: str = DEFAULT_DEMO_PROFILE,
+    object_xy: tuple[float, float] | list[float] | None = None,
 ) -> torch.Tensor:
     """Return the initial object pose as an unbatched 7D wxyz pose tensor."""
     preset = OBJECT_PRESETS[object_name]
-    pos = object_initial_position(object_name, object_scale)
+    pos = object_initial_position(
+        object_name,
+        object_scale,
+        support_surface,
+        demo_profile,
+        object_xy,
+    )
     yaw = math.radians(float(preset["init_rot"][2]))
     return torch.tensor(
         [
@@ -1148,33 +1339,44 @@ def create_object(
     sim: SimulationManager,
     object_name: str,
     object_scale: tuple[float, float, float] | list[float] | None = None,
+    support_surface: str = "ground",
+    demo_profile: str = DEFAULT_DEMO_PROFILE,
+    object_xy: tuple[float, float] | list[float] | None = None,
 ) -> RigidObject:
     """Create the benchmark object."""
     preset = OBJECT_PRESETS[object_name]
-    scale = resolve_object_body_scale(object_name, object_scale)
+    scale = resolve_object_body_scale(object_name, object_scale, demo_profile)
     shape = (
         CubeCfg(size=list(preset["size"]))
         if preset.get("shape") == "cube"
         else MeshCfg(fpath=get_data_path(preset["mesh_path"]))
     )
+    attrs_kwargs = {
+        "mass": preset["mass"],
+        "dynamic_friction": 0.97,
+        "static_friction": 0.99,
+        "enable_ccd": True,
+    }
+    if demo_profile != TUTORIAL_PLACE_PROFILE:
+        attrs_kwargs["contact_offset"] = 0.004
     cfg = RigidObjectCfg(
         uid=preset["label"],
         shape=shape,
-        attrs=RigidBodyAttributesCfg(
-            mass=preset["mass"],
-            dynamic_friction=0.97,
-            static_friction=0.99,
-            enable_ccd=True,
-            contact_offset=0.004,
-        ),
+        attrs=RigidBodyAttributesCfg(**attrs_kwargs),
         max_convex_hull_num=preset["max_convex_hull_num"],
-        init_pos=object_initial_position(object_name, scale),
+        init_pos=object_initial_position(
+            object_name,
+            scale,
+            support_surface,
+            demo_profile,
+            object_xy,
+        ),
         init_rot=preset["init_rot"],
         body_scale=scale,
         use_usd_properties=preset["use_usd_properties"],
     )
     obj = sim.add_rigid_object(cfg=cfg)
-    sim.update(step=20)
+    sim.update(step=DEFAULT_OBJECT_SETTLE_STEPS)
     return obj
 
 
@@ -1182,9 +1384,19 @@ def reset_object_pose(
     obj: RigidObject,
     object_name: str,
     object_scale: tuple[float, float, float] | list[float] | None = None,
+    support_surface: str = "ground",
+    demo_profile: str = DEFAULT_DEMO_PROFILE,
+    object_xy: tuple[float, float] | list[float] | None = None,
 ) -> torch.Tensor:
     """Reset the benchmark object to its initial pose and return it as a matrix."""
-    pose = object_initial_pose(object_name, obj.device, object_scale).unsqueeze(0)
+    pose = object_initial_pose(
+        object_name,
+        obj.device,
+        object_scale,
+        support_surface,
+        demo_profile,
+        object_xy,
+    ).unsqueeze(0)
     obj.set_local_pose(pose, env_ids=[0])
     obj.clear_dynamics(env_ids=[0])
     return obj.get_local_pose(to_matrix=True)[0].clone()
@@ -1219,6 +1431,22 @@ def make_rest_pose(robot: Robot) -> torch.Tensor:
     return compute_fk_pose(robot, FRANKA_REST_QPOS)
 
 
+def make_tutorial_place_eef_pose(device: torch.device) -> torch.Tensor:
+    """Return the fixed Place target pose used by the atomic-action tutorial."""
+    pose = torch.eye(4, dtype=torch.float32, device=device)
+    pose[:3, :3] = torch.tensor(
+        [
+            [-0.0539, -0.9985, -0.0022],
+            [-0.9977, 0.0540, -0.0401],
+            [0.0401, 0.0000, -0.9992],
+        ],
+        dtype=torch.float32,
+        device=device,
+    )
+    pose[:3, 3] = torch.tensor([-0.20, 0.28, 0.1], dtype=torch.float32, device=device)
+    return pose
+
+
 def make_object_target_pose(obj: RigidObject) -> torch.Tensor:
     """Return a desired object placement pose derived from the live object pose."""
     target = obj.get_local_pose(to_matrix=True)[0].clone()
@@ -1230,6 +1458,17 @@ def make_object_target_pose(obj: RigidObject) -> torch.Tensor:
     return target
 
 
+def make_tutorial_retract_eef_pose(place_eef_pose: torch.Tensor) -> torch.Tensor:
+    """Return the final retract pose produced by the tutorial Place action."""
+    retract_pose = place_eef_pose.clone()
+    retract_pose[:3, 3] += torch.tensor(
+        [0.0, 0.0, TUTORIAL_PLACE_LIFT_HEIGHT],
+        dtype=torch.float32,
+        device=place_eef_pose.device,
+    )
+    return retract_pose
+
+
 def make_pre_pick_eef_pose(robot: Robot, position: torch.Tensor) -> torch.Tensor:
     """Return a pre-pick TCP pose using the current TCP orientation."""
     pose = robot.compute_fk(
@@ -1295,6 +1534,22 @@ def build_grasp_generator_cfg(
     args: argparse.Namespace, preflight: GraspPreflight
 ) -> GraspGeneratorCfg:
     """Build grasp annotation config."""
+    cfg = GraspGeneratorCfg(
+        viser_port=11801,
+        antipodal_sampler_cfg=AntipodalSamplerCfg(
+            n_sample=args.n_sample,
+            max_length=preflight.gripper_max_opening_m,
+            min_length=(
+                0.003
+                if args.demo_profile == TUTORIAL_PLACE_PROFILE
+                else max(0.003, preflight.gripper_min_opening_m)
+            ),
+        ),
+        is_partial_annotate=False,
+        is_filter_ground_collision=args.support_surface == "table",
+    )
+    if args.demo_profile == TUTORIAL_PLACE_PROFILE:
+        return cfg
     return GraspGeneratorCfg(
         viser_port=11801,
         antipodal_sampler_cfg=AntipodalSamplerCfg(
@@ -1303,7 +1558,7 @@ def build_grasp_generator_cfg(
             min_length=max(0.003, preflight.gripper_min_opening_m),
         ),
         is_partial_annotate=False,
-        is_filter_ground_collision=True,
+        is_filter_ground_collision=args.support_surface == "table",
         n_deviated_approach_directions=args.n_deviated_approach_directions,
         n_top_grasps=30,
     )
@@ -1465,14 +1720,23 @@ def build_engine(
         close_margin_m=args.gripper_close_margin,
         franka_urdf_path=franka_urdf_path,
     )
+    tutorial_profile = args.demo_profile == TUTORIAL_PLACE_PROFILE
     pickup_cfg = PickUpCfg(
         control_part=ARM_NAME,
         hand_control_part=HAND_NAME,
         hand_open_qpos=hand_open,
         hand_close_qpos=hand_close,
         approach_direction=resolve_approach_direction(args, robot.device),
-        pre_grasp_distance=0.12,
-        lift_height=0.14,
+        pre_grasp_distance=(
+            TUTORIAL_PRE_GRASP_DISTANCE
+            if tutorial_profile
+            else BENCHMARK_PRE_GRASP_DISTANCE
+        ),
+        lift_height=(
+            TUTORIAL_PICK_LIFT_HEIGHT
+            if tutorial_profile
+            else BENCHMARK_PICK_LIFT_HEIGHT
+        ),
         sample_interval=PICK_SAMPLE_INTERVAL,
         hand_interp_steps=HAND_INTERP_STEPS,
     )
@@ -1481,7 +1745,11 @@ def build_engine(
         hand_control_part=HAND_NAME,
         hand_open_qpos=hand_open,
         hand_close_qpos=hand_close,
-        lift_height=0.14,
+        lift_height=(
+            TUTORIAL_PLACE_LIFT_HEIGHT
+            if tutorial_profile
+            else BENCHMARK_PLACE_LIFT_HEIGHT
+        ),
         sample_interval=PLACE_SAMPLE_INTERVAL,
         hand_interp_steps=HAND_INTERP_STEPS,
     )
@@ -1507,10 +1775,12 @@ def build_engine(
 def plan_sequence(
     engine: AtomicActionEngine,
     semantics: ObjectSemantics,
-    object_target_pose: torch.Tensor,
+    target_pose: torch.Tensor,
     rest_pose: torch.Tensor,
+    *,
+    demo_profile: str,
 ) -> PlanOutcome:
-    """Plan the full PickUp -> Place -> MoveEndEffector sequence."""
+    """Plan the requested PickUp/Place sequence."""
     pick_success, pick_traj, pick_state = engine.run(
         steps=[("pick_up", GraspTarget(semantics=semantics))]
     )
@@ -1523,20 +1793,22 @@ def plan_sequence(
             rest_eef_pose=rest_pose,
         )
 
-    object_to_eef = pick_state.held_object.object_to_eef.to(
-        device=object_target_pose.device, dtype=torch.float32
-    )
-    if object_to_eef.shape == (4, 4):
-        object_to_eef = object_to_eef.unsqueeze(0)
-    place_pose = torch.bmm(object_target_pose.unsqueeze(0), object_to_eef)[0]
-
-    finish_success, finish_traj, _ = engine.run(
-        steps=[
+    if demo_profile == TUTORIAL_PLACE_PROFILE:
+        place_pose = target_pose
+        finish_steps = [("place", EndEffectorPoseTarget(xpos=place_pose))]
+    else:
+        object_to_eef = pick_state.held_object.object_to_eef.to(
+            device=target_pose.device, dtype=torch.float32
+        )
+        if object_to_eef.shape == (4, 4):
+            object_to_eef = object_to_eef.unsqueeze(0)
+        place_pose = torch.bmm(target_pose.unsqueeze(0), object_to_eef)[0]
+        finish_steps = [
             ("place", EndEffectorPoseTarget(xpos=place_pose)),
             ("move_end_effector", EndEffectorPoseTarget(xpos=rest_pose)),
-        ],
-        state=pick_state,
-    )
+        ]
+
+    finish_success, finish_traj, _ = engine.run(steps=finish_steps, state=pick_state)
     trajectory = torch.cat([pick_traj, finish_traj], dim=1)
     return PlanOutcome(
         success=finish_success,
@@ -1579,15 +1851,23 @@ def _memory_snapshot() -> dict[str, float]:
 def timed_plan_sequence(
     engine: AtomicActionEngine,
     semantics: ObjectSemantics,
-    object_target_pose: torch.Tensor,
+    target_pose: torch.Tensor,
     rest_pose: torch.Tensor,
+    *,
+    demo_profile: str,
 ) -> tuple[PlanOutcome, float, dict[str, float], float]:
     """Plan once and return time, memory delta, and peak GPU memory."""
     _reset_peak_gpu_memory()
     before = _memory_snapshot()
     _sync_cuda()
     start = time.perf_counter()
-    outcome = plan_sequence(engine, semantics, object_target_pose, rest_pose)
+    outcome = plan_sequence(
+        engine,
+        semantics,
+        target_pose,
+        rest_pose,
+        demo_profile=demo_profile,
+    )
     _sync_cuda()
     elapsed = time.perf_counter() - start
     after = _memory_snapshot()
@@ -1945,9 +2225,11 @@ def replay_trajectory(
         if close_end_idx is None
         else object_delta_from_tcp(robot, obj, traj[:, close_end_idx, :])
     )
+    should_clear_object_dynamics = args.demo_profile == TUTORIAL_PLACE_PROFILE
+    replay_control = effective_replay_control(args)
     try:
         for idx in range(traj.shape[1]):
-            _set_replay_qpos(robot, traj[:, idx, :], mode=args.replay_control)
+            _set_replay_qpos(robot, traj[:, idx, :], mode=replay_control)
             if (
                 args.object_replay_mode == "attached"
                 and close_end_idx is not None
@@ -2010,7 +2292,16 @@ def replay_trajectory(
                     repeat_id=repeat_id,
                     label="after_close",
                 )
-                sim.update(step=args.grasp_hold_steps + POST_GRASP_HOLD_STEPS)
+                if should_clear_object_dynamics:
+                    obj.clear_dynamics(env_ids=[0])
+                    should_clear_object_dynamics = False
+                close_hold_steps = (
+                    0
+                    if args.demo_profile == TUTORIAL_PLACE_PROFILE
+                    else args.grasp_hold_steps + POST_GRASP_HOLD_STEPS
+                )
+                if close_hold_steps > 0:
+                    sim.update(step=close_hold_steps)
                 if args.object_replay_mode == "attached":
                     update_attached_object_pose(
                         robot, obj, traj[:, idx, :], object_to_eef
@@ -2065,8 +2356,14 @@ def replay_trajectory(
             repeat_id=repeat_id,
             label="after_path",
         )
-        _set_replay_qpos(robot, traj[:, -1, :], mode=args.replay_control)
-        sim.update(step=args.hold_steps + POST_REPLAY_HOLD_STEPS)
+        final_hold_steps = effective_final_hold_steps(args)
+        if args.demo_profile == TUTORIAL_PLACE_PROFILE:
+            for _ in range(final_hold_steps):
+                _set_replay_qpos(robot, traj[:, -1, :], mode=replay_control)
+                sim.update(step=2)
+        else:
+            _set_replay_qpos(robot, traj[:, -1, :], mode=replay_control)
+            sim.update(step=final_hold_steps)
         actual_min_hand_opening = min(
             actual_min_hand_opening,
             current_hand_opening(robot, hand_opening_spec),
@@ -2262,8 +2559,16 @@ def build_trial_row(
         "script": SCRIPT_NAME,
         "planner": planner,
         "mode": mode,
+        "demo_profile": args.demo_profile,
         "object": args.object,
-        "object_scale": list(resolve_object_body_scale(args.object, args.object_scale)),
+        "support_surface": args.support_surface,
+        "object_scale": list(
+            resolve_object_body_scale(
+                args.object,
+                args.object_scale,
+                args.demo_profile,
+            )
+        ),
         "gripper_closing_axis_index": FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
         "tcp_z": float(args.tcp_z),
         "tcp_yaw": float(args.tcp_yaw),
@@ -2281,7 +2586,7 @@ def build_trial_row(
         "planning_time_sec": float(planning_time_sec),
         "replay_time_sec": float(replay.replay_time_sec) if replay else 0.0,
         "object_replay_mode": replay.object_replay_mode if replay else None,
-        "replay_control": args.replay_control if replay else None,
+        "replay_control": effective_replay_control(args) if replay else None,
         "cpu_delta_mb": float(mem_delta["cpu_mb"]),
         "gpu_delta_mb": float(mem_delta["gpu_mb"]),
         "peak_gpu_mb": float(peak_gpu_mb),
@@ -2354,7 +2659,14 @@ def run_one_trial(
     args: argparse.Namespace,
 ) -> dict[str, object]:
     """Run one independent benchmark trial."""
-    object_start_pose = reset_object_pose(obj, args.object, args.object_scale)
+    object_start_pose = reset_object_pose(
+        obj,
+        args.object,
+        args.object_scale,
+        args.support_surface,
+        args.demo_profile,
+        args.object_xy,
+    )
     hand_open, _ = get_hand_open_close_qpos(robot)
     set_robot_start_qpos(robot, hand_open)
     try:
@@ -2378,12 +2690,20 @@ def run_one_trial(
             repeat_id=repeat_id,
             warmup=warmup,
             reason=reason,
+            support_surface=args.support_surface,
+            demo_profile=args.demo_profile,
         )
-    sim.update(step=max(args.hold_steps, 1))
+    pre_plan_hold_steps = effective_pre_plan_hold_steps(args)
+    if pre_plan_hold_steps > 0:
+        sim.update(step=pre_plan_hold_steps)
 
     preflight = build_grasp_preflight(obj, robot, franka_urdf_path=franka_urdf_path)
-    object_target_pose = make_object_target_pose(obj)
-    rest_pose = make_rest_pose(robot)
+    if args.demo_profile == TUTORIAL_PLACE_PROFILE:
+        target_pose = make_tutorial_place_eef_pose(robot.device)
+        rest_pose = make_tutorial_retract_eef_pose(target_pose)
+    else:
+        target_pose = make_object_target_pose(obj)
+        rest_pose = make_rest_pose(robot)
     motion_gen = build_motion_generator(robot, planner, checkpoint_path, args)
     engine = build_engine(
         robot,
@@ -2397,15 +2717,30 @@ def run_one_trial(
     outcome, planning_time_sec, mem_delta, peak_gpu_mb = timed_plan_sequence(
         engine,
         semantics,
-        object_target_pose,
+        target_pose,
         rest_pose,
-    )
+        demo_profile=args.demo_profile,
+    )
+    object_target_pose = target_pose
+    if (
+        args.demo_profile == TUTORIAL_PLACE_PROFILE
+        and outcome.held_object is not None
+        and outcome.place_eef_pose is not None
+    ):
+        object_target_pose = compute_attached_object_pose(
+            outcome.place_eef_pose,
+            outcome.held_object.object_to_eef,
+        )
 
     replay = None
     can_replay = (
         mode == "physical"
         and outcome.success
-        and (not preflight.failed or args.object_replay_mode == "attached")
+        and (
+            not preflight.failed
+            or args.object_replay_mode == "attached"
+            or args.demo_profile == TUTORIAL_PLACE_PROFILE
+        )
     )
     if can_replay:
         hand_opening_spec = load_franka_hand_opening_spec(franka_urdf_path)
@@ -2488,6 +2823,8 @@ def make_skipped_rows(
     *,
     object_name: str,
     reason: str,
+    support_surface: str | None = None,
+    demo_profile: str | None = None,
 ) -> list[dict[str, object]]:
     """Build measured rows for a gracefully skipped live-simulation benchmark."""
     rows: list[dict[str, object]] = []
@@ -2498,7 +2835,9 @@ def make_skipped_rows(
                     "script": SCRIPT_NAME,
                     "planner": planner,
                     "mode": mode,
+                    "demo_profile": demo_profile,
                     "object": object_name,
+                    "support_surface": support_surface,
                     "object_scale": None,
                     "gripper_closing_axis_index": FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
                     "tcp_z": None,
@@ -2558,13 +2897,17 @@ def make_failed_trial_row(
     repeat_id: int,
     warmup: bool,
     reason: str,
+    support_surface: str | None = None,
+    demo_profile: str | None = None,
 ) -> dict[str, object]:
     """Build one failed row when setup cannot reach the planner call."""
     return {
         "script": SCRIPT_NAME,
         "planner": planner,
         "mode": mode,
+        "demo_profile": demo_profile,
         "object": object_name,
+        "support_surface": support_surface,
         "object_scale": None,
         "gripper_closing_axis_index": FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
         "tcp_z": None,
@@ -2783,6 +3126,8 @@ def make_metric_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
             {
                 "planner": summary.planner,
                 "mode": summary.mode,
+                "demo_profile": _single_status(summary.rows, "demo_profile"),
+                "support_surface": _single_status(summary.rows, "support_surface"),
                 "object_replay_mode": _single_status(
                     summary.rows, "object_replay_mode"
                 ),
@@ -2922,6 +3267,163 @@ def write_markdown_report(
     return path
 
 
+def selected_run_kind(args: argparse.Namespace) -> str:
+    """Resolve the effective run kind from CLI args."""
+    if args.run_kind != "auto":
+        return args.run_kind
+    if args.open_window:
+        return "demo"
+    return "benchmark"
+
+
+def run_demo(args: argparse.Namespace) -> dict[str, object]:
+    """Run one tutorial-style PickUp -> Place sequence without benchmark repeats."""
+    validate_args(args)
+    torch.manual_seed(args.seed)
+    planner = expand_planner_selection(args.planner)[0]
+    checkpoint_path = resolve_checkpoint(args, [planner])
+
+    sim = make_sim(args)
+    robot, franka_urdf_path = create_franka(
+        sim,
+        tcp_z=args.tcp_z,
+        tcp_yaw=args.tcp_yaw,
+    )
+    if args.support_surface == "table":
+        create_support_table(sim)
+        create_visual_support_table(sim)
+    obj = create_object(
+        sim,
+        args.object,
+        args.object_scale,
+        args.support_surface,
+        args.demo_profile,
+        args.object_xy,
+    )
+
+    if not args.headless:
+        sim.open_window()
+
+    hand_open, _ = get_hand_open_close_qpos(robot)
+    initialize_pre_pick_robot_pose(
+        robot,
+        obj,
+        hand_open,
+        pre_pick_z=args.pre_pick_z,
+    )
+
+    preflight = build_grasp_preflight(obj, robot, franka_urdf_path=franka_urdf_path)
+    motion_gen = build_motion_generator(robot, planner, checkpoint_path, args)
+    engine = build_engine(
+        robot,
+        motion_gen,
+        args,
+        preflight,
+        franka_urdf_path=franka_urdf_path,
+    )
+    semantics = create_object_semantics(obj, args, preflight)
+
+    if args.demo_profile == TUTORIAL_PLACE_PROFILE:
+        target_pose = make_tutorial_place_eef_pose(robot.device)
+        rest_pose = make_tutorial_retract_eef_pose(target_pose)
+    else:
+        target_pose = make_object_target_pose(obj)
+        rest_pose = make_rest_pose(robot)
+
+    if not args.headless:
+        draw_axis_marker(sim, "franka_pick_place_target_axis", target_pose)
+    pause_for_tutorial_inspection(args)
+    inspect_viewer_before_trials(sim, args)
+
+    logger.log_info(
+        f"{SCRIPT_NAME}: running demo planner={planner} "
+        f"support_surface={args.support_surface} replay_mode={args.demo_object_replay_mode}"
+    )
+    outcome, planning_time_sec, mem_delta, peak_gpu_mb = timed_plan_sequence(
+        engine,
+        semantics,
+        target_pose,
+        rest_pose,
+        demo_profile=args.demo_profile,
+    )
+    if not outcome.success:
+        logger.log_warning(f"{SCRIPT_NAME}: demo planning failed.")
+        return build_trial_row(
+            planner=planner,
+            mode="demo",
+            repeat_id=0,
+            warmup=False,
+            outcome=outcome,
+            planning_time_sec=planning_time_sec,
+            mem_delta=mem_delta,
+            peak_gpu_mb=peak_gpu_mb,
+            robot=robot,
+            preflight=preflight,
+            replay=None,
+            args=args,
+        )
+
+    if not args.auto_play and sys.stdin.isatty():
+        input("Press Enter to replay the Franka PickUp -> Place demo...")
+
+    object_target_pose = target_pose
+    if (
+        args.demo_profile == TUTORIAL_PLACE_PROFILE
+        and outcome.held_object is not None
+        and outcome.place_eef_pose is not None
+    ):
+        object_target_pose = compute_attached_object_pose(
+            outcome.place_eef_pose,
+            outcome.held_object.object_to_eef,
+        )
+
+    replay_args = argparse.Namespace(**vars(args))
+    replay_args.object_replay_mode = args.demo_object_replay_mode
+    recording_started = start_auto_play_recording(
+        sim,
+        args,
+        video_prefix=f"{SCRIPT_NAME}_demo",
+    )
+    try:
+        replay = replay_trajectory(
+            sim,
+            robot,
+            obj,
+            outcome.trajectory,
+            object_target_pose,
+            replay_args,
+            held_object=outcome.held_object,
+            planner=planner,
+            mode="demo",
+            repeat_id=0,
+            hand_opening_spec=load_franka_hand_opening_spec(franka_urdf_path),
+        )
+    finally:
+        stop_auto_play_recording(sim, recording_started)
+
+    row = build_trial_row(
+        planner=planner,
+        mode="demo",
+        repeat_id=0,
+        warmup=False,
+        outcome=outcome,
+        planning_time_sec=planning_time_sec,
+        mem_delta=mem_delta,
+        peak_gpu_mb=peak_gpu_mb,
+        robot=robot,
+        preflight=preflight,
+        replay=replay,
+        args=replay_args,
+    )
+    logger.log_info(
+        f"{SCRIPT_NAME}: demo planner_success={row['planner_success']} "
+        f"replay_success={row['replay_success']} preflight={preflight.status}"
+    )
+    if not args.auto_play and sys.stdin.isatty():
+        input("Press Enter to exit the simulation...")
+    return row
+
+
 def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
     """Run the selected planner benchmark trials."""
     validate_args(args)
@@ -2946,6 +3448,8 @@ def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
             modes,
             object_name=args.object,
             reason=reason,
+            support_surface=args.support_surface,
+            demo_profile=args.demo_profile,
         )
         for row in rows:
             write_raw_jsonl(args.save_raw_jsonl, row)
@@ -2961,6 +3465,8 @@ def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
             modes,
             object_name=args.object,
             reason=skip_reason,
+            support_surface=args.support_surface,
+            demo_profile=args.demo_profile,
         )
         for row in rows:
             write_raw_jsonl(args.save_raw_jsonl, row)
@@ -2976,11 +3482,21 @@ def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
         tcp_z=args.tcp_z,
         tcp_yaw=args.tcp_yaw,
     )
-    create_support_table(sim)
-    create_visual_support_table(sim)
-    obj = create_object(sim, args.object, args.object_scale)
+    if args.support_surface == "table":
+        create_support_table(sim)
+        create_visual_support_table(sim)
+    obj = create_object(
+        sim,
+        args.object,
+        args.object_scale,
+        args.support_surface,
+        args.demo_profile,
+        args.object_xy,
+    )
     if not args.headless:
         sim.open_window()
+    pause_for_tutorial_inspection(args)
+    inspect_viewer_before_trials(sim, args)
 
     for planner in planners:
         for mode in modes:
@@ -3010,8 +3526,12 @@ def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
 
 
 def run_all_benchmarks() -> None:
-    """Parse CLI args and run the Franka pick-place NMG benchmark."""
-    run_benchmark(parse_args())
+    """Parse CLI args and run the selected Franka pick-place path."""
+    args = parse_args()
+    if selected_run_kind(args) == "demo":
+        run_demo(args)
+        return
+    run_benchmark(args)
 
 
 if __name__ == "__main__":
diff --git a/scripts/benchmark/robotics/nmg/franka_planner.py b/scripts/benchmark/robotics/nmg/franka_planner.py
index 49015527..28c8576b 100644
--- a/scripts/benchmark/robotics/nmg/franka_planner.py
+++ b/scripts/benchmark/robotics/nmg/franka_planner.py
@@ -17,9 +17,10 @@
 """Benchmark Franka end-effector waypoint planning backends.
 
 This benchmark isolates planner quality from grasp annotation and physics. It
-generates reachable Franka TCP waypoint poses from known joint configurations,
-then compares IK+TOPPRA interpolation, NeuralPlanner, and NeuralPlanner with a
-final IK refinement.
+supports a demo-matched waypoint source that mirrors
+examples/sim/planners/neural_planner.py, and a broader reachable-FK source that
+samples target poses from known Franka joint configurations. The downstream
+Franka pick-place benchmark remains the third integration layer.
 Run: python -m scripts.benchmark.robotics.nmg.franka_planner
 """
 
@@ -76,6 +77,18 @@
 DEFAULT_NMG_POS_THRESHOLD = 0.05
 DEFAULT_NMG_ROT_THRESHOLD = 0.3
 PLANNER_NAMES = ("ik_toppra", "neural", "neural_refine")
+TRIAL_SOURCE_NAMES = ("demo_offsets", "fk_bank")
+DEFAULT_TRIAL_SOURCE = "fk_bank"
+DEMO_WAYPOINT_OFFSETS = (
+    (0.10, 0.00, 0.00),
+    (0.10, 0.10, 0.00),
+    (0.00, 0.10, -0.08),
+    (-0.10, 0.10, -0.08),
+    (-0.10, 0.00, 0.00),
+    (0.00, -0.10, 0.00),
+    (0.10, -0.10, -0.06),
+    (0.00, 0.00, -0.12),
+)
 
 
 @dataclass(frozen=True)
@@ -83,6 +96,7 @@ class PlannerTrial:
     """A reachable waypoint planning trial."""
 
     trial_id: int
+    trial_source: str
     start_qpos: torch.Tensor
     target_qpos: torch.Tensor
     waypoints: list[torch.Tensor]
@@ -118,6 +132,16 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         default=None,
         help="Local Franka NMG checkpoint. Required for neural planners unless downloadable.",
     )
+    parser.add_argument(
+        "--trial_source",
+        choices=[*TRIAL_SOURCE_NAMES, "all"],
+        default=DEFAULT_TRIAL_SOURCE,
+        help=(
+            "Waypoint source. 'demo_offsets' mirrors examples/sim/planners/"
+            "neural_planner.py, 'fk_bank' uses broader FK-generated targets, "
+            "and 'all' runs both planner layers."
+        ),
+    )
     parser.add_argument(
         "--num_trials",
         type=int,
@@ -206,6 +230,13 @@ def expand_planner_selection(planner: str) -> list[str]:
     return [planner]
 
 
+def expand_trial_source_selection(trial_source: str) -> list[str]:
+    """Expand trial-source aliases into concrete source names."""
+    if trial_source == "all":
+        return list(TRIAL_SOURCE_NAMES)
+    return [trial_source]
+
+
 def simulation_requires_cuda(args: argparse.Namespace) -> bool:
     """Return whether the selected simulation settings require CUDA."""
     if str(args.device).startswith("cuda"):
@@ -331,7 +362,46 @@ def fk_pose(robot: Robot, qpos: torch.Tensor) -> torch.Tensor:
     return robot.compute_fk(qpos=qpos.unsqueeze(0), name=ARM_NAME, to_matrix=True)[0]
 
 
-def make_trials(
+def make_demo_offset_waypoints(
+    start_pose: torch.Tensor, num_waypoints: int
+) -> list[torch.Tensor]:
+    """Create demo-matched compact TCP waypoints around the start pose."""
+    offsets = torch.tensor(
+        DEMO_WAYPOINT_OFFSETS,
+        dtype=start_pose.dtype,
+        device=start_pose.device,
+    )
+    count = max(1, min(int(num_waypoints), offsets.shape[0]))
+    waypoints = start_pose.unsqueeze(0).repeat(count, 1, 1)
+    waypoints[:, :3, 3] += offsets[:count]
+    return [waypoint for waypoint in waypoints]
+
+
+def make_demo_offset_trials(
+    robot: Robot,
+    *,
+    total_trials: int,
+    num_waypoints: int,
+) -> list[PlannerTrial]:
+    """Build demo-matched trials from the NeuralPlanner example path."""
+    start_qpos = torch.tensor(
+        FRANKA_START_QPOS, dtype=torch.float32, device=robot.device
+    )
+    start_pose = fk_pose(robot, start_qpos)
+    waypoints = make_demo_offset_waypoints(start_pose, num_waypoints)
+    return [
+        PlannerTrial(
+            trial_id=trial_id,
+            trial_source="demo_offsets",
+            start_qpos=start_qpos.clone(),
+            target_qpos=torch.empty(0, 7, dtype=torch.float32, device=robot.device),
+            waypoints=[waypoint.clone() for waypoint in waypoints],
+        )
+        for trial_id in range(total_trials)
+    ]
+
+
+def make_fk_bank_trials(
     robot: Robot,
     *,
     total_trials: int,
@@ -355,6 +425,7 @@ def make_trials(
         trials.append(
             PlannerTrial(
                 trial_id=trial_id,
+                trial_source="fk_bank",
                 start_qpos=start_qpos.clone(),
                 target_qpos=target_qpos.clone(),
                 waypoints=waypoints,
@@ -363,6 +434,36 @@ def make_trials(
     return trials
 
 
+def make_trials(
+    robot: Robot,
+    *,
+    trial_sources: list[str],
+    total_trials: int,
+    num_waypoints: int,
+    seed: int,
+) -> list[PlannerTrial]:
+    """Build planner trials for the selected benchmark layers."""
+    trials: list[PlannerTrial] = []
+    for source in trial_sources:
+        if source == "demo_offsets":
+            source_trials = make_demo_offset_trials(
+                robot,
+                total_trials=total_trials,
+                num_waypoints=num_waypoints,
+            )
+        elif source == "fk_bank":
+            source_trials = make_fk_bank_trials(
+                robot,
+                total_trials=total_trials,
+                num_waypoints=num_waypoints,
+                seed=seed,
+            )
+        else:
+            raise ValueError(f"Unsupported trial source: {source}")
+        trials.extend(source_trials)
+    return trials
+
+
 def _all_success(success: bool | torch.Tensor) -> bool:
     """Return true when all success flags are true."""
     if isinstance(success, torch.Tensor):
@@ -539,15 +640,24 @@ def pose_error(
     return pos_error, rot_error
 
 
+def trajectory_fk_poses(robot: Robot, positions: torch.Tensor) -> list[torch.Tensor]:
+    """Compute FK poses for trajectory samples one row at a time."""
+    return [fk_pose(robot, qpos) for qpos in positions]
+
+
 def trajectory_quality(
     robot: Robot, positions: torch.Tensor | None, trial: PlannerTrial
 ) -> dict[str, object]:
-    """Compute final pose and joint path quality metrics."""
+    """Compute final pose, waypoint, and joint path quality metrics."""
     if positions is None or positions.numel() == 0:
         return {
             "trajectory_steps": 0,
             "final_tcp_pos_error": None,
             "final_tcp_rot_error": None,
+            "mean_waypoint_pos_error": None,
+            "max_waypoint_pos_error": None,
+            "mean_waypoint_rot_error": None,
+            "max_waypoint_rot_error": None,
             "joint_path_length": 0.0,
             "max_joint_step": 0.0,
             "mean_target_qpos_error": None,
@@ -555,6 +665,14 @@ def trajectory_quality(
         }
     final_pose = fk_pose(robot, positions[-1])
     final_pos, final_rot = pose_error(final_pose, trial.waypoints[-1])
+    trajectory_poses = trajectory_fk_poses(robot, positions)
+    waypoint_pos_errors = []
+    waypoint_rot_errors = []
+    for waypoint in trial.waypoints:
+        pose_errors = [pose_error(pose, waypoint) for pose in trajectory_poses]
+        best_pos, best_rot = min(pose_errors, key=lambda item: item[0])
+        waypoint_pos_errors.append(best_pos)
+        waypoint_rot_errors.append(best_rot)
     deltas = torch.diff(positions, dim=0)
     step_norms = (
         torch.linalg.norm(deltas, dim=-1)
@@ -565,17 +683,48 @@ def trajectory_quality(
     for target_qpos in trial.target_qpos:
         dists = torch.linalg.norm(positions - target_qpos.unsqueeze(0), dim=-1)
         target_errors.append(float(torch.min(dists).item()))
+    mean_target_qpos_error = (
+        sum(target_errors) / len(target_errors) if target_errors else None
+    )
     return {
         "trajectory_steps": int(positions.shape[0]),
         "final_tcp_pos_error": final_pos,
         "final_tcp_rot_error": final_rot,
+        "mean_waypoint_pos_error": sum(waypoint_pos_errors) / len(waypoint_pos_errors),
+        "max_waypoint_pos_error": max(waypoint_pos_errors),
+        "mean_waypoint_rot_error": sum(waypoint_rot_errors) / len(waypoint_rot_errors),
+        "max_waypoint_rot_error": max(waypoint_rot_errors),
         "joint_path_length": float(step_norms.sum().item()),
         "max_joint_step": float(step_norms.max().item()),
-        "mean_target_qpos_error": sum(target_errors) / len(target_errors),
+        "mean_target_qpos_error": mean_target_qpos_error,
         "final_qpos": [float(v) for v in positions[-1].detach().cpu().tolist()],
     }
 
 
+def all_waypoints_within_threshold(
+    robot: Robot,
+    positions: torch.Tensor | None,
+    trial: PlannerTrial,
+    *,
+    pos_threshold: float,
+    rot_threshold: float,
+) -> bool:
+    """Return whether every target waypoint is hit by some trajectory sample."""
+    if positions is None or positions.numel() == 0:
+        return False
+    trajectory_poses = trajectory_fk_poses(robot, positions)
+    for waypoint in trial.waypoints:
+        waypoint_hit = False
+        for pose in trajectory_poses:
+            pos_error, rot_error = pose_error(pose, waypoint)
+            if pos_error <= pos_threshold and rot_error <= rot_threshold:
+                waypoint_hit = True
+                break
+        if not waypoint_hit:
+            return False
+    return True
+
+
 def build_trial_row(
     *,
     planner: str,
@@ -596,6 +745,16 @@ def build_trial_row(
         and float(final_pos) <= args.pos_success_threshold
         and float(final_rot) <= args.rot_success_threshold
     )
+    all_waypoint_strict_success = (
+        outcome.action_success
+        and all_waypoints_within_threshold(
+            robot,
+            outcome.positions,
+            trial,
+            pos_threshold=args.pos_success_threshold,
+            rot_threshold=args.rot_success_threshold,
+        )
+    )
     nmg_threshold_success = (
         outcome.action_success
         and final_pos is not None
@@ -603,15 +762,28 @@ def build_trial_row(
         and float(final_pos) <= args.nmg_pos_success_threshold
         and float(final_rot) <= args.nmg_rot_success_threshold
     )
+    all_waypoint_nmg_threshold_success = (
+        outcome.action_success
+        and all_waypoints_within_threshold(
+            robot,
+            outcome.positions,
+            trial,
+            pos_threshold=args.nmg_pos_success_threshold,
+            rot_threshold=args.nmg_rot_success_threshold,
+        )
+    )
     row: dict[str, object] = {
         "script": SCRIPT_NAME,
         "planner": planner,
+        "trial_source": trial.trial_source,
         "trial_id": trial.trial_id,
         "warmup": warmup,
         "num_waypoints": len(trial.waypoints),
         "action_success": outcome.action_success,
         "strict_pose_success": bool(strict_success),
+        "all_waypoint_strict_success": bool(all_waypoint_strict_success),
         "nmg_threshold_success": bool(nmg_threshold_success),
+        "all_waypoint_nmg_threshold_success": bool(all_waypoint_nmg_threshold_success),
         "planning_time_sec": outcome.planning_time_sec,
         "cpu_delta_mb": outcome.cpu_delta_mb,
         "gpu_delta_mb": outcome.gpu_delta_mb,
@@ -705,25 +877,43 @@ def measured_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
     return [row for row in rows if not row.get("warmup", False)]
 
 
-def summarize_by_planner(
+def summarize_by_source_planner(
     rows: list[dict[str, object]],
-) -> list[tuple[str, list[dict[str, object]]]]:
-    """Group measured rows by planner."""
+) -> list[tuple[str, str, list[dict[str, object]]]]:
+    """Group measured rows by trial source and planner."""
     measured = measured_rows(rows)
-    planners = sorted({str(row["planner"]) for row in measured})
+    groups = sorted(
+        {
+            (
+                str(row.get("trial_source", DEFAULT_TRIAL_SOURCE)),
+                str(row["planner"]),
+            )
+            for row in measured
+        }
+    )
     return [
-        (planner, [row for row in measured if row["planner"] == planner])
-        for planner in planners
+        (
+            trial_source,
+            planner,
+            [
+                row
+                for row in measured
+                if row.get("trial_source", DEFAULT_TRIAL_SOURCE) == trial_source
+                and row["planner"] == planner
+            ],
+        )
+        for trial_source, planner in groups
     ]
 
 
 def make_perf_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
     """Build Time & Memory report rows."""
     perf_rows = []
-    for planner, group in summarize_by_planner(rows):
+    for trial_source, planner, group in summarize_by_source_planner(rows):
         times = _numeric_values(group, "planning_time_sec")
         perf_rows.append(
             {
+                "trial_source": trial_source,
                 "planner": planner,
                 "repeat_count": len(group),
                 "cost_time_ms_mean": _fmt_float(
@@ -762,25 +952,60 @@ def make_perf_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
 def make_metric_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
     """Build Success & Other Metrics report rows."""
     metric_rows = []
-    for planner, group in summarize_by_planner(rows):
+    for trial_source, planner, group in summarize_by_source_planner(rows):
         final_pos = _mean(_numeric_values(group, "final_tcp_pos_error"))
         final_rot = _mean(_numeric_values(group, "final_tcp_rot_error"))
+        mean_waypoint_pos = _mean(_numeric_values(group, "mean_waypoint_pos_error"))
+        max_waypoint_pos = _mean(_numeric_values(group, "max_waypoint_pos_error"))
+        mean_waypoint_rot = _mean(_numeric_values(group, "mean_waypoint_rot_error"))
+        max_waypoint_rot = _mean(_numeric_values(group, "max_waypoint_rot_error"))
         metric_rows.append(
             {
+                "trial_source": trial_source,
                 "planner": planner,
                 "action_success_rate": _fmt_rate(_rate(group, "action_success")),
                 "strict_pose_success_rate": _fmt_rate(
                     _rate(group, "strict_pose_success")
                 ),
+                "all_waypoint_strict_success_rate": _fmt_rate(
+                    _rate(group, "all_waypoint_strict_success")
+                ),
                 "nmg_threshold_success_rate": _fmt_rate(
                     _rate(group, "nmg_threshold_success")
                 ),
+                "all_waypoint_nmg_threshold_success_rate": _fmt_rate(
+                    _rate(group, "all_waypoint_nmg_threshold_success")
+                ),
                 "final_tcp_pos_err_mm": _fmt_float(
                     None if final_pos is None else final_pos * 1000.0, 3
                 ),
                 "final_tcp_rot_err_deg": _fmt_float(
                     None if final_rot is None else final_rot * 180.0 / math.pi, 3
                 ),
+                "mean_waypoint_pos_err_mm": _fmt_float(
+                    (None if mean_waypoint_pos is None else mean_waypoint_pos * 1000.0),
+                    3,
+                ),
+                "max_waypoint_pos_err_mm": _fmt_float(
+                    None if max_waypoint_pos is None else max_waypoint_pos * 1000.0,
+                    3,
+                ),
+                "mean_waypoint_rot_err_deg": _fmt_float(
+                    (
+                        None
+                        if mean_waypoint_rot is None
+                        else mean_waypoint_rot * 180.0 / math.pi
+                    ),
+                    3,
+                ),
+                "max_waypoint_rot_err_deg": _fmt_float(
+                    (
+                        None
+                        if max_waypoint_rot is None
+                        else max_waypoint_rot * 180.0 / math.pi
+                    ),
+                    3,
+                ),
                 "joint_path_length": _fmt_float(
                     _mean(_numeric_values(group, "joint_path_length")), 4
                 ),
@@ -796,40 +1021,66 @@ def make_metric_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
 
 
 def make_leaderboard_rows(rows: list[dict[str, object]]) -> list[dict[str, object]]:
-    """Build leaderboard rows sorted by strict success then latency."""
+    """Build leaderboard rows sorted by waypoint strict success then latency."""
     leaderboard = []
-    for planner, group in summarize_by_planner(rows):
+    for trial_source, planner, group in summarize_by_source_planner(rows):
         strict_rate = _rate(group, "strict_pose_success") or 0.0
+        waypoint_strict_rate = _rate(group, "all_waypoint_strict_success") or 0.0
         loose_rate = _rate(group, "nmg_threshold_success") or 0.0
+        waypoint_loose_rate = _rate(group, "all_waypoint_nmg_threshold_success") or 0.0
         avg_time = _mean(_numeric_values(group, "planning_time_sec")) or 0.0
         avg_pos = _mean(_numeric_values(group, "final_tcp_pos_error"))
+        avg_waypoint_pos = _mean(_numeric_values(group, "max_waypoint_pos_error"))
         leaderboard.append(
             {
+                "trial_source": trial_source,
                 "planner": planner,
                 "strict_rate": strict_rate,
+                "waypoint_strict_rate": waypoint_strict_rate,
                 "loose_rate": loose_rate,
+                "waypoint_loose_rate": waypoint_loose_rate,
                 "avg_time_ms": avg_time * 1000.0,
                 "avg_pos_mm": None if avg_pos is None else avg_pos * 1000.0,
+                "avg_waypoint_pos_mm": (
+                    None if avg_waypoint_pos is None else avg_waypoint_pos * 1000.0
+                ),
             }
         )
     leaderboard.sort(
         key=lambda row: (
+            str(row["trial_source"]),
+            -float(row["waypoint_strict_rate"]),
             -float(row["strict_rate"]),
+            -float(row["waypoint_loose_rate"]),
             -float(row["loose_rate"]),
             float(row["avg_time_ms"]),
         )
     )
-    return [
-        {
-            "rank": rank,
-            "planner": row["planner"],
-            "overall_success_rate": _fmt_rate(float(row["strict_rate"])),
-            "nmg_threshold_success_rate": _fmt_rate(float(row["loose_rate"])),
-            "avg_cost_time_ms": _fmt_float(float(row["avg_time_ms"]), 2),
-            "avg_final_tcp_pos_err_mm": _fmt_float(row["avg_pos_mm"], 3),
-        }
-        for rank, row in enumerate(leaderboard, start=1)
-    ]
+    rows = []
+    current_source = None
+    source_rank = 0
+    for row in leaderboard:
+        if row["trial_source"] != current_source:
+            current_source = row["trial_source"]
+            source_rank = 1
+        else:
+            source_rank += 1
+        rows.append(
+            {
+                "rank": source_rank,
+                "trial_source": row["trial_source"],
+                "planner": row["planner"],
+                "overall_success_rate": _fmt_rate(float(row["waypoint_strict_rate"])),
+                "final_pose_success_rate": _fmt_rate(float(row["strict_rate"])),
+                "nmg_threshold_success_rate": _fmt_rate(float(row["loose_rate"])),
+                "avg_cost_time_ms": _fmt_float(float(row["avg_time_ms"]), 2),
+                "avg_final_tcp_pos_err_mm": _fmt_float(row["avg_pos_mm"], 3),
+                "avg_max_waypoint_pos_err_mm": _fmt_float(
+                    row["avg_waypoint_pos_mm"], 3
+                ),
+            }
+        )
+    return rows
 
 
 def _markdown_table(rows: list[dict[str, object]]) -> list[str]:
@@ -884,35 +1135,44 @@ def write_markdown_report(
 def make_skipped_rows(
     planners: list[str],
     *,
+    trial_sources: list[str],
     reason: str,
 ) -> list[dict[str, object]]:
     """Build rows for a gracefully skipped live-simulation benchmark."""
     rows = []
-    for planner in planners:
-        rows.append(
-            {
-                "script": SCRIPT_NAME,
-                "planner": planner,
-                "trial_id": 0,
-                "warmup": False,
-                "num_waypoints": 0,
-                "action_success": False,
-                "strict_pose_success": False,
-                "nmg_threshold_success": False,
-                "planning_time_sec": 0.0,
-                "cpu_delta_mb": 0.0,
-                "gpu_delta_mb": 0.0,
-                "peak_gpu_mb": 0.0,
-                "trajectory_steps": 0,
-                "final_tcp_pos_error": None,
-                "final_tcp_rot_error": None,
-                "joint_path_length": 0.0,
-                "max_joint_step": 0.0,
-                "mean_target_qpos_error": None,
-                "final_qpos": None,
-                "skip_reason": reason,
-            }
-        )
+    for trial_source in trial_sources:
+        for planner in planners:
+            rows.append(
+                {
+                    "script": SCRIPT_NAME,
+                    "planner": planner,
+                    "trial_source": trial_source,
+                    "trial_id": 0,
+                    "warmup": False,
+                    "num_waypoints": 0,
+                    "action_success": False,
+                    "strict_pose_success": False,
+                    "all_waypoint_strict_success": False,
+                    "nmg_threshold_success": False,
+                    "all_waypoint_nmg_threshold_success": False,
+                    "planning_time_sec": 0.0,
+                    "cpu_delta_mb": 0.0,
+                    "gpu_delta_mb": 0.0,
+                    "peak_gpu_mb": 0.0,
+                    "trajectory_steps": 0,
+                    "final_tcp_pos_error": None,
+                    "final_tcp_rot_error": None,
+                    "mean_waypoint_pos_error": None,
+                    "max_waypoint_pos_error": None,
+                    "mean_waypoint_rot_error": None,
+                    "max_waypoint_rot_error": None,
+                    "joint_path_length": 0.0,
+                    "max_joint_step": 0.0,
+                    "mean_target_qpos_error": None,
+                    "final_qpos": None,
+                    "skip_reason": reason,
+                }
+            )
     return rows
 
 
@@ -921,6 +1181,7 @@ def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
     validate_args(args)
     torch.manual_seed(args.seed)
     planners = expand_planner_selection(args.planner)
+    trial_sources = expand_trial_source_selection(args.trial_source)
 
     if args.save_raw_jsonl:
         raw_path = Path(args.save_raw_jsonl)
@@ -929,7 +1190,7 @@ def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
 
     if simulation_requires_cuda(args) and not torch.cuda.is_available():
         reason = "CUDA is unavailable, so the Franka planner benchmark was skipped."
-        rows = make_skipped_rows(planners, reason=reason)
+        rows = make_skipped_rows(planners, trial_sources=trial_sources, reason=reason)
         for row in rows:
             write_raw_jsonl(args.save_raw_jsonl, row)
         write_markdown_report(rows, args.report_path)
@@ -942,6 +1203,7 @@ def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
     total_trials = args.warmup_trials + args.num_trials
     trials = make_trials(
         robot,
+        trial_sources=trial_sources,
         total_trials=total_trials,
         num_waypoints=args.num_waypoints,
         seed=args.seed,
@@ -949,8 +1211,8 @@ def run_benchmark(args: argparse.Namespace) -> list[dict[str, object]]:
     rows: list[dict[str, object]] = []
     for planner in planners:
         motion_gen = build_motion_generator(robot, planner, checkpoint_path, args)
-        for index, trial in enumerate(trials):
-            warmup = index < args.warmup_trials
+        for trial in trials:
+            warmup = trial.trial_id < args.warmup_trials
             robot.set_qpos(trial.start_qpos.unsqueeze(0), name=ARM_NAME, target=False)
             robot.set_qpos(trial.start_qpos.unsqueeze(0), name=ARM_NAME, target=True)
             outcome = run_planner_once(motion_gen, planner, trial, args)
diff --git a/tests/benchmark/robotics/nmg/test_franka_pick_place.py b/tests/benchmark/robotics/nmg/test_franka_pick_place.py
index 39cad0c9..d6d4909e 100644
--- a/tests/benchmark/robotics/nmg/test_franka_pick_place.py
+++ b/tests/benchmark/robotics/nmg/test_franka_pick_place.py
@@ -25,43 +25,63 @@
 
 from embodichain.data import get_data_path
 from scripts.benchmark.robotics.nmg.franka_pick_place import (
+    BENCHMARK_PROFILE,
     ContactStats,
+    DEFAULT_ARENA_SPACE,
+    DEFAULT_DEMO_PROFILE,
     DEFAULT_FRANKA_TCP_YAW,
     DEFAULT_FRANKA_TCP_Z,
     DEFAULT_GRASP_AXIS,
     DEFAULT_GRASP_DEPTH_OFFSET,
     DEFAULT_GRIPPER_CLOSE_MARGIN,
+    DEFAULT_OBJECT_SETTLE_STEPS,
     FRANKA_GRIPPER_CLOSING_AXIS_INDEX,
+    GraspPreflight,
     OBJECT_SUPPORT_MARGIN,
     TABLE_COLLIDER_INIT_POS,
     TABLE_COLLIDER_SIZE,
     TABLE_INIT_POS,
     TABLE_TOP_Z,
+    TUTORIAL_OBJECT_INIT_Z,
+    TUTORIAL_OBJECT_SCALE,
+    TUTORIAL_OBJECT_XY,
+    TUTORIAL_POST_TRAJECTORY_STEPS,
+    TUTORIAL_PLACE_PROFILE,
+    TUTORIAL_PLACE_LIFT_HEIGHT,
+    build_grasp_generator_cfg,
     compute_attached_object_pose,
     create_support_table,
     estimate_gripper_opening_range,
     estimate_object_grasp_width,
     evaluate_grasp_preflight,
+    effective_final_hold_steps,
+    effective_pre_plan_hold_steps,
+    effective_replay_control,
     expand_mode_selection,
     expand_planner_selection,
     franka_hand_opening_from_finger_qpos,
     grasp_axis_alignment_cost,
     horizontal_bbox_axis,
     initialize_pre_pick_robot_pose,
+    inspect_viewer_before_trials,
     load_franka_hand_opening_spec,
     empty_replay_diagnostics,
     make_leaderboard_rows,
     make_failed_trial_row,
     make_skipped_rows,
+    make_tutorial_place_eef_pose,
+    make_tutorial_retract_eef_pose,
     physical_gpu_memory_skip_reason,
     _record_contact_stats,
     object_body_scale,
     object_initial_position,
     object_supported_z,
     parse_args,
+    pause_for_tutorial_inspection,
     rerank_grasp_costs_by_axis,
     resolve_object_body_scale,
     resolve_hand_open_close_qpos,
+    should_pause_for_tutorial_inspection,
     simulation_requires_cuda,
     write_markdown_report,
 )
@@ -70,11 +90,23 @@
 class _FakeSim:
     def __init__(self):
         self.cfg = None
+        self.update_steps = []
 
     def add_rigid_object(self, cfg):
         self.cfg = cfg
         return cfg
 
+    def update(self, step: int):
+        self.update_steps.append(step)
+
+
+class _FakeStdin:
+    def __init__(self, interactive: bool):
+        self.interactive = interactive
+
+    def isatty(self):
+        return self.interactive
+
 
 class _FakeObject:
     def __init__(self):
@@ -232,8 +264,16 @@ def test_cli_defaults_and_selection_expansion():
     assert args.mode == "planner"
     assert args.object == "sugar_box"
     assert args.planner == "all"
+    assert args.demo_profile == DEFAULT_DEMO_PROFILE
+    assert args.demo_profile == TUTORIAL_PLACE_PROFILE
     assert args.headless is True
+    assert args.open_window is False
+    assert args.auto_play is False
     assert args.object_replay_mode == "physics"
+    assert args.support_surface == "ground"
+    assert args.object_xy is None
+    assert args.inspect_seconds == pytest.approx(0.0)
+    assert args.arena_space == pytest.approx(DEFAULT_ARENA_SPACE)
     assert args.object_scale is None
     assert args.grasp_axis == DEFAULT_GRASP_AXIS
     assert args.tcp_z == pytest.approx(DEFAULT_FRANKA_TCP_Z)
@@ -250,6 +290,87 @@ def test_cli_defaults_and_selection_expansion():
     assert expand_mode_selection("both") == ["planner", "physical"]
 
 
+def test_open_window_disables_headless_mode():
+    args = parse_args(["--open_window"])
+
+    assert args.open_window is True
+    assert args.headless is False
+    assert should_pause_for_tutorial_inspection(args) is True
+
+
+def test_auto_play_skips_open_window_tutorial_pause():
+    args = parse_args(["--open_window", "--auto_play"])
+
+    assert args.auto_play is True
+    assert should_pause_for_tutorial_inspection(args) is False
+
+
+def test_inspect_seconds_updates_viewer_before_trials():
+    sim = _FakeSim()
+    args = parse_args(["--inspect_seconds", "0.05"])
+
+    inspect_viewer_before_trials(sim, args)
+
+    assert sim.update_steps == [5]
+
+
+def test_benchmark_profile_does_not_use_tutorial_inspection_pause():
+    args = parse_args(["--open_window", "--demo_profile", "benchmark"])
+
+    assert should_pause_for_tutorial_inspection(args) is False
+
+
+def test_tutorial_profile_replay_matches_tutorial_direct_qpos_loop():
+    args = parse_args([])
+
+    assert effective_pre_plan_hold_steps(args) == 0
+    assert effective_replay_control(args) == "direct"
+    assert effective_final_hold_steps(args) == TUTORIAL_POST_TRAJECTORY_STEPS
+
+
+def test_benchmark_profile_replay_uses_requested_control_and_hold_steps():
+    args = parse_args(
+        [
+            "--demo_profile",
+            "benchmark",
+            "--replay_control",
+            "target",
+            "--hold_steps",
+            "7",
+        ]
+    )
+
+    assert effective_pre_plan_hold_steps(args) == 7
+    assert effective_replay_control(args) == "target"
+    assert effective_final_hold_steps(args) == 87
+
+
+def test_noninteractive_tutorial_inspection_pause_does_not_read_input(monkeypatch):
+    from scripts.benchmark.robotics.nmg import franka_pick_place
+
+    args = parse_args(["--open_window"])
+    monkeypatch.setattr(franka_pick_place.sys, "stdin", _FakeStdin(False))
+    monkeypatch.setattr(
+        "builtins.input",
+        lambda _: pytest.fail("input should not be called for noninteractive stdin"),
+    )
+
+    pause_for_tutorial_inspection(args)
+
+
+def test_interactive_tutorial_inspection_pause_reads_input(monkeypatch):
+    from scripts.benchmark.robotics.nmg import franka_pick_place
+
+    calls = []
+    args = parse_args(["--open_window"])
+    monkeypatch.setattr(franka_pick_place.sys, "stdin", _FakeStdin(True))
+    monkeypatch.setattr("builtins.input", lambda prompt: calls.append(prompt))
+
+    pause_for_tutorial_inspection(args)
+
+    assert calls == ["Inspect the object, then press Enter to plan PickUp -> Place..."]
+
+
 def test_cli_accepts_object_scale_and_short_axis_grasp():
     args = parse_args(
         [
@@ -257,12 +378,16 @@ def test_cli_accepts_object_scale_and_short_axis_grasp():
             "0.9",
             "0.9",
             "0.9",
+            "--object_xy",
+            "-0.35",
+            "0.12",
             "--grasp_axis",
             "short",
         ]
     )
 
     assert args.object_scale == pytest.approx([0.9, 0.9, 0.9])
+    assert args.object_xy == pytest.approx([-0.35, 0.12])
     assert args.grasp_axis == "short"
 
 
@@ -289,8 +414,12 @@ def test_object_supported_z_respects_scale_override():
     assert z == pytest.approx(TABLE_TOP_Z + 0.045 + OBJECT_SUPPORT_MARGIN)
 
 
-def test_sugar_box_scale_uses_preset_value_by_default():
-    assert object_body_scale("sugar_box") == pytest.approx((1.0, 1.0, 1.0))
+def test_sugar_box_scale_matches_tutorial_place_profile_by_default():
+    assert object_body_scale("sugar_box") == pytest.approx(TUTORIAL_OBJECT_SCALE)
+    assert resolve_object_body_scale(
+        "sugar_box",
+        demo_profile=BENCHMARK_PROFILE,
+    ) == pytest.approx((1.0, 1.0, 1.0))
 
 
 def test_support_table_is_static_and_below_interaction_plane():
@@ -302,16 +431,105 @@ def test_support_table_is_static_and_below_interaction_plane():
     assert cfg.uid == "support_table_collider"
     assert cfg.shape.size == list(TABLE_COLLIDER_SIZE)
     assert cfg.init_pos == TABLE_COLLIDER_INIT_POS
-    assert TABLE_TOP_Z == pytest.approx(0.0)
+    assert TABLE_TOP_Z == pytest.approx(0.1)
+
 
+def test_cli_accepts_legacy_table_support_surface():
+    args = parse_args(["--support_surface", "table"])
 
-def test_sugar_box_initial_position_places_bottom_above_support_plane():
+    assert args.support_surface == "table"
+
+
+def test_ground_support_surface_matches_tutorial_grasp_filtering():
+    preflight = GraspPreflight(
+        status="ok",
+        object_grasp_width_m=0.09,
+        gripper_min_opening_m=0.07,
+        gripper_max_opening_m=0.16,
+        reason="ok",
+    )
+
+    ground_cfg = build_grasp_generator_cfg(parse_args([]), preflight)
+    table_cfg = build_grasp_generator_cfg(
+        parse_args(["--support_surface", "table"]),
+        preflight,
+    )
+
+    assert ground_cfg.is_filter_ground_collision is False
+    assert table_cfg.is_filter_ground_collision is True
+    assert ground_cfg.antipodal_sampler_cfg.min_length == pytest.approx(0.003)
+
+
+def test_benchmark_profile_grasp_filter_uses_franka_min_opening():
+    preflight = GraspPreflight(
+        status="ok",
+        object_grasp_width_m=0.09,
+        gripper_min_opening_m=0.07,
+        gripper_max_opening_m=0.16,
+        reason="ok",
+    )
+
+    cfg = build_grasp_generator_cfg(
+        parse_args(["--demo_profile", "benchmark"]),
+        preflight,
+    )
+
+    assert cfg.antipodal_sampler_cfg.min_length == pytest.approx(0.07)
+
+
+def test_ground_support_surface_matches_pickup_tutorial_settle_defaults():
+    assert parse_args([]).support_surface == "ground"
+    assert parse_args([]).arena_space == pytest.approx(2.5)
+    assert DEFAULT_OBJECT_SETTLE_STEPS == 10
+
+
+def test_sugar_box_initial_position_matches_pickup_tutorial_settle_pose():
     x, y, z = object_initial_position("sugar_box")
 
+    assert (x, y) == pytest.approx(TUTORIAL_OBJECT_XY)
+    assert z == pytest.approx(TUTORIAL_OBJECT_INIT_Z)
+    assert object_supported_z("sugar_box") > TABLE_TOP_Z
+
+
+def test_object_xy_override_changes_tutorial_initial_position():
+    x, y, z = object_initial_position("sugar_box", object_xy=(-0.35, 0.12))
+
+    assert (x, y) == pytest.approx((-0.35, 0.12))
+    assert z == pytest.approx(TUTORIAL_OBJECT_INIT_Z)
+
+
+def test_benchmark_profile_sugar_box_initial_position_uses_support_plane_z():
+    x, y, z = object_initial_position(
+        "sugar_box",
+        demo_profile=BENCHMARK_PROFILE,
+    )
+
     assert (x, y) == pytest.approx((0.31, 0.0))
-    assert z > TABLE_TOP_Z
+    assert z == pytest.approx(
+        object_supported_z("sugar_box", demo_profile=BENCHMARK_PROFILE),
+        abs=1e-6,
+    )
+
+
+def test_object_xy_override_changes_benchmark_initial_position():
+    x, y, z = object_initial_position(
+        "sugar_box",
+        demo_profile=BENCHMARK_PROFILE,
+        object_xy=(-0.35, 0.12),
+    )
+
+    assert (x, y) == pytest.approx((-0.35, 0.12))
+    assert z == pytest.approx(
+        object_supported_z("sugar_box", demo_profile=BENCHMARK_PROFILE),
+        abs=1e-6,
+    )
+
+
+def test_legacy_table_support_surface_uses_tutorial_xy_and_support_plane_z():
+    _, _, z = object_initial_position("sugar_box", support_surface="table")
+
     assert z == pytest.approx(object_supported_z("sugar_box"), abs=1e-6)
-    assert z - TABLE_TOP_Z > OBJECT_SUPPORT_MARGIN
+    assert z > TABLE_TOP_Z
 
 
 def test_resolve_hand_open_close_qpos_clamps_to_joint_limits():
@@ -434,6 +652,18 @@ def test_compute_attached_object_pose_inverts_object_to_eef_transform():
     assert object_pose[:3, 3].tolist() == pytest.approx([0.3, -0.2, 0.28])
 
 
+def test_tutorial_place_retract_pose_matches_place_lift_height():
+    place_pose = make_tutorial_place_eef_pose(torch.device("cpu"))
+
+    retract_pose = make_tutorial_retract_eef_pose(place_pose)
+
+    assert torch.allclose(retract_pose[:3, :3], place_pose[:3, :3])
+    assert retract_pose[:3, 3].tolist() == pytest.approx([-0.20, 0.28, 0.24])
+    assert retract_pose[2, 3] - place_pose[2, 3] == pytest.approx(
+        TUTORIAL_PLACE_LIFT_HEIGHT
+    )
+
+
 def test_contact_stats_aggregation_tracks_impulse_and_distance_extrema():
     max_count, max_impulse, max_total_impulse, min_distance = _record_contact_stats(
         ContactStats(count=2, max_impulse=0.3, total_impulse=0.5, min_distance=-0.001),
diff --git a/tests/benchmark/robotics/nmg/test_franka_planner.py b/tests/benchmark/robotics/nmg/test_franka_planner.py
index d1efc9d5..6503ab19 100644
--- a/tests/benchmark/robotics/nmg/test_franka_planner.py
+++ b/tests/benchmark/robotics/nmg/test_franka_planner.py
@@ -18,13 +18,17 @@
 
 from __future__ import annotations
 
+import pytest
 import torch
 
 from scripts.benchmark.robotics.nmg.franka_planner import (
+    DEMO_WAYPOINT_OFFSETS,
     PlannerOutcome,
     PlannerTrial,
     build_trial_row,
     expand_planner_selection,
+    expand_trial_source_selection,
+    make_demo_offset_waypoints,
     make_leaderboard_rows,
     make_metric_rows,
     make_perf_rows,
@@ -42,6 +46,7 @@ class _FakeRobot:
 def _trial() -> PlannerTrial:
     return PlannerTrial(
         trial_id=0,
+        trial_source="fk_bank",
         start_qpos=torch.zeros(7),
         target_qpos=torch.zeros(1, 7),
         waypoints=[torch.eye(4)],
@@ -53,12 +58,15 @@ def _rows() -> list[dict[str, object]]:
         {
             "script": "franka_planner_nmg",
             "planner": "ik_toppra",
+            "trial_source": "fk_bank",
             "trial_id": 0,
             "warmup": True,
             "num_waypoints": 1,
             "action_success": False,
             "strict_pose_success": False,
+            "all_waypoint_strict_success": False,
             "nmg_threshold_success": False,
+            "all_waypoint_nmg_threshold_success": False,
             "planning_time_sec": 10.0,
             "cpu_delta_mb": 0.0,
             "gpu_delta_mb": 0.0,
@@ -66,6 +74,10 @@ def _rows() -> list[dict[str, object]]:
             "trajectory_steps": 0,
             "final_tcp_pos_error": None,
             "final_tcp_rot_error": None,
+            "mean_waypoint_pos_error": None,
+            "max_waypoint_pos_error": None,
+            "mean_waypoint_rot_error": None,
+            "max_waypoint_rot_error": None,
             "joint_path_length": 0.0,
             "max_joint_step": 0.0,
             "mean_target_qpos_error": None,
@@ -74,12 +86,15 @@ def _rows() -> list[dict[str, object]]:
         {
             "script": "franka_planner_nmg",
             "planner": "ik_toppra",
+            "trial_source": "fk_bank",
             "trial_id": 1,
             "warmup": False,
             "num_waypoints": 1,
             "action_success": True,
             "strict_pose_success": True,
+            "all_waypoint_strict_success": True,
             "nmg_threshold_success": True,
+            "all_waypoint_nmg_threshold_success": True,
             "planning_time_sec": 0.8,
             "cpu_delta_mb": 1.0,
             "gpu_delta_mb": 0.0,
@@ -87,6 +102,10 @@ def _rows() -> list[dict[str, object]]:
             "trajectory_steps": 120,
             "final_tcp_pos_error": 0.0005,
             "final_tcp_rot_error": 0.01,
+            "mean_waypoint_pos_error": 0.0004,
+            "max_waypoint_pos_error": 0.0005,
+            "mean_waypoint_rot_error": 0.01,
+            "max_waypoint_rot_error": 0.01,
             "joint_path_length": 0.5,
             "max_joint_step": 0.1,
             "mean_target_qpos_error": 0.0,
@@ -95,12 +114,15 @@ def _rows() -> list[dict[str, object]]:
         {
             "script": "franka_planner_nmg",
             "planner": "neural",
+            "trial_source": "fk_bank",
             "trial_id": 1,
             "warmup": False,
             "num_waypoints": 1,
             "action_success": True,
             "strict_pose_success": False,
+            "all_waypoint_strict_success": False,
             "nmg_threshold_success": True,
+            "all_waypoint_nmg_threshold_success": True,
             "planning_time_sec": 0.2,
             "cpu_delta_mb": 0.5,
             "gpu_delta_mb": 2.0,
@@ -108,6 +130,10 @@ def _rows() -> list[dict[str, object]]:
             "trajectory_steps": 120,
             "final_tcp_pos_error": 0.02,
             "final_tcp_rot_error": 0.2,
+            "mean_waypoint_pos_error": 0.02,
+            "max_waypoint_pos_error": 0.02,
+            "mean_waypoint_rot_error": 0.2,
+            "max_waypoint_rot_error": 0.2,
             "joint_path_length": 0.7,
             "max_joint_step": 0.2,
             "mean_target_qpos_error": 0.1,
@@ -120,6 +146,7 @@ def test_cli_defaults_and_selection_expansion():
     args = parse_args([])
 
     assert args.planner == "all"
+    assert args.trial_source == "fk_bank"
     assert args.num_trials == 8
     assert expand_planner_selection("all") == [
         "ik_toppra",
@@ -127,16 +154,23 @@ def test_cli_defaults_and_selection_expansion():
         "neural_refine",
     ]
     assert expand_planner_selection("neural") == ["neural"]
+    assert expand_trial_source_selection("all") == ["demo_offsets", "fk_bank"]
+    assert expand_trial_source_selection("demo_offsets") == ["demo_offsets"]
 
 
 def test_simulation_cuda_preflight():
     args = parse_args(["--device", "cpu", "--renderer", "fast-rt"])
 
     assert simulation_requires_cuda(args)
-    skipped = make_skipped_rows(["ik_toppra", "neural"], reason="cuda missing")
+    skipped = make_skipped_rows(
+        ["ik_toppra", "neural"],
+        trial_sources=["demo_offsets", "fk_bank"],
+        reason="cuda missing",
+    )
 
-    assert len(skipped) == 2
+    assert len(skipped) == 4
     assert {row["planner"] for row in skipped} == {"ik_toppra", "neural"}
+    assert {row["trial_source"] for row in skipped} == {"demo_offsets", "fk_bank"}
     assert all(row["skip_reason"] == "cuda missing" for row in skipped)
 
 
@@ -148,7 +182,30 @@ def test_report_builders_ignore_warmup_rows():
     assert {row["planner"] for row in perf_rows} == {"ik_toppra", "neural"}
     assert all(row["repeat_count"] == 1 for row in perf_rows)
     assert metric_rows[0]["strict_pose_success_rate"] == "100.0%"
+    assert metric_rows[0]["all_waypoint_strict_success_rate"] == "100.0%"
     assert leaderboard_rows[0]["planner"] == "ik_toppra"
+    assert leaderboard_rows[0]["trial_source"] == "fk_bank"
+
+
+def test_report_builders_keep_trial_sources_separate():
+    rows = _rows()
+    demo_row = dict(rows[1])
+    demo_row["trial_source"] = "demo_offsets"
+    demo_row["planning_time_sec"] = 0.1
+    rows.append(demo_row)
+
+    perf_rows = make_perf_rows(rows)
+    leaderboard_rows = make_leaderboard_rows(rows)
+
+    assert {(row["trial_source"], row["planner"]) for row in perf_rows} == {
+        ("demo_offsets", "ik_toppra"),
+        ("fk_bank", "ik_toppra"),
+        ("fk_bank", "neural"),
+    }
+    assert any(
+        row["trial_source"] == "demo_offsets" and row["planner"] == "ik_toppra"
+        for row in leaderboard_rows
+    )
 
 
 def test_write_markdown_report_has_exactly_three_tables(tmp_path):
@@ -161,7 +218,7 @@ def test_write_markdown_report_has_exactly_three_tables(tmp_path):
     assert text.count("## Time & Memory") == 1
     assert text.count("## Success & Other Metrics") == 1
     assert text.count("## Leaderboard") == 1
-    assert text.count("| planner |") == 3
+    assert text.count("| trial_source | planner |") == 3
 
 
 def test_build_trial_row_marks_failed_empty_trajectory():
@@ -185,5 +242,16 @@ def test_build_trial_row_marks_failed_empty_trajectory():
 
     assert row["action_success"] is False
     assert row["strict_pose_success"] is False
+    assert row["all_waypoint_strict_success"] is False
     assert row["nmg_threshold_success"] is False
+    assert row["all_waypoint_nmg_threshold_success"] is False
     assert row["trajectory_steps"] == 0
+
+
+def test_demo_offset_waypoints_match_example_offsets():
+    start_pose = torch.eye(4)
+    waypoints = make_demo_offset_waypoints(start_pose, num_waypoints=99)
+
+    assert len(waypoints) == len(DEMO_WAYPOINT_OFFSETS)
+    for waypoint, offset in zip(waypoints, DEMO_WAYPOINT_OFFSETS):
+        assert waypoint[:3, 3].tolist() == pytest.approx(offset)