From eda68b2860c48941ce0d81a2ea27fea5217d3f04 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sat, 13 Jun 2026 18:09:41 +0800 Subject: [PATCH 01/33] feat: add action agent pipeline baseline --- .../gen_sim/action_agent_pipeline/__init__.py | 21 + .../action_agent_pipeline/agents/__init__.py | 24 + .../agents/agent_base.py | 94 + .../agents/compile_agent.py | 126 + .../action_agent_pipeline/agents/llm.py | 75 + .../agents/task_agent.py | 72 + .../action_agent_pipeline/cli/__init__.py | 19 + .../cli/generate_ur5_basket_config.py | 246 ++ .../cli/pipeline_records.py | 383 ++ .../action_agent_pipeline/cli/run_agent.py | 156 + .../cli/run_agent_pipeline.py | 1334 +++++++ .../env_adapters/__init__.py | 19 + .../env_adapters/tableware/__init__.py | 19 + .../env_adapters/tableware/atomic_actions.py | 54 + .../env_adapters/tableware/base_agent_env.py | 345 ++ .../env_adapters/tableware/success.py | 237 ++ .../generation/__init__.py | 21 + .../generation/coacd_cache.py | 171 + .../generation/prompt_builders.py | 992 ++++++ .../generation/ur5_basket_config.py | 3127 +++++++++++++++++ .../gym_project_api/image2tabletop_client.py | 297 ++ .../prompt2geometry/.gitignore | 4 + .../prompt2geometry/__init__.py | 57 + .../prompt2geometry/config.json | 21 + .../gym_project_api/prompt2geometry/config.py | 109 + .../prompt2geometry/dimensions.py | 128 + .../prompt2geometry/llm_client.py | 134 + .../prompt2geometry/mesh_scaling.py | 225 ++ .../prompt2geometry/pipeline.py | 589 ++++ .../gym_project_api/prompt2geometry/run.py | 135 + .../prompt2geometry/sam3_client.py | 266 ++ .../prompt2geometry/sam3d_client.py | 324 ++ .../prompt2geometry/schemas.py | 46 + .../prompt2geometry/segmentation_outputs.py | 245 ++ .../prompt2geometry/zimage_client.py | 115 + .../action_agent_pipeline/prompts/__init__.py | 21 + .../prompts/atom_actions.txt | 59 + .../prompts/basic_background.txt | 20 + .../prompts/task_prompt.py | 122 + .../action_agent_pipeline/runtime/__init__.py | 21 + .../runtime/atom_action_utils.py | 170 + .../runtime/atom_actions.py | 1046 ++++++ .../runtime/graph_compiler.py | 262 ++ .../runtime/task_graph.py | 134 + .../action_agent_pipeline/utils/__init__.py | 18 + .../action_agent_pipeline/utils/llm_config.py | 159 + .../action_agent_pipeline/utils/llm_json.py | 73 + .../action_agent_pipeline/utils/llm_usage.py | 410 +++ .../action_agent_pipeline/utils/mllm.py | 115 + embodichain/lab/sim/objects/articulation.py | 9 +- .../test_backend_atomic_runtime.py | 432 +++ .../test_demo3_semantic_grasp_integration.py | 252 ++ .../test_graph_spec_backend_atomic.py | 112 + .../action_agent_pipeline/test_llm_usage.py | 161 + .../test_ur5_basket_config_generation.py | 1059 ++++++ 55 files changed, 14884 insertions(+), 1 deletion(-) create mode 100644 embodichain/gen_sim/action_agent_pipeline/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/agents/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/agents/agent_base.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/agents/compile_agent.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/agents/llm.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/agents/task_agent.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/generate_ur5_basket_config.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/pipeline_records.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/env_adapters/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/atomic_actions.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/.gitignore create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.json create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/dimensions.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/llm_client.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/mesh_scaling.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/pipeline.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/run.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3_client.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3d_client.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/schemas.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/segmentation_outputs.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/zimage_client.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/prompts/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt create mode 100644 embodichain/gen_sim/action_agent_pipeline/prompts/basic_background.txt create mode 100644 embodichain/gen_sim/action_agent_pipeline/prompts/task_prompt.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/runtime/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/utils/__init__.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/utils/llm_config.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/utils/llm_json.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/utils/llm_usage.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/utils/mllm.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_llm_usage.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py diff --git a/embodichain/gen_sim/action_agent_pipeline/__init__.py b/embodichain/gen_sim/action_agent_pipeline/__init__.py new file mode 100644 index 00000000..0517d273 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/__init__.py @@ -0,0 +1,21 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +"""Action-agent graph compilation and atomic-action runtime.""" + +__all__: list[str] = [] diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/__init__.py b/embodichain/gen_sim/action_agent_pipeline/agents/__init__.py new file mode 100644 index 00000000..4f45d84b --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/agents/__init__.py @@ -0,0 +1,24 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +__all__ = [ + "agent_base", + "compile_agent", + "llm", + "task_agent", +] diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/agent_base.py b/embodichain/gen_sim/action_agent_pipeline/agents/agent_base.py new file mode 100644 index 00000000..fc967f65 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/agents/agent_base.py @@ -0,0 +1,94 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from abc import ABCMeta +import os + +from embodichain.utils.utility import load_txt + + +def _resolve_prompt_path(file_name: str, config_dir: str | None = None) -> str: + # If absolute path, use directly + if os.path.isabs(file_name): + if os.path.exists(file_name): + return file_name + raise FileNotFoundError(f"Prompt file not found: {file_name}") + + # Try config directory first (for task-specific prompts) + if config_dir: + config_path = os.path.join(config_dir, file_name) + if os.path.exists(config_path): + return config_path + + # Try action_agent_pipeline/prompts directory for reusable prompts. + agents_prompts_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "prompts" + ) + agents_path = os.path.join(agents_prompts_dir, file_name) + if os.path.exists(agents_path): + return agents_path + + # If still not found, raise error with search paths + searched_paths = [] + if config_dir: + searched_paths.append(f" - {config_dir}/{file_name}") + searched_paths.append(f" - {agents_prompts_dir}/{file_name}") + + raise FileNotFoundError( + f"Prompt file not found: {file_name}\n" + f"Searched in:\n" + "\n".join(searched_paths) + ) + + +class AgentBase(metaclass=ABCMeta): + def __init__(self, **kwargs) -> None: + + assert ( + "prompt_kwargs" in kwargs.keys() + ), "Key prompt_kwargs must exist in config." + + for key, value in kwargs.items(): + setattr(self, key, value) + + # Get config directory if provided + config_dir = kwargs.get("config_dir", None) + if config_dir: + config_dir = os.path.dirname(os.path.abspath(config_dir)) + + # Preload and store prompt contents inside self.prompt_kwargs + for key, val in self.prompt_kwargs.items(): + if val["type"] == "text": + file_path = _resolve_prompt_path(val["name"], config_dir) + val["content"] = load_txt(file_path) + else: + raise ValueError( + f"Now only support `text` type but {val['type']} is given." + ) + + def generate(self, *args, **kwargs): + pass + + def act(self, *args, **kwargs): + pass + + def get_composed_observations(self, **kwargs): + ret = {} + for key, val in self.prompt_kwargs.items(): + ret[key] = val["content"] + ret.update(kwargs) + return ret diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/compile_agent.py b/embodichain/gen_sim/action_agent_pipeline/agents/compile_agent.py new file mode 100644 index 00000000..f1ac95a3 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/agents/compile_agent.py @@ -0,0 +1,126 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import hashlib +import json +from pathlib import Path +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.agents.agent_base import AgentBase +from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + extract_json_object, + normalize_json_content, +) +from embodichain.data import database_agent_prompt_dir + +__all__ = ["CompileAgent"] + +COMPILED_GRAPH_SCHEMA_VERSION = "nominal_graph_v1" + + +class CompileAgent(AgentBase): + """Compile and execute nominal atomic-action graph specs.""" + + query_prefix = "# " + query_suffix = "." + prompt_kwargs: dict[str, dict[str, Any]] + + def __init__(self, llm, **kwargs) -> None: + for key, value in kwargs.items(): + setattr(self, key, value) + self.prompt_kwargs = kwargs.get("prompt_kwargs", {}) + self.llm = llm + + def generate(self, **kwargs): + if kwargs.get("recovery_enabled") or kwargs.get("recovery_spec"): + raise NotImplementedError("Recovery graph generation has been removed.") + + log_dir = kwargs.get( + "log_dir", Path(database_agent_prompt_dir) / self.task_name + ) + file_path = Path(log_dir) / "agent_compiled_graph.json" + task_graph = extract_json_object(kwargs["task_graph"]) + task_graph_hash = _stable_json_hash(task_graph) + + if not kwargs.get("regenerate", False) and file_path.exists(): + existing_bundle = extract_json_object(file_path.read_text(encoding="utf-8")) + metadata = existing_bundle.get("metadata", {}) + if ( + metadata.get("schema_version") == COMPILED_GRAPH_SCHEMA_VERSION + and metadata.get("task_graph_hash") == task_graph_hash + ): + print(f"Compiled graph artifact already exists at {file_path}.") + return file_path, kwargs, None + + content = normalize_json_content( + { + "task_graph": task_graph, + "metadata": { + "schema_version": COMPILED_GRAPH_SCHEMA_VERSION, + "task_graph_hash": task_graph_hash, + }, + } + ) + + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text(content, encoding="utf-8") + print(f"Compiled graph artifact saved to {file_path}") + return file_path, kwargs, content + + def act(self, graph_file_path, **kwargs): + graph_file_path = Path(graph_file_path) + if graph_file_path.suffix != ".json": + raise ValueError("CompileAgent executes compiled graph JSON artifacts.") + + from embodichain.gen_sim.action_agent_pipeline.runtime.graph_compiler import ( + compile_agent_graph_from_file, + ) + + runtime_kwargs = _runtime_kwargs(kwargs, getattr(self, "prompt_kwargs", {})) + graph = compile_agent_graph_from_file(graph_file_path) + result = graph.run(**runtime_kwargs) + print("Compiled agent graph executed successfully.") + return result + + def get_composed_observations(self, **kwargs): + return dict(kwargs) + + +def _stable_json_hash(content: dict[str, Any]) -> str: + payload = json.dumps( + content, ensure_ascii=False, sort_keys=True, separators=(",", ":") + ) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def _runtime_kwargs( + kwargs: dict[str, Any], + prompt_kwargs: dict[str, dict[str, Any]], +) -> dict[str, Any]: + prompt_only_keys = set(prompt_kwargs) + prompt_only_keys.update( + { + "task_graph", + "recovery_spec", + "recovery_graph", + "recovery_enabled", + "observations", + "regenerate", + } + ) + return {key: value for key, value in kwargs.items() if key not in prompt_only_keys} diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/llm.py b/embodichain/gen_sim/action_agent_pipeline/agents/llm.py new file mode 100644 index 00000000..5ae52b88 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/agents/llm.py @@ -0,0 +1,75 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from embodichain.gen_sim.action_agent_pipeline.utils.mllm import create_chat_openai + +__all__ = ["create_llm", "task_llm", "compile_llm"] + + +# ------------------------------------------------------------------------------ +# LLM factory +# ------------------------------------------------------------------------------ + + +def create_llm(*, temperature=0.0, model=None, usage_stage=None): + return create_chat_openai( + temperature=temperature, + model=model, + usage_stage=usage_stage, + ) + + +# ------------------------------------------------------------------------------ +# LLM instances +# ------------------------------------------------------------------------------ + + +# Initialize LLM instances, but handle errors gracefully for documentation builds +def _create_llm_safe(*, temperature=0.0, model=None, usage_stage=None): + try: + return create_llm( + temperature=temperature, + model=model, + usage_stage=usage_stage, + ) + except Exception: + return None + + +task_llm = _create_llm_safe( + temperature=0.0, + usage_stage="action_agent.task_graph", +) +compile_llm = _create_llm_safe( + temperature=0.0, + usage_stage="action_agent.compile_canonicalize", +) + +if __name__ == "__main__": + + def call_llm(prompt, temperature=0.0, model=None): + llm = create_llm( + temperature=temperature, + model=model, + usage_stage="action_agent.debug", + ) + response = llm.invoke(prompt) + return response.content + + response = call_llm(prompt="Which model you are?", temperature=0.0) + print(response) diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/task_agent.py b/embodichain/gen_sim/action_agent_pipeline/agents/task_agent.py new file mode 100644 index 00000000..6efbdc32 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/agents/task_agent.py @@ -0,0 +1,72 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.agents.agent_base import AgentBase +from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + normalize_json_content, +) +from embodichain.gen_sim.action_agent_pipeline.prompts import TaskPrompt +from embodichain.data import database_agent_prompt_dir +from embodichain.utils.utility import load_txt + +__all__ = ["TaskAgent"] + + +class TaskAgent(AgentBase): + """Generate the nominal atomic-action task graph.""" + + prompt_name: str + prompt_kwargs: dict[str, dict[str, Any]] + + def __init__(self, llm, **kwargs) -> None: + super().__init__(**kwargs) + if llm is None: + raise ValueError( + "LLM is None. Configure the shared MLLM entry point " + "`embodichain.gen_sim.action_agent_pipeline.utils.mllm` with " + "OPENAI_API_KEY, optional " + "OPENAI_MODEL/OPENAI_BASE_URL, or the gen-sim LLM config." + ) + self.llm = llm + + def generate(self, **kwargs) -> str: + log_dir = kwargs.get( + "log_dir", Path(database_agent_prompt_dir) / self.task_name + ) + file_path = Path(log_dir) / "agent_task_graph.json" + + if not kwargs.get("regenerate", False) and file_path.exists(): + print(f"Task graph already exists at {file_path}.") + return load_txt(file_path) + + prompt = getattr(TaskPrompt, self.prompt_name)(**kwargs) + response = self.llm.invoke(prompt) + print(f"\033[92m\nTask agent output:\n{response.content}\n\033[0m") + + content = normalize_json_content(response.content) + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text(content, encoding="utf-8") + print(f"Generated task graph saved to {file_path}") + + return content + + def act(self, *args, **kwargs): + return super().act(*args, **kwargs) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/__init__.py b/embodichain/gen_sim/action_agent_pipeline/cli/__init__.py new file mode 100644 index 00000000..015c4151 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/__init__.py @@ -0,0 +1,19 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/generate_ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/cli/generate_ur5_basket_config.py new file mode 100644 index 00000000..3a754e11 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/generate_ur5_basket_config.py @@ -0,0 +1,246 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse +from pathlib import Path + +from embodichain.gen_sim.action_agent_pipeline.generation.ur5_basket_config import ( + TargetReplacementSpec, + generate_ur5_basket_config_from_project, +) + +__all__ = ["cli"] + + +def cli() -> None: + parser = argparse.ArgumentParser( + description=( + "Generate a Dual-UR5 basket-placement action-agent config from an " + "exported tabletop gym project." + ) + ) + parser.add_argument( + "--gym_project", + type=str, + required=True, + help=( + "Path to a project root, formatted tabletop scene folder, or " + "gym_config.json/gym_config_merged.json. Directory inputs prefer " + "gym_config_merged.json when available." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + required=True, + help="Destination directory for generated agent configs.", + ) + parser.add_argument( + "--task_name", + type=str, + default="UR5BreadBasket", + help="Task name passed to run_agent.", + ) + parser.add_argument( + "--task_description", + type=str, + default=None, + help=( + "Simple natural-language relative-placement task. Providing this " + "uses the LLM to generate a constrained config-level prompt/spec." + ), + ) + parser.add_argument( + "--task_file", + type=str, + default=None, + help="Optional text file containing --task_description.", + ) + parser.add_argument( + "--use_llm_roles", + action="store_true", + default=False, + help=( + "Use the shared LLM only to refine object role mapping. The task " + "template and prompts remain deterministic." + ), + ) + parser.add_argument( + "--llm_model", + type=str, + default=None, + help="Optional LLM model override for --use_llm_roles.", + ) + parser.add_argument( + "--target_body_scale", + type=float, + default=0.7, + help=( + "Uniform body_scale for generated target objects. Basket-like " + "containers keep their source body_scale." + ), + ) + parser.add_argument( + "--target_replacement1", + "--target-replacement1", + nargs=2, + metavar=("SOURCE_UID", "PROMPT"), + default=None, + help=( + "Generate /mesh_assets/new1 from PROMPT and use it " + "to replace SOURCE_UID in the generated config." + ), + ) + parser.add_argument( + "--target_replacement2", + "--target-replacement2", + nargs=2, + metavar=("SOURCE_UID", "PROMPT"), + default=None, + help=( + "Generate /mesh_assets/new2 from PROMPT and use it " + "to replace SOURCE_UID in the generated config." + ), + ) + parser.add_argument( + "--sync_replacement_names", + "--sync-replacement-names", + action="store_true", + default=False, + help=( + "Also update replacement target runtime UIDs and generated prompts " + "from the replacement prompts." + ), + ) + parser.add_argument( + "--reuse_target_replacements", + "--reuse-target-replacements", + dest="reuse_target_replacements", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "Reuse existing prompt-generated replacement GLBs when the prompt " + "and expected output name match. Defaults to true." + ), + ) + parser.add_argument( + "--prewarm_coacd_cache", + "--prewarm-coacd-cache", + dest="prewarm_coacd_cache", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "Precompute environment CoACD cache files during config generation. " + "Defaults to true." + ), + ) + parser.add_argument( + "--overwrite", + action="store_true", + default=False, + help="Overwrite generated files if they already exist.", + ) + parser.add_argument( + "--max_episodes", + type=int, + default=1, + help="max_episodes value written to fast_gym_config.json.", + ) + parser.add_argument( + "--max_episode_steps", + type=int, + default=1000, + help="max_episode_steps value written to fast_gym_config.json.", + ) + args = parser.parse_args() + task_description = _resolve_task_description(args) + target_replacements = _resolve_target_replacements(args) + + paths = generate_ur5_basket_config_from_project( + gym_project=args.gym_project, + output_dir=args.output_dir, + task_name=args.task_name, + task_description=task_description, + use_llm_roles=args.use_llm_roles, + llm_model=args.llm_model, + target_body_scale=args.target_body_scale, + target_replacements=target_replacements, + sync_replacement_names=args.sync_replacement_names, + reuse_target_replacements=args.reuse_target_replacements, + prewarm_coacd_cache=args.prewarm_coacd_cache, + overwrite=args.overwrite, + max_episodes=args.max_episodes, + max_episode_steps=args.max_episode_steps, + ) + + print(f"Generated gym config: {paths.gym_config}") + print(f"Generated agent config: {paths.agent_config}") + print(f"Generated task prompt: {paths.task_prompt}") + print(f"Generated basic background: {paths.basic_background}") + print(f"Generated atom actions: {paths.atom_actions}") + if paths.summary: + print("Generation summary:") + for key, value in paths.summary.items(): + print(f" {key}: {value}") + print( + "Run with:\n" + "python -m embodichain.gen_sim.action_agent_pipeline.cli.run_agent " + f"--task_name {args.task_name} " + f'--gym_config "{paths.gym_config}" ' + f'--agent_config "{paths.agent_config}" ' + "--regenerate" + ) + + +def _resolve_task_description(args: argparse.Namespace) -> str | None: + if args.task_description and args.task_file: + raise ValueError("Use either --task_description or --task_file, not both.") + if args.task_file: + return Path(args.task_file).expanduser().read_text(encoding="utf-8").strip() + if args.task_description: + return args.task_description.strip() + return None + + +def _resolve_target_replacements( + args: argparse.Namespace, +) -> list[TargetReplacementSpec]: + replacements = [] + if args.target_replacement1: + source_uid, prompt = args.target_replacement1 + replacements.append( + TargetReplacementSpec( + source_uid=source_uid, + prompt=prompt, + output_dir_name="new1", + ) + ) + if args.target_replacement2: + source_uid, prompt = args.target_replacement2 + replacements.append( + TargetReplacementSpec( + source_uid=source_uid, + prompt=prompt, + output_dir_name="new2", + ) + ) + return replacements + + +if __name__ == "__main__": + cli() diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_records.py b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_records.py new file mode 100644 index 00000000..79d5a189 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_records.py @@ -0,0 +1,383 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +"""Pipeline history and manifest record helpers.""" + +from __future__ import annotations + +import argparse +from collections.abc import Sequence +from datetime import datetime +import hashlib +import json +from pathlib import Path +from typing import Any + +__all__ = [ + "append_pipeline_history", + "build_pipeline_record", + "find_history_entry_by_index", + "history_entry_has_source", + "history_entry_index", + "path_from_history_entry", + "pipeline_history_path", + "read_pipeline_history", + "resolve_record_path", + "resolve_source_gym_config", + "write_pipeline_manifests", +] + + +def pipeline_history_path(args: argparse.Namespace) -> Path: + return Path(args.pipeline_history_path).expanduser().resolve() + + +def read_pipeline_history( + history_path: Path, + *, + schema_version: int, +) -> dict[str, Any]: + if not history_path.exists(): + return {"schema_version": schema_version, "runs": []} + + data = json.loads(history_path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError(f"Pipeline history must be a JSON object: {history_path}") + runs = data.get("runs") + if not isinstance(runs, list): + raise ValueError(f"Pipeline history must contain a runs list: {history_path}") + return { + "schema_version": data.get("schema_version", schema_version), + "runs": runs, + } + + +def find_history_entry_by_index( + runs: list[Any], history_index: int +) -> dict[str, Any] | None: + for entry in runs: + if isinstance(entry, dict) and history_entry_index(entry) == history_index: + return entry + return None + + +def history_entry_index(entry: dict[str, Any]) -> int: + try: + return int(entry.get("index", 0)) + except (TypeError, ValueError): + return 0 + + +def history_entry_has_source(entry: dict[str, Any]) -> bool: + return bool(entry.get("source_gym_config") or entry.get("source_gym_project_dir")) + + +def path_from_history_entry(entry: dict[str, Any], *, repo_root: Path) -> Path: + source = entry.get("source_gym_config") or entry.get("source_gym_project_dir") + if not source: + raise ValueError( + f"Pipeline history entry #{entry.get('index')} has no source gym path." + ) + path = resolve_record_path(str(source), repo_root=repo_root) + if not path.exists(): + raise FileNotFoundError(f"Pipeline history source path does not exist: {path}") + return path + + +def resolve_record_path(value: str | Path, *, repo_root: Path) -> Path: + path = Path(value).expanduser() + if path.is_absolute(): + return path.resolve() + return (repo_root / path).resolve() + + +def write_pipeline_manifests( + *, + args: argparse.Namespace, + resolution: Any, + generated_paths: Any, + target_replacements: Sequence[object], + repo_root: Path, + schema_version: int, + manifest_filename: str, +) -> dict[str, Any]: + history_path = pipeline_history_path(args) + record = build_pipeline_record( + args=args, + resolution=resolution, + generated_paths=generated_paths, + history_path=history_path, + target_replacements=target_replacements, + repo_root=repo_root, + schema_version=schema_version, + ) + record = append_pipeline_history( + history_path, + record, + schema_version=schema_version, + ) + + manifest_path = Path(generated_paths.output_dir) / manifest_filename + manifest_path.write_text( + json.dumps(record, ensure_ascii=False, indent=4) + "\n", + encoding="utf-8", + ) + print(f"Updated pipeline history: {history_path}", flush=True) + print(f"Wrote pipeline manifest: {manifest_path}", flush=True) + return record + + +def build_pipeline_record( + *, + args: argparse.Namespace, + resolution: Any, + generated_paths: Any, + history_path: Path, + target_replacements: Sequence[object], + repo_root: Path, + schema_version: int, +) -> dict[str, Any]: + source_gym_config = resolve_source_gym_config( + Path(resolution.path), + gym_config_preference=("gym_config_merged.json", "gym_config.json"), + ) + source_gym_project_dir = source_gym_config.parent + source_sha256 = _file_sha256(source_gym_config) + record: dict[str, Any] = { + "schema_version": schema_version, + "created_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "task_name": args.task_name, + "source_mode": resolution.mode, + "source_id": f"gym_config_sha256:{source_sha256}", + "source_gym_config_sha256": source_sha256, + "path_base": "repo_root", + "source_gym_project_dir": _record_path(source_gym_project_dir, repo_root), + "source_gym_config": _record_path(source_gym_config, repo_root), + "input_path": _record_path(Path(resolution.path), repo_root), + "config_output_dir": _record_path(Path(generated_paths.output_dir), repo_root), + "generated_gym_config": _record_path( + Path(generated_paths.gym_config), + repo_root, + ), + "generated_agent_config": _record_path( + Path(generated_paths.agent_config), + repo_root, + ), + "generated_task_prompt": _record_path( + Path(generated_paths.task_prompt), + repo_root, + ), + "generated_basic_background": _record_path( + Path(generated_paths.basic_background), + repo_root, + ), + "generated_atom_actions": _record_path( + Path(generated_paths.atom_actions), + repo_root, + ), + "pipeline_history_path": _record_path(history_path, repo_root), + "target_body_scale": args.target_body_scale, + "target_replacements": _target_replacement_records( + args, + target_replacements, + ), + "sync_replacement_names": args.sync_replacement_names, + "reuse_target_replacements": args.reuse_target_replacements, + "prewarm_coacd_cache": args.prewarm_coacd_cache, + "overwrite_config": args.overwrite_config, + "regenerate": args.regenerate, + "skip_run_agent": args.skip_run_agent, + "generation_summary": generated_paths.summary, + } + if args.task_description: + record["task_description"] = args.task_description + record.update(_source_request_record(args, resolution, repo_root=repo_root)) + return record + + +def resolve_source_gym_config( + input_path: Path, + *, + gym_config_preference: Sequence[str], +) -> Path: + input_path = input_path.expanduser().resolve() + if input_path.is_file(): + if input_path.name not in gym_config_preference: + expected = ", ".join(gym_config_preference) + raise ValueError(f"Expected one of {expected}, got: {input_path}") + return input_path + + for filename in gym_config_preference: + path = input_path / filename + if path.is_file(): + return path.resolve() + + matches = [] + for filename in gym_config_preference: + matches.extend(sorted(input_path.rglob(filename))) + unique_matches = sorted({path.resolve() for path in matches}) + if len(unique_matches) == 1: + return unique_matches[0] + if not unique_matches: + expected = " or ".join(gym_config_preference) + raise FileNotFoundError(f"{expected} not found under: {input_path}") + match_text = ", ".join(path.as_posix() for path in unique_matches) + raise ValueError( + f"Multiple gym config files found under {input_path}: {match_text}" + ) + + +def append_pipeline_history( + history_path: Path, + record: dict[str, Any], + *, + schema_version: int, +) -> dict[str, Any]: + history = read_pipeline_history(history_path, schema_version=schema_version) + runs = history["runs"] + next_index = ( + max( + (history_entry_index(entry) for entry in runs if isinstance(entry, dict)), + default=0, + ) + + 1 + ) + record = dict(record) + record["index"] = next_index + + runs.append(record) + history["schema_version"] = schema_version + history_path.parent.mkdir(parents=True, exist_ok=True) + history_path.write_text( + json.dumps(history, ensure_ascii=False, indent=4) + "\n", + encoding="utf-8", + ) + return record + + +def _source_request_record( + args: argparse.Namespace, + resolution: Any, + *, + repo_root: Path, +) -> dict[str, Any]: + record: dict[str, Any] = {} + if args.image_name: + record["image_name"] = args.image_name + if args.image: + record["image"] = _record_path(Path(args.image).expanduser(), repo_root) + if args.use_image2scene: + record.update( + { + "server": args.server, + "background": args.background, + "image2scene_root": _record_path( + Path(args.image2scene_root).expanduser(), + repo_root, + ), + "image2scene_download_dir": str(args.image2scene_download_dir), + "image2scene_output_root": str(args.image2scene_output_root), + "image2scene_gen_config": str(args.image2scene_gen_config), + "image2scene_llm_config": str(args.image2scene_llm_config), + } + ) + if args.image2scene_extract_dir is not None: + record["image2scene_extract_dir"] = str(args.image2scene_extract_dir) + if args.image2scene_merged_output is not None: + record["image2scene_merged_output"] = str(args.image2scene_merged_output) + elif resolution.mode == "image2tabletop": + record.update( + { + "server": args.server, + "gym_project_root": _record_path( + Path(args.gym_project_root).expanduser(), + repo_root, + ), + "overwrite_gym_project": args.overwrite_gym_project, + } + ) + elif resolution.mode == "existing_gym_project": + record["gym_project"] = _record_path( + Path(args.gym_project).expanduser(), + repo_root, + ) + elif resolution.mode == "history" and resolution.base_history is not None: + base_source_path = path_from_history_entry( + resolution.base_history, + repo_root=repo_root, + ) + record.update( + { + "base_task_name": args.base_task_name, + "base_history_index": resolution.base_history.get("index"), + "base_history_task_name": resolution.base_history.get("task_name"), + "base_history_source_id": resolution.base_history.get("source_id"), + "base_history_source_gym_config": _record_path( + base_source_path, + repo_root, + ), + } + ) + return record + + +def _target_replacement_records( + args: argparse.Namespace, + target_replacements: Sequence[object], +) -> list[dict[str, str]]: + requested_by_output_dir = { + output_dir_name: replacement[0] + for output_dir_name, replacement in ( + ("new1", args.target_replacement1), + ("new2", args.target_replacement2), + ) + if replacement and len(replacement) == 2 + } + records = [] + for replacement in target_replacements: + output_dir_name = str(getattr(replacement, "output_dir_name")) + source_uid = str(getattr(replacement, "source_uid")) + record = { + "source_uid": source_uid, + "prompt": str(getattr(replacement, "prompt")), + "output_dir_name": output_dir_name, + } + requested_source_uid = requested_by_output_dir.get(output_dir_name) + if requested_source_uid and requested_source_uid != source_uid: + record["requested_source_uid"] = requested_source_uid + records.append(record) + return records + + +def _record_path(path: Path, repo_root: Path) -> str: + path = path.expanduser() + if not path.is_absolute(): + path = (Path.cwd() / path).resolve() + else: + path = path.resolve() + repo_root = repo_root.expanduser().resolve() + try: + return path.relative_to(repo_root).as_posix() + except ValueError: + return path.as_posix() + + +def _file_sha256(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as file: + for chunk in iter(lambda: file.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py new file mode 100644 index 00000000..10723c70 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py @@ -0,0 +1,156 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse + +import gymnasium +import numpy as np +import torch +import tqdm + +from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.atomic_actions import ( # noqa: F401 + AtomicActionsAgentEnv, +) +from embodichain.lab.gym.utils.gym_utils import ( + add_env_launcher_args_to_parser, + build_env_cfg_from_args, +) +from embodichain.utils.logger import log_error, log_info, log_warning +from embodichain.utils.utility import load_config + +__all__ = ["cli"] + + +def cli() -> None: + np.set_printoptions(5, suppress=True) + torch.set_printoptions(precision=5, sci_mode=False) + + parser = argparse.ArgumentParser() + add_env_launcher_args_to_parser(parser) + parser.add_argument( + "--task_name", + type=str, + help="Name of the task.", + required=True, + ) + parser.add_argument( + "--agent_config", + type=str, + help="Path to the agent configuration file.", + required=True, + ) + parser.add_argument( + "--regenerate", + action="store_true", + help="Whether to regenerate code if already existed.", + default=False, + ) + + args = parser.parse_args() + + if args.num_envs != 1: + log_error(f"Currently only support num_envs=1, but got {args.num_envs}.") + raise SystemExit(1) + + env_cfg, gym_config, _ = build_env_cfg_from_args(args) + agent_config = load_config(args.agent_config) + + env = gymnasium.make( + id=gym_config["id"], + cfg=env_cfg, + agent_config=agent_config, + agent_config_path=args.agent_config, + task_name=args.task_name, + ) + + _run_action_agent(args, env, gym_config) + + if args.headless: + env.reset(options={"final": True}) + + +def _run_action_agent(args: argparse.Namespace, env: gymnasium.Env, gym_config: dict): + """Run action-agent graphs without relying on the shared run_env runner.""" + if getattr(args, "preview", False): + log_warning("Preview mode is handled by the shared runner and is skipped here.") + + log_info("Start action-agent data generation.", color="green") + for trajectory_idx in range(gym_config.get("max_episodes", 1)): + _generate_action_agent_trajectory( + args, + env, + trajectory_idx, + ) + _, _ = env.reset() + + +def _generate_action_agent_trajectory( + args: argparse.Namespace, + env: gymnasium.Env, + trajectory_idx: int, +) -> bool: + _, _ = env.reset() + action_list = env.get_wrapper_attr("create_demo_action_list")( + action_sentence=trajectory_idx, + save_path=getattr(args, "save_path", ""), + save_video=getattr(args, "save_video", False), + debug_mode=getattr(args, "debug_mode", False), + regenerate=getattr(args, "regenerate", False), + recovery=getattr(args, "recovery", False), + ) + if action_list is None or len(action_list) == 0: + log_warning("Action is invalid. Skip to next generation.") + return False + + if getattr(action_list, "already_executed", False): + log_info("Action list was already executed by the action-agent runtime.") + _log_task_success(env) + return True + + for action in tqdm.tqdm( + action_list, + desc=f"Executing action list #{trajectory_idx}", + unit="step", + ): + env.step(action) + _log_task_success(env) + return True + + +def _log_task_success(env: gymnasium.Env) -> bool | None: + try: + success_fn = ( + env.get_wrapper_attr("is_task_success") + if hasattr(env, "get_wrapper_attr") + else env.is_task_success + ) + success = success_fn() + except Exception as exc: + log_warning(f"Failed to evaluate task success after execution: {exc}") + return None + + if isinstance(success, torch.Tensor): + success_value = bool(success.detach().cpu().flatten().all().item()) + else: + success_value = bool(np.asarray(success).flatten().all()) + log_info(f"Task success after execution: {success_value}", color="green") + return success_value + + +if __name__ == "__main__": + cli() diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py new file mode 100644 index 00000000..abb46a9d --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py @@ -0,0 +1,1334 @@ +#!/usr/bin/env python3 +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +"""Run the Image2Tabletop -> config generation -> action-agent pipeline.""" + +from __future__ import annotations + +import argparse +from collections.abc import Callable +from dataclasses import dataclass +from datetime import datetime +import json +import os +from pathlib import Path +import re +import shlex +import subprocess +import sys +from typing import Any + + +def _repo_root() -> Path: + current = Path(__file__).resolve() + for parent in current.parents: + if (parent / "setup.py").is_file() and (parent / "embodichain").is_dir(): + return parent + return Path.cwd().resolve() + + +__all__ = ["main"] + +_REPO_ROOT = _repo_root() +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_records import ( + find_history_entry_by_index as _records_find_history_entry_by_index, + history_entry_has_source as _records_history_entry_has_source, + history_entry_index as _records_history_entry_index, + path_from_history_entry as _records_path_from_history_entry, + pipeline_history_path as _records_pipeline_history_path, + read_pipeline_history as _records_read_pipeline_history, + resolve_source_gym_config as _records_resolve_source_gym_config, + write_pipeline_manifests as _records_write_pipeline_manifests, +) + +_DEFAULT_SERVER = "http://192.168.3.23:4523" +_DEFAULT_IMAGE = ( + _REPO_ROOT + / "embodichain/gen_sim/action_agent_pipeline/gym_project_api/image/demo5.jpg" +) +_DEFAULT_IMAGE_DIR = _DEFAULT_IMAGE.parent +_DEFAULT_GYM_PROJECT_ROOT = _REPO_ROOT / "gym_project" +_DEFAULT_EXISTING_GYM_PROJECT = _DEFAULT_GYM_PROJECT_ROOT / "1780562837_gym_project" +_DEFAULT_IMAGE2SCENE_ROOT = _REPO_ROOT / "gym_project/environment/image2tabletop" +_DEFAULT_IMAGE2SCENE_IMAGE = "scene_image/robotwin_example.png" +_DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR = "./downloads" +_DEFAULT_IMAGE2SCENE_OUTPUT_ROOT = "./generated" +_DEFAULT_IMAGE2SCENE_CONFIG = "./gen_config.json" +_DEFAULT_CONFIG_OUTPUT_DIR = ( + _REPO_ROOT / "embodichain/gen_sim/action_agent_pipeline/configs/demo3_text" +) +_DEFAULT_PIPELINE_HISTORY = ( + _REPO_ROOT + / "embodichain/gen_sim/action_agent_pipeline/configs/pipeline_history.json" +) +_DEFAULT_TASK_NAME = "Demo3_Text" +_DEFAULT_TASK_TEMPLATE_NAMES = frozenset({"Demo1_Text"}) +_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".webp", ".bmp") +_GYM_CONFIG_PREFERENCE = ("gym_config_merged.json", "gym_config.json") +_PIPELINE_HISTORY_SCHEMA_VERSION = 1 +_PIPELINE_MANIFEST_FILENAME = "pipeline_manifest.json" +_INDEXED_REPLACEMENT_ALIAS_RE = re.compile( + r"^(?P[A-Za-z][A-Za-z0-9 _-]*?)[ _-]?(?P[0-9]+)$" +) + + +@dataclass(frozen=True) +class ProjectResolution: + path: Path + mode: str + base_history: dict[str, Any] | None = None + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=( + "Generate a tabletop gym project from one image, generate action-agent " + "configs from that project, then run the generated task." + ) + ) + image_group = parser.add_mutually_exclusive_group() + image_group.add_argument( + "--image", + default=None, + help=( + f"Input image path. If omitted, defaults to {_DEFAULT_IMAGE.as_posix()} " + f"or {_DEFAULT_IMAGE2SCENE_IMAGE} with --use-image2scene." + ), + ) + image_group.add_argument( + "--image-name", + "--image_name", + dest="image_name", + default=None, + help=( + "Image file name under the default image directory. The suffix is " + 'optional, e.g. "demo6" resolves to demo6.jpg.' + ), + ) + parser.add_argument( + "--server", + default=_DEFAULT_SERVER, + help=f"Image2Tabletop API server. Defaults to {_DEFAULT_SERVER}", + ) + parser.add_argument( + "--use-image2scene", + action="store_true", + default=False, + help=( + "Use gym_project/environment/image2tabletop/demo_api/client/" + "image2scene_pipeline.py as the first stage and continue from its " + "gym_config_merged.json output." + ), + ) + parser.add_argument( + "--background", + default=None, + help=( + "Background description passed to image2scene_pipeline.py. Required " + "with --use-image2scene." + ), + ) + parser.add_argument( + "--image2scene-root", + default=str(_DEFAULT_IMAGE2SCENE_ROOT), + help=( + "Working directory for image2scene_pipeline.py. Defaults to " + f"{_DEFAULT_IMAGE2SCENE_ROOT.as_posix()}" + ), + ) + parser.add_argument( + "--image2scene-download-dir", + default=_DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR, + help=( + "Download directory passed to image2scene_pipeline.py. Relative " + "paths are interpreted under --image2scene-root. Defaults to " + f"{_DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR}." + ), + ) + parser.add_argument( + "--image2scene-output-root", + default=_DEFAULT_IMAGE2SCENE_OUTPUT_ROOT, + help=( + "Generated EC project directory passed to image2scene_pipeline.py. " + "Relative paths are interpreted under --image2scene-root. Defaults " + f"to {_DEFAULT_IMAGE2SCENE_OUTPUT_ROOT}." + ), + ) + parser.add_argument( + "--image2scene-gen-config", + default=_DEFAULT_IMAGE2SCENE_CONFIG, + help=( + "Generation config passed to image2scene_pipeline.py. Relative " + "paths are interpreted under --image2scene-root. Defaults to " + f"{_DEFAULT_IMAGE2SCENE_CONFIG}." + ), + ) + parser.add_argument( + "--image2scene-llm-config", + default=_DEFAULT_IMAGE2SCENE_CONFIG, + help=( + "LLM config passed to image2scene_pipeline.py. Relative paths are " + "interpreted under --image2scene-root. Defaults to " + f"{_DEFAULT_IMAGE2SCENE_CONFIG}." + ), + ) + parser.add_argument( + "--image2scene-extract-dir", + default=None, + help=( + "Optional extract directory passed to image2scene_pipeline.py. " + "Relative paths are interpreted under --image2scene-root." + ), + ) + parser.add_argument( + "--image2scene-merged-output", + default=None, + help=( + "Optional merged output path passed to image2scene_pipeline.py. " + "Relative paths are interpreted under --image2scene-root." + ), + ) + parser.add_argument( + "--gym-project-root", + default=str(_DEFAULT_GYM_PROJECT_ROOT), + help=( + "Directory where Image2Tabletop generated gym projects are written. " + f"Defaults to {_DEFAULT_GYM_PROJECT_ROOT.as_posix()}" + ), + ) + parser.add_argument( + "--use-existing-gym-project", + action="store_true", + default=False, + help=( + "Skip Image2Tabletop API and start from --gym-project. Defaults to " + "false." + ), + ) + parser.add_argument( + "--base-task-name", + "--base_task_name", + dest="base_task_name", + default=None, + help=( + "Start from the latest pipeline history entry with this task name. " + "Use this to chain demos, e.g. demo2 based on Demo1_Text." + ), + ) + parser.add_argument( + "--base-history-index", + "--base_history_index", + dest="base_history_index", + type=int, + default=None, + help=( + "Start from a specific pipeline history index. When used with " + "--base-task-name, the history entry must match that task name." + ), + ) + parser.add_argument( + "--gym-project", + "--gym_project", + dest="gym_project", + default=str(_DEFAULT_EXISTING_GYM_PROJECT), + help=( + "Existing gym project used with --use-existing-gym-project. " + f"Defaults to {_DEFAULT_EXISTING_GYM_PROJECT.as_posix()}" + ), + ) + parser.add_argument( + "--config-output-dir", + "--output_dir", + dest="config_output_dir", + default=str(_DEFAULT_CONFIG_OUTPUT_DIR), + help=( + "Destination directory for generated config files. Defaults to " + f"{_DEFAULT_CONFIG_OUTPUT_DIR.as_posix()}" + ), + ) + parser.add_argument( + "--pipeline-history-path", + "--pipeline_history_path", + dest="pipeline_history_path", + default=str(_DEFAULT_PIPELINE_HISTORY), + help=( + "Global pipeline history JSON path. Defaults to " + f"{_DEFAULT_PIPELINE_HISTORY.as_posix()}" + ), + ) + parser.add_argument( + "--task_name", + "--task-name", + dest="task_name", + default=_DEFAULT_TASK_NAME, + help=f"Task name passed to run_agent. Defaults to {_DEFAULT_TASK_NAME}", + ) + parser.add_argument( + "--task_description", + "--task-description", + dest="task_description", + default="", + help=( + 'Task description passed to config generation. Defaults to "". ' + "Ignored for default-template tasks such as Demo1_Text." + ), + ) + parser.add_argument( + "--target_body_scale", + "--target-body-scale", + dest="target_body_scale", + type=float, + default=0.8, + help=( + "Uniform body_scale for generated target objects. Basket-like " + "containers keep their source body_scale. Defaults to 0.8." + ), + ) + parser.add_argument( + "--target_replacement1", + "--target-replacement1", + nargs="+", + metavar="SOURCE_OR_PROMPT", + default=None, + help=( + "Generate /mesh_assets/new1 from PROMPT. Accepts either " + "PROMPT, which auto-selects the lower-y duplicated rigid " + "object, or SOURCE_UID PROMPT for explicit selection." + ), + ) + parser.add_argument( + "--target_replacement2", + "--target-replacement2", + nargs="+", + metavar="SOURCE_OR_PROMPT", + default=None, + help=( + "Generate /mesh_assets/new2 from PROMPT. Accepts either " + "PROMPT, which auto-selects the higher-y duplicated rigid " + "object, or SOURCE_UID PROMPT for explicit selection." + ), + ) + parser.add_argument( + "--sync_replacement_names", + "--sync-replacement-names", + action="store_true", + default=False, + help=( + "Also update replacement target runtime UIDs and generated prompts " + "from the replacement prompts." + ), + ) + parser.add_argument( + "--reuse-target-replacements", + "--reuse_target_replacements", + dest="reuse_target_replacements", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "Reuse existing prompt-generated replacement GLBs when the prompt " + "and expected output name match. Defaults to true." + ), + ) + parser.add_argument( + "--prewarm-coacd-cache", + "--prewarm_coacd_cache", + dest="prewarm_coacd_cache", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "Precompute environment CoACD cache files during config generation. " + "Defaults to true." + ), + ) + parser.add_argument( + "--poll-interval", + type=float, + default=10.0, + help="Image2Tabletop job polling interval in seconds. Defaults to 10.0.", + ) + parser.add_argument( + "--skip-health-check", + action="store_true", + default=False, + help="Skip GET /health before submitting the image.", + ) + parser.add_argument( + "--overwrite-gym-project", + action="store_true", + default=False, + help="Replace an existing generated gym project with the same name.", + ) + parser.add_argument( + "--overwrite-config", + action=argparse.BooleanOptionalAction, + default=True, + help="Overwrite generated config files. Defaults to true.", + ) + parser.add_argument( + "--regenerate", + action=argparse.BooleanOptionalAction, + default=True, + help="Pass --regenerate to run_agent. Defaults to true.", + ) + parser.add_argument( + "--skip-run-agent", + action="store_true", + default=False, + help="Stop after generating config files instead of launching run_agent.", + ) + parser.add_argument( + "--llm-usage-output", + default=None, + help=( + "JSONL path for local LLM token usage records. Defaults to " + "/llm_usage.jsonl." + ), + ) + parser.add_argument( + "--llm-usage-summary-output", + default=None, + help=( + "JSON path for the aggregated local LLM token usage summary. " + "Defaults to /llm_usage_summary.json." + ), + ) + parser.add_argument( + "--llm-usage-run-id", + default=None, + help="Optional run id written into local LLM token usage records.", + ) + parser.add_argument( + "--no-llm-usage", + dest="llm_usage", + action="store_false", + default=True, + help="Disable local LLM token usage recording for this pipeline run.", + ) + return parser + + +def _ensure_repo_on_pythonpath() -> None: + if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + + +def _resolve_single_image( + image_input: str, + collect_image_paths: Callable[[Path], list[Path]], +) -> Path: + image_paths = collect_image_paths(Path(image_input)) + if len(image_paths) != 1: + paths = ", ".join(path.as_posix() for path in image_paths) + raise ValueError( + "This pipeline expects exactly one image, but got " + f"{len(image_paths)}: {paths}" + ) + return image_paths[0] + + +def _resolve_image_input(args: argparse.Namespace) -> Path: + if args.image_name: + return _resolve_image_name(args.image_name) + if args.image: + return Path(args.image) + return _DEFAULT_IMAGE + + +def _resolve_image_name(image_name: str) -> Path: + image_path = Path(image_name) + if image_path.parent != Path("."): + raise ValueError( + "--image-name only accepts a file name under " + f"{_DEFAULT_IMAGE_DIR.as_posix()}. Use --image for a full path." + ) + if image_path.suffix: + return _DEFAULT_IMAGE_DIR / image_path + + matches = [ + _DEFAULT_IMAGE_DIR / f"{image_name}{suffix}" for suffix in _IMAGE_SUFFIXES + ] + existing = [path for path in matches if path.exists()] + if len(existing) == 1: + return existing[0] + if not existing: + candidates = ", ".join(path.name for path in matches) + raise FileNotFoundError( + f"Image name {image_name!r} was not found. Tried: {candidates}" + ) + + matched = ", ".join(path.name for path in existing) + raise ValueError( + f"Image name {image_name!r} is ambiguous. Use --image-name with a suffix: " + f"{matched}" + ) + + +def _resolve_under_root(root: Path, path_input: str | None) -> Path | None: + if path_input is None: + return None + path = Path(path_input).expanduser() + if path.is_absolute(): + return path.resolve() + return (root / path).resolve() + + +def _image2scene_subprocess_env() -> dict[str, str]: + from embodichain.gen_sim.action_agent_pipeline.utils.llm_config import ( + get_openai_compatible_llm_config, + ) + from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + scrub_usage_tracking_env, + ) + + env = scrub_usage_tracking_env() + cfg = get_openai_compatible_llm_config( + required=False, + require_base_url=False, + ) + env_overrides = { + "OPENAI_API_KEY": cfg.get("api_key"), + "OPENAI_MODEL": cfg.get("model"), + "OPENAI_BASE_URL": cfg.get("base_url"), + "EMBODICHAIN_LLM_PROXY": cfg.get("proxy_url"), + } + for name, value in env_overrides.items(): + if value: + env[name] = str(value) + + if cfg.get("model") or cfg.get("base_url"): + print( + "Using shared LLM config for image2scene subprocess: " + f"model={cfg.get('model')!r}, base_url={cfg.get('base_url')!r}", + flush=True, + ) + return env + + +def _resolve_task_description_for_generation(args: argparse.Namespace) -> str | None: + task_description = str(args.task_description or "").strip() + if args.task_name in _DEFAULT_TASK_TEMPLATE_NAMES: + if task_description: + print( + f"Ignoring --task_description for {args.task_name}; " + "using the default basket task template.", + flush=True, + ) + return None + return task_description or None + + +def _collect_merged_gym_configs(download_dir: Path) -> list[Path]: + if not download_dir.exists(): + return [] + return sorted( + path.resolve() for path in download_dir.rglob("gym_config_merged.json") + ) + + +def _latest_path(paths: list[Path]) -> Path: + return max(paths, key=lambda path: path.stat().st_mtime) + + +def _resolve_image2scene_image( + args: argparse.Namespace, image2scene_root: Path +) -> Path: + if args.image_name: + image_name = Path(args.image_name) + if image_name.parent != Path("."): + raise ValueError( + "--image-name only accepts a file name under " + f"{_DEFAULT_IMAGE_DIR.as_posix()} with " + "--use-image2scene. Use --image for a full path." + ) + if image_name.suffix: + return (_DEFAULT_IMAGE_DIR / image_name).resolve() + + matches = [ + _DEFAULT_IMAGE_DIR / f"{args.image_name}{suffix}" + for suffix in _IMAGE_SUFFIXES + ] + existing = [path.resolve() for path in matches if path.exists()] + if len(existing) == 1: + return existing[0] + if not existing: + candidates = ", ".join(path.name for path in matches) + raise FileNotFoundError( + f"Image name {args.image_name!r} was not found. Tried: {candidates}" + ) + + matched = ", ".join(path.name for path in existing) + raise ValueError( + f"Image name {args.image_name!r} is ambiguous. Use --image-name " + f"with a suffix: {matched}" + ) + + image_input = args.image or _DEFAULT_IMAGE2SCENE_IMAGE + image_path = Path(image_input).expanduser() + if image_path.is_absolute(): + return image_path.resolve() + return (image2scene_root / image_path).resolve() + + +def _run_image2scene_pipeline(args: argparse.Namespace) -> Path: + if not args.background: + raise ValueError("--background is required with --use-image2scene.") + + image2scene_root = Path(args.image2scene_root).expanduser().resolve() + if not image2scene_root.is_dir(): + raise FileNotFoundError(f"image2scene root not found: {image2scene_root}") + + script_path = image2scene_root / "demo_api/client/image2scene_pipeline.py" + if not script_path.is_file(): + raise FileNotFoundError(f"image2scene pipeline not found: {script_path}") + + image_path = _resolve_image2scene_image(args, image2scene_root) + download_dir = _resolve_under_root(image2scene_root, args.image2scene_download_dir) + output_root = _resolve_under_root(image2scene_root, args.image2scene_output_root) + gen_config = _resolve_under_root(image2scene_root, args.image2scene_gen_config) + llm_config = _resolve_under_root(image2scene_root, args.image2scene_llm_config) + extract_dir = _resolve_under_root(image2scene_root, args.image2scene_extract_dir) + merged_output = _resolve_under_root( + image2scene_root, args.image2scene_merged_output + ) + + if ( + download_dir is None + or output_root is None + or gen_config is None + or llm_config is None + ): + raise ValueError("image2scene paths must not be empty.") + + before_configs = set(_collect_merged_gym_configs(download_dir)) + command = [ + sys.executable, + str(script_path), + "--server", + args.server, + "--image", + str(image_path), + "--download-dir", + str(download_dir), + "--background", + args.background, + "--output-root", + str(output_root), + "--gen-config", + str(gen_config), + "--llm-config", + str(llm_config), + "--poll-interval", + str(args.poll_interval), + ] + if extract_dir is not None: + command.extend(["--extract-dir", str(extract_dir)]) + if merged_output is not None: + command.extend(["--merged-output", str(merged_output)]) + + print("Running image2scene pipeline:") + print(shlex.join(command), flush=True) + completed = subprocess.run( + command, + cwd=image2scene_root, + check=False, + env=_image2scene_subprocess_env(), + ) + if completed.returncode != 0: + raise RuntimeError( + f"image2scene pipeline failed with exit code {completed.returncode}" + ) + + if merged_output is not None: + if not merged_output.is_file(): + raise FileNotFoundError( + f"image2scene merged output not found: {merged_output}" + ) + print(f"Using image2scene merged gym config: {merged_output}", flush=True) + return merged_output + + after_configs = _collect_merged_gym_configs(download_dir) + new_configs = [path for path in after_configs if path not in before_configs] + if new_configs: + merged_config = _latest_path(new_configs) + elif after_configs: + merged_config = _latest_path(after_configs) + else: + raise FileNotFoundError( + f"gym_config_merged.json not found under: {download_dir}" + ) + + print(f"Using image2scene merged gym config: {merged_config}", flush=True) + return merged_config + + +def _resolve_gym_project(args: argparse.Namespace) -> ProjectResolution: + use_history = args.base_task_name is not None or args.base_history_index is not None + selected_modes = [ + args.use_image2scene, + args.use_existing_gym_project, + use_history, + ] + if sum(bool(mode) for mode in selected_modes) > 1: + raise ValueError( + "Use only one of --use-image2scene, --use-existing-gym-project, " + "or --base-task-name/--base-history-index." + ) + + if args.use_existing_gym_project: + project_path = Path(args.gym_project).expanduser().resolve() + if not project_path.exists(): + raise FileNotFoundError(f"gym project not found: {project_path}") + print(f"Using existing gym project: {project_path}", flush=True) + return ProjectResolution(path=project_path, mode="existing_gym_project") + + if args.use_image2scene: + return ProjectResolution( + path=_run_image2scene_pipeline(args), mode="image2scene" + ) + + if use_history: + history_entry = _resolve_base_history_entry(args) + project_path = _path_from_history_entry(history_entry) + print( + "Using base history " + f"#{history_entry.get('index')} ({history_entry.get('task_name')}): " + f"{project_path}", + flush=True, + ) + return ProjectResolution( + path=project_path, + mode="history", + base_history=history_entry, + ) + + from embodichain.gen_sim.action_agent_pipeline.gym_project_api.image2tabletop_client import ( + check_health, + collect_image_paths, + process_image, + ) + + image_input = _resolve_image_input(args) + image_path = _resolve_single_image(str(image_input), collect_image_paths) + if not args.skip_health_check: + check_health(args.server) + + return ProjectResolution( + path=process_image( + server=args.server, + image_path=image_path, + output_root=Path(args.gym_project_root), + poll_interval=args.poll_interval, + overwrite=args.overwrite_gym_project, + ), + mode="image2tabletop", + ) + + +def _resolve_base_history_entry(args: argparse.Namespace) -> dict[str, Any]: + if args.base_history_index is not None and args.base_history_index <= 0: + raise ValueError("--base-history-index must be a positive integer.") + + history_path = _pipeline_history_path(args) + history = _read_pipeline_history(history_path) + runs = history["runs"] + + if args.base_history_index is not None: + entry = _find_history_entry_by_index(runs, args.base_history_index) + if entry is None: + raise ValueError( + f"Pipeline history index not found: {args.base_history_index}" + ) + if args.base_task_name and entry.get("task_name") != args.base_task_name: + raise ValueError( + "Pipeline history entry " + f"#{args.base_history_index} has task_name={entry.get('task_name')!r}, " + f"expected {args.base_task_name!r}." + ) + return dict(entry) + + if not args.base_task_name: + raise ValueError("--base-task-name is required without --base-history-index.") + + candidates = [ + entry + for entry in runs + if entry.get("task_name") == args.base_task_name + and _history_entry_has_source(entry) + ] + if not candidates: + raise ValueError( + "No pipeline history entry found for task_name=" + f"{args.base_task_name!r} in {history_path}" + ) + return dict(max(candidates, key=_history_entry_index)) + + +def _pipeline_history_path(args: argparse.Namespace) -> Path: + return _records_pipeline_history_path(args) + + +def _read_pipeline_history(history_path: Path) -> dict[str, Any]: + return _records_read_pipeline_history( + history_path, + schema_version=_PIPELINE_HISTORY_SCHEMA_VERSION, + ) + + +def _find_history_entry_by_index( + runs: list[Any], history_index: int +) -> dict[str, Any] | None: + return _records_find_history_entry_by_index(runs, history_index) + + +def _history_entry_index(entry: dict[str, Any]) -> int: + return _records_history_entry_index(entry) + + +def _history_entry_has_source(entry: dict[str, Any]) -> bool: + return _records_history_entry_has_source(entry) + + +def _path_from_history_entry(entry: dict[str, Any]) -> Path: + return _records_path_from_history_entry(entry, repo_root=_REPO_ROOT) + + +def _resolve_target_replacements( + args: argparse.Namespace, + target_replacement_spec_cls: Callable[..., object], + gym_project: Path, +) -> list[object]: + replacements = [] + alias_config = None + if args.target_replacement1: + alias_config = alias_config or _load_replacement_alias_config(gym_project) + source_uid, prompt = _resolve_target_replacement_arg( + args.target_replacement1, + alias_config, + option_name="--target_replacement1", + replacement_number=1, + ) + replacements.append( + target_replacement_spec_cls( + source_uid=source_uid, + prompt=prompt, + output_dir_name="new1", + ) + ) + if args.target_replacement2: + alias_config = alias_config or _load_replacement_alias_config(gym_project) + source_uid, prompt = _resolve_target_replacement_arg( + args.target_replacement2, + alias_config, + option_name="--target_replacement2", + replacement_number=2, + ) + replacements.append( + target_replacement_spec_cls( + source_uid=source_uid, + prompt=prompt, + output_dir_name="new2", + ) + ) + return replacements + + +def _resolve_target_replacement_arg( + values: list[str], + gym_config: dict[str, Any], + *, + option_name: str, + replacement_number: int, +) -> tuple[str, str]: + if len(values) == 1: + prompt = str(values[0]).strip() + if not prompt: + raise ValueError(f"{option_name} prompt must be non-empty.") + source_uid = _auto_replacement_source_uid( + gym_config, + replacement_number=replacement_number, + option_name=option_name, + ) + return source_uid, prompt + + if len(values) == 2: + source_uid, prompt = values + prompt = str(prompt).strip() + if not prompt: + raise ValueError(f"{option_name} prompt must be non-empty.") + source_uid = _resolve_replacement_source_uid( + source_uid, + gym_config, + option_name=option_name, + ) + return source_uid, prompt + + raise ValueError( + f"{option_name} expects either PROMPT or SOURCE_UID PROMPT, got " + f"{len(values)} values: {values!r}. Quote multi-word prompts." + ) + + +def _load_replacement_alias_config(gym_project: Path) -> dict[str, Any]: + config_path = _resolve_replacement_alias_gym_config(gym_project) + data = json.loads(config_path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError(f"Gym config must be a JSON object: {config_path}") + return data + + +def _resolve_replacement_alias_gym_config(input_path: Path) -> Path: + input_path = input_path.expanduser().resolve() + if input_path.is_file(): + sibling_gym_config = input_path.parent / "gym_config.json" + if sibling_gym_config.is_file(): + return sibling_gym_config.resolve() + return _resolve_source_gym_config(input_path) + + direct_gym_config = input_path / "gym_config.json" + if direct_gym_config.is_file(): + return direct_gym_config.resolve() + + source_config = _resolve_source_gym_config(input_path) + sibling_gym_config = source_config.parent / "gym_config.json" + if sibling_gym_config.is_file(): + return sibling_gym_config.resolve() + return source_config + + +def _auto_replacement_source_uid( + gym_config: dict[str, Any], + *, + replacement_number: int, + option_name: str, +) -> str: + if replacement_number not in {1, 2}: + raise ValueError(f"Unsupported replacement number: {replacement_number}") + + duplicate_groups = _duplicated_numbered_rigid_object_groups(gym_config) + if len(duplicate_groups) != 1: + candidates = _format_duplicate_group_candidates(duplicate_groups) + raise ValueError( + f"{option_name} was given without an explicit source uid, so the " + "pipeline expected exactly one duplicated numbered rigid_object " + f"group in gym_config.json. Found {len(duplicate_groups)} group(s): " + f"{candidates}. Use SOURCE_UID PROMPT to disambiguate." + ) + + base_name, positioned_objects = duplicate_groups[0] + if len(positioned_objects) != 2: + candidates = _format_duplicate_group_candidates(duplicate_groups) + raise ValueError( + f"{option_name} auto-selection requires exactly two objects in the " + f"duplicated group {base_name!r}, found {len(positioned_objects)}: " + f"{candidates}. Use SOURCE_UID PROMPT to disambiguate." + ) + + if ( + abs(float(positioned_objects[0]["y"]) - float(positioned_objects[1]["y"])) + < 1e-9 + ): + candidates = _format_duplicate_group_candidates(duplicate_groups) + raise ValueError( + f"{option_name} auto-selection requires distinct y coordinates in " + f"duplicated group {base_name!r}: {candidates}. Use SOURCE_UID PROMPT " + "to disambiguate." + ) + + selected = positioned_objects[replacement_number - 1] + source_uid = selected["object"]["uid"] + print( + f"Resolved {option_name} auto source -> {source_uid!r} " + f"from duplicated rigid_object group {base_name!r} by y={selected['y']}", + flush=True, + ) + return source_uid + + +def _duplicated_numbered_rigid_object_groups( + gym_config: dict[str, Any], +) -> list[tuple[str, list[dict[str, Any]]]]: + grouped: dict[str, list[dict[str, Any]]] = {} + for obj in _rigid_objects(gym_config): + parsed = _parse_numbered_rigid_object_uid(obj["uid"]) + if parsed is None: + continue + base_name, number = parsed + grouped.setdefault(base_name, []).append( + { + "number": number, + "y": _rigid_object_y_coordinate(obj), + "object": obj, + } + ) + + duplicate_groups = [] + for base_name, entries in grouped.items(): + if len(entries) < 2: + continue + duplicate_groups.append( + ( + base_name, + sorted( + entries, + key=lambda entry: ( + float(entry["y"]), + str(entry["object"]["uid"]), + ), + ), + ) + ) + return sorted(duplicate_groups, key=lambda item: item[0]) + + +def _parse_numbered_rigid_object_uid(uid: str) -> tuple[str, int] | None: + match = re.match(r"^(?P.+?)[_-]?(?P[0-9]+)$", uid) + if match is None: + return None + base_name = match.group("base").strip("_-") + if not base_name: + return None + return base_name, int(match.group("number")) + + +def _rigid_object_y_coordinate(obj: dict[str, Any]) -> float: + init_pos = obj.get("init_pos") + if not isinstance(init_pos, (list, tuple)) or len(init_pos) < 2: + raise ValueError( + "Auto replacement source selection requires each duplicated " + f"rigid_object to define init_pos with a y value, got {obj.get('uid')!r}." + ) + try: + return float(init_pos[1]) + except (TypeError, ValueError) as exc: + raise ValueError( + "Auto replacement source selection requires numeric init_pos[1], " + f"got {obj.get('uid')!r}: {init_pos[1]!r}" + ) from exc + + +def _format_duplicate_group_candidates( + groups: list[tuple[str, list[dict[str, Any]]]], +) -> str: + if not groups: + return "" + parts = [] + for base_name, entries in groups: + values = ", ".join( + f"{entry['object']['uid']}#number={entry['number']},y={entry['y']}" + for entry in entries + ) + parts.append(f"{base_name}: {values}") + return "; ".join(parts) + + +def _resolve_replacement_source_uid( + source_input: str, + gym_config: dict[str, Any], + *, + option_name: str, +) -> str: + source_input = str(source_input).strip() + rigid_objects = _rigid_objects(gym_config) + by_uid = {obj["uid"]: obj for obj in rigid_objects} + if source_input in by_uid: + return source_input + + alias = _parse_indexed_replacement_alias(source_input) + if alias is None: + candidates = _format_rigid_object_candidates(rigid_objects) + raise ValueError( + f"{option_name} source {source_input!r} is neither a rigid object uid " + f"nor an indexed alias such as bread1. Rigid object candidates: " + f"{candidates}" + ) + + keyword, alias_index = alias + matches = [ + obj for obj in rigid_objects if _rigid_object_matches_keyword(obj, keyword) + ] + if alias_index > len(matches): + candidates = _format_rigid_object_candidates(matches or rigid_objects) + raise ValueError( + f"{option_name} alias {source_input!r} requested match #{alias_index} " + f"for keyword {keyword!r}, but only found {len(matches)} match(es). " + f"Candidates: {candidates}" + ) + + resolved_uid = matches[alias_index - 1]["uid"] + print( + f"Resolved {option_name} source alias {source_input!r} -> {resolved_uid!r}", + flush=True, + ) + return resolved_uid + + +def _rigid_objects(gym_config: dict[str, Any]) -> list[dict[str, Any]]: + value = gym_config.get("rigid_object", []) + if isinstance(value, dict): + value = [value] + if not isinstance(value, list): + raise ValueError("gym config rigid_object must be a list or object.") + + rigid_objects = [] + for obj in value: + if not isinstance(obj, dict): + continue + uid = str(obj.get("uid", "")).strip() + if not uid: + continue + copied = dict(obj) + copied["uid"] = uid + rigid_objects.append(copied) + if not rigid_objects: + raise ValueError("No rigid_object entries found in gym config.") + return rigid_objects + + +def _parse_indexed_replacement_alias(alias: str) -> tuple[str, int] | None: + match = _INDEXED_REPLACEMENT_ALIAS_RE.match(alias.strip()) + if match is None: + return None + keyword = match.group("keyword").strip(" _-") + index = int(match.group("index")) + if not keyword or index < 1: + return None + return keyword, index + + +def _rigid_object_matches_keyword(obj: dict[str, Any], keyword: str) -> bool: + keyword_tokens = _search_tokens(keyword) + if not keyword_tokens: + return False + object_tokens = set(_search_tokens(_rigid_object_search_text(obj))) + return all(token in object_tokens for token in keyword_tokens) + + +def _rigid_object_search_text(obj: dict[str, Any]) -> str: + values = [ + obj.get("uid", ""), + obj.get("source_uid", ""), + obj.get("category", ""), + obj.get("semantic_label", ""), + obj.get("name", ""), + obj.get("description", ""), + ] + shape = obj.get("shape", {}) + if isinstance(shape, dict): + values.extend( + [ + shape.get("fpath", ""), + shape.get("file_path", ""), + shape.get("category", ""), + ] + ) + return " ".join(str(value) for value in values if value) + + +def _search_tokens(value: str) -> list[str]: + return re.findall(r"[a-z0-9]+", str(value).lower()) + + +def _format_rigid_object_candidates(rigid_objects: list[dict[str, Any]]) -> str: + if not rigid_objects: + return "" + parts = [] + for obj in rigid_objects: + shape = obj.get("shape", {}) + fpath = shape.get("fpath", "") if isinstance(shape, dict) else "" + parts.append(f"{obj.get('uid')} ({fpath})") + return ", ".join(parts) + + +def _write_pipeline_manifests( + *, + args: argparse.Namespace, + resolution: ProjectResolution, + generated_paths: object, + target_replacements: list[object], +) -> dict[str, Any]: + return _records_write_pipeline_manifests( + args=args, + resolution=resolution, + generated_paths=generated_paths, + target_replacements=target_replacements, + repo_root=_REPO_ROOT, + schema_version=_PIPELINE_HISTORY_SCHEMA_VERSION, + manifest_filename=_PIPELINE_MANIFEST_FILENAME, + ) + + +def _resolve_source_gym_config(input_path: Path) -> Path: + return _records_resolve_source_gym_config( + input_path, + gym_config_preference=_GYM_CONFIG_PREFERENCE, + ) + + +def _configure_llm_usage_tracking( + args: argparse.Namespace, +) -> tuple[Path, Path] | None: + if not args.llm_usage: + from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + disable_usage_tracking, + ) + + disable_usage_tracking() + return None + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + configure_usage_tracking, + ) + + output_dir = Path(args.config_output_dir).expanduser().resolve() + usage_path = ( + Path(args.llm_usage_output).expanduser().resolve() + if args.llm_usage_output + else output_dir / "llm_usage.jsonl" + ) + summary_path = ( + Path(args.llm_usage_summary_output).expanduser().resolve() + if args.llm_usage_summary_output + else output_dir / "llm_usage_summary.json" + ) + run_id = args.llm_usage_run_id or ( + f"{args.task_name}_{datetime.now().astimezone().strftime('%Y%m%d_%H%M%S')}" + ) + configure_usage_tracking( + usage_path=usage_path, + run_id=run_id, + process_name="run_agent_pipeline", + reset=True, + ) + print(f"Recording local LLM token usage: {usage_path}", flush=True) + print(f"Local LLM token usage summary: {summary_path}", flush=True) + return usage_path, summary_path + + +def _write_llm_usage_summary(usage_paths: tuple[Path, Path] | None) -> None: + if usage_paths is None: + return + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + write_usage_summary, + ) + + usage_path, summary_path = usage_paths + summary = write_usage_summary( + usage_path=usage_path, + summary_path=summary_path, + ) + total = summary["total"] + print( + "Local LLM token usage total: " + f"calls={total['calls']}, " + f"input={total['input_tokens']}, " + f"output={total['output_tokens']}, " + f"total={total['total_tokens']}", + flush=True, + ) + + +def _run_agent_command( + *, + task_name: str, + gym_config: Path, + agent_config: Path, + regenerate: bool, +) -> int: + command = [ + sys.executable, + "-m", + "embodichain.gen_sim.action_agent_pipeline.cli.run_agent", + "--task_name", + task_name, + "--gym_config", + str(gym_config), + "--agent_config", + str(agent_config), + ] + if regenerate: + command.append("--regenerate") + + env = os.environ.copy() + if env.get("EMBODICHAIN_LLM_USAGE_PATH"): + env["EMBODICHAIN_LLM_USAGE_PROCESS"] = "run_agent" + + print("Running task:") + print(shlex.join(command), flush=True) + return subprocess.run(command, check=False, env=env).returncode + + +def main() -> int: + args = _build_parser().parse_args() + + _ensure_repo_on_pythonpath() + from embodichain.gen_sim.action_agent_pipeline.generation.ur5_basket_config import ( + TargetReplacementSpec, + generate_ur5_basket_config_from_project, + ) + + resolution = _resolve_gym_project(args) + usage_paths = _configure_llm_usage_tracking(args) + target_replacements = _resolve_target_replacements( + args, + TargetReplacementSpec, + resolution.path, + ) + task_description = _resolve_task_description_for_generation(args) + args.task_description = task_description or "" + + paths = generate_ur5_basket_config_from_project( + gym_project=resolution.path, + output_dir=args.config_output_dir, + task_name=args.task_name, + task_description=task_description, + target_body_scale=args.target_body_scale, + target_replacements=target_replacements, + sync_replacement_names=args.sync_replacement_names, + reuse_target_replacements=args.reuse_target_replacements, + prewarm_coacd_cache=args.prewarm_coacd_cache, + overwrite=args.overwrite_config, + ) + _write_pipeline_manifests( + args=args, + resolution=resolution, + generated_paths=paths, + target_replacements=target_replacements, + ) + + print(f"Using gym project/config: {resolution.path}", flush=True) + print(f"Generated gym config: {paths.gym_config}", flush=True) + print(f"Generated agent config: {paths.agent_config}", flush=True) + if args.skip_run_agent: + _write_llm_usage_summary(usage_paths) + return 0 + + return_code = _run_agent_command( + task_name=args.task_name, + gym_config=paths.gym_config, + agent_config=paths.agent_config, + regenerate=args.regenerate, + ) + _write_llm_usage_summary(usage_paths) + return return_code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/__init__.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/__init__.py new file mode 100644 index 00000000..015c4151 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/__init__.py @@ -0,0 +1,19 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/__init__.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/__init__.py new file mode 100644 index 00000000..015c4151 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/__init__.py @@ -0,0 +1,19 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/atomic_actions.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/atomic_actions.py new file mode 100644 index 00000000..907af3c6 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/atomic_actions.py @@ -0,0 +1,54 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import torch + +from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg +from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.base_agent_env import ( + BaseAgentEnv, +) +from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.success import ( + evaluate_configured_success, +) +from embodichain.lab.gym.utils.registration import register_env + +__all__ = ["AtomicActionsAgentEnv"] + + +@register_env("AtomicActionsAgent-v3", max_episode_steps=600) +class AtomicActionsAgentEnv(BaseAgentEnv, EmbodiedEnv): + """Config-driven agent environment for atomic-action tasks.""" + + def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): + super().__init__(cfg, **kwargs) + if bool(getattr(self, "ignore_terminations_during_agent", False)): + self.cfg.ignore_terminations = True + super()._init_agents(**kwargs) + + def reset(self, seed: int | None = None, options: dict | None = None): + obs, info = super().reset(seed=seed, options=options) + super().get_states() + return obs, info + + def is_task_success(self, **kwargs) -> torch.Tensor: + return evaluate_configured_success(self) + + def compute_task_state(self, **kwargs) -> tuple[torch.Tensor, torch.Tensor, dict]: + success = self.is_task_success() + fail = torch.zeros_like(success) + return success, fail, {} diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py new file mode 100644 index 00000000..bbbe303e --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py @@ -0,0 +1,345 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from copy import deepcopy + +import torch +from embodichain.utils import logger + +_TASK_PROMPT_KEYS = frozenset({"task_prompt", "basic_background", "atom_actions"}) + + +class BaseAgentEnv: + + def _init_agents(self, agent_config, task_name, agent_config_path=None): + from embodichain.gen_sim.action_agent_pipeline.agents.task_agent import ( + TaskAgent, + ) + from embodichain.gen_sim.action_agent_pipeline.agents.compile_agent import ( + CompileAgent, + ) + from embodichain.gen_sim.action_agent_pipeline.agents.llm import ( + task_llm, + compile_llm, + ) + + task_agent_config = self._agent_config_with_prompt_keys( + agent_config["Agent"], + _TASK_PROMPT_KEYS, + ) + compile_agent_config = self._agent_config_with_prompt_keys( + agent_config["Agent"], + frozenset(), + ) + self.task_agent = TaskAgent( + task_llm, + **task_agent_config, + **agent_config["TaskAgent"], + task_name=task_name, + config_dir=agent_config_path, + ) + self.compile_agent = CompileAgent( + compile_llm, + **compile_agent_config, + **agent_config["CompileAgent"], + task_name=task_name, + config_dir=agent_config_path, + ) + + def _agent_config_with_prompt_keys(self, agent_config, allowed_keys): + filtered = deepcopy(agent_config) + prompt_kwargs = filtered.get("prompt_kwargs", {}) or {} + filtered["prompt_kwargs"] = { + key: value for key, value in prompt_kwargs.items() if key in allowed_keys + } + return filtered + + def get_states(self): + # TODO: only support num_env = 1 for now + # store robot states in each env.reset + self.init_qpos = self.robot.get_qpos().squeeze(0) + + self._agent_arm_slots = self._resolve_agent_arm_slots() + for side in ("left", "right"): + self._initialize_agent_arm_slot(side, self._agent_arm_slots.get(side)) + + self.open_state = torch.as_tensor( + getattr( + self, + "agent_open_state", + getattr(self, "gripper_open_state", [0.05]), + ), + dtype=self.init_qpos.dtype, + device=self.init_qpos.device, + ).flatten() + self.close_state = torch.as_tensor( + getattr( + self, + "agent_close_state", + getattr(self, "gripper_close_state", [0.0]), + ), + dtype=self.init_qpos.dtype, + device=self.init_qpos.device, + ).flatten() + self.left_arm_current_gripper_state = self._initial_gripper_state("left") + self.right_arm_current_gripper_state = self._initial_gripper_state("right") + + self.update_obj_info() + + def _resolve_agent_arm_slots(self) -> dict[str, dict[str, str | None] | None]: + configured_slots = getattr(self, "agent_arm_slots", None) + if configured_slots is not None: + return self._normalize_agent_arm_slots(configured_slots) + + if hasattr(self, "single_arm_name") or hasattr(self, "single_eef_name"): + slot = getattr(self, "agent_single_arm_slot", "right") + return self._normalize_agent_arm_slots( + { + slot: { + "arm": getattr(self, "single_arm_name", "right_arm"), + "eef": getattr(self, "single_eef_name", "right_eef"), + } + } + ) + + control_parts = getattr(self.robot, "control_parts", {}) or {} + if "arm" in control_parts and "hand" in control_parts: + slot = getattr(self, "agent_single_arm_slot", "left") + return self._normalize_agent_arm_slots( + {slot: {"arm": "arm", "eef": "hand"}} + ) + + return self._normalize_agent_arm_slots( + { + "left": {"arm": "left_arm", "eef": "left_eef"}, + "right": {"arm": "right_arm", "eef": "right_eef"}, + } + ) + + def _normalize_agent_arm_slots( + self, slots + ) -> dict[str, dict[str, str | None] | None]: + normalized = {"left": None, "right": None} + for side in normalized: + slot_cfg = slots.get(side) if isinstance(slots, dict) else None + if slot_cfg is None: + continue + if isinstance(slot_cfg, str): + normalized[side] = {"arm": slot_cfg, "eef": None} + continue + normalized[side] = { + "arm": slot_cfg.get("arm", slot_cfg.get("arm_control_part")), + "eef": slot_cfg.get( + "eef", + slot_cfg.get("hand", slot_cfg.get("eef_control_part")), + ), + } + return normalized + + def _initialize_agent_arm_slot( + self, side: str, slot_cfg: dict[str, str | None] | None + ) -> None: + arm_name = slot_cfg.get("arm") if slot_cfg else None + eef_name = slot_cfg.get("eef") if slot_cfg else None + arm_joints = self._get_control_part_joint_ids(arm_name) + eef_joints = self._get_control_part_joint_ids(eef_name) + + setattr(self, f"{side}_arm_joints", arm_joints) + setattr(self, f"{side}_eef_joints", eef_joints) + + if arm_name is None or not arm_joints: + setattr(self, f"{side}_arm_init_qpos", self.init_qpos.new_empty(0)) + setattr(self, f"{side}_arm_init_xpos", None) + setattr(self, f"{side}_arm_base_pose", None) + setattr(self, f"{side}_arm_current_qpos", self.init_qpos.new_empty(0)) + setattr(self, f"{side}_arm_current_xpos", None) + return + + init_qpos = self.init_qpos[arm_joints] + init_xpos = self.robot.compute_fk( + init_qpos, name=arm_name, to_matrix=True + ).squeeze(0) + base_pose = self.robot.get_control_part_base_pose( + arm_name, to_matrix=True + ).squeeze(0) + + setattr(self, f"{side}_arm_init_qpos", init_qpos) + setattr(self, f"{side}_arm_init_xpos", init_xpos) + setattr(self, f"{side}_arm_base_pose", base_pose) + setattr(self, f"{side}_arm_current_qpos", init_qpos) + setattr(self, f"{side}_arm_current_xpos", init_xpos) + + def _get_control_part_joint_ids(self, control_part: str | None) -> list[int]: + if control_part is None: + return [] + if control_part not in (getattr(self.robot, "control_parts", {}) or {}): + return [] + return list(self.robot.get_joint_ids(name=control_part)) + + def _initial_gripper_state(self, side: str) -> torch.Tensor: + if len(getattr(self, f"{side}_eef_joints", []) or []) == 0: + return self.open_state.new_empty(0) + return self.open_state + + def update_obj_info(self): + # store some useful obj information + obj_info = getattr(self, "obj_info", {}) + obj_uids = self.sim.get_rigid_object_uid_list() + for obj_name in obj_uids: + obj = self.sim.get_rigid_object(obj_name) + obj_pose = obj.get_local_pose(to_matrix=True).squeeze(0) + + if obj_name not in obj_info: + obj_height = obj_pose[2, 3] # Extract the height (z-coordinate) + obj_info[obj_name] = { + "pose": obj_pose, # Store the full pose (4x4 matrix) + "height": obj_height, # Store the initial height (z-coordinate) + } + else: + obj_info[obj_name]["pose"] = obj_pose + + self.obj_info = obj_info + + # -------------------- Common getters / setters -------------------- + + def get_obs_for_agent(self): + obs = self.get_obs() + rgb = obs["sensor"]["cam_high"]["color"].squeeze(0) + + # Get validation camera data + camera_data = self.event_manager.get_functor("validation_cameras")(self, None) + result = {"rgb": rgb} + result.update({k: v.squeeze(0) for k, v in camera_data.items()}) + return result + + def get_current_qpos_agent(self): + return self.left_arm_current_qpos, self.right_arm_current_qpos + + def set_current_qpos_agent(self, arm_qpos, is_left): + if is_left: + self.left_arm_current_qpos = arm_qpos + else: + self.right_arm_current_qpos = arm_qpos + + def get_current_xpos_agent(self): + return self.left_arm_current_xpos, self.right_arm_current_xpos + + def set_current_xpos_agent(self, arm_xpos, is_left): + if is_left: + self.left_arm_current_xpos = arm_xpos + else: + self.right_arm_current_xpos = arm_xpos + + def get_current_gripper_state_agent(self): + return self.left_arm_current_gripper_state, self.right_arm_current_gripper_state + + def set_current_gripper_state_agent(self, arm_gripper_state, is_left): + if is_left: + self.left_arm_current_gripper_state = arm_gripper_state + else: + self.right_arm_current_gripper_state = arm_gripper_state + + # -------------------- IK / FK -------------------- + def get_arm_ik(self, target_xpos, is_left, qpos_seed=None): + control_part = self.get_agent_arm_control_part(is_left) + ret, qpos = self.robot.compute_ik( + name=control_part, pose=target_xpos, joint_seed=qpos_seed + ) + return ret.all().item(), qpos.squeeze(0) + + def get_arm_fk(self, qpos, is_left): + control_part = self.get_agent_arm_control_part(is_left) + xpos = self.robot.compute_fk( + name=control_part, qpos=torch.as_tensor(qpos), to_matrix=True + ) + return xpos.squeeze(0) + + def get_agent_arm_control_part(self, is_left: bool) -> str: + return self._get_agent_control_part(is_left=is_left, key="arm") + + def get_agent_eef_control_part(self, is_left: bool) -> str | None: + return self._get_agent_control_part(is_left=is_left, key="eef", required=False) + + def _get_agent_control_part( + self, is_left: bool, key: str, required: bool = True + ) -> str | None: + if not hasattr(self, "_agent_arm_slots"): + self._agent_arm_slots = self._resolve_agent_arm_slots() + side = "left" if is_left else "right" + slot_cfg = getattr(self, "_agent_arm_slots", {}).get(side) + control_part = slot_cfg.get(key) if slot_cfg else None + if control_part is None and required: + logger.log_error( + f"{side}_{key} is not configured for agent control.", + error_type=ValueError, + ) + return control_part + + # -------------------- get compiled graph for action list -------------------- + def generate_graph_for_actions(self, regenerate=False, recovery=False, **kwargs): + if recovery: + raise NotImplementedError( + "RecoveryAgent has been removed from this pipeline." + ) + + logger.log_info( + "Generate graph for creating action list for " + f"{self.compile_agent.task_name}.", + color="green", + ) + + print(f"\033[92m\nStart task graph generation.\n\033[0m") + task_agent_input = self.task_agent.get_composed_observations( + env=self, + regenerate=regenerate, + observations=self.get_obs_for_agent(), + **kwargs, + ) + task_graph = self.task_agent.generate(**task_agent_input) + + print(f"\033[94m\nStart graph compilation.\n\033[0m") + compile_agent_input = self.compile_agent.get_composed_observations( + env=self, + regenerate=regenerate, + task_graph=task_graph, + **kwargs, + ) + graph_file_path, kwargs, graph_content = self.compile_agent.generate( + **compile_agent_input + ) + + return graph_file_path, kwargs, graph_content + + # -------------------- get action list -------------------- + def create_demo_action_list( + self, regenerate=False, recovery=False, *args, **kwargs + ): + graph_file_path, compile_kwargs, _ = self.generate_graph_for_actions( + regenerate=regenerate, recovery=recovery + ) + atomic_action_kwargs = { + "use_place_action": True, + "allow_grasp_annotation": True, + "force_grasp_reannotate": False, + } + for key in atomic_action_kwargs: + if key in kwargs: + atomic_action_kwargs[key] = kwargs[key] + compile_kwargs.update(atomic_action_kwargs) + action_list = self.compile_agent.act(graph_file_path, **compile_kwargs) + return action_list diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py new file mode 100644 index 00000000..23de84b9 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py @@ -0,0 +1,237 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from typing import Any + +import torch + +__all__ = ["evaluate_configured_success"] + + +def evaluate_configured_success( + env, + spec: Mapping[str, Any] | None = None, +) -> torch.Tensor: + """Evaluate action-agent task success predicates from env config.""" + success_spec = spec or getattr(env, "agent_success", None) + if success_spec is None: + return _constant(env, False) + return _evaluate_spec(env, success_spec) + + +def _evaluate_spec( + env, + spec: Mapping[str, Any] | Sequence[Mapping[str, Any]], +) -> torch.Tensor: + if isinstance(spec, Sequence) and not isinstance(spec, (str, bytes, Mapping)): + return _evaluate_all(env, spec) + if not isinstance(spec, Mapping): + raise TypeError(f"Success spec must be a mapping, got {type(spec)}.") + + op = str(spec.get("op", "")).lower() + if not op and "terms" in spec and "type" not in spec and "func" not in spec: + op = "all" + if op in {"all", "and"}: + return _evaluate_all(env, spec.get("terms", [])) + if op in {"any", "or"}: + return _evaluate_any(env, spec.get("terms", [])) + if op == "not": + term = spec.get("term") + terms = spec.get("terms") + if term is None and isinstance(terms, Sequence) and len(terms) == 1: + term = terms[0] + if term is None: + raise ValueError("Success op 'not' requires exactly one term.") + return ~_evaluate_spec(env, term) + + term_type = str(spec.get("type", spec.get("func", ""))).lower() + if term_type in {"object_position_near", "object_near_position"}: + return _object_position_near(env, spec) + if term_type in {"object_xy_near", "object_near_xy"}: + return _object_xy_near(env, spec) + if term_type == "object_in_container": + return _object_in_container(env, spec) + if term_type in {"object_on_object", "object_on", "on_object"}: + return _object_on_object(env, spec) + if term_type in {"object_not_fallen", "not_fallen"}: + return _object_not_fallen(env, spec) + if term_type in {"object_axis_offset_near", "object_relative_axis_near"}: + return _object_axis_offset_near(env, spec) + if term_type in {"object_axis_near", "object_coordinate_near"}: + return _object_axis_near(env, spec) + if term_type in {"object_lifted", "object_height_above_initial"}: + return _object_lifted(env, spec) + raise ValueError(f"Unsupported success term type: {term_type!r}.") + + +def _evaluate_all(env, terms: Sequence[Mapping[str, Any]]) -> torch.Tensor: + success = _constant(env, True) + for term in terms: + success = success & _evaluate_spec(env, term) + return success + + +def _evaluate_any(env, terms: Sequence[Mapping[str, Any]]) -> torch.Tensor: + success = _constant(env, False) + for term in terms: + success = success | _evaluate_spec(env, term) + return success + + +def _constant(env, value: bool) -> torch.Tensor: + return torch.full((env.num_envs,), value, dtype=torch.bool, device=env.device) + + +def _pose(env, uid: str) -> torch.Tensor: + return env.sim.get_rigid_object(uid).get_local_pose(to_matrix=True) + + +def _position(env, uid: str) -> torch.Tensor: + return _pose(env, uid)[:, :3, 3] + + +def _tensor(value: Any, *, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + return torch.as_tensor(value, dtype=dtype, device=device) + + +def _object_name(spec: Mapping[str, Any]) -> str: + return str(spec.get("object", spec.get("object_uid"))) + + +def _object_position_near(env, spec: Mapping[str, Any]) -> torch.Tensor: + position = _position(env, _object_name(spec)) + target = _tensor( + spec.get("target_position", spec.get("position", spec.get("target"))), + dtype=position.dtype, + device=position.device, + ).flatten() + if target.numel() == 2: + return _object_xy_near(env, {**spec, "target_xy": target}) + target = target.reshape(1, 3) + return torch.linalg.norm(position - target, dim=-1) <= float( + spec.get("tolerance", 0.05) + ) + + +def _object_xy_near(env, spec: Mapping[str, Any]) -> torch.Tensor: + position = _position(env, _object_name(spec)) + target_xy = _tensor( + spec.get("target_xy", spec.get("xy", spec.get("target"))), + dtype=position.dtype, + device=position.device, + ).flatten()[:2] + tolerance = float(spec.get("tolerance", spec.get("xy_tolerance", 0.05))) + return ( + torch.linalg.norm(position[:, :2] - target_xy.reshape(1, 2), dim=-1) + <= tolerance + ) + + +def _object_in_container(env, spec: Mapping[str, Any]) -> torch.Tensor: + object_position = _position(env, _object_name(spec)) + container_position = _position( + env, + str(spec.get("container", spec.get("container_uid"))), + ) + xy_distance = torch.linalg.norm( + object_position[:, :2] - container_position[:, :2], + dim=-1, + ) + z_offset = object_position[:, 2] - container_position[:, 2] + return ( + (xy_distance <= float(spec.get("xy_radius", spec.get("radius", 0.1)))) + & (z_offset >= float(spec.get("min_z_offset", -0.03))) + & (z_offset <= float(spec.get("max_z_offset", 0.25))) + ) + + +def _object_on_object(env, spec: Mapping[str, Any]) -> torch.Tensor: + object_position = _position(env, _object_name(spec)) + support_position = _position( + env, + str( + spec.get( + "support", + spec.get("support_uid", spec.get("reference", spec.get("container"))), + ) + ), + ) + xy_distance = torch.linalg.norm( + object_position[:, :2] - support_position[:, :2], + dim=-1, + ) + z_offset = object_position[:, 2] - support_position[:, 2] + return ( + (xy_distance <= float(spec.get("xy_radius", spec.get("radius", 0.08)))) + & (z_offset >= float(spec.get("min_z_offset", 0.02))) + & (z_offset <= float(spec.get("max_z_offset", 0.35))) + ) + + +def _object_not_fallen(env, spec: Mapping[str, Any]) -> torch.Tensor: + pose = _pose(env, _object_name(spec)) + pose_z_axis = pose[:, :3, 2] + world_z_axis = torch.tensor([0, 0, 1], dtype=pose.dtype, device=pose.device) + dot_product = torch.sum(pose_z_axis * world_z_axis, dim=-1).clamp(-1.0, 1.0) + return torch.arccos(dot_product) < float(spec.get("max_tilt", torch.pi / 4)) + + +def _object_axis_offset_near(env, spec: Mapping[str, Any]) -> torch.Tensor: + object_position = _position(env, _object_name(spec)) + reference_position = _position( + env, + str(spec.get("reference", spec.get("reference_uid"))), + ) + axis = _axis_index(str(spec.get("axis", "y"))) + target_value = reference_position[:, axis] + float(spec.get("offset", 0.0)) + return torch.abs(object_position[:, axis] - target_value) <= float( + spec.get("tolerance", 0.02) + ) + + +def _object_axis_near(env, spec: Mapping[str, Any]) -> torch.Tensor: + object_position = _position(env, _object_name(spec)) + axis = _axis_index(str(spec.get("axis", "y"))) + target_value = float(spec.get("target", spec.get("value"))) + return torch.abs(object_position[:, axis] - target_value) <= float( + spec.get("tolerance", 0.02) + ) + + +def _object_lifted(env, spec: Mapping[str, Any]) -> torch.Tensor: + object_name = _object_name(spec) + position = _position(env, object_name) + initial_height = spec.get("initial_height") + if initial_height is None: + initial_height = getattr(env, "obj_info", {}).get(object_name, {}).get("height") + if initial_height is None: + initial_height = position[:, 2] + initial_height = _tensor( + initial_height, + dtype=position.dtype, + device=position.device, + ) + return position[:, 2] >= initial_height + float(spec.get("min_height", 0.1)) + + +def _axis_index(axis: str) -> int: + axes = {"x": 0, "y": 1, "z": 2} + if axis not in axes: + raise ValueError(f"Unsupported axis {axis!r}; expected one of x, y, z.") + return axes[axis] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/__init__.py b/embodichain/gen_sim/action_agent_pipeline/generation/__init__.py new file mode 100644 index 00000000..f89f7a8b --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/__init__.py @@ -0,0 +1,21 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +"""Config generation helpers for the action-agent pipeline.""" + +__all__: list[str] = [] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py b/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py new file mode 100644 index 00000000..f2f4fee0 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py @@ -0,0 +1,171 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping +from pathlib import Path +from typing import Any +import hashlib + +from embodichain.utils.logger import log_info + +__all__ = [ + "coacd_cache_path_for_mesh", + "prewarm_coacd_cache_for_gym_config", +] + +_DEFAULT_CONVEX_DECOMP_DIR = ( + Path.home() / ".cache" / "embodichain_cache" / "convex_decomposition" +) + + +def coacd_cache_path_for_mesh( + mesh_path: str | Path, + max_convex_hull_num: int, + cache_dir: str | Path | None = None, +) -> Path: + """Return the DexSim environment-side CoACD cache path for a mesh.""" + + if cache_dir is None: + cache_dir = _DEFAULT_CONVEX_DECOMP_DIR + + mesh_path = Path(mesh_path).expanduser().resolve() + mesh_md5_key = hashlib.md5(mesh_path.read_bytes()).hexdigest() + return Path(cache_dir).expanduser().resolve() / ( + f"{mesh_md5_key}_{int(max_convex_hull_num)}.obj" + ) + + +def prewarm_coacd_cache_for_gym_config( + gym_config: Mapping[str, Any], + *, + cache_dir: str | Path | None = None, + repo_root: str | Path | None = None, +) -> list[dict[str, Any]]: + """Precompute DexSim environment-side CoACD cache files for mesh objects.""" + + entries = [] + for obj in _iter_mesh_object_configs(gym_config): + max_convex_hull_num = int(obj.get("max_convex_hull_num", 1)) + if max_convex_hull_num <= 1: + continue + entries.append((obj, max_convex_hull_num)) + if not entries: + return [] + + if cache_dir is None: + cache_dir = _DEFAULT_CONVEX_DECOMP_DIR + + cache_dir = Path(cache_dir).expanduser().resolve() + cache_dir.mkdir(parents=True, exist_ok=True) + repo_root = Path(repo_root).expanduser().resolve() if repo_root else _repo_root() + + reports: list[dict[str, Any]] = [] + seen_cache_paths: set[Path] = set() + for obj, max_convex_hull_num in entries: + uid = str(obj.get("uid", "")) + raw_fpath = str(obj.get("shape", {}).get("fpath", "")) + mesh_path = _resolve_mesh_path(raw_fpath, repo_root) + cache_path = coacd_cache_path_for_mesh( + mesh_path, + max_convex_hull_num, + cache_dir, + ) + report = { + "uid": uid, + "mesh_path": mesh_path.as_posix(), + "max_convex_hull_num": max_convex_hull_num, + "cache_path": cache_path.as_posix(), + } + if cache_path in seen_cache_paths: + report["status"] = "duplicate" + elif cache_path.is_file(): + report["status"] = "hit" + else: + try: + _generate_coacd_cache(mesh_path, cache_path, max_convex_hull_num) + except Exception as exc: + report["status"] = "skipped" + report["reason"] = str(exc) + else: + report["status"] = "generated" + seen_cache_paths.add(cache_path) + reports.append(report) + return reports + + +def _iter_mesh_object_configs( + gym_config: Mapping[str, Any], +) -> list[Mapping[str, Any]]: + objects = [] + for section in ("background", "rigid_object"): + value = gym_config.get(section, []) + if isinstance(value, Mapping): + value = [value] + if not isinstance(value, list): + continue + for obj in value: + if not isinstance(obj, Mapping): + continue + shape = obj.get("shape", {}) + if isinstance(shape, Mapping) and shape.get("shape_type") == "Mesh": + objects.append(obj) + return objects + + +def _resolve_mesh_path(raw_fpath: str, repo_root: Path) -> Path: + path = Path(raw_fpath).expanduser() + if path.is_absolute(): + candidate = path.resolve() + else: + candidate = (repo_root / path).resolve() + if not candidate.is_file(): + cwd_candidate = (Path.cwd() / path).resolve() + if cwd_candidate.is_file(): + candidate = cwd_candidate + if not candidate.is_file(): + raise FileNotFoundError(f"Mesh path for CoACD prewarm not found: {raw_fpath}") + return candidate + + +def _generate_coacd_cache( + mesh_path: Path, + cache_path: Path, + max_convex_hull_num: int, +) -> None: + import open3d as o3d + from dexsim.kit.meshproc import convex_decomposition_coacd + from dexsim.kit.meshproc.utility import mesh_list_to_file + + log_info( + "Prewarming environment CoACD cache: " + f"mesh={mesh_path.as_posix()}, hulls={max_convex_hull_num}" + ) + in_mesh = o3d.t.io.read_triangle_mesh(mesh_path.as_posix()) + _, out_mesh_list = convex_decomposition_coacd( + in_mesh, + max_convex_hull_num=int(max_convex_hull_num), + ) + mesh_list_to_file(cache_path.as_posix(), out_mesh_list) + + +def _repo_root() -> Path: + current = Path(__file__).resolve() + for parent in current.parents: + if (parent / "setup.py").is_file() and (parent / "embodichain").is_dir(): + return parent + return Path.cwd().resolve() diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py new file mode 100644 index 00000000..b4cf86ec --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -0,0 +1,992 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +"""Prompt and agent-config builders for generated action-agent tasks.""" + +from __future__ import annotations + +import json +from collections.abc import Mapping, Sequence +from typing import Any, Protocol + +__all__ = [ + "make_agent_config", + "make_basket_atom_actions_prompt", + "make_basket_basic_background", + "make_basket_task_prompt", + "make_relative_atom_actions_prompt", + "make_relative_basic_background", + "make_relative_task_prompt", +] + + +class _BasketRolesLike(Protocol): + left_target_runtime_uid: str + right_target_runtime_uid: str + container_runtime_uid: str + left_target_source_uid: str + right_target_source_uid: str + container_source_uid: str + left_target_noun: str + right_target_noun: str + + +class _RelativePlacementLike(Protocol): + active_side: str + moved_runtime_uid: str + moved_source_uid: str + reference_runtime_uid: str + reference_source_uid: str + relation: str + high_offset: tuple[float, float, float] + release_offset: tuple[float, float, float] + + +class _RelativeSpecLike(_RelativePlacementLike, Protocol): + placements: Sequence[_RelativePlacementLike] + task_prompt_summary: str + task_description: str + action_sketch: Sequence[str] + basic_background_notes: str + + +def make_agent_config() -> dict[str, Any]: + return { + "TaskAgent": { + "prompt_name": "generate_task_graph", + }, + "CompileAgent": { + "prompt_name": "compile_agent_graph", + }, + "Agent": { + "prompt_kwargs": { + "task_prompt": { + "type": "text", + "name": "task_prompt.txt", + }, + "basic_background": { + "type": "text", + "name": "basic_background.txt", + }, + "atom_actions": { + "type": "text", + "name": "atom_actions.txt", + }, + } + }, + } + + +def make_relative_task_prompt( + task_name: str, + project_name: str, + spec: _RelativeSpecLike, +) -> str: + if len(spec.placements) > 1: + return _make_dual_relative_task_prompt(task_name, project_name, spec) + + active_arm = f"{spec.active_side}_arm" + inactive_slot = ( + "right_arm_action" if spec.active_side == "left" else "left_arm_action" + ) + active_slot = f"{spec.active_side}_arm_action" + action_sketch = _format_action_sketch(spec.action_sketch) + pick_spec = _format_pick_up_spec(active_arm, spec.moved_runtime_uid) + high_spec = _format_pose_object_spec( + active_arm, + spec.reference_runtime_uid, + spec.high_offset, + sample_interval=45, + ) + release_spec = _format_pose_object_spec( + active_arm, + spec.reference_runtime_uid, + spec.release_offset, + sample_interval=30, + ) + open_spec = _format_gripper_spec( + active_arm, + "open", + sample_interval=15, + post_hold_steps=25, + ) + retreat_spec = _format_pose_offset_spec( + active_arm, + (0.0, 0.0, 0.14), + sample_interval=20, + ) + initial_spec = _format_initial_qpos_spec(active_arm, sample_interval=30) + return f"""Task: +{task_name}: {spec.task_prompt_summary} + +This config was generated from a simple task description by the config-stage +LLM. The execution-stage LLM must now generate the graph JSON from this prompt. + +Original simple task description: +{spec.task_description} + +Config-stage LLM interpretation: +{action_sketch} + +Object and arm mapping: +- Move `{spec.moved_runtime_uid}`. Source object: `{spec.moved_source_uid}`. +- Use `{spec.reference_runtime_uid}` as the spatial reference. Source object: + `{spec.reference_source_uid}`. +- Goal relation: `{spec.relation}` ({_relative_relation_phrase(spec.relation)}). +- Active arm: `{active_arm}`. +- Keep every `{inactive_slot}` as null. + +Coordinate convention for relative placement: +- `left_of` means negative world y relative to the reference object. +- `right_of` means positive world y relative to the reference object. +- `front_of` means negative world x relative to the reference object. +- `behind` means positive world x relative to the reference object. +- `inside` and `on` use the reference object's xy center. + +Generate one deterministic nominal graph with exactly 6 nominal edges. Use only +the atomic action class JSON specs shown below. Do not add recovery, monitor, search, +alignment, or extra lift edges. The inactive arm must remain null in every edge. + +1. Pick up the moved object: + - {active_slot}: {pick_spec} + - {inactive_slot}: null + +2. Move the held object to the high staging pose relative to the reference: + - {active_slot}: {high_spec} + - {inactive_slot}: null + +3. Lower the held object to the release pose: + - {active_slot}: {release_spec} + - {inactive_slot}: null + +4. Release the moved object: + - {active_slot}: {open_spec} + - {inactive_slot}: null + +5. Move the empty gripper upward to clear the object: + - {active_slot}: {retreat_spec} + - {inactive_slot}: null + +6. Return the active arm to its initial pose: + - {active_slot}: {initial_spec} + - {inactive_slot}: null + +Final state: `{spec.moved_runtime_uid}` must be +{_relative_relation_phrase(spec.relation)} `{spec.reference_runtime_uid}`. Always +plan to the current object poses from the exported {project_name} environment +config. Do not hard-code absolute object coordinates in the generated graph. +""" + + +def _make_dual_relative_task_prompt( + task_name: str, + project_name: str, + spec: _RelativeSpecLike, +) -> str: + first, second = spec.placements + first_arm = f"{first.active_side}_arm" + second_arm = f"{second.active_side}_arm" + first_slot = f"{first.active_side}_arm_action" + second_slot = f"{second.active_side}_arm_action" + action_sketch = _format_action_sketch(spec.action_sketch) + first_pick_spec = _format_pick_up_spec(first_arm, first.moved_runtime_uid) + second_pick_spec = _format_pick_up_spec(second_arm, second.moved_runtime_uid) + first_high_spec = _format_pose_object_spec( + first_arm, + first.reference_runtime_uid, + first.high_offset, + sample_interval=45, + ) + first_release_spec = _format_pose_object_spec( + first_arm, + first.reference_runtime_uid, + first.release_offset, + sample_interval=30, + ) + second_high_spec = _format_pose_object_spec( + second_arm, + second.reference_runtime_uid, + second.high_offset, + sample_interval=45, + ) + second_release_spec = _format_pose_object_spec( + second_arm, + second.reference_runtime_uid, + second.release_offset, + sample_interval=30, + ) + first_open_spec = _format_gripper_spec( + first_arm, + "open", + sample_interval=15, + post_hold_steps=25, + ) + second_open_spec = _format_gripper_spec( + second_arm, + "open", + sample_interval=15, + post_hold_steps=25, + ) + first_close_spec = _format_gripper_spec( + first_arm, + "close", + sample_interval=10, + ) + second_close_spec = _format_gripper_spec( + second_arm, + "close", + sample_interval=10, + ) + first_retreat_spec = _format_pose_offset_spec( + first_arm, + (0.0, 0.0, 0.14), + sample_interval=20, + ) + second_retreat_spec = _format_pose_offset_spec( + second_arm, + (0.0, 0.0, 0.14), + sample_interval=20, + ) + first_initial_spec = _format_initial_qpos_spec( + first_arm, + sample_interval=30, + ) + second_initial_spec = _format_initial_qpos_spec( + second_arm, + sample_interval=30, + ) + return f"""Task: +{task_name}: {spec.task_prompt_summary} + +This config was generated from a simple task description by the config-stage +LLM. The execution-stage LLM must now generate the graph JSON from this prompt. + +Original simple task description: +{spec.task_description} + +Config-stage LLM interpretation: +{action_sketch} + +Object and arm mapping: +- {first_slot} must manipulate `{first.moved_runtime_uid}`. Source object: + `{first.moved_source_uid}`. +- {second_slot} must manipulate `{second.moved_runtime_uid}`. Source object: + `{second.moved_source_uid}`. +- `{first.reference_runtime_uid}` is the spatial reference for + `{first.moved_runtime_uid}`. Goal relation: `{first.relation}` + ({_relative_relation_phrase(first.relation)}). +- `{second.reference_runtime_uid}` is the spatial reference for + `{second.moved_runtime_uid}`. Goal relation: `{second.relation}` + ({_relative_relation_phrase(second.relation)}). + +Coordinate convention for relative placement: +- `left_of` means negative world y relative to the reference object. +- `right_of` means positive world y relative to the reference object. +- `front_of` means negative world x relative to the reference object. +- `behind` means positive world x relative to the reference object. +- `inside` and `on` use the reference object's xy center. + +Generate one deterministic nominal graph with exactly 10 nominal edges. Use only +the atomic action class JSON specs shown below. Do not add recovery, monitor, search, +alignment, or extra lift edges. + +1. Pick up both moved objects simultaneously: + - {first_slot}: {first_pick_spec} + - {second_slot}: {second_pick_spec} + +2. Move `{first.moved_runtime_uid}` to the high staging pose while the other arm + keeps holding `{second.moved_runtime_uid}`: + - {first_slot}: {first_high_spec} + - {second_slot}: {second_close_spec} + +3. Lower `{first.moved_runtime_uid}` to the release pose: + - {first_slot}: {first_release_spec} + - {second_slot}: {second_close_spec} + +4. Release `{first.moved_runtime_uid}`: + - {first_slot}: {first_open_spec} + - {second_slot}: {second_close_spec} + +5. Move the empty `{first_arm}` gripper upward to clear the workspace: + - {first_slot}: {first_retreat_spec} + - {second_slot}: {second_close_spec} + +6. Return `{first_arm}` to its initial pose while moving `{second.moved_runtime_uid}` + to the high staging pose: + - {first_slot}: {first_initial_spec} + - {second_slot}: {second_high_spec} + +7. Lower `{second.moved_runtime_uid}` to the release pose: + - {first_slot}: null + - {second_slot}: {second_release_spec} + +8. Release `{second.moved_runtime_uid}`: + - {first_slot}: null + - {second_slot}: {second_open_spec} + +9. Move the empty `{second_arm}` gripper upward to clear the workspace: + - {first_slot}: null + - {second_slot}: {second_retreat_spec} + +10. Return `{second_arm}` to its initial pose: + - {first_slot}: null + - {second_slot}: {second_initial_spec} + +Final state: `{first.moved_runtime_uid}` must be +{_relative_relation_phrase(first.relation)} `{first.reference_runtime_uid}`, and +`{second.moved_runtime_uid}` must be {_relative_relation_phrase(second.relation)} +`{second.reference_runtime_uid}`. Always plan to the current object poses from the +exported {project_name} environment config. Do not hard-code absolute object +coordinates in the generated graph. +""" + + +def make_relative_basic_background( + project_name: str, + spec: _RelativeSpecLike, +) -> str: + if len(spec.placements) > 1: + return _make_dual_relative_basic_background(project_name, spec) + + active_arm = f"{spec.active_side}_arm" + inactive_arm = "right_arm" if spec.active_side == "left" else "left_arm" + notes = spec.basic_background_notes or ( + "No extra scene notes were provided by the config-stage LLM." + ) + return f"""The scene comes from the exported {project_name} mesh environment. + +This configuration directory is for a Dual-UR5 relative-placement task generated +from a simple natural-language task description. + +The robot is a dual-UR5 composite robot with DH_PGI_140_80 parallel grippers: +- left_arm is the UR5 outside the left side of the table's near long edge. +- right_arm is the UR5 outside the right side of the table's near long edge. + +The active arm for this task is `{active_arm}`. The inactive arm +`{inactive_arm}` must stay null in the nominal graph. + +Interactive task objects: +- {spec.moved_runtime_uid}: moved object from source `{spec.moved_source_uid}`. +- {spec.reference_runtime_uid}: reference object from source + `{spec.reference_source_uid}`. + +Config-stage LLM notes: +{notes} + +The execution-stage LLM should generate graph JSON that grasps the moved object, +moves it to a high staging pose relative to the current reference object pose, +lowers to the release pose, opens the gripper, retreats upward, and returns the +active arm to its initial pose. +""" + + +def _make_dual_relative_basic_background( + project_name: str, + spec: _RelativeSpecLike, +) -> str: + notes = spec.basic_background_notes or ( + "No extra scene notes were provided by the config-stage LLM." + ) + placement_lines = "\n".join( + f"- {placement.active_side}_arm moves `{placement.moved_runtime_uid}` " + f"{_relative_relation_phrase(placement.relation)} " + f"`{placement.reference_runtime_uid}`." + for placement in spec.placements + ) + return f"""The scene comes from the exported {project_name} mesh environment. + +This configuration directory is for a Dual-UR5 dual-arm relative-placement task +generated from a simple natural-language task description. + +The robot is a dual-UR5 composite robot with DH_PGI_140_80 parallel grippers: +- left_arm is the UR5 outside the left side of the table's near long edge. +- right_arm is the UR5 outside the right side of the table's near long edge. + +Both arms participate in the nominal graph: +{placement_lines} + +Config-stage LLM notes: +{notes} + +The execution-stage LLM should generate graph JSON that grasps both moved +objects, places the first moved object, retreats the first arm, then places the +second moved object while the first arm returns to its initial pose. Each arm +must release its moved object before returning to its initial pose. +""" + + +def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: + if len(spec.placements) > 1: + return _make_dual_relative_atom_actions_prompt(spec) + + active_arm = f"{spec.active_side}_arm" + inactive_arm = "right_arm" if spec.active_side == "left" else "left_arm" + high_spec = _format_pose_object_spec( + active_arm, + spec.reference_runtime_uid, + spec.high_offset, + sample_interval=45, + ) + release_spec = _format_pose_object_spec( + active_arm, + spec.reference_runtime_uid, + spec.release_offset, + sample_interval=30, + ) + return f"""### Atomic Action Class JSON Specs for Dual-UR5 Relative Placement + +Use only atomic action class JSON specs backed by `PickUpAction`, `MoveAction`, and +`PlaceAction`. The active arm is `{active_arm}`. Keep `{inactive_arm}` null in +the nominal graph. + +Use exactly these action patterns: +- Pick up `{spec.moved_runtime_uid}`: + {_format_pick_up_spec(active_arm, spec.moved_runtime_uid)} +- High staging relative to `{spec.reference_runtime_uid}`: + {high_spec} +- Release pose relative to `{spec.reference_runtime_uid}`: + {release_spec} +- Release the held object: + {_format_gripper_spec(active_arm, "open", sample_interval=15, post_hold_steps=25)} +- Retreat upward: + {_format_pose_offset_spec(active_arm, (0.0, 0.0, 0.14), sample_interval=20)} +- Return to initial qpos: + {_format_initial_qpos_spec(active_arm, sample_interval=30)} +""" + + +def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: + first, second = spec.placements + first_arm = f"{first.active_side}_arm" + second_arm = f"{second.active_side}_arm" + first_high_spec = _format_pose_object_spec( + first_arm, + first.reference_runtime_uid, + first.high_offset, + sample_interval=45, + ) + first_release_spec = _format_pose_object_spec( + first_arm, + first.reference_runtime_uid, + first.release_offset, + sample_interval=30, + ) + second_high_spec = _format_pose_object_spec( + second_arm, + second.reference_runtime_uid, + second.high_offset, + sample_interval=45, + ) + second_release_spec = _format_pose_object_spec( + second_arm, + second.reference_runtime_uid, + second.release_offset, + sample_interval=30, + ) + return f"""### Atomic Action Class JSON Specs for Dual-UR5 Dual-Arm Relative Placement + +Use only atomic action class JSON specs backed by `PickUpAction`, `MoveAction`, and +`PlaceAction`. +- `{first_arm}` manipulates `{first.moved_runtime_uid}`. +- `{second_arm}` manipulates `{second.moved_runtime_uid}`. + +Use these action patterns: +- First arm pick-up: + {_format_pick_up_spec(first_arm, first.moved_runtime_uid)} +- Second arm pick-up: + {_format_pick_up_spec(second_arm, second.moved_runtime_uid)} +- First high staging: + {first_high_spec} +- First release pose: + {first_release_spec} +- Second high staging: + {second_high_spec} +- Second release pose: + {second_release_spec} +- Release an object: + {_format_gripper_spec("", "open", sample_interval=15, post_hold_steps=25)} +- Keep a holding arm closed: + {_format_gripper_spec("", "close", sample_interval=10)} +- Retreat upward: + {_format_pose_offset_spec("", (0.0, 0.0, 0.14), sample_interval=20)} +- Return to initial qpos: + {_format_initial_qpos_spec("", sample_interval=30)} +""" + + +def make_basket_task_prompt( + task_name: str, + project_name: str, + roles: _BasketRolesLike, +) -> str: + left_target_text = _left_target_text(roles) + right_target_text = _right_target_text(roles) + target_pair_text = _target_pair_text(roles) + target_plural = _target_plural_text(roles) + left_pick_spec = _format_pick_up_spec( + "left_arm", + roles.left_target_runtime_uid, + ) + right_pick_spec = _format_pick_up_spec( + "right_arm", + roles.right_target_runtime_uid, + ) + left_high_spec = _format_pose_object_spec( + "left_arm", + roles.container_runtime_uid, + (0.0, -0.04, 0.22), + sample_interval=45, + ) + left_release_spec = _format_pose_object_spec( + "left_arm", + roles.container_runtime_uid, + (0.0, -0.04, 0.12), + sample_interval=30, + ) + right_high_spec = _format_pose_object_spec( + "right_arm", + roles.container_runtime_uid, + (0.0, 0.04, 0.22), + sample_interval=45, + ) + right_release_spec = _format_pose_object_spec( + "right_arm", + roles.container_runtime_uid, + (0.0, 0.04, 0.12), + sample_interval=30, + ) + left_open_spec = _format_gripper_spec( + "left_arm", + "open", + sample_interval=15, + post_hold_steps=25, + ) + right_open_spec = _format_gripper_spec( + "right_arm", + "open", + sample_interval=15, + post_hold_steps=25, + ) + right_close_spec = _format_gripper_spec( + "right_arm", + "close", + sample_interval=10, + ) + left_retreat_spec = _format_pose_offset_spec( + "left_arm", + (0.0, 0.0, 0.14), + sample_interval=20, + ) + right_retreat_spec = _format_pose_offset_spec( + "right_arm", + (0.0, 0.0, 0.14), + sample_interval=20, + ) + left_initial_spec = _format_initial_qpos_spec( + "left_arm", + sample_interval=30, + ) + right_initial_spec = _format_initial_qpos_spec( + "right_arm", + sample_interval=30, + ) + return f"""Task: +{task_name}: use the current two-UR5 configuration to place +{target_pair_text} into the {roles.container_runtime_uid}. + +The task starts with both arms acting simultaneously: +the left UR5 grasps the left {left_target_text} while the right UR5 grasps the +right {right_target_text} in the same nominal graph edge. After both +{target_plural} are grasped, the left UR5 places its {left_target_text} into the +{roles.container_runtime_uid} and retreats upward. While the left UR5 returns +to its initial pose, the right UR5 must simultaneously begin placing its +already-grasped {right_target_text} by moving it to the high staging pose above +the {roles.container_runtime_uid}. The right UR5 then completes its placement +and returns to its initial pose. + +Object and arm mapping: +- left_arm must only manipulate `{roles.left_target_runtime_uid}`. +- right_arm must only manipulate `{roles.right_target_runtime_uid}`. +- Both target objects must be released into `{roles.container_runtime_uid}`. + +Generate one deterministic nominal graph with the following semantic sequence. +Do not add extra alignment, search, recovery, or monitor steps. Do include the +specified post-release retreat and return-to-initial steps. The left arm must +finish its upward retreat before the right arm enters the shared container +workspace, but the left return-to-initial action and the right high-staging +action must execute simultaneously in one graph edge. Generate exactly 10 +nominal edges, one edge for each numbered step below. Do not split the +simultaneous grasp or the simultaneous left-return/right-staging action into +separate edges. Do not merge, reorder, or omit the lower-to-release, +open-gripper, upward-retreat, or final right return-to-initial edges. + +A target object is not considered placed when it is only above the +{roles.container_runtime_uid}. For each arm, the placement order must be: move +to a high staging pose above the container, lower to the release pose inside the +container, use `target_qpos` with source `gripper_state` and state `open`, +move the empty gripper upward, then return the arm to its initial pose. Never +use `target_qpos` source `initial` for an arm that has not already released its +held target object. + +1. Pick up both target objects simultaneously: + - left_arm_action: {left_pick_spec} + - right_arm_action: {right_pick_spec} + +2. Move the held left target object directly above the left half of the + {roles.container_runtime_uid} while the right arm keeps holding its target: + - left_arm_action: {left_high_spec} + - right_arm_action: {right_close_spec} + +3. Lower the held left target object to the left release pose inside the + {roles.container_runtime_uid}: + - left_arm_action: {left_release_spec} + - right_arm_action: {right_close_spec} + +4. Release the left target object into the {roles.container_runtime_uid}: + - left_arm_action: {left_open_spec} + - right_arm_action: {right_close_spec} + +5. Move the empty left gripper upward to clear the container: + - left_arm_action: {left_retreat_spec} + - right_arm_action: {right_close_spec} + +6. After the left gripper has retreated upward, return the left UR5 to its + initial pose while simultaneously moving the held right target object + directly above the right half of the {roles.container_runtime_uid}. This + parallel handoff must remain one graph edge: + - left_arm_action: {left_initial_spec} + - right_arm_action: {right_high_spec} + +7. Lower the held right target object to the right release pose inside the + {roles.container_runtime_uid}: + - left_arm_action: null + - right_arm_action: {right_release_spec} + +8. Release the right target object into the {roles.container_runtime_uid}: + - left_arm_action: null + - right_arm_action: {right_open_spec} + +9. Move the empty right gripper upward to clear the container: + - left_arm_action: null + - right_arm_action: {right_retreat_spec} + +10. Return the right UR5 to its initial pose after releasing the target object: + - left_arm_action: null + - right_arm_action: {right_initial_spec} + +The final state is both `{roles.left_target_runtime_uid}` and +`{roles.right_target_runtime_uid}` resting inside `{roles.container_runtime_uid}`, +with both arms moved away from the container workspace. Always plan to the +current `{roles.container_runtime_uid}` object pose from the exported +{project_name} environment config. +""" + + +def make_basket_basic_background( + project_name: str, + roles: _BasketRolesLike, +) -> str: + left_target_text = _left_target_text(roles) + right_target_text = _right_target_text(roles) + target_plural = _target_plural_text(roles) + return f"""The scene comes from the exported {project_name} mesh environment. + +This configuration directory is for the UR5BreadBasket task template. The +current robot is a dual-UR5 composite robot with DH_PGI_140_80 parallel +grippers. + +The robot is a dual-UR5 composite robot with two parallel grippers: +- left_arm is the UR5 outside the left side of the table's near long edge. +- right_arm is the UR5 outside the right side of the table's near long edge. + +Both UR5 bases are on the same long side of the table and face inward toward +the central {roles.container_runtime_uid}. The bases are intentionally kept +outside the table edge to avoid initial robot-table contact. + +The interactive objects are: +- {roles.left_target_runtime_uid}: the {left_target_text} mesh initially on the + negative-y side (source object {roles.left_target_source_uid}). +- {roles.right_target_runtime_uid}: the {right_target_text} mesh initially on the + positive-y side (source object {roles.right_target_source_uid}). +- {roles.container_runtime_uid}: the target container near the center of the + table (source object {roles.container_source_uid}). + +The nominal task starts with simultaneous dual-arm grasping. The left UR5 must +grasp {roles.left_target_runtime_uid} while the right UR5 grasps +{roles.right_target_runtime_uid} in the same graph edge. After both +{target_plural} are held, the left UR5 places +{roles.left_target_runtime_uid} into {roles.container_runtime_uid}, releases +it, and retreats upward. The next graph edge is a parallel handoff: the left +UR5 returns to its initial pose while the right UR5 simultaneously moves its +already-grasped {roles.right_target_runtime_uid} to the high staging pose above +{roles.container_runtime_uid}. The right UR5 then lowers and releases +{roles.right_target_runtime_uid}, retreats upward, and returns to its initial +pose. To change the insertion order later, edit the task prompt sequence and +keep the same atomic action API. + +The {roles.container_runtime_uid} area is a shared workspace. After a UR5 +releases a target object, it should retreat upward before the other UR5 moves +to the container, otherwise the two arms may collide near the container. The +right UR5 should keep holding {roles.right_target_runtime_uid} while the left +UR5 performs its placement and upward retreat. Once that retreat is complete, +the right UR5 may move toward the container while the left UR5 simultaneously +returns to its initial pose; it must not wait for the left return-to-initial +motion to finish. + +A target object at a high pose above `{roles.container_runtime_uid}` is only +staged, not placed. Each arm must lower the held object into the container +release pose and open the gripper before any return-to-initial motion. + +Always plan to the current `{roles.container_runtime_uid}` object pose from the +environment config. Do not hard-code container coordinates in generated graph +actions. +""" + + +def make_basket_atom_actions_prompt(roles: _BasketRolesLike) -> str: + left_high_spec = _format_pose_object_spec( + "left_arm", + roles.container_runtime_uid, + (0.0, -0.04, 0.22), + sample_interval=45, + ) + left_release_spec = _format_pose_object_spec( + "left_arm", + roles.container_runtime_uid, + (0.0, -0.04, 0.12), + sample_interval=30, + ) + right_high_spec = _format_pose_object_spec( + "right_arm", + roles.container_runtime_uid, + (0.0, 0.04, 0.22), + sample_interval=45, + ) + right_release_spec = _format_pose_object_spec( + "right_arm", + roles.container_runtime_uid, + (0.0, 0.04, 0.12), + sample_interval=30, + ) + return f"""### Atomic Action Class JSON Specs for UR5BreadBasket Dual-UR5 Placement + +Use only atomic action class JSON specs backed by `PickUpAction`, `MoveAction`, and +`PlaceAction`. Use `robot_name="left_arm"` only for +`{roles.left_target_runtime_uid}` and `robot_name="right_arm"` only for +`{roles.right_target_runtime_uid}`. + +The nominal task starts with simultaneous dual-arm pick-up, followed by a +left-first placement with an overlapped handoff to the right arm: +- The first nominal edge must use `atomic_action_class:"PickUpAction"` for both arms. +- While the left arm places its target, keep the right hand closed with a + `target_qpos` whose source is `gripper_state` and state is `close`. +- After the left arm releases `{roles.left_target_runtime_uid}`, first move it + upward to clear the container. +- The next nominal edge must pair the left arm's initial `target_qpos` move with + the right arm's object-referenced `target_pose` high-staging move. Do not split this + parallel handoff into separate edges. +- After the parallel handoff edge, the remaining right-side placement steps put + the actual action in `right_arm_action` and set `left_arm_action` to null. +- Never use initial `target_qpos` for an arm that is still holding a target object. + +Use these action patterns: +- Left pick-up: + {_format_pick_up_spec("left_arm", roles.left_target_runtime_uid)} +- Right pick-up: + {_format_pick_up_spec("right_arm", roles.right_target_runtime_uid)} +- Left high staging: + {left_high_spec} +- Left release pose: + {left_release_spec} +- Right high staging: + {right_high_spec} +- Right release pose: + {right_release_spec} +- Release an object: + {_format_gripper_spec("", "open", sample_interval=15, post_hold_steps=25)} +- Keep a holding arm closed: + {_format_gripper_spec("", "close", sample_interval=10)} +- Retreat upward: + {_format_pose_offset_spec("", (0.0, 0.0, 0.14), sample_interval=20)} +- Return to initial qpos: + {_format_initial_qpos_spec("", sample_interval=30)} +""" + + +def _format_pick_up_spec( + robot_name: str, + obj_name: str, + *, + sample_interval: int = 45, +) -> str: + return _compact_json( + { + "atomic_action_class": "PickUpAction", + "robot_name": robot_name, + "control": "arm", + "target_object": { + "obj_name": obj_name, + "affordance": "antipodal", + }, + "cfg": { + "pre_grasp_distance": 0.08, + "sample_interval": sample_interval, + }, + } + ) + + +def _format_pose_object_spec( + robot_name: str, + obj_name: str, + offset: tuple[float, float, float] | list[float], + *, + sample_interval: int, +) -> str: + x, y, z = offset + return _compact_json( + { + "atomic_action_class": "MoveAction", + "robot_name": robot_name, + "control": "arm", + "target_pose": { + "reference": "object", + "obj_name": obj_name, + "offset": [float(x), float(y), float(z)], + "orientation": "current", + }, + "cfg": {"sample_interval": sample_interval}, + } + ) + + +def _format_pose_offset_spec( + robot_name: str, + offset: tuple[float, float, float], + *, + sample_interval: int = 20, +) -> str: + dx, dy, dz = offset + return _compact_json( + { + "atomic_action_class": "MoveAction", + "robot_name": robot_name, + "control": "arm", + "target_pose": { + "reference": "relative", + "offset": [float(dx), float(dy), float(dz)], + "frame": "world", + }, + "cfg": {"sample_interval": sample_interval}, + } + ) + + +def _format_gripper_spec( + robot_name: str, + state: str, + *, + sample_interval: int, + post_hold_steps: int = 0, +) -> str: + cfg = {"sample_interval": sample_interval} + if post_hold_steps: + cfg["post_hold_steps"] = post_hold_steps + return _compact_json( + { + "atomic_action_class": "MoveAction", + "robot_name": robot_name, + "control": "hand", + "target_qpos": {"source": "gripper_state", "state": state}, + "cfg": cfg, + } + ) + + +def _format_initial_qpos_spec( + robot_name: str, + *, + sample_interval: int, +) -> str: + return _compact_json( + { + "atomic_action_class": "MoveAction", + "robot_name": robot_name, + "control": "arm", + "target_qpos": {"source": "initial"}, + "cfg": {"sample_interval": sample_interval}, + } + ) + + +def _compact_json(value: Mapping[str, Any]) -> str: + return json.dumps(value, ensure_ascii=False, separators=(",", ":")) + + +def _format_action_sketch(action_sketch: list[str]) -> str: + return "\n".join(f"- {item}" for item in action_sketch) + + +def _relative_relation_phrase(relation: str) -> str: + if relation == "inside": + return "inside" + if relation == "on": + return "on top of" + if relation == "left_of": + return "to the left of" + if relation == "right_of": + return "to the right of" + if relation == "front_of": + return "in front of" + if relation == "behind": + return "behind" + raise ValueError(f"Unsupported relative placement relation: {relation!r}.") + + +def _left_target_text(roles: _BasketRolesLike) -> str: + return _display_noun(roles.left_target_noun) + + +def _right_target_text(roles: _BasketRolesLike) -> str: + return _display_noun(roles.right_target_noun) + + +def _target_pair_text(roles: _BasketRolesLike) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return f"two {left_text} objects" + return f"the left {left_text} and right {right_text}" + + +def _target_plural_text(roles: _BasketRolesLike) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return _plural(left_text) + return "target objects" + + +def _display_noun(uid: str) -> str: + return uid.replace("_", " ") + + +def _plural(noun: str) -> str: + if noun.endswith("s"): + return noun + if noun.endswith(("ch", "sh", "x")): + return f"{noun}es" + return f"{noun}s" diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py new file mode 100644 index 00000000..a1bd6b88 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -0,0 +1,3127 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any +import copy +import json +import math +import re +import struct + +from embodichain.gen_sim.action_agent_pipeline.generation.prompt_builders import ( + make_agent_config, + make_basket_atom_actions_prompt, + make_basket_basic_background, + make_basket_task_prompt, + make_relative_atom_actions_prompt, + make_relative_basic_background, + make_relative_task_prompt, +) + +__all__ = [ + "GeneratedUR5BasketConfigPaths", + "TargetReplacementSpec", + "generate_ur5_basket_config_from_project", +] + +_DIGIT_SUFFIX_RE = re.compile(r"_[0-9]+$") +_INVALID_UID_CHARS_RE = re.compile(r"[^0-9a-zA-Z_]+") +_PROJECT_NAME_RE = re.compile(r"^[0-9]+_gym_project$") +_GYM_CONFIG_FILENAMES = frozenset({"gym_config.json", "gym_config_merged.json"}) +_GYM_CONFIG_PREFERENCE = ("gym_config_merged.json", "gym_config.json") +_TARGET_REPLACEMENT_MANIFEST_FILENAME = ".embodichain_replacement_manifest.json" + +_CONTAINER_KEYWORDS = ( + "basket", + "container", + "bowl", + "box", + "bin", + "tray", + "crate", +) + +_RELATIVE_RELATIONS = { + "inside", + "on", + "left_of", + "right_of", + "front_of", + "behind", +} + +_RELATION_ALIASES = { + "in": "inside", + "into": "inside", + "inside": "inside", + "放入": "inside", + "放进": "inside", + "里面": "inside", + "on": "on", + "onto": "on", + "on_top": "on", + "on_top_of": "on", + "above": "on", + "top": "on", + "上": "on", + "上方": "on", + "上面": "on", + "叠放": "on", + "left": "left_of", + "left_of": "left_of", + "to_the_left_of": "left_of", + "左": "left_of", + "左边": "left_of", + "右": "right_of", + "右边": "right_of", + "right": "right_of", + "right_of": "right_of", + "to_the_right_of": "right_of", + "front": "front_of", + "front_of": "front_of", + "in_front_of": "front_of", + "前": "front_of", + "前方": "front_of", + "前面": "front_of", + "back": "behind", + "behind": "behind", + "back_of": "behind", + "后": "behind", + "后方": "behind", + "后面": "behind", +} + +_SIDE_RELATION_DISTANCE = 0.16 +_SIDE_RELEASE_Z_OFFSET = 0.12 +_STAGING_Z_DELTA = 0.10 +_ON_RELEASE_Z_OFFSET = 0.2 +_DUAL_UR5_LEGACY_INIT_Z = 0.5 +_DUAL_UR5_HIGH_TABLETOP_THRESHOLD = 1.0 +_DUAL_UR5_HIGH_TABLETOP_INIT_Z = 0.8 +_DUAL_UR5_ARM_COMPONENT_Z = 0.4 +_DUAL_UR5_TABLETOP_CLEARANCE = 0.25 +_DUAL_UR5_SIDE_AXIS_INDEX = 1 +_BACKGROUND_MAX_CONVEX_HULL_NUM = 1 +_TARGET_MAX_CONVEX_HULL_NUM = 16 +_CONTAINER_MAX_CONVEX_HULL_NUM = 8 +_EXTRA_RIGID_MAX_CONVEX_HULL_NUM = 1 +_GLB_JSON_CHUNK_TYPE = 0x4E4F534A +_GLB_BINARY_CHUNK_TYPE = 0x004E4942 +_GLTF_COMPONENT_FORMATS = { + 5120: ("b", 1), + 5121: ("B", 1), + 5122: ("h", 2), + 5123: ("H", 2), + 5125: ("I", 4), + 5126: ("f", 4), +} +_GLTF_TYPE_COMPONENT_COUNTS = { + "SCALAR": 1, + "VEC2": 2, + "VEC3": 3, + "VEC4": 4, + "MAT4": 16, +} + +_BACKGROUND_ATTRS = { + "mass": 10.0, + "static_friction": 0.95, + "dynamic_friction": 0.9, + "restitution": 0.01, +} + +_RIGID_OBJECT_ATTRS = { + "mass": 0.01, + "contact_offset": 0.003, + "rest_offset": 0.001, + "restitution": 0.01, + "max_depenetration_velocity": 10.0, + "min_position_iters": 32, + "min_velocity_iters": 8, +} + + +@dataclass(frozen=True) +class GeneratedUR5BasketConfigPaths: + """Paths written by the UR5 basket config generator.""" + + output_dir: Path + gym_config: Path + agent_config: Path + task_prompt: Path + basic_background: Path + atom_actions: Path + summary: dict[str, Any] + + +@dataclass(frozen=True) +class TargetReplacementSpec: + """Prompt-to-geometry replacement for one source target object.""" + + source_uid: str + prompt: str + output_dir_name: str + + +@dataclass(frozen=True) +class _SceneObject: + source_uid: str + source_role: str + config: dict[str, Any] + + +@dataclass(frozen=True) +class _BasketTaskRoles: + table_source_uid: str + container_source_uid: str + left_target_source_uid: str + right_target_source_uid: str + container_runtime_uid: str + left_target_runtime_uid: str + right_target_runtime_uid: str + target_noun: str + left_target_noun: str + right_target_noun: str + container_noun: str + + +@dataclass(frozen=True) +class _ResolvedTargetReplacement: + source_uid: str + prompt: str + output_dir_name: str + mesh_path: Path + runtime_noun: str + reused: bool = False + + +@dataclass(frozen=True) +class _RelativePlacementStepSpec: + moved_source_uid: str + reference_source_uid: str + moved_runtime_uid: str + reference_runtime_uid: str + relation: str + active_side: str + release_offset: list[float] + high_offset: list[float] + + +@dataclass(frozen=True) +class _RelativePlacementSpec: + table_source_uid: str + moved_source_uid: str + reference_source_uid: str + moved_runtime_uid: str + reference_runtime_uid: str + relation: str + active_side: str + task_description: str + task_prompt_summary: str + basic_background_notes: str + action_sketch: list[str] + release_offset: list[float] + high_offset: list[float] + placements: tuple[_RelativePlacementStepSpec, ...] + + +def generate_ur5_basket_config_from_project( + gym_project: str | Path, + output_dir: str | Path, + *, + task_name: str = "UR5BreadBasket", + task_description: str | None = None, + use_llm_roles: bool = False, + llm_model: str | None = None, + target_body_scale: float | list[float] | tuple[float, float, float] = 0.7, + target_replacements: Sequence[TargetReplacementSpec] | None = None, + sync_replacement_names: bool = False, + reuse_target_replacements: bool = True, + prewarm_coacd_cache: bool = True, + overwrite: bool = False, + max_episodes: int = 1, + max_episode_steps: int = 1000, +) -> GeneratedUR5BasketConfigPaths: + """Generate Dual-UR5 basket placement configs from an exported gym project. + + This first-stage generator intentionally keeps the UR5BreadBasket task + structure fixed: the left arm grasps the left target object, the right arm + grasps the right target object, and both objects are placed into one + basket-like container. + + Args: + gym_project: Project root, formatted scene folder, ``gym_config.json``, + or ``gym_config_merged.json``. + output_dir: Destination config directory. + task_name: Name passed to ``run_agent``. + task_description: Optional natural-language relative-placement task. + When provided, the generator asks the shared LLM for a constrained + config-level task spec and generates prompts from that spec. + use_llm_roles: If true, use an LLM only to refine object role mapping. + llm_model: Optional model override for role refinement. + target_body_scale: Uniform or xyz scale applied to generated target + objects. Basket-like containers keep their source ``body_scale``. + target_replacements: Optional prompt-generated GLB replacements for + selected default basket target objects. Each replacement writes to + ``/mesh_assets/`` and only affects the + generated config, not the original source mesh file. + sync_replacement_names: If true, update runtime target UIDs and prompts + from the replacement prompts. If false, only mesh paths are replaced. + reuse_target_replacements: If true, reuse an existing replacement GLB + at the expected output path when it matches the requested prompt. + prewarm_coacd_cache: If true, precompute environment-side CoACD cache + files referenced by the generated gym config before writing it. + overwrite: If false, fail when generated files already exist. + max_episodes: Value written to ``fast_gym_config.json``. + max_episode_steps: Value written to ``fast_gym_config.json``. + + Returns: + Paths of generated config files. + """ + + input_path = Path(gym_project).expanduser().resolve() + gym_config_path = _resolve_gym_config_path(input_path) + scene_dir = gym_config_path.parent + source_config = _read_json(gym_config_path) + project_name = _infer_project_name(input_path, scene_dir) + replacement_specs = _normalize_target_replacements(target_replacements) + + scene_objects = _collect_scene_objects(source_config) + if task_description: + if replacement_specs: + raise ValueError( + "target_replacements are only supported by the default basket " + "template. Do not combine them with task_description." + ) + spec = _build_relative_placement_spec_with_llm( + scene_objects=scene_objects, + project_name=project_name, + task_description=task_description, + model=llm_model, + ) + bundle = _build_relative_placement_bundle( + scene_dir=scene_dir, + source_config=source_config, + spec=spec, + project_name=project_name, + task_name=task_name, + target_body_scale=target_body_scale, + max_episodes=max_episodes, + max_episode_steps=max_episode_steps, + ) + _validate_relative_bundle(bundle, spec) + if prewarm_coacd_cache: + _attach_coacd_cache_summary(bundle) + return _write_config_bundle( + output_dir=Path(output_dir).expanduser().resolve(), + bundle=bundle, + overwrite=overwrite, + ) + + roles = _infer_basket_task_roles(scene_objects) + if use_llm_roles: + roles = _refine_roles_with_llm( + roles=roles, + scene_objects=scene_objects, + project_name=project_name, + model=llm_model, + ) + + _validate_target_replacement_sources(roles, replacement_specs) + resolved_replacements = _run_target_replacements( + scene_dir=scene_dir, + replacement_specs=replacement_specs, + reuse_target_replacements=reuse_target_replacements, + ) + if sync_replacement_names: + roles = _apply_replacement_names( + roles, + resolved_replacements, + ) + + bundle = _build_ur5_basket_bundle( + scene_dir=scene_dir, + source_config=source_config, + roles=roles, + project_name=project_name, + task_name=task_name, + target_body_scale=target_body_scale, + target_replacements=resolved_replacements, + max_episodes=max_episodes, + max_episode_steps=max_episode_steps, + ) + _validate_bundle(bundle, roles) + if prewarm_coacd_cache: + _attach_coacd_cache_summary(bundle) + return _write_config_bundle( + output_dir=Path(output_dir).expanduser().resolve(), + bundle=bundle, + overwrite=overwrite, + ) + + +def _resolve_gym_config_path(input_path: Path) -> Path: + if input_path.is_file(): + if input_path.name not in _GYM_CONFIG_FILENAMES: + expected = ", ".join(sorted(_GYM_CONFIG_FILENAMES)) + raise ValueError(f"Expected one of {expected}, got: {input_path}") + return input_path + + direct = _preferred_gym_config_in_dir(input_path) + if direct is not None: + return direct + + formatted_scene_dirs = sorted( + { + path.parent + for filename in _GYM_CONFIG_FILENAMES + for path in input_path.glob(f"formatted_tabletop_scene/*/{filename}") + } + ) + formatted_matches = [ + path + for scene_dir in formatted_scene_dirs + if (path := _preferred_gym_config_in_dir(scene_dir)) is not None + ] + if len(formatted_matches) == 1: + return formatted_matches[0] + if len(formatted_matches) > 1: + matches = ", ".join(path.as_posix() for path in formatted_matches) + raise ValueError(f"Multiple formatted gym config files found: {matches}") + + recursive_scene_dirs = sorted( + { + path.parent + for filename in _GYM_CONFIG_FILENAMES + for path in input_path.rglob(filename) + } + ) + recursive_matches = [ + path + for scene_dir in recursive_scene_dirs + if (path := _preferred_gym_config_in_dir(scene_dir)) is not None + ] + if len(recursive_matches) == 1: + return recursive_matches[0] + if not recursive_matches: + expected = " or ".join(_GYM_CONFIG_PREFERENCE) + raise FileNotFoundError(f"{expected} not found under: {input_path}") + matches = ", ".join(path.as_posix() for path in recursive_matches) + raise ValueError(f"Multiple gym config files found: {matches}") + + +def _preferred_gym_config_in_dir(scene_dir: Path) -> Path | None: + for filename in _GYM_CONFIG_PREFERENCE: + path = scene_dir / filename + if path.is_file(): + return path + return None + + +def _infer_project_name(input_path: Path, scene_dir: Path) -> str: + for part in input_path.parts: + if _PROJECT_NAME_RE.match(part): + return part + for part in scene_dir.parts: + if _PROJECT_NAME_RE.match(part): + return part + return scene_dir.name + + +def _collect_scene_objects(scene_config: Mapping[str, Any]) -> list[_SceneObject]: + scene_objects = [] + for source_role in ("background", "rigid_object"): + for obj_config in scene_config.get(source_role, []) or []: + source_uid = str(obj_config.get("uid", "")).strip() + if not source_uid: + raise ValueError(f"Scene object without uid in {source_role}.") + scene_objects.append( + _SceneObject( + source_uid=source_uid, + source_role=source_role, + config=copy.deepcopy(dict(obj_config)), + ) + ) + + if not scene_objects: + raise ValueError("No background or rigid_object entries found in gym config.") + return scene_objects + + +def _infer_basket_task_roles(scene_objects: list[_SceneObject]) -> _BasketTaskRoles: + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + if not background_objects: + raise ValueError("UR5 basket generation requires a table/background object.") + if len(rigid_objects) < 3: + raise ValueError( + "UR5 basket generation requires at least two target objects and one " + "basket-like container." + ) + + table = _pick_table(background_objects) + container = _pick_container(rigid_objects) + target_candidates = [ + obj for obj in rigid_objects if obj.source_uid != container.source_uid + ] + if len(target_candidates) < 2: + raise ValueError("Expected at least two non-container target objects.") + + left_target, right_target = _pick_left_right_targets(target_candidates) + target_noun = _target_noun(left_target, right_target) + container_noun = _display_noun(_base_name(container)) + return _BasketTaskRoles( + table_source_uid=table.source_uid, + container_source_uid=container.source_uid, + left_target_source_uid=left_target.source_uid, + right_target_source_uid=right_target.source_uid, + container_runtime_uid=_container_runtime_uid(container), + left_target_runtime_uid=f"left_{target_noun}", + right_target_runtime_uid=f"right_{target_noun}", + target_noun=target_noun, + left_target_noun=target_noun, + right_target_noun=target_noun, + container_noun=container_noun, + ) + + +def _pick_table(background_objects: list[_SceneObject]) -> _SceneObject: + for obj in background_objects: + text = _object_text(obj) + if "table" in text: + return obj + return background_objects[0] + + +def _pick_container(rigid_objects: list[_SceneObject]) -> _SceneObject: + candidates = [ + obj + for obj in rigid_objects + if any(keyword in _object_text(obj) for keyword in _CONTAINER_KEYWORDS) + ] + if not candidates: + names = ", ".join(obj.source_uid for obj in rigid_objects) + raise ValueError(f"No basket-like container object found among: {names}") + + def score(obj: _SceneObject) -> tuple[int, float]: + text = _object_text(obj) + keyword_score = 0 if "basket" in text else 1 + pos = _vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) + center_distance = abs(pos[0]) + abs(pos[1]) + return keyword_score, center_distance + + return sorted(candidates, key=score)[0] + + +def _pick_left_right_targets( + target_candidates: list[_SceneObject], +) -> tuple[_SceneObject, _SceneObject]: + if len(target_candidates) == 2: + picked = target_candidates + else: + grouped: dict[str, list[_SceneObject]] = {} + for obj in target_candidates: + grouped.setdefault(_base_name(obj), []).append(obj) + repeated_groups = [group for group in grouped.values() if len(group) >= 2] + if repeated_groups: + picked = sorted( + repeated_groups, + key=_target_group_sort_key, + )[0] + if len(picked) > 2: + picked = sorted( + picked, + key=lambda obj: abs(_side_axis_value(obj)), + reverse=True, + )[:2] + else: + picked = sorted( + target_candidates, + key=lambda obj: abs(_side_axis_value(obj)), + reverse=True, + )[:2] + left, right = sorted(picked, key=_side_axis_value) + return left, right + + +def _target_group_sort_key(group: list[_SceneObject]) -> tuple[float, int]: + side_values = [_side_axis_value(obj) for obj in group] + side_spread = max(side_values) - min(side_values) + return -side_spread, -len(group) + + +def _side_axis_value(obj: _SceneObject) -> float: + return _position_side_axis_value( + _vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) + ) + + +def _position_side_axis_value(position: list[float]) -> float: + return float(position[_DUAL_UR5_SIDE_AXIS_INDEX]) + + +def _arm_side_for_position(position: list[float]) -> str: + return "left" if _position_side_axis_value(position) < 0.0 else "right" + + +def _target_noun(left_target: _SceneObject, right_target: _SceneObject) -> str: + left_base = _base_name(left_target) + right_base = _base_name(right_target) + if left_base == right_base: + return _target_runtime_suffix(left_base) + return "target_object" + + +def _object_text(obj: _SceneObject) -> str: + shape = obj.config.get("shape", {}) or {} + return f"{obj.source_uid} {shape.get('fpath', '')}".lower() + + +def _base_name(obj: _SceneObject) -> str: + base = _DIGIT_SUFFIX_RE.sub("", obj.source_uid) + if base == obj.source_uid: + fpath = str(obj.config.get("shape", {}).get("fpath", "")) + path = Path(fpath) + if len(path.parts) >= 2: + base = path.parts[-2] + return _normalize_runtime_uid(base) + + +def _target_runtime_suffix(base: str) -> str: + if base == "bread": + return "bread_roll" + return base + + +def _container_runtime_uid(container: _SceneObject) -> str: + base = _base_name(container) + if "basket" in base: + return "wicker_basket" + return f"target_{base}" + + +def _display_noun(uid: str) -> str: + return uid.replace("_", " ") + + +def _plural(noun: str) -> str: + if noun.endswith("s"): + return noun + if noun.endswith(("ch", "sh", "x")): + return f"{noun}es" + return f"{noun}s" + + +def _left_target_text(roles: _BasketTaskRoles) -> str: + return _display_noun(roles.left_target_noun) + + +def _right_target_text(roles: _BasketTaskRoles) -> str: + return _display_noun(roles.right_target_noun) + + +def _target_pair_text(roles: _BasketTaskRoles) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return f"two {left_text} objects" + return f"the left {left_text} and right {right_text}" + + +def _target_plural_text(roles: _BasketTaskRoles) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return _plural(left_text) + return "target objects" + + +def _generic_target_text(roles: _BasketTaskRoles) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return left_text + return "target object" + + +def _target_task_description_text(roles: _BasketTaskRoles) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return _plural(left_text) + return f"{left_text}-and-{right_text}" + + +def _normalize_runtime_uid(value: str) -> str: + uid = _INVALID_UID_CHARS_RE.sub("_", value.strip()).strip("_").lower() + if not uid: + raise ValueError(f"Invalid runtime uid: {value!r}") + return uid + + +def _normalize_target_replacements( + target_replacements: Sequence[TargetReplacementSpec] | None, +) -> tuple[TargetReplacementSpec, ...]: + if not target_replacements: + return () + + normalized = [] + seen_source_uids = set() + seen_output_dirs = set() + for replacement in target_replacements: + if not isinstance(replacement, TargetReplacementSpec): + raise TypeError( + "target_replacements must contain TargetReplacementSpec values." + ) + source_uid = str(replacement.source_uid).strip() + prompt = str(replacement.prompt).strip() + output_dir_name = str(replacement.output_dir_name).strip() + if not source_uid: + raise ValueError("target replacement source_uid must be non-empty.") + if not prompt: + raise ValueError("target replacement prompt must be non-empty.") + if not output_dir_name: + raise ValueError("target replacement output_dir_name must be non-empty.") + output_dir_path = Path(output_dir_name) + if ( + output_dir_path.is_absolute() + or len(output_dir_path.parts) != 1 + or output_dir_name in {".", ".."} + ): + raise ValueError( + "target replacement output_dir_name must be a single relative " + f"directory name, got: {output_dir_name!r}" + ) + if source_uid in seen_source_uids: + raise ValueError(f"Duplicate target replacement source uid: {source_uid}") + if output_dir_name in seen_output_dirs: + raise ValueError( + f"Duplicate target replacement output dir: {output_dir_name}" + ) + seen_source_uids.add(source_uid) + seen_output_dirs.add(output_dir_name) + normalized.append( + TargetReplacementSpec( + source_uid=source_uid, + prompt=prompt, + output_dir_name=output_dir_name, + ) + ) + return tuple(normalized) + + +def _validate_target_replacement_sources( + roles: _BasketTaskRoles, + replacement_specs: Sequence[TargetReplacementSpec], +) -> None: + if not replacement_specs: + return + + target_source_uids = { + roles.left_target_source_uid, + roles.right_target_source_uid, + } + unknown = [ + replacement.source_uid + for replacement in replacement_specs + if replacement.source_uid not in target_source_uids + ] + if unknown: + raise ValueError( + "target_replacements must reference the selected basket target " + f"source uid(s) {sorted(target_source_uids)}, got: {unknown}" + ) + + +def _run_target_replacements( + *, + scene_dir: Path, + replacement_specs: Sequence[TargetReplacementSpec], + reuse_target_replacements: bool, +) -> tuple[_ResolvedTargetReplacement, ...]: + resolved = [] + for replacement in replacement_specs: + runtime_noun = _replacement_runtime_noun(replacement.prompt) + output_root = scene_dir / "mesh_assets" / replacement.output_dir_name + output_name = f"{runtime_noun}.glb" + mesh_path = None + reused = False + if reuse_target_replacements: + mesh_path = _resolve_reusable_target_replacement_mesh_path( + output_root=output_root, + prompt=replacement.prompt, + output_name=output_name, + ) + reused = mesh_path is not None + if mesh_path is None: + result = _run_prompt2geometry_replacement( + prompt=replacement.prompt, + output_root=output_root, + output_name=output_name, + ) + mesh_path = _resolve_prompt2geometry_mesh_path(result, output_root) + _write_target_replacement_manifest( + output_root=output_root, + prompt=replacement.prompt, + output_name=output_name, + mesh_path=mesh_path, + ) + elif reused: + _write_target_replacement_manifest( + output_root=output_root, + prompt=replacement.prompt, + output_name=output_name, + mesh_path=mesh_path, + ) + resolved.append( + _ResolvedTargetReplacement( + source_uid=replacement.source_uid, + prompt=replacement.prompt, + output_dir_name=replacement.output_dir_name, + mesh_path=mesh_path, + runtime_noun=runtime_noun, + reused=reused, + ) + ) + return tuple(resolved) + + +def _resolve_reusable_target_replacement_mesh_path( + *, + output_root: Path, + prompt: str, + output_name: str, +) -> Path | None: + expected_mesh_path = (output_root / output_name).expanduser().resolve() + if not expected_mesh_path.is_file(): + return None + + manifest_path = _target_replacement_manifest_path(output_root) + if not manifest_path.is_file(): + return expected_mesh_path + + try: + manifest = _read_json(manifest_path) + except (OSError, json.JSONDecodeError): + return None + + if manifest.get("prompt") != prompt or manifest.get("output_name") != output_name: + return None + + manifest_mesh_path = Path( + str(manifest.get("mesh_path", expected_mesh_path)) + ).expanduser() + if not manifest_mesh_path.is_absolute(): + manifest_mesh_path = (output_root / manifest_mesh_path).resolve() + else: + manifest_mesh_path = manifest_mesh_path.resolve() + if manifest_mesh_path.is_file(): + return manifest_mesh_path + return expected_mesh_path + + +def _write_target_replacement_manifest( + *, + output_root: Path, + prompt: str, + output_name: str, + mesh_path: Path, +) -> None: + _write_json( + _target_replacement_manifest_path(output_root), + { + "prompt": prompt, + "output_name": output_name, + "mesh_path": mesh_path.expanduser().resolve().as_posix(), + }, + ) + + +def _target_replacement_manifest_path(output_root: Path) -> Path: + return output_root / _TARGET_REPLACEMENT_MANIFEST_FILENAME + + +def _run_prompt2geometry_replacement( + *, + prompt: str, + output_root: Path, + output_name: str, +) -> dict[str, Any]: + from embodichain.gen_sim.action_agent_pipeline.gym_project_api.prompt2geometry import ( + Prompt2GeometryRequest, + load_prompt2geometry_config, + run_prompt2geometry, + ) + + cfg = load_prompt2geometry_config() + return run_prompt2geometry( + Prompt2GeometryRequest( + prompt=prompt, + output_root=output_root, + output_name=output_name, + zimage_base_url=cfg.zimage_base_url, + sam3_base_url=cfg.sam3_base_url, + sam3d_base_url=cfg.sam3d_base_url, + llm_api_key=cfg.llm_api_key, + llm_model=cfg.llm_model, + llm_base_url=cfg.llm_base_url, + llm_timeout_s=cfg.llm_timeout_s, + ) + ) + + +def _resolve_prompt2geometry_mesh_path( + result: Mapping[str, Any], + output_root: Path, +) -> Path: + raw_path = result.get("scaled_mesh_path") or result.get("mesh_path") + if not raw_path: + raise ValueError("prompt2geometry result did not include a GLB mesh path.") + + mesh_path = Path(str(raw_path)).expanduser() + if not mesh_path.is_absolute(): + mesh_path = (output_root / mesh_path).resolve() + else: + mesh_path = mesh_path.resolve() + + if not mesh_path.is_file(): + raise FileNotFoundError(f"Generated replacement GLB not found: {mesh_path}") + return mesh_path + + +def _replacement_runtime_noun(prompt: str) -> str: + tokens = re.findall(r"[a-z0-9]+", prompt.lower()) + while tokens and tokens[0] in {"a", "an", "the"}: + tokens.pop(0) + stem = "_".join(tokens) + if not stem: + stem = "replacement_object" + return _normalize_runtime_uid(stem) + + +def _apply_replacement_names( + roles: _BasketTaskRoles, + resolved_replacements: Sequence[_ResolvedTargetReplacement], +) -> _BasketTaskRoles: + replacement_by_uid = { + replacement.source_uid: replacement for replacement in resolved_replacements + } + left_replacement = replacement_by_uid.get(roles.left_target_source_uid) + right_replacement = replacement_by_uid.get(roles.right_target_source_uid) + left_target_noun = ( + left_replacement.runtime_noun + if left_replacement is not None + else roles.left_target_noun + ) + right_target_noun = ( + right_replacement.runtime_noun + if right_replacement is not None + else roles.right_target_noun + ) + target_noun = ( + left_target_noun if left_target_noun == right_target_noun else "target_object" + ) + return _BasketTaskRoles( + table_source_uid=roles.table_source_uid, + container_source_uid=roles.container_source_uid, + left_target_source_uid=roles.left_target_source_uid, + right_target_source_uid=roles.right_target_source_uid, + container_runtime_uid=roles.container_runtime_uid, + left_target_runtime_uid=f"left_{left_target_noun}", + right_target_runtime_uid=f"right_{right_target_noun}", + target_noun=target_noun, + left_target_noun=left_target_noun, + right_target_noun=right_target_noun, + container_noun=roles.container_noun, + ) + + +def _refine_roles_with_llm( + *, + roles: _BasketTaskRoles, + scene_objects: list[_SceneObject], + project_name: str, + model: str | None, +) -> _BasketTaskRoles: + response = _call_role_llm( + project_name=project_name, + scene_summary=[ + { + "source_uid": obj.source_uid, + "role": obj.source_role, + "mesh": obj.config.get("shape", {}).get("fpath"), + "init_pos": obj.config.get("init_pos"), + } + for obj in scene_objects + ], + default_roles={ + "container_object": roles.container_source_uid, + "left_target_object": roles.left_target_source_uid, + "right_target_object": roles.right_target_source_uid, + "target_noun": roles.target_noun, + "container_runtime_uid": roles.container_runtime_uid, + }, + model=model, + ) + source_uids = {obj.source_uid for obj in scene_objects} + left_target = str(response.get("left_target_object", roles.left_target_source_uid)) + right_target = str( + response.get("right_target_object", roles.right_target_source_uid) + ) + container = str(response.get("container_object", roles.container_source_uid)) + for uid in (left_target, right_target, container): + if uid not in source_uids: + raise ValueError(f"LLM returned unknown source uid: {uid!r}") + if len({left_target, right_target, container}) != 3: + raise ValueError("LLM role mapping must use three distinct source objects.") + + target_noun = _normalize_runtime_uid( + str(response.get("target_noun", roles.target_noun)) + ) + container_runtime_uid = _normalize_runtime_uid( + str(response.get("container_runtime_uid", roles.container_runtime_uid)) + ) + return _BasketTaskRoles( + table_source_uid=roles.table_source_uid, + container_source_uid=container, + left_target_source_uid=left_target, + right_target_source_uid=right_target, + container_runtime_uid=container_runtime_uid, + left_target_runtime_uid=f"left_{target_noun}", + right_target_runtime_uid=f"right_{target_noun}", + target_noun=target_noun, + left_target_noun=target_noun, + right_target_noun=target_noun, + container_noun=_display_noun(container_runtime_uid), + ) + + +def _call_role_llm( + *, + project_name: str, + scene_summary: list[dict[str, Any]], + default_roles: dict[str, Any], + model: str | None, +) -> dict[str, Any]: + from langchain_core.messages import HumanMessage, SystemMessage + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + extract_json_object, + ) + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( + create_chat_openai, + ) + + prompt = ( + "Identify roles for a fixed Dual-UR5 basket-placement simulation task. " + "Return only one JSON object with keys: container_object, " + "left_target_object, right_target_object, target_noun, " + "container_runtime_uid. Use only source_uid values from the scene. The " + "left target starts on the negative-y side, and the right target starts " + "on the positive-y side.\n\n" + f"Project: {project_name}\n" + f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}\n" + f"Default roles:\n{json.dumps(default_roles, ensure_ascii=False, indent=2)}" + ) + llm = create_chat_openai( + temperature=0.0, + model=model, + usage_stage="config_generation.role_refinement", + ) + response = llm.invoke( + [ + SystemMessage( + content=( + "You produce strict JSON role mappings for simulation config " + "generation. Do not include markdown." + ) + ), + HumanMessage(content=prompt), + ] + ) + content = getattr(response, "content", response) + return extract_json_object(content) + + +def _build_relative_placement_spec_with_llm( + *, + scene_objects: list[_SceneObject], + project_name: str, + task_description: str, + model: str | None, +) -> _RelativePlacementSpec: + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + if not background_objects: + raise ValueError("Relative placement generation requires a background table.") + if len(rigid_objects) < 2: + raise ValueError( + "Relative placement generation requires at least two rigid objects." + ) + + table = _pick_table(background_objects) + response = _call_relative_task_llm( + project_name=project_name, + task_description=task_description, + scene_summary=[ + { + "source_uid": obj.source_uid, + "role": obj.source_role, + "object_type": _base_name(obj), + "is_container_like": _is_container_like(obj), + "mesh": obj.config.get("shape", {}).get("fpath"), + "init_pos": obj.config.get("init_pos"), + } + for obj in scene_objects + ], + model=model, + ) + return _apply_relative_task_response( + response=response, + table_source_uid=table.source_uid, + rigid_objects=rigid_objects, + task_description=task_description, + ) + + +def _call_relative_task_llm( + *, + project_name: str, + task_description: str, + scene_summary: list[dict[str, Any]], + model: str | None, +) -> dict[str, Any]: + from langchain_core.messages import HumanMessage, SystemMessage + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + extract_json_object, + ) + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( + create_chat_openai, + ) + + prompt = ( + "Parse a simple Dual-UR5 tabletop relative-placement task and produce " + "a constrained config-level JSON spec. This JSON is used to generate " + "task_prompt.txt, basic_background.txt, atom_actions.txt, and " + "agent_success; a second LLM will later read those prompts to generate " + "the executable graph JSON.\n\n" + "Return exactly one JSON object with this schema:\n" + "{\n" + ' "placements": [\n' + " {\n" + ' "moved_object": "",\n' + ' "reference_object": "",\n' + ' "goal_relation": ' + '"inside|on|left_of|right_of|front_of|behind",\n' + ' "arm": "left|right|auto"\n' + " }\n" + " ],\n" + ' "task_prompt_summary": "",\n' + ' "basic_background_notes": "",\n' + ' "action_sketch": [\n' + ' "grasp moved_object",\n' + ' "move above the relation target pose",\n' + ' "lower to the release pose",\n' + ' "open gripper",\n' + ' "retreat upward"\n' + " ]\n" + "}\n\n" + "Rules:\n" + "- Use only source_uid values from rigid_object entries.\n" + "- Return one placement for a single-arm task and exactly two placements " + "for a dual-arm task.\n" + "- Treat the task as dual-arm when it explicitly says 双臂, 两臂, both " + "arms, two arms, or when it describes separate work for the left arm and " + "the right arm even if it does not literally say 双臂.\n" + "- Do not invent a second placement when the task only moves one object.\n" + "- moved_object is the object to grasp and move.\n" + "- reference_object is the object used as the spatial reference, " + "container, or support.\n" + "- Within each placement, moved_object and reference_object must be " + "different.\n" + "- For dual-arm tasks, the placements must use two different moved_object " + "values and one left arm plus one right arm. Use arm='auto' only when " + "the user did not specify which arm handles that placement.\n" + "- arm selects the single UR5 arm that should manipulate moved_object. " + "Use arm='left' for explicit left-arm instructions such as 左臂, 左机械臂, " + "left arm, or left UR5; use arm='right' for explicit right-arm " + "instructions such as 右臂, 右机械臂, right arm, or right UR5; use " + "arm='auto' when the task does not specify an arm.\n" + "- For Chinese/English left/right/front/back, use the relation enums. " + "front_of means negative world-x; behind means positive world-x; " + "left_of means negative world-y; right_of means positive world-y.\n" + "- If the task says to release an object above a basket/container so it " + "falls into it, use goal_relation='inside'.\n" + "- If the task says to stack/place one object on another non-container " + "support, use goal_relation='on'.\n" + "- Do not return numeric offsets, object poses, scales, success JSON, " + "robot config, or full prompt files. The generator computes those " + "deterministically.\n\n" + f"Project: {project_name}\n" + f"Task description:\n{task_description}\n" + f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}" + ) + llm = create_chat_openai( + temperature=0.0, + model=model, + usage_stage="config_generation.relative_task", + ) + response = llm.invoke( + [ + SystemMessage( + content=( + "You produce strict JSON specs for simulation config " + "generation. Do not include markdown." + ) + ), + HumanMessage(content=prompt), + ] + ) + content = getattr(response, "content", response) + return extract_json_object(content) + + +def _apply_relative_task_response( + *, + response: Mapping[str, Any], + table_source_uid: str, + rigid_objects: list[_SceneObject], + task_description: str, +) -> _RelativePlacementSpec: + by_uid = {obj.source_uid: obj for obj in rigid_objects} + runtime_uids = _relative_runtime_uid_mapping(rigid_objects) + + placement_entries = _relative_placement_entries(response) + if len(placement_entries) > 2: + raise ValueError("Relative placement supports at most two arm placements.") + + forced_arm_sides = _relative_forced_arm_sides( + placement_entries, + by_uid=by_uid, + rigid_objects=rigid_objects, + ) + placements = tuple( + _build_relative_placement_step( + entry=entry, + by_uid=by_uid, + rigid_objects=rigid_objects, + runtime_uids=runtime_uids, + forced_side=forced_side, + ) + for entry, forced_side in zip(placement_entries, forced_arm_sides) + ) + _validate_relative_placements(placements) + + summary = str(response.get("task_prompt_summary", "")).strip() + if not summary: + summary = _default_relative_plan_summary(placements) + background_notes = str(response.get("basic_background_notes", "")).strip() + action_sketch = _string_list(response.get("action_sketch")) + if not action_sketch: + action_sketch = _default_relative_action_sketch(placements) + + primary = placements[0] + + return _RelativePlacementSpec( + table_source_uid=table_source_uid, + moved_source_uid=primary.moved_source_uid, + reference_source_uid=primary.reference_source_uid, + moved_runtime_uid=primary.moved_runtime_uid, + reference_runtime_uid=primary.reference_runtime_uid, + relation=primary.relation, + active_side=primary.active_side, + task_description=task_description, + task_prompt_summary=summary, + basic_background_notes=background_notes, + action_sketch=action_sketch, + release_offset=primary.release_offset, + high_offset=primary.high_offset, + placements=placements, + ) + + +def _relative_placement_entries(response: Mapping[str, Any]) -> list[Mapping[str, Any]]: + placements = response.get("placements") + if placements is None: + return [response] + if not isinstance(placements, list) or not placements: + raise ValueError("LLM response placements must be a non-empty list.") + entries: list[Mapping[str, Any]] = [] + for index, placement in enumerate(placements): + if not isinstance(placement, Mapping): + raise ValueError(f"Placement {index} must be a JSON object.") + entries.append(placement) + return entries + + +def _relative_forced_arm_sides( + placement_entries: list[Mapping[str, Any]], + *, + by_uid: Mapping[str, _SceneObject], + rigid_objects: list[_SceneObject], +) -> list[str | None]: + if len(placement_entries) != 2: + return [None for _ in placement_entries] + + requested_sides = [ + _normalize_relative_arm(entry.get("arm")) for entry in placement_entries + ] + explicit_sides = [side for side in requested_sides if side != "auto"] + if len(explicit_sides) == 2: + return [None, None] + if len(explicit_sides) == 1: + complement = "right" if explicit_sides[0] == "left" else "left" + return [ + requested_side if requested_side != "auto" else complement + for requested_side in requested_sides + ] + + moved_source_uids = [ + _resolve_rigid_source_uid( + entry.get("moved_object"), + rigid_objects, + field_name="moved_object", + ) + for entry in placement_entries + ] + positions = [ + _vector3(by_uid[source_uid].config.get("init_pos", [0.0, 0.0, 0.0])) + for source_uid in moved_source_uids + ] + inferred_sides = [_arm_side_for_position(position) for position in positions] + if set(inferred_sides) == {"left", "right"}: + return inferred_sides + + side_values = [_position_side_axis_value(position) for position in positions] + if side_values[0] <= side_values[1]: + return ["left", "right"] + return ["right", "left"] + + +def _build_relative_placement_step( + *, + entry: Mapping[str, Any], + by_uid: Mapping[str, _SceneObject], + rigid_objects: list[_SceneObject], + runtime_uids: Mapping[str, str], + forced_side: str | None, +) -> _RelativePlacementStepSpec: + moved_source_uid = _resolve_rigid_source_uid( + entry.get("moved_object"), + rigid_objects, + field_name="moved_object", + ) + reference_source_uid = _resolve_rigid_source_uid( + entry.get("reference_object"), + rigid_objects, + field_name="reference_object", + ) + if moved_source_uid == reference_source_uid: + raise ValueError( + "Relative placement requires distinct moved/reference objects." + ) + + reference_obj = by_uid[reference_source_uid] + relation = _normalize_relative_relation(entry.get("goal_relation")) + if relation == "on" and _is_container_like(reference_obj): + relation = "inside" + + moved_runtime_uid = runtime_uids[moved_source_uid] + reference_runtime_uid = runtime_uids[reference_source_uid] + if moved_runtime_uid == reference_runtime_uid: + raise ValueError( + f"Relative placement produced duplicate runtime uid {moved_runtime_uid!r}." + ) + + release_offset = _relative_release_offset(relation) + high_offset = list(release_offset) + high_offset[2] += _STAGING_Z_DELTA + moved_position = _vector3( + by_uid[moved_source_uid].config.get("init_pos", [0, 0, 0]) + ) + requested_side = _normalize_relative_arm(entry.get("arm")) + active_side = ( + forced_side + if forced_side is not None + else ( + _arm_side_for_position(moved_position) + if requested_side == "auto" + else requested_side + ) + ) + + return _RelativePlacementStepSpec( + moved_source_uid=moved_source_uid, + reference_source_uid=reference_source_uid, + moved_runtime_uid=moved_runtime_uid, + reference_runtime_uid=reference_runtime_uid, + relation=relation, + active_side=active_side, + release_offset=release_offset, + high_offset=high_offset, + ) + + +def _validate_relative_placements( + placements: tuple[_RelativePlacementStepSpec, ...], +) -> None: + if not placements: + raise ValueError("Relative placement requires at least one placement.") + moved_source_uids = [placement.moved_source_uid for placement in placements] + if len(moved_source_uids) != len(set(moved_source_uids)): + raise ValueError("Relative placements must use distinct moved_object values.") + if len(placements) == 2: + active_sides = {placement.active_side for placement in placements} + if active_sides != {"left", "right"}: + raise ValueError( + "Dual-arm relative placement requires one left arm and one right arm." + ) + + +def _resolve_rigid_source_uid( + value: Any, + rigid_objects: list[_SceneObject], + *, + field_name: str, +) -> str: + if value is None: + raise ValueError(f"LLM response missing required {field_name}.") + text = str(value).strip() + by_uid = {obj.source_uid: obj for obj in rigid_objects} + if text in by_uid: + return text + + normalized = _normalize_runtime_uid(text) + matches = [ + obj.source_uid + for obj in rigid_objects + if _normalize_runtime_uid(obj.source_uid) == normalized + or _base_name(obj) == normalized + or _candidate_relative_runtime_uid(obj) == normalized + ] + if len(matches) == 1: + return matches[0] + if not matches: + raise ValueError(f"LLM returned unknown {field_name}: {text!r}.") + raise ValueError( + f"LLM returned ambiguous {field_name}: {text!r}; candidates: {matches}." + ) + + +def _normalize_relative_relation(value: Any) -> str: + relation = str(value or "").strip().lower().replace("-", "_").replace(" ", "_") + relation = _RELATION_ALIASES.get(relation, relation) + if relation not in _RELATIVE_RELATIONS: + raise ValueError( + f"Unsupported relative placement relation {value!r}; expected one " + f"of {sorted(_RELATIVE_RELATIONS)}." + ) + return relation + + +def _normalize_relative_arm(value: Any) -> str: + if value is None: + return "auto" + text = str(value).strip().lower().replace("-", "_").replace(" ", "_") + if text in { + "", + "auto", + "automatic", + "unspecified", + "none", + "null", + "default", + "自动", + "默认", + "未指定", + "不指定", + }: + return "auto" + if text in { + "left", + "left_arm", + "left_ur5", + "左", + "左臂", + "左机械臂", + "左手", + "左手臂", + }: + return "left" + if text in { + "right", + "right_arm", + "right_ur5", + "右", + "右臂", + "右机械臂", + "右手", + "右手臂", + }: + return "right" + raise ValueError( + f"Unsupported relative placement arm {value!r}; expected 'left', " + "'right', or 'auto'." + ) + + +def _relative_release_offset(relation: str) -> list[float]: + relation = _normalize_relative_relation(relation) + if relation == "inside": + return [0.0, 0.0, _SIDE_RELEASE_Z_OFFSET] + if relation == "on": + return [0.0, 0.0, _ON_RELEASE_Z_OFFSET] + if relation == "left_of": + return [0.0, -_SIDE_RELATION_DISTANCE, _SIDE_RELEASE_Z_OFFSET] + if relation == "right_of": + return [0.0, _SIDE_RELATION_DISTANCE, _SIDE_RELEASE_Z_OFFSET] + if relation == "front_of": + return [-_SIDE_RELATION_DISTANCE, 0.0, _SIDE_RELEASE_Z_OFFSET] + if relation == "behind": + return [_SIDE_RELATION_DISTANCE, 0.0, _SIDE_RELEASE_Z_OFFSET] + raise ValueError(f"Unsupported relative placement relation: {relation!r}.") + + +def _relative_runtime_uid_mapping( + rigid_objects: list[_SceneObject], +) -> dict[str, str]: + candidates: dict[str, str] = {} + for obj in rigid_objects: + if _is_container_like(obj): + candidates[obj.source_uid] = _container_runtime_uid(obj) + continue + + base = _target_runtime_suffix(_base_name(obj)) + base_count = sum( + 1 for other in rigid_objects if _base_name(other) == _base_name(obj) + ) + candidates[obj.source_uid] = ( + base if base_count == 1 else _normalize_runtime_uid(obj.source_uid) + ) + + counts: dict[str, int] = {} + for runtime_uid in candidates.values(): + counts[runtime_uid] = counts.get(runtime_uid, 0) + 1 + return { + source_uid: ( + runtime_uid + if counts[runtime_uid] == 1 + else _normalize_runtime_uid(source_uid) + ) + for source_uid, runtime_uid in candidates.items() + } + + +def _candidate_relative_runtime_uid(obj: _SceneObject) -> str: + if _is_container_like(obj): + return _container_runtime_uid(obj) + return _target_runtime_suffix(_base_name(obj)) + + +def _is_container_like(obj: _SceneObject) -> bool: + return any(keyword in _object_text(obj) for keyword in _CONTAINER_KEYWORDS) + + +def _string_list(value: Any) -> list[str]: + if not isinstance(value, list): + return [] + return [str(item).strip() for item in value if str(item).strip()] + + +def _default_relative_task_summary( + moved_uid: str, + reference_uid: str, + relation: str, +) -> str: + return ( + f"Move `{moved_uid}` so its final state is " + f"{_relative_relation_phrase(relation)} `{reference_uid}`." + ) + + +def _default_relative_plan_summary( + placements: Sequence[_RelativePlacementStepSpec], +) -> str: + if len(placements) == 1: + placement = placements[0] + return _default_relative_task_summary( + placement.moved_runtime_uid, + placement.reference_runtime_uid, + placement.relation, + ) + placement_text = "; ".join( + f"use the {placement.active_side} UR5 to move " + f"`{placement.moved_runtime_uid}` " + f"{_relative_relation_phrase(placement.relation)} " + f"`{placement.reference_runtime_uid}`" + for placement in placements + ) + return f"Use both UR5 arms for a dual-arm relative placement: {placement_text}." + + +def _default_relative_action_sketch( + placements: Sequence[_RelativePlacementStepSpec], +) -> list[str]: + if len(placements) == 1: + placement = placements[0] + return [ + f"grasp {placement.moved_runtime_uid}", + ( + f"move above the {placement.relation} release pose relative to " + f"{placement.reference_runtime_uid}" + ), + "lower to the release pose", + "open the gripper", + "retreat upward", + ] + sketch = ["grasp both moved objects with their assigned arms"] + for placement in placements: + sketch.extend( + [ + ( + f"use {placement.active_side}_arm to move " + f"{placement.moved_runtime_uid} above the release pose relative " + f"to {placement.reference_runtime_uid}" + ), + f"lower and release {placement.moved_runtime_uid}", + f"retreat {placement.active_side}_arm upward", + ] + ) + return sketch + + +def _relative_relation_phrase(relation: str) -> str: + relation = _normalize_relative_relation(relation) + if relation == "inside": + return "inside" + if relation == "on": + return "on top of" + if relation == "left_of": + return "to the left of" + if relation == "right_of": + return "to the right of" + if relation == "front_of": + return "in front of" + if relation == "behind": + return "behind" + raise ValueError(f"Unsupported relative placement relation: {relation!r}.") + + +def _build_ur5_basket_bundle( + *, + scene_dir: Path, + source_config: Mapping[str, Any], + roles: _BasketTaskRoles, + project_name: str, + task_name: str, + target_body_scale: float | list[float] | tuple[float, float, float], + target_replacements: Sequence[_ResolvedTargetReplacement], + max_episodes: int, + max_episode_steps: int, +) -> dict[str, Any]: + scene_objects = _collect_scene_objects(source_config) + by_uid = {obj.source_uid: obj for obj in scene_objects} + replacement_by_source_uid = { + replacement.source_uid: replacement for replacement in target_replacements + } + object_scale = _target_body_scale_vector(target_body_scale) + container_scale = _source_body_scale(by_uid[roles.container_source_uid]) + task_source_uids = { + roles.container_source_uid, + roles.left_target_source_uid, + roles.right_target_source_uid, + } + extra_rigid_objects = [ + obj + for obj in scene_objects + if obj.source_role == "rigid_object" and obj.source_uid not in task_source_uids + ] + extra_background_objects = [ + obj + for obj in scene_objects + if obj.source_role == "background" and obj.source_uid != roles.table_source_uid + ] + robot_init_z = _estimate_dual_ur5_init_z( + scene_dir, + by_uid[roles.table_source_uid], + ) + + gym_config = { + "id": "AtomicActionsAgent-v3", + "max_episodes": int(max_episodes), + "max_episode_steps": int(max_episode_steps), + "env": { + "extensions": _make_extensions_config(roles), + "events": _make_events_config(roles), + "observations": _make_observations_config(), + "dataset": _make_dataset_config(project_name, roles), + }, + "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), + "sensor": _make_sensor_config(), + "light": _make_light_config(), + "background": [ + _make_background_config(scene_dir, by_uid[roles.table_source_uid]), + *[ + _make_extra_background_config(scene_dir, obj) + for obj in extra_background_objects + ], + ], + "rigid_object": [ + _make_target_object_config( + scene_dir, + by_uid[roles.right_target_source_uid], + roles.right_target_runtime_uid, + object_scale, + replacement_by_source_uid.get(roles.right_target_source_uid), + ), + _make_target_object_config( + scene_dir, + by_uid[roles.left_target_source_uid], + roles.left_target_runtime_uid, + object_scale, + replacement_by_source_uid.get(roles.left_target_source_uid), + ), + _make_container_object_config( + scene_dir, + by_uid[roles.container_source_uid], + roles.container_runtime_uid, + container_scale, + ), + *[ + _make_extra_rigid_object_config(scene_dir, obj, _source_body_scale(obj)) + for obj in extra_rigid_objects + ], + ], + } + return { + "gym_config": gym_config, + "agent_config": make_agent_config(), + "task_prompt": make_basket_task_prompt(task_name, project_name, roles), + "basic_background": make_basket_basic_background(project_name, roles), + "atom_actions": make_basket_atom_actions_prompt(roles), + "summary": { + "mode": "basket_template", + "left_target": roles.left_target_runtime_uid, + "right_target": roles.right_target_runtime_uid, + "container": roles.container_runtime_uid, + "target_replacements": [ + { + "source_uid": replacement.source_uid, + "prompt": replacement.prompt, + "output_dir_name": replacement.output_dir_name, + "mesh_path": replacement.mesh_path.as_posix(), + "runtime_noun": replacement.runtime_noun, + "reused": replacement.reused, + } + for replacement in target_replacements + ], + }, + } + + +def _attach_coacd_cache_summary(bundle: dict[str, Any]) -> None: + from embodichain.gen_sim.action_agent_pipeline.generation.coacd_cache import ( + prewarm_coacd_cache_for_gym_config, + ) + + bundle.setdefault("summary", {})["coacd_cache"] = ( + prewarm_coacd_cache_for_gym_config(bundle["gym_config"]) + ) + + +def _build_relative_placement_bundle( + *, + scene_dir: Path, + source_config: Mapping[str, Any], + spec: _RelativePlacementSpec, + project_name: str, + task_name: str, + target_body_scale: float | list[float] | tuple[float, float, float], + max_episodes: int, + max_episode_steps: int, +) -> dict[str, Any]: + scene_objects = _collect_scene_objects(source_config) + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + by_uid = {obj.source_uid: obj for obj in scene_objects} + runtime_uids = _relative_runtime_uid_mapping(rigid_objects) + object_scale = _target_body_scale_vector(target_body_scale) + robot_init_z = _estimate_dual_ur5_init_z( + scene_dir, + by_uid[spec.table_source_uid], + ) + + gym_config = { + "id": "AtomicActionsAgent-v3", + "max_episodes": int(max_episodes), + "max_episode_steps": int(max_episode_steps), + "env": { + "extensions": _make_relative_extensions_config(spec), + "events": _make_relative_events_config(spec, list(runtime_uids.values())), + "observations": _make_observations_config(), + "dataset": _make_relative_dataset_config(project_name, spec), + }, + "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), + "sensor": _make_sensor_config(), + "light": _make_light_config(), + "background": [ + _make_background_config(scene_dir, by_uid[spec.table_source_uid]), + *[ + _make_extra_background_config(scene_dir, obj, object_scale) + for obj in background_objects + if obj.source_uid != spec.table_source_uid + ], + ], + "rigid_object": [ + _make_relative_rigid_object_config( + scene_dir=scene_dir, + obj=obj, + runtime_uid=runtime_uids[obj.source_uid], + body_scale=_relative_object_body_scale( + obj, + target_scale=object_scale, + ), + max_convex_hull_num=_relative_rigid_object_max_convex_hull_num( + runtime_uids[obj.source_uid], + spec, + ), + ) + for obj in rigid_objects + ], + } + return { + "gym_config": gym_config, + "agent_config": make_agent_config(), + "task_prompt": make_relative_task_prompt(task_name, project_name, spec), + "basic_background": make_relative_basic_background(project_name, spec), + "atom_actions": make_relative_atom_actions_prompt(spec), + "summary": _make_relative_summary(spec), + } + + +def _target_body_scale_vector( + target_body_scale: float | list[float] | tuple[float, float, float], +) -> list[float]: + if isinstance(target_body_scale, (int, float)): + value = float(target_body_scale) + return [value, value, value] + return _clean_vector3(target_body_scale) + + +def _source_body_scale(obj: _SceneObject) -> list[float]: + return _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])) + + +def _relative_object_body_scale( + obj: _SceneObject, + *, + target_scale: list[float], +) -> list[float]: + if _is_container_object(obj): + return _source_body_scale(obj) + return target_scale + + +def _is_container_object(obj: _SceneObject) -> bool: + text = _object_text(obj) + return any(keyword in text for keyword in _CONTAINER_KEYWORDS) + + +def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: + if len(spec.placements) == 1: + return { + "mode": "relative_placement", + "moved_object": spec.moved_runtime_uid, + "reference_object": spec.reference_runtime_uid, + "relation": spec.relation, + "active_arm": f"{spec.active_side}_arm", + "release_offset": spec.release_offset, + } + return { + "mode": "dual_arm_relative_placement", + "placements": [ + { + "moved_object": placement.moved_runtime_uid, + "reference_object": placement.reference_runtime_uid, + "relation": placement.relation, + "active_arm": f"{placement.active_side}_arm", + "release_offset": placement.release_offset, + } + for placement in spec.placements + ], + } + + +def _estimate_dual_ur5_init_z(scene_dir: Path, table_obj: _SceneObject) -> float: + """Estimate robot root height from the table mesh top surface.""" + + table_top_z = _resolve_table_mesh_world_zmax(scene_dir, table_obj) + if table_top_z is None: + return _DUAL_UR5_LEGACY_INIT_Z + + init_z = table_top_z + _DUAL_UR5_TABLETOP_CLEARANCE - _DUAL_UR5_ARM_COMPONENT_Z + return round(max(_DUAL_UR5_LEGACY_INIT_Z, init_z), 6) + + +def _resolve_table_mesh_world_zmax( + scene_dir: Path, + table_obj: _SceneObject, +) -> float | None: + shape = table_obj.config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + if shape.get("shape_type") != "Mesh" or not shape.get("fpath"): + return None + + mesh_path = _source_asset_path(scene_dir, str(shape["fpath"])) + try: + vertices = _load_mesh_vertices(mesh_path) + except ( + OSError, + ValueError, + json.JSONDecodeError, + UnicodeDecodeError, + struct.error, + ): + return None + if not vertices: + return None + + world_matrix = _table_mesh_world_matrix(table_obj.config) + return max(_transform_point(world_matrix, vertex)[2] for vertex in vertices) + + +def _source_asset_path(scene_dir: Path, fpath: str) -> Path: + raw_path = Path(fpath) + if raw_path.is_absolute(): + return raw_path.resolve() + + scene_candidate = (scene_dir / raw_path).resolve() + if scene_candidate.exists(): + return scene_candidate + + repo_candidate = (_repo_root() / raw_path).resolve() + if repo_candidate.exists(): + return repo_candidate + return scene_candidate + + +def _load_mesh_vertices(mesh_path: Path) -> list[tuple[float, float, float]] | None: + if mesh_path.suffix.lower() == ".glb": + try: + return list(_iter_glb_world_position_vertices(mesh_path)) + except ( + OSError, + ValueError, + json.JSONDecodeError, + UnicodeDecodeError, + struct.error, + ): + return _load_mesh_vertices_with_trimesh(mesh_path) + return _load_mesh_vertices_with_trimesh(mesh_path) + + +def _load_mesh_vertices_with_trimesh( + mesh_path: Path, +) -> list[tuple[float, float, float]] | None: + try: + import trimesh + except ImportError: + return None + + try: + scene_or_mesh = trimesh.load(str(mesh_path), force="scene") + try: + mesh = scene_or_mesh.dump(concatenate=True) + except AttributeError: + mesh = scene_or_mesh + except Exception: + return None + vertices = getattr(mesh, "vertices", None) + if vertices is None or len(vertices) == 0: + return None + return [ + (float(vertex[0]), float(vertex[1]), float(vertex[2])) for vertex in vertices + ] + + +def _iter_glb_world_position_vertices( + mesh_path: Path, +): + doc, binary_chunk = _read_glb(mesh_path) + nodes = doc.get("nodes", []) + if not isinstance(nodes, list): + raise ValueError("GLB nodes must be a list.") + + scenes = doc.get("scenes", []) + if scenes: + scene_index = int(doc.get("scene", 0)) + root_node_ids = scenes[scene_index].get("nodes", []) + else: + root_node_ids = list(range(len(nodes))) + + stack = [(int(node_id), _identity_matrix4()) for node_id in root_node_ids] + while stack: + node_id, parent_matrix = stack.pop() + node = nodes[node_id] + node_matrix = _matrix_multiply(parent_matrix, _gltf_node_matrix(node)) + mesh_index = node.get("mesh") + if mesh_index is not None: + for vertex in _iter_gltf_mesh_position_vertices( + doc, + binary_chunk, + int(mesh_index), + ): + yield _transform_point(node_matrix, vertex) + for child_id in node.get("children", []) or []: + stack.append((int(child_id), node_matrix)) + + +def _read_glb(mesh_path: Path) -> tuple[dict[str, Any], bytes]: + data = mesh_path.read_bytes() + if len(data) < 20: + raise ValueError("GLB file is too small.") + + magic, version, total_length = struct.unpack_from("<4sII", data, 0) + if magic != b"glTF" or version != 2: + raise ValueError("Only GLB version 2 files are supported.") + if total_length > len(data): + raise ValueError("GLB length header exceeds file size.") + + doc: dict[str, Any] | None = None + binary_chunk = b"" + offset = 12 + while offset + 8 <= total_length: + chunk_length, chunk_type = struct.unpack_from(" total_length: + raise ValueError("GLB chunk exceeds file size.") + chunk = data[offset:chunk_end] + offset = chunk_end + if chunk_type == _GLB_JSON_CHUNK_TYPE: + doc = json.loads(chunk.decode("utf-8").rstrip("\x00 ")) + elif chunk_type == _GLB_BINARY_CHUNK_TYPE: + binary_chunk = chunk + + if doc is None: + raise ValueError("GLB file does not contain a JSON chunk.") + return doc, binary_chunk + + +def _iter_gltf_mesh_position_vertices( + doc: Mapping[str, Any], + binary_chunk: bytes, + mesh_index: int, +): + meshes = doc.get("meshes", []) + accessors = doc.get("accessors", []) + mesh = meshes[mesh_index] + for primitive in mesh.get("primitives", []) or []: + attributes = primitive.get("attributes", {}) + position_accessor = attributes.get("POSITION") + if position_accessor is None: + continue + if int(position_accessor) >= len(accessors): + raise ValueError("POSITION accessor index is out of range.") + yield from _iter_gltf_accessor_vec3(doc, binary_chunk, int(position_accessor)) + + +def _iter_gltf_accessor_vec3( + doc: Mapping[str, Any], + binary_chunk: bytes, + accessor_index: int, +): + accessor = doc["accessors"][accessor_index] + if accessor.get("sparse"): + raise ValueError("Sparse GLB accessors are not supported.") + if accessor.get("type") != "VEC3": + raise ValueError("POSITION accessor must be VEC3.") + if "bufferView" not in accessor: + raise ValueError("POSITION accessor must reference a bufferView.") + + component_type = int(accessor["componentType"]) + if component_type not in _GLTF_COMPONENT_FORMATS: + raise ValueError(f"Unsupported GLB component type: {component_type}.") + component_format, component_size = _GLTF_COMPONENT_FORMATS[component_type] + component_count = _GLTF_TYPE_COMPONENT_COUNTS[accessor["type"]] + buffer_view = doc["bufferViews"][int(accessor["bufferView"])] + if int(buffer_view.get("buffer", 0)) != 0: + raise ValueError("Only GLB embedded binary buffers are supported.") + + stride = int(buffer_view.get("byteStride", component_size * component_count)) + offset = int(buffer_view.get("byteOffset", 0)) + int(accessor.get("byteOffset", 0)) + element_format = "<" + component_format * component_count + for index in range(int(accessor["count"])): + values = struct.unpack_from( + element_format, + binary_chunk, + offset + index * stride, + ) + yield (float(values[0]), float(values[1]), float(values[2])) + + +def _table_mesh_world_matrix(table_config: Mapping[str, Any]) -> list[list[float]]: + scale = _vector3(table_config.get("body_scale", [1.0, 1.0, 1.0])) + init_local_pose = table_config.get("init_local_pose") + if init_local_pose is not None: + root_matrix = _matrix4(init_local_pose) + else: + root_matrix = _euler_xyz_degrees_matrix( + _vector3(table_config.get("init_rot", [0.0, 0.0, 0.0])), + _vector3(table_config.get("init_pos", [0.0, 0.0, 0.0])), + ) + return _matrix_multiply(root_matrix, _scale_matrix4(scale)) + + +def _gltf_node_matrix(node: Mapping[str, Any]) -> list[list[float]]: + if "matrix" in node: + values = [float(value) for value in node["matrix"]] + if len(values) != 16: + raise ValueError("GLB node matrix must contain 16 values.") + return [[values[column * 4 + row] for column in range(4)] for row in range(4)] + + translation = [float(value) for value in node.get("translation", [0.0, 0.0, 0.0])] + scale = [float(value) for value in node.get("scale", [1.0, 1.0, 1.0])] + rotation = [float(value) for value in node.get("rotation", [0.0, 0.0, 0.0, 1.0])] + if len(translation) != 3 or len(scale) != 3 or len(rotation) != 4: + raise ValueError("Invalid GLB node TRS transform.") + + x, y, z, w = rotation + xx, yy, zz = x * x, y * y, z * z + xy, xz, yz = x * y, x * z, y * z + wx, wy, wz = w * x, w * y, w * z + matrix = [ + [ + (1.0 - 2.0 * (yy + zz)) * scale[0], + (2.0 * (xy - wz)) * scale[1], + (2.0 * (xz + wy)) * scale[2], + translation[0], + ], + [ + (2.0 * (xy + wz)) * scale[0], + (1.0 - 2.0 * (xx + zz)) * scale[1], + (2.0 * (yz - wx)) * scale[2], + translation[1], + ], + [ + (2.0 * (xz - wy)) * scale[0], + (2.0 * (yz + wx)) * scale[1], + (1.0 - 2.0 * (xx + yy)) * scale[2], + translation[2], + ], + [0.0, 0.0, 0.0, 1.0], + ] + return matrix + + +def _euler_xyz_degrees_matrix( + rotation_deg: Sequence[float], + translation: Sequence[float], +) -> list[list[float]]: + rx, ry, rz = (math.radians(float(value)) for value in rotation_deg) + cx, sx = math.cos(rx), math.sin(rx) + cy, sy = math.cos(ry), math.sin(ry) + cz, sz = math.cos(rz), math.sin(rz) + rot_x = [ + [1.0, 0.0, 0.0, 0.0], + [0.0, cx, -sx, 0.0], + [0.0, sx, cx, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + rot_y = [ + [cy, 0.0, sy, 0.0], + [0.0, 1.0, 0.0, 0.0], + [-sy, 0.0, cy, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + rot_z = [ + [cz, -sz, 0.0, 0.0], + [sz, cz, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + matrix = _matrix_multiply(_matrix_multiply(rot_z, rot_y), rot_x) + matrix[0][3] = float(translation[0]) + matrix[1][3] = float(translation[1]) + matrix[2][3] = float(translation[2]) + return matrix + + +def _identity_matrix4() -> list[list[float]]: + return [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + + +def _scale_matrix4(scale: Sequence[float]) -> list[list[float]]: + return [ + [float(scale[0]), 0.0, 0.0, 0.0], + [0.0, float(scale[1]), 0.0, 0.0], + [0.0, 0.0, float(scale[2]), 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + + +def _matrix4(value: Any) -> list[list[float]]: + if not isinstance(value, (list, tuple)) or len(value) != 4: + raise ValueError(f"Expected a 4x4 matrix, got {value!r}.") + matrix = [] + for row in value: + if not isinstance(row, (list, tuple)) or len(row) != 4: + raise ValueError(f"Expected a 4x4 matrix, got {value!r}.") + matrix.append([float(item) for item in row]) + return matrix + + +def _matrix_multiply( + left: Sequence[Sequence[float]], + right: Sequence[Sequence[float]], +) -> list[list[float]]: + return [ + [ + sum( + float(left[row][inner]) * float(right[inner][column]) + for inner in range(4) + ) + for column in range(4) + ] + for row in range(4) + ] + + +def _transform_point( + matrix: Sequence[Sequence[float]], + point: Sequence[float], +) -> tuple[float, float, float]: + x, y, z = (float(point[0]), float(point[1]), float(point[2])) + return ( + float(matrix[0][0]) * x + + float(matrix[0][1]) * y + + float(matrix[0][2]) * z + + float(matrix[0][3]), + float(matrix[1][0]) * x + + float(matrix[1][1]) * y + + float(matrix[1][2]) * z + + float(matrix[1][3]), + float(matrix[2][0]) * x + + float(matrix[2][1]) * y + + float(matrix[2][2]) * z + + float(matrix[2][3]), + ) + + +def _make_extensions_config(roles: _BasketTaskRoles) -> dict[str, Any]: + return { + "agent_arm_slots": { + "left": { + "arm": "left_arm", + "eef": "left_eef", + }, + "right": { + "arm": "right_arm", + "eef": "right_eef", + }, + }, + "arm_aim_yaw_offset": { + "left": 0.0, + "right": 3.141592653589793, + }, + "gripper_open_state": [0.0], + "gripper_close_state": [0.04], + "ignore_terminations_during_agent": True, + "viewer_camera_uid": "cam_high", + "agent_success": { + "op": "all", + "terms": [ + _object_in_container_success( + roles.left_target_runtime_uid, + roles.container_runtime_uid, + ), + _object_in_container_success( + roles.right_target_runtime_uid, + roles.container_runtime_uid, + ), + ], + }, + } + + +def _object_in_container_success(object_uid: str, container_uid: str) -> dict[str, Any]: + return { + "type": "object_in_container", + "object": object_uid, + "container": container_uid, + "radius": 0.2, + "min_z_offset": -0.05, + "max_z_offset": 0.35, + } + + +def _make_relative_extensions_config(spec: _RelativePlacementSpec) -> dict[str, Any]: + return { + "agent_arm_slots": { + "left": { + "arm": "left_arm", + "eef": "left_eef", + }, + "right": { + "arm": "right_arm", + "eef": "right_eef", + }, + }, + "arm_aim_yaw_offset": { + "left": 0.0, + "right": 3.141592653589793, + }, + "gripper_open_state": [0.0], + "gripper_close_state": [0.04], + "ignore_terminations_during_agent": True, + "viewer_camera_uid": "cam_high", + "agent_success": _make_relative_success_spec(spec), + } + + +def _make_relative_success_spec(spec: _RelativePlacementSpec) -> dict[str, Any]: + if len(spec.placements) == 1: + return _make_relative_placement_success_spec(spec.placements[0]) + return { + "op": "all", + "terms": [ + _make_relative_placement_success_spec(placement) + for placement in spec.placements + ], + } + + +def _make_relative_placement_success_spec( + placement: _RelativePlacementStepSpec, +) -> dict[str, Any]: + if placement.relation == "inside": + return _object_in_container_success( + placement.moved_runtime_uid, + placement.reference_runtime_uid, + ) + if placement.relation == "on": + return { + "type": "object_on_object", + "object": placement.moved_runtime_uid, + "support": placement.reference_runtime_uid, + "xy_radius": 0.08, + "min_z_offset": 0.02, + "max_z_offset": 0.35, + } + + primary_axis, primary_offset, secondary_axis = _side_relation_axes( + placement.relation + ) + return { + "op": "all", + "terms": [ + { + "type": "object_axis_offset_near", + "object": placement.moved_runtime_uid, + "reference": placement.reference_runtime_uid, + "axis": primary_axis, + "offset": primary_offset, + "tolerance": 0.05, + }, + { + "type": "object_axis_offset_near", + "object": placement.moved_runtime_uid, + "reference": placement.reference_runtime_uid, + "axis": secondary_axis, + "offset": 0.0, + "tolerance": 0.06, + }, + { + "type": "object_not_fallen", + "object": placement.moved_runtime_uid, + "max_tilt": 0.9, + }, + ], + } + + +def _side_relation_axes(relation: str) -> tuple[str, float, str]: + if relation == "left_of": + return "y", -_SIDE_RELATION_DISTANCE, "x" + if relation == "right_of": + return "y", _SIDE_RELATION_DISTANCE, "x" + if relation == "front_of": + return "x", -_SIDE_RELATION_DISTANCE, "y" + if relation == "behind": + return "x", _SIDE_RELATION_DISTANCE, "y" + raise ValueError(f"Unsupported side relation: {relation!r}.") + + +def _make_relative_events_config( + spec: _RelativePlacementSpec, + rigid_runtime_uids: list[str], +) -> dict[str, Any]: + return { + "record_camera": _record_camera_event_config(), + "validation_cameras": _validation_cameras_event_config(), + "prepare_extra_attr": { + "func": "prepare_extra_attr", + "mode": "reset", + "params": { + "attrs": [ + { + "name": "object_lengths", + "mode": "callable", + "entity_uids": "all_objects", + "func_name": "compute_object_length", + "func_kwargs": { + "is_svd_frame": True, + "sample_points": 5000, + }, + }, + ] + }, + }, + "register_info_to_env": { + "func": "register_info_to_env", + "mode": "reset", + "params": { + "registry": [ + _object_registry_entry(uid) for uid in sorted(rigid_runtime_uids) + ], + "registration": "affordance_datas", + "sim_update": True, + }, + }, + } + + +def _make_events_config(roles: _BasketTaskRoles) -> dict[str, Any]: + return { + "record_camera": _record_camera_event_config(), + "validation_cameras": _validation_cameras_event_config(), + "prepare_extra_attr": { + "func": "prepare_extra_attr", + "mode": "reset", + "params": { + "attrs": [ + { + "name": "object_lengths", + "mode": "callable", + "entity_uids": "all_objects", + "func_name": "compute_object_length", + "func_kwargs": { + "is_svd_frame": True, + "sample_points": 5000, + }, + }, + ] + }, + }, + "register_info_to_env": { + "func": "register_info_to_env", + "mode": "reset", + "params": { + "registry": [ + _object_registry_entry(roles.left_target_runtime_uid), + _object_registry_entry(roles.right_target_runtime_uid), + _object_registry_entry(roles.container_runtime_uid), + ], + "registration": "affordance_datas", + "sim_update": True, + }, + }, + } + + +def _record_camera_event_config() -> dict[str, Any]: + camera = _make_sensor_config()[0] + extrinsics = camera["extrinsics"] + return { + "func": "record_camera_data", + "mode": "interval", + "interval_step": 1, + "params": { + "name": "record_cam_high", + "resolution": [camera["width"], camera["height"]], + "intrinsics": camera["intrinsics"], + "eye": extrinsics["eye"], + "target": extrinsics["target"], + "up": extrinsics["up"], + }, + } + + +def _validation_cameras_event_config() -> dict[str, Any]: + return { + "func": "validation_cameras", + "mode": "trigger", + "params": {}, + } + + +def _object_registry_entry(uid: str) -> dict[str, Any]: + return { + "entity_cfg": { + "uid": uid, + }, + "pose_register_params": { + "compute_relative": False, + "compute_pose_object_to_arena": True, + "to_matrix": True, + }, + } + + +def _make_observations_config() -> dict[str, Any]: + return { + "norm_robot_eef_joint": { + "func": "normalize_robot_joint_data", + "mode": "modify", + "name": "robot/qpos", + "params": { + "joint_ids": [12, 13, 14, 15], + }, + } + } + + +def _make_dataset_config( + project_name: str, + roles: _BasketTaskRoles, +) -> dict[str, Any]: + left_target_text = _left_target_text(roles) + right_target_text = _right_target_text(roles) + target_description = _target_task_description_text(roles) + return { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "DualUR5", + "control_freq": 25, + }, + "instruction": { + "lang": ( + f"Use the left UR5 to place the left {left_target_text} into " + f"the {roles.container_runtime_uid}, then use the right " + f"UR5 to place the right {right_target_text} into the " + f"{roles.container_runtime_uid}." + ), + }, + "extra": { + "scene_type": project_name, + "task_description": ( + f"Dual UR5 {target_description}-to-container placement" + ), + "data_type": "sim", + }, + "use_videos": True, + }, + } + } + + +def _make_relative_dataset_config( + project_name: str, + spec: _RelativePlacementSpec, +) -> dict[str, Any]: + return { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "DualUR5", + "control_freq": 25, + }, + "instruction": { + "lang": _relative_dataset_instruction(spec), + }, + "extra": { + "scene_type": project_name, + "task_description": spec.task_description, + "data_type": "sim", + }, + "use_videos": True, + }, + } + } + + +def _relative_dataset_instruction(spec: _RelativePlacementSpec) -> str: + if len(spec.placements) == 1: + placement = spec.placements[0] + return ( + f"Use the {placement.active_side} UR5 to move " + f"{placement.moved_runtime_uid} " + f"{_relative_relation_phrase(placement.relation)} " + f"{placement.reference_runtime_uid}." + ) + return " ".join( + f"Use the {placement.active_side} UR5 to move " + f"{placement.moved_runtime_uid} " + f"{_relative_relation_phrase(placement.relation)} " + f"{placement.reference_runtime_uid}." + for placement in spec.placements + ) + + +def _make_dual_ur5_robot_config(*, robot_init_z: float) -> dict[str, Any]: + return { + "uid": "DualUR5", + "urdf_cfg": { + "fname": "dual_ur5_dh_pgi_basket", + "components": [ + { + "component_type": "left_arm", + "urdf_path": "UniversalRobots/UR5/UR5.urdf", + "transform": [ + [0.0, -1.0, 0.0, -0.3], + [1.0, 0.0, 0.0, -1.45], + [0.0, 0.0, 1.0, 0.4], + [0.0, 0.0, 0.0, 1.0], + ], + }, + { + "component_type": "left_hand", + "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf", + }, + { + "component_type": "right_arm", + "urdf_path": "UniversalRobots/UR5/UR5.urdf", + "transform": [ + [0.0, -1.0, 0.0, 0.3], + [1.0, 0.0, 0.0, -1.45], + [0.0, 0.0, 1.0, 0.4], + [0.0, 0.0, 0.0, 1.0], + ], + }, + { + "component_type": "right_hand", + "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf", + }, + ], + }, + "init_pos": [-2.0, 0.0, float(robot_init_z)], + "init_rot": [0.0, 0.0, 90.0], + "init_qpos": [ + 0, + 0, + -1.57, + -1.57, + 1.57, + 1.57, + -1.57, + -1.57, + -1.57, + -1.57, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + "drive_pros": { + "stiffness": { + "LEFT_JOINT[1-6]": 10000.0, + "RIGHT_JOINT[1-6]": 10000.0, + "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 100.0, + "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 100.0, + }, + "damping": { + "LEFT_JOINT[1-6]": 1000.0, + "RIGHT_JOINT[1-6]": 1000.0, + "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 10.0, + "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 10.0, + }, + "max_effort": { + "LEFT_JOINT[1-6]": 100000.0, + "RIGHT_JOINT[1-6]": 100000.0, + "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 1000.0, + "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 1000.0, + }, + }, + "control_parts": { + "left_arm": ["LEFT_JOINT[1-6]"], + "left_eef": ["LEFT_GRIPPER_FINGER[1-2]_JOINT_1"], + "right_arm": ["RIGHT_JOINT[1-6]"], + "right_eef": ["RIGHT_GRIPPER_FINGER[1-2]_JOINT_1"], + }, + "solver_cfg": { + "left_arm": _ur5_solver_config("left"), + "right_arm": _ur5_solver_config("right"), + }, + } + + +def _ur5_solver_config(side: str) -> dict[str, Any]: + return { + "class_type": "PytorchSolver", + "end_link_name": f"{side}_ee_link", + "root_link_name": f"{side}_base_link", + "tcp": [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.16], + [0.0, 0.0, 0.0, 1.0], + ], + } + + +def _make_sensor_config() -> list[dict[str, Any]]: + return [ + { + "sensor_type": "Camera", + "uid": "cam_high", + "width": 960, + "height": 540, + "intrinsics": [420, 420, 480, 270], + "extrinsics": { + "pos": [0.4, 0.0, 2.2], + "eye": [0.6, 0.0, 3.3], + "target": [0.0, 0.0, 0.75], + "up": [1.0, 0.0, 0.0], + }, + }, + { + "sensor_type": "Camera", + "uid": "cam_wrist_left", + "width": 640, + "height": 480, + "intrinsics": [600, 600, 320, 240], + "extrinsics": { + "parent": "left_ee_link", + "pos": [0.0, 0.12, 0.08], + "quat": [ + -0.0012598701, + -0.029051816664441618998, + 0.9094039177564813, + 0.41489627504330695, + ], + }, + }, + { + "sensor_type": "Camera", + "uid": "cam_wrist_right", + "width": 640, + "height": 480, + "intrinsics": [600, 600, 320, 240], + "extrinsics": { + "parent": "right_ee_link", + "pos": [0.0, 0.12, 0.08], + "quat": [ + -0.0012598701, + -0.029051816664441618998, + 0.9094039177564813, + 0.41489627504330695, + ], + }, + }, + ] + + +def _make_light_config() -> dict[str, Any]: + return { + "direct": [ + { + "uid": "main_light", + "light_type": "point", + "color": [1.0, 1.0, 1.0], + "intensity": 40.0, + "init_pos": [0.0, -0.4, 2.2], + "radius": 10.0, + } + ] + } + + +def _make_background_config(scene_dir: Path, obj: _SceneObject) -> dict[str, Any]: + return { + "uid": "table", + "shape": _make_shape_config(scene_dir, obj.config), + "attrs": dict(_BACKGROUND_ATTRS), + "body_scale": _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])), + "body_type": "kinematic", + "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "max_convex_hull_num": _role_limited_max_convex_hull_num( + obj, + _BACKGROUND_MAX_CONVEX_HULL_NUM, + ), + } + + +def _make_extra_background_config( + scene_dir: Path, + obj: _SceneObject, + body_scale: Any | None = None, +) -> dict[str, Any]: + config = { + "uid": _normalize_runtime_uid(obj.source_uid), + "shape": _make_shape_config(scene_dir, obj.config), + "attrs": copy.deepcopy(dict(obj.config.get("attrs", _BACKGROUND_ATTRS))), + "body_scale": _clean_vector3( + obj.config.get("body_scale", [1.0, 1.0, 1.0]) + if body_scale is None + else body_scale + ), + "body_type": str(obj.config.get("body_type", "static")), + "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "max_convex_hull_num": _role_limited_max_convex_hull_num( + obj, + _BACKGROUND_MAX_CONVEX_HULL_NUM, + ), + } + return config + + +def _make_target_object_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + target_scale: list[float], + replacement: _ResolvedTargetReplacement | None = None, +) -> dict[str, Any]: + return _make_rigid_object_config( + scene_dir, + obj, + runtime_uid, + target_scale, + max_convex_hull_num=_TARGET_MAX_CONVEX_HULL_NUM, + mesh_fpath=replacement.mesh_path if replacement else None, + ) + + +def _make_container_object_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + body_scale: Any, +) -> dict[str, Any]: + return _make_rigid_object_config( + scene_dir, + obj, + runtime_uid, + body_scale, + max_convex_hull_num=_role_limited_max_convex_hull_num( + obj, + _CONTAINER_MAX_CONVEX_HULL_NUM, + ), + ) + + +def _make_extra_rigid_object_config( + scene_dir: Path, + obj: _SceneObject, + body_scale: Any, +) -> dict[str, Any]: + return _make_rigid_object_config( + scene_dir, + obj, + _normalize_runtime_uid(obj.source_uid), + body_scale, + max_convex_hull_num=_role_limited_max_convex_hull_num( + obj, + _EXTRA_RIGID_MAX_CONVEX_HULL_NUM, + ), + ) + + +def _make_relative_rigid_object_config( + *, + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + body_scale: Any, + max_convex_hull_num: int, +) -> dict[str, Any]: + if max_convex_hull_num == _TARGET_MAX_CONVEX_HULL_NUM: + resolved_max_convex_hull_num = max_convex_hull_num + else: + resolved_max_convex_hull_num = _role_limited_max_convex_hull_num( + obj, + max_convex_hull_num, + ) + return _make_rigid_object_config( + scene_dir, + obj, + runtime_uid, + body_scale, + max_convex_hull_num=resolved_max_convex_hull_num, + ) + + +def _make_rigid_object_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + body_scale: Any, + max_convex_hull_num: int, + mesh_fpath: str | Path | None = None, +) -> dict[str, Any]: + config = { + "uid": runtime_uid, + "shape": _make_shape_config(scene_dir, obj.config, mesh_fpath=mesh_fpath), + "attrs": dict(_RIGID_OBJECT_ATTRS), + "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "body_scale": _clean_vector3(body_scale), + "max_convex_hull_num": int(max_convex_hull_num), + } + if "body_type" in obj.config: + config["body_type"] = str(obj.config["body_type"]) + return config + + +def _role_limited_max_convex_hull_num( + obj: _SceneObject, + role_max_convex_hull_num: int, +) -> int: + source_max_convex_hull_num = obj.config.get("max_convex_hull_num") + if source_max_convex_hull_num is None: + return role_max_convex_hull_num + return max(1, min(int(source_max_convex_hull_num), role_max_convex_hull_num)) + + +def _relative_rigid_object_max_convex_hull_num( + runtime_uid: str, + spec: _RelativePlacementSpec, +) -> int: + for placement in spec.placements: + if ( + placement.relation == "inside" + and runtime_uid == placement.reference_runtime_uid + ): + return _CONTAINER_MAX_CONVEX_HULL_NUM + task_uids = { + uid + for placement in spec.placements + for uid in (placement.moved_runtime_uid, placement.reference_runtime_uid) + } + if runtime_uid in task_uids: + return _TARGET_MAX_CONVEX_HULL_NUM + return _EXTRA_RIGID_MAX_CONVEX_HULL_NUM + + +def _make_shape_config( + scene_dir: Path, + source_config: Mapping[str, Any], + *, + mesh_fpath: str | Path | None = None, +) -> dict[str, Any]: + shape = copy.deepcopy(dict(source_config.get("shape", {}))) + if mesh_fpath is not None: + shape["shape_type"] = "Mesh" + shape["fpath"] = str(mesh_fpath) + if shape.get("shape_type") == "Mesh" and "fpath" in shape: + shape["fpath"] = _asset_path_for_config(scene_dir, str(shape["fpath"])) + shape.setdefault("compute_uv", False) + return shape + + +def _asset_path_for_config(scene_dir: Path, fpath: str) -> str: + raw_path = Path(fpath) + if raw_path.is_absolute(): + return raw_path.resolve().as_posix() + return (scene_dir / raw_path).resolve().as_posix() + + +def _repo_root() -> Path: + current = Path(__file__).resolve() + for parent in current.parents: + if (parent / "setup.py").exists() and (parent / "embodichain").exists(): + return parent + return Path.cwd().resolve() + + +def _validate_bundle(bundle: Mapping[str, Any], roles: _BasketTaskRoles) -> None: + gym_config = bundle["gym_config"] + if gym_config.get("id") != "AtomicActionsAgent-v3": + raise ValueError("Generated gym config must use AtomicActionsAgent-v3.") + if gym_config.get("robot", {}).get("uid") != "DualUR5": + raise ValueError("Generated UR5 basket config must use DualUR5.") + + rigid_uids = {obj["uid"] for obj in gym_config.get("rigid_object", [])} + required = { + roles.left_target_runtime_uid, + roles.right_target_runtime_uid, + roles.container_runtime_uid, + } + if not required.issubset(rigid_uids): + raise ValueError( + f"Generated rigid objects missing: {sorted(required - rigid_uids)}" + ) + + success = gym_config["env"]["extensions"]["agent_success"] + for term in success.get("terms", []): + if ( + term.get("object") not in rigid_uids + or term.get("container") not in rigid_uids + ): + raise ValueError(f"Invalid success term uid reference: {term}") + + +def _validate_relative_bundle( + bundle: Mapping[str, Any], + spec: _RelativePlacementSpec, +) -> None: + gym_config = bundle["gym_config"] + if gym_config.get("id") != "AtomicActionsAgent-v3": + raise ValueError("Generated gym config must use AtomicActionsAgent-v3.") + if gym_config.get("robot", {}).get("uid") != "DualUR5": + raise ValueError("Generated relative placement config must use DualUR5.") + + rigid_uids = [obj["uid"] for obj in gym_config.get("rigid_object", [])] + if len(rigid_uids) != len(set(rigid_uids)): + raise ValueError(f"Duplicate rigid object runtime uid(s): {rigid_uids}") + required = { + uid + for placement in spec.placements + for uid in (placement.moved_runtime_uid, placement.reference_runtime_uid) + } + missing = required - set(rigid_uids) + if missing: + raise ValueError( + f"Generated relative config missing rigid object(s): {missing}" + ) + + _validate_success_uids( + gym_config["env"]["extensions"]["agent_success"], + set(rigid_uids), + ) + registry = gym_config["env"]["events"]["register_info_to_env"]["params"]["registry"] + registered = {entry["entity_cfg"]["uid"] for entry in registry} + if not required.issubset(registered): + raise ValueError( + f"Relative config registry missing: {sorted(required - registered)}" + ) + + +def _validate_success_uids(success: Mapping[str, Any], rigid_uids: set[str]) -> None: + if success.get("op") in {"all", "and", "any", "or"}: + for term in success.get("terms", []): + _validate_success_uids(term, rigid_uids) + return + + success_type = str(success.get("type", success.get("func", ""))).lower() + if success_type == "object_in_container": + required_keys = ("object", "container") + elif success_type in {"object_on_object", "object_on", "on_object"}: + required_keys = ("object", "support") + elif success_type in { + "object_axis_offset_near", + "object_relative_axis_near", + }: + required_keys = ("object", "reference") + elif success_type in {"object_not_fallen", "not_fallen"}: + required_keys = ("object",) + else: + raise ValueError(f"Unsupported generated success term: {success_type!r}.") + + for key in required_keys: + uid = success.get(key) + if uid not in rigid_uids: + raise ValueError(f"Invalid success uid reference {key}={uid!r}.") + + +def _write_config_bundle( + *, + output_dir: Path, + bundle: Mapping[str, Any], + overwrite: bool, +) -> GeneratedUR5BasketConfigPaths: + paths = GeneratedUR5BasketConfigPaths( + output_dir=output_dir, + gym_config=output_dir / "fast_gym_config.json", + agent_config=output_dir / "agent_config.json", + task_prompt=output_dir / "task_prompt.txt", + basic_background=output_dir / "basic_background.txt", + atom_actions=output_dir / "atom_actions.txt", + summary=dict(bundle.get("summary", {})), + ) + output_files = [ + paths.gym_config, + paths.agent_config, + paths.task_prompt, + paths.basic_background, + paths.atom_actions, + ] + existing = [path for path in output_files if path.exists()] + if existing and not overwrite: + existing_text = ", ".join(path.as_posix() for path in existing) + raise FileExistsError( + f"Generated file(s) already exist: {existing_text}. " + "Pass overwrite=True or --overwrite to replace them." + ) + + output_dir.mkdir(parents=True, exist_ok=True) + _write_json(paths.gym_config, bundle["gym_config"]) + _write_json(paths.agent_config, bundle["agent_config"]) + _write_text(paths.task_prompt, bundle["task_prompt"]) + _write_text(paths.basic_background, bundle["basic_background"]) + _write_text(paths.atom_actions, bundle["atom_actions"]) + return paths + + +def _write_json(path: Path, data: Mapping[str, Any]) -> None: + path.write_text( + json.dumps(data, ensure_ascii=False, indent=4) + "\n", + encoding="utf-8", + ) + + +def _write_text(path: Path, content: str) -> None: + path.write_text(content.rstrip() + "\n", encoding="utf-8") + + +def _read_json(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as file: + return json.load(file) + + +def _vector3(value: Any) -> list[float]: + if not isinstance(value, (list, tuple)) or len(value) != 3: + raise ValueError(f"Expected a 3-vector, got {value!r}.") + return [float(item) for item in value] + + +def _clean_vector3(value: Any) -> list[float]: + cleaned = [] + for item in _vector3(value): + if abs(item - 1.0) < 1e-9: + cleaned.append(1.0) + elif abs(item) < 1e-12: + cleaned.append(0.0) + else: + cleaned.append(item) + return cleaned diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py new file mode 100644 index 00000000..c7901c9f --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +"""Client for the Image2Tabletop API.""" + +from __future__ import annotations + +import argparse +import json +import re +import shutil +import sys +import time +import zipfile +from pathlib import Path +from tempfile import TemporaryDirectory + +import requests +from requests import exceptions as request_exceptions + +_IMAGE_SUFFIXES = frozenset({".bmp", ".jpeg", ".jpg", ".png", ".webp"}) +_PROJECT_NAME_RE = re.compile(r"^[0-9]+_gym_project$") +_PROJECT_ID_RE = re.compile(r"Image2Tabletop-([0-9]+)-v[0-9]+") +_DEFAULT_SERVER = "http://192.168.3.23:4523" + + +def _repo_root() -> Path: + current = Path(__file__).resolve() + for parent in current.parents: + if (parent / "setup.py").is_file() and (parent / "embodichain").is_dir(): + return parent + return Path.cwd().resolve() + + +_DEFAULT_IMAGE_INPUT = Path(__file__).resolve().parent / "image" +_DEFAULT_OUTPUT_ROOT = _repo_root() / "gym_project" + + +def _server_url(base_url: str, path: str) -> str: + return f"{base_url.rstrip('/')}{path}" + + +def check_health(server: str) -> None: + try: + response = requests.get(_server_url(server, "/health"), timeout=10) + except request_exceptions.ConnectionError as exc: + raise RuntimeError( + f"cannot connect to Image2Tabletop demo API: {server}. " + "Start the server with: " + "python demo_api/server/image2tabletop_api.py --host 0.0.0.0 --port 4523" + ) from exc + response.raise_for_status() + + +def submit_job(server: str, image_path: Path) -> str: + try: + with image_path.open("rb") as image_file: + response = requests.post( + _server_url(server, "/api/image2tabletop/start"), + files={"image": (image_path.name, image_file)}, + timeout=60, + ) + except request_exceptions.ConnectionError as exc: + raise RuntimeError( + f"cannot connect to API server: {server}. " + "Make sure the server is running and listening on this host/port." + ) from exc + response.raise_for_status() + data = response.json() + job_id = data.get("job_id") + if not job_id: + raise RuntimeError(f"API response does not contain job_id: {data}") + return str(job_id) + + +def wait_for_job(server: str, job_id: str, poll_interval: float) -> dict: + status_url = _server_url(server, f"/api/image2tabletop/status/{job_id}") + while True: + response = requests.get(status_url, timeout=30) + response.raise_for_status() + data = response.json() + status = data.get("status") + print(f"[{time.strftime('%H:%M:%S')}] job={job_id} status={status}", flush=True) + if status == "completed": + return data + if status == "failed": + raise RuntimeError(f"job failed: {data}") + time.sleep(poll_interval) + + +def download_zip(server: str, job_id: str, output_dir: Path) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + zip_path = output_dir / f"{job_id}_formatted_tabletop_scene.zip" + response = requests.get( + _server_url(server, f"/api/image2tabletop/download/{job_id}"), + stream=True, + timeout=300, + ) + response.raise_for_status() + with zip_path.open("wb") as file: + for chunk in response.iter_content(chunk_size=1024 * 1024): + if chunk: + file.write(chunk) + return zip_path + + +def collect_image_paths(image_input: Path) -> list[Path]: + image_input = image_input.expanduser().resolve() + if image_input.is_file(): + if image_input.suffix.lower() not in _IMAGE_SUFFIXES: + raise ValueError(f"unsupported image suffix: {image_input}") + return [image_input] + if image_input.is_dir(): + image_paths = sorted( + path + for path in image_input.iterdir() + if path.is_file() and path.suffix.lower() in _IMAGE_SUFFIXES + ) + if image_paths: + return image_paths + raise FileNotFoundError(f"no supported image files found under: {image_input}") + raise FileNotFoundError(f"image input not found: {image_input}") + + +def extract_gym_project( + zip_path: Path, output_root: Path, job_id: str, overwrite: bool +) -> Path: + output_root = output_root.expanduser().resolve() + output_root.mkdir(parents=True, exist_ok=True) + + with TemporaryDirectory(prefix=f"{job_id}_image2tabletop_") as temp_dir_name: + extract_dir = Path(temp_dir_name).resolve() + _safe_extract_zip(zip_path, extract_dir) + gym_config_paths = sorted(extract_dir.rglob("gym_config.json")) + if not gym_config_paths: + raise FileNotFoundError( + f"gym_config.json not found in downloaded archive: {zip_path}" + ) + if len(gym_config_paths) > 1: + matches = ", ".join(path.as_posix() for path in gym_config_paths) + raise ValueError( + f"multiple gym_config.json files found in archive: {matches}" + ) + + gym_config_path = gym_config_paths[0] + project_name = _infer_project_name(gym_config_path, extract_dir, job_id) + source_root = _infer_source_project_root( + gym_config_path, extract_dir, project_name + ) + destination = output_root / project_name + if destination.exists(): + if not overwrite: + raise FileExistsError( + f"output project already exists: {destination}. " + "Pass --overwrite to replace it." + ) + shutil.rmtree(destination) + shutil.copytree(source_root, destination) + return destination + + +def _safe_extract_zip(zip_path: Path, extract_dir: Path) -> None: + with zipfile.ZipFile(zip_path) as archive: + for member in archive.infolist(): + target_path = (extract_dir / member.filename).resolve() + if not target_path.is_relative_to(extract_dir): + raise RuntimeError(f"unsafe archive member path: {member.filename}") + archive.extractall(extract_dir) + + +def _infer_project_name(gym_config_path: Path, extract_dir: Path, job_id: str) -> str: + for part in gym_config_path.relative_to(extract_dir).parts: + if _PROJECT_NAME_RE.match(part): + return part + + try: + config = json.loads(gym_config_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + config = {} + project_id = str(config.get("id", "")) + match = _PROJECT_ID_RE.match(project_id) + if match: + return f"{match.group(1)}_gym_project" + return f"{job_id}_gym_project" + + +def _infer_source_project_root( + gym_config_path: Path, extract_dir: Path, project_name: str +) -> Path: + current = extract_dir + for part in gym_config_path.relative_to(extract_dir).parts: + current = current / part + if part == project_name: + return current + return gym_config_path.parent + + +def process_image( + server: str, + image_path: Path, + output_root: Path, + poll_interval: float, + overwrite: bool, +) -> Path: + job_id = submit_job(server, image_path) + print(f"submitted job: {job_id} image={image_path}", flush=True) + wait_for_job(server, job_id, poll_interval) + with TemporaryDirectory( + prefix=f"{job_id}_image2tabletop_download_" + ) as temp_dir_name: + zip_path = download_zip(server, job_id, Path(temp_dir_name)) + project_path = extract_gym_project(zip_path, output_root, job_id, overwrite) + print(f"generated gym project: {project_path}", flush=True) + return project_path + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Submit image files to Image2Tabletop API." + ) + parser.add_argument( + "--server", + default=_DEFAULT_SERVER, + help=f"Image2Tabletop demo API server. Defaults to {_DEFAULT_SERVER}", + ) + parser.add_argument( + "--image", + default=str(_DEFAULT_IMAGE_INPUT), + help=( + "Input image file or directory. Defaults to " + f"{_DEFAULT_IMAGE_INPUT.as_posix()}" + ), + ) + parser.add_argument( + "--output-root", + default=None, + help=f"Directory where generated gym projects are written. Defaults to {_DEFAULT_OUTPUT_ROOT.as_posix()}", + ) + parser.add_argument( + "--download-dir", + dest="output_root", + default=None, + help=argparse.SUPPRESS, + ) + parser.add_argument("--poll-interval", type=float, default=10.0) + parser.add_argument( + "--skip-health-check", + action="store_true", + default=False, + help="Skip GET /health before submitting images.", + ) + parser.add_argument( + "--overwrite", + action="store_true", + default=False, + help="Replace an existing generated gym project with the same name.", + ) + args = parser.parse_args() + + image_paths = collect_image_paths(Path(args.image)) + if not args.skip_health_check: + check_health(args.server) + + project_paths = [] + for image_path in image_paths: + project_paths.append( + process_image( + server=args.server, + image_path=image_path, + output_root=Path(args.output_root or _DEFAULT_OUTPUT_ROOT), + poll_interval=args.poll_interval, + overwrite=args.overwrite, + ) + ) + + print("gym_project paths:", flush=True) + for project_path in project_paths: + print(project_path, flush=True) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/.gitignore b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/.gitignore new file mode 100644 index 00000000..ede6bbf2 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/.gitignore @@ -0,0 +1,4 @@ +# Python cache +__pycache__/ +*.py[cod] + diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/__init__.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/__init__.py new file mode 100644 index 00000000..bdac8600 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/__init__.py @@ -0,0 +1,57 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from .pipeline import ( + Prompt2GeometryRequest, + run_prompt2geometry, +) +from .config import ( + Prompt2GeometryConfig, + load_prompt2geometry_config, +) +from .llm_client import ( + OpenAICompatibleClient, + OpenAICompatibleClientError, +) +from .sam3_client import ( + SAM3Client, + SAM3ClientError, +) +from .sam3d_client import ( + SAM3DClient, + SAM3DClientError, +) +from .zimage_client import ( + ZImageClient, + ZImageClientError, +) + +__all__ = [ + "Prompt2GeometryRequest", + "Prompt2GeometryConfig", + "OpenAICompatibleClient", + "OpenAICompatibleClientError", + "SAM3Client", + "SAM3ClientError", + "SAM3DClient", + "SAM3DClientError", + "ZImageClient", + "ZImageClientError", + "run_prompt2geometry", + "load_prompt2geometry_config", +] diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.json b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.json new file mode 100644 index 00000000..740a5710 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.json @@ -0,0 +1,21 @@ +{ + "services": { + "zimage": { + "base_url": "http://192.168.3.23:5013" + }, + "sam3": { + "base_url": "http://192.168.3.23:5015" + }, + "sam3d": { + "base_url": "http://192.168.3.23:5016" + } + }, + "llm": { + "openai_compatible": { + "api_key": "sk-7hjyRgBLrhUYUSCpLgPSARk8sz1Sc2vZ2bnt3fy1bkHsI7ak", + "model": "gpt-5.5", + "base_url": "https://airouter.cloud/v1", + "timeout_s": 120 + } + } +} diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.py new file mode 100644 index 00000000..cf7dda1d --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.py @@ -0,0 +1,109 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.utils.llm_config import ( + get_openai_compatible_llm_config, +) + +__all__ = ["Prompt2GeometryConfig", "load_prompt2geometry_config"] + +DEFAULT_CONFIG_PATH = Path(__file__).resolve().parent / "config.json" + + +@dataclass(frozen=True) +class Prompt2GeometryConfig: + """Prompt2Geometry runtime configuration.""" + + zimage_base_url: str + sam3_base_url: str + sam3d_base_url: str + llm_api_key: str + llm_model: str + llm_base_url: str + llm_timeout_s: float + + +def load_prompt2geometry_config( + config_path: Path | None = None, +) -> Prompt2GeometryConfig: + """Load prompt2geometry config from a local JSON file and environment.""" + path = (config_path or DEFAULT_CONFIG_PATH).expanduser().resolve() + if not path.is_file(): + raise FileNotFoundError(f"Prompt2Geometry config not found: {path}") + raw = json.loads(path.read_text(encoding="utf-8")) + services = _mapping(raw.get("services"), "services") + llm = _mapping( + _mapping(raw.get("llm"), "llm").get("openai_compatible"), + "llm.openai_compatible", + ) + shared_llm = get_openai_compatible_llm_config( + required=False, + require_base_url=False, + ) + + return Prompt2GeometryConfig( + zimage_base_url=_env_or_config( + "PROMPT2GEOMETRY_ZIMAGE_BASE_URL", + _service_base_url(services, "zimage"), + ), + sam3_base_url=_env_or_config( + "PROMPT2GEOMETRY_SAM3_BASE_URL", + _service_base_url(services, "sam3"), + ), + sam3d_base_url=_env_or_config( + "PROMPT2GEOMETRY_SAM3D_BASE_URL", + _service_base_url(services, "sam3d"), + ), + llm_api_key=_env_or_config( + "PROMPT2GEOMETRY_LLM_API_KEY", + str(shared_llm.get("api_key") or llm.get("api_key", "")), + ), + llm_model=_env_or_config( + "PROMPT2GEOMETRY_LLM_MODEL", + str(shared_llm.get("model") or llm.get("model", "")), + ), + llm_base_url=_env_or_config( + "PROMPT2GEOMETRY_LLM_BASE_URL", + str(shared_llm.get("base_url") or llm.get("base_url", "")), + ).rstrip("/"), + llm_timeout_s=float( + os.getenv("PROMPT2GEOMETRY_LLM_TIMEOUT_S") + or llm.get("timeout_s", 120.0) + ), + ) + + +def _service_base_url(services: dict[str, Any], name: str) -> str: + section = _mapping(services.get(name), f"services.{name}") + return str(section.get("base_url", "")).rstrip("/") + + +def _env_or_config(env_name: str, config_value: str) -> str: + return str(os.getenv(env_name) or config_value).strip() + + +def _mapping(value: Any, name: str) -> dict[str, Any]: + if not isinstance(value, dict): + raise ValueError(f"Prompt2Geometry config key {name} must be an object.") + return value diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/dimensions.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/dimensions.py new file mode 100644 index 00000000..3c0dec17 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/dimensions.py @@ -0,0 +1,128 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import time +from typing import Any + +try: + from .llm_client import OpenAICompatibleClient +except ImportError: + from llm_client import OpenAICompatibleClient + +__all__ = ["DIMENSION_ESTIMATION_SYSTEM_PROMPT", "estimate_real_dimensions"] + + +DIMENSION_ESTIMATION_SYSTEM_PROMPT = """ + +You are a careful real-world object size estimation assistant. + + + +Estimate the plausible real-world bounding-box dimensions of one physical object +from the user's object description. + + + +- Units are meters. +- length_m is the object's longest horizontal dimension. +- width_m is the object's shorter horizontal dimension. +- height_m is the vertical dimension when the object is in its common upright pose. +- Use common real-world size priors for everyday objects. +- If the object category is ambiguous, choose a conservative typical tabletop size. +- Do not include decorative background, shadows, or image canvas in the dimensions. + + + +{ + "length_m": 0.08, + "width_m": 0.08, + "height_m": 0.08, + "confidence": 0.7, + "reason": "A typical apple is roughly 8 cm across." +} + + + +- Output JSON only. Do not include markdown or text outside JSON. +- length_m, width_m, height_m, and confidence must be numbers. +- length_m, width_m, and height_m must be positive. +- confidence must be between 0 and 1. +- Keep reason short and specific. + +""".strip() + + +def estimate_real_dimensions( + *, + object_prompt: str, + client: OpenAICompatibleClient, + max_attempts: int | None = None, +) -> dict[str, Any]: + """Estimate real-world object dimensions with schema validation and retry.""" + messages = [ + {"role": "system", "content": DIMENSION_ESTIMATION_SYSTEM_PROMPT}, + { + "role": "user", + "content": ( + "Object description:\n" + f"{object_prompt.strip()}\n\n" + "Return the dimensions JSON only." + ), + }, + ] + attempt = 1 + while max_attempts is None or attempt <= max_attempts: + try: + raw = client.chat_json(messages=messages) + return _validate_dimension_output(raw) + except Exception: + attempt += 1 + time.sleep(1.0) + continue + raise ValueError( + "Failed to estimate object dimensions after " + f"{max_attempts} attempts." + ) + + +def _validate_dimension_output(raw: dict[str, Any]) -> dict[str, Any]: + allowed = {"length_m", "width_m", "height_m", "confidence", "reason"} + extra = set(raw) - allowed + if extra: + raise ValueError(f"Unexpected dimension keys: {sorted(extra)}") + result: dict[str, Any] = {} + for key in ("length_m", "width_m", "height_m"): + value = raw.get(key) + if not isinstance(value, int | float): + raise ValueError(f"{key} must be a number.") + value = float(value) + if value <= 0: + raise ValueError(f"{key} must be positive.") + result[key] = value + confidence = raw.get("confidence") + if not isinstance(confidence, int | float): + raise ValueError("confidence must be a number.") + confidence = float(confidence) + if confidence < 0 or confidence > 1: + raise ValueError("confidence must be between 0 and 1.") + reason = raw.get("reason") + if not isinstance(reason, str) or not reason.strip(): + raise ValueError("reason must be a non-empty string.") + result["confidence"] = confidence + result["reason"] = reason.strip() + return result diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/llm_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/llm_client.py new file mode 100644 index 00000000..d3a0f826 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/llm_client.py @@ -0,0 +1,134 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +from typing import Any +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + +from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import record_llm_usage + +__all__ = ["OpenAICompatibleClient", "OpenAICompatibleClientError"] + + +class OpenAICompatibleClientError(RuntimeError): + """Raised when an OpenAI-compatible chat request fails.""" + + +class OpenAICompatibleClient: + """Small OpenAI-compatible chat completions client.""" + + def __init__( + self, + *, + api_key: str, + model: str, + base_url: str, + timeout_s: float = 120.0, + usage_stage: str | None = None, + ): + if not api_key.strip(): + raise ValueError("LLM api_key must be non-empty.") + if not model.strip(): + raise ValueError("LLM model must be non-empty.") + if not base_url.strip(): + raise ValueError("LLM base_url must be non-empty.") + self.api_key = api_key + self.model = model + self.base_url = base_url.rstrip("/") + self.timeout_s = timeout_s + self.usage_stage = usage_stage or "prompt2geometry.chat_json" + + def chat_json(self, *, messages: list[dict[str, str]]) -> dict[str, Any]: + """Call chat completions and return the decoded JSON response content.""" + payload = { + "model": self.model, + "messages": messages, + "temperature": 0, + "response_format": {"type": "json_object"}, + } + request = Request( + f"{self.base_url}/chat/completions", + data=json.dumps(payload).encode("utf-8"), + headers={ + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + }, + method="POST", + ) + try: + with urlopen(request, timeout=self.timeout_s) as response: + body = response.read().decode("utf-8") + except HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise OpenAICompatibleClientError( + f"LLM request failed with HTTP {exc.code}: {detail}" + ) from exc + except URLError as exc: + raise OpenAICompatibleClientError( + f"LLM server is unreachable at {request.full_url}: {exc.reason}" + ) from exc + except TimeoutError as exc: + raise OpenAICompatibleClientError( + f"LLM request timed out after {self.timeout_s}s." + ) from exc + + try: + decoded = json.loads(body) + choice = decoded["choices"][0] + content = choice["message"]["content"] + except (KeyError, IndexError, TypeError, json.JSONDecodeError) as exc: + raise OpenAICompatibleClientError( + f"LLM response has unsupported format: {body}" + ) from exc + record_llm_usage( + stage=self.usage_stage, + provider="openai_compatible_http", + model=str(decoded.get("model") or self.model), + usage=decoded.get("usage") if isinstance(decoded, dict) else None, + request_id=str(decoded.get("id")) if decoded.get("id") else None, + finish_reason=( + str(choice.get("finish_reason")) + if isinstance(choice, dict) and choice.get("finish_reason") + else None + ), + raw_usage=( + decoded.get("usage") + if isinstance(decoded, dict) and isinstance(decoded.get("usage"), dict) + else None + ), + ) + if not isinstance(content, str): + raise OpenAICompatibleClientError("LLM message content must be a string.") + return _parse_json_text(content) + + +def _parse_json_text(content: str) -> dict[str, Any]: + stripped = content.strip() + if stripped.startswith("```"): + lines = stripped.splitlines() + if lines and lines[0].startswith("```"): + lines = lines[1:] + if lines and lines[-1].startswith("```"): + lines = lines[:-1] + stripped = "\n".join(lines).strip() + parsed = json.loads(stripped) + if not isinstance(parsed, dict): + raise ValueError("LLM output must be a JSON object.") + return parsed diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/mesh_scaling.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/mesh_scaling.py new file mode 100644 index 00000000..e0418419 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/mesh_scaling.py @@ -0,0 +1,225 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +__all__ = ["scale_mesh_to_real_dimensions"] + + +def scale_mesh_to_real_dimensions( + *, + mesh_path: Path, + output_path: Path, + dimensions_m: dict[str, Any], + report_path: Path, +) -> dict[str, Any]: + """Scale a canonical GLB mesh for Blender with object up along -Y. + + glTF/GLB stores assets in y-up coordinates. Blender converts glTF y-up + assets to its z-up scene coordinates during import. The exported vertices + are arranged so that after Blender import the object's original y-up axis + becomes Blender -Y, and the bbox bottom-center is at the world origin. + """ + trimesh = _require_trimesh() + np = _require_numpy() + mesh_path = mesh_path.expanduser().resolve() + output_path = output_path.expanduser().resolve() + report_path = report_path.expanduser().resolve() + scene = trimesh.load(str(mesh_path), force="scene") + mesh = _scene_to_world_mesh(scene) + bounds = _mesh_bounds(mesh) + extents = bounds[1] - bounds[0] + axis_map = _axis_mapping(extents) + target_extents = np.asarray( + [ + dimensions_m[axis_map["x"]], + dimensions_m[axis_map["y"]], + dimensions_m[axis_map["z"]], + ], + dtype=np.float64, + ) + source_max_extent = float(max(extents) or 1.0) + target_max_extent = float(max(target_extents)) + uniform_scale = target_max_extent / source_max_extent + scale = np.asarray([uniform_scale, uniform_scale, uniform_scale], dtype=np.float64) + bottom_center_y_up = _bottom_center_y_up(bounds) + gltf_to_blender = _gltf_y_up_to_blender_z_up_matrix(np) + original_to_blender = _original_y_up_to_blender_negative_y_up_matrix(np) + original_to_export = np.linalg.inv(gltf_to_blender) @ original_to_blender + transform = np.eye(4, dtype=np.float64) + transform[:3, :3] = original_to_export @ np.diag(scale) + transform[:3, 3] = -(original_to_export @ np.diag(scale) @ bottom_center_y_up) + mesh.apply_transform(transform) + exported_bounds = _mesh_bounds(mesh) + blender_bounds = _bounds(_transform_vertices(mesh.vertices, gltf_to_blender)) + output_path.parent.mkdir(parents=True, exist_ok=True) + mesh.export(str(output_path)) + + report = { + "input_mesh_path": str(mesh_path), + "scaled_mesh_path": str(output_path), + "axis_convention": ( + "Input GLB is treated as y-up. After Blender's glTF import, the " + "object's original y-up axis is aligned to Blender -Y. length_m " + "maps to the larger generated horizontal axis among input x/z; " + "width_m maps to the other." + ), + "scaling_policy": ( + "The mesh is scaled uniformly to preserve generated geometry " + "proportions. The source mesh is first considered normalized by " + "its maximum bbox extent; the uniform scale is computed as " + "estimated_max_real_extent / mesh_max_extent." + ), + "origin_policy": ( + "The input y-up bbox bottom-center is subtracted before GLB export. " + "After Blender import, the -Y-up bbox bottom-center is at " + "(0, 0, 0), so its XZ-plane location is (0, 0)." + ), + "axis_map": axis_map, + "estimated_dimensions_m": dimensions_m, + "estimated_target_extents_by_mesh_axes": target_extents.tolist(), + "source_max_extent": source_max_extent, + "estimated_max_real_extent": target_max_extent, + "original_bounds": bounds.tolist(), + "original_extents": extents.tolist(), + "bottom_center_y_up_subtracted": bottom_center_y_up.tolist(), + "gltf_to_blender_matrix": gltf_to_blender.tolist(), + "original_to_blender_matrix": original_to_blender.tolist(), + "original_to_export_matrix": original_to_export.tolist(), + "uniform_scale": uniform_scale, + "applied_transform": transform.tolist(), + "exported_gltf_bounds": exported_bounds.tolist(), + "exported_gltf_extents": (exported_bounds[1] - exported_bounds[0]).tolist(), + "blender_import_bounds": blender_bounds.tolist(), + "blender_import_extents": (blender_bounds[1] - blender_bounds[0]).tolist(), + "blender_import_bottom_center_negative_y_up": _bottom_center_negative_y_up( + blender_bounds + ).tolist(), + } + report_path.parent.mkdir(parents=True, exist_ok=True) + report_path.write_text( + json.dumps(report, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + return report + + +def _axis_mapping(extents: Any) -> dict[str, str]: + if float(extents[0]) >= float(extents[2]): + return {"x": "length_m", "y": "height_m", "z": "width_m"} + return {"x": "width_m", "y": "height_m", "z": "length_m"} + + +def _bottom_center_negative_y_up(bounds: Any) -> Any: + np = _require_numpy() + return np.asarray( + [ + 0.5 * (bounds[0][0] + bounds[1][0]), + bounds[1][1], + 0.5 * (bounds[0][2] + bounds[1][2]), + ], + dtype=np.float64, + ) + + +def _bottom_center_y_up(bounds: Any) -> Any: + np = _require_numpy() + return np.asarray( + [ + 0.5 * (bounds[0][0] + bounds[1][0]), + bounds[0][1], + 0.5 * (bounds[0][2] + bounds[1][2]), + ], + dtype=np.float64, + ) + + +def _bounds(vertices: Any) -> Any: + np = _require_numpy() + return np.vstack([vertices.min(axis=0), vertices.max(axis=0)]) + + +def _transform_vertices(vertices: Any, matrix: Any) -> Any: + np = _require_numpy() + vertices_array = np.asarray(vertices, dtype=np.float64) + matrix_array = np.asarray(matrix, dtype=np.float64) + return vertices_array @ matrix_array.T + + +def _gltf_y_up_to_blender_z_up_matrix(np: Any) -> Any: + return np.asarray( + [ + [1.0, 0.0, 0.0], + [0.0, 0.0, -1.0], + [0.0, 1.0, 0.0], + ], + dtype=np.float64, + ) + + +def _original_y_up_to_blender_negative_y_up_matrix(np: Any) -> Any: + return np.asarray( + [ + [1.0, 0.0, 0.0], + [0.0, -1.0, 0.0], + [0.0, 0.0, 1.0], + ], + dtype=np.float64, + ) + + +def _mesh_bounds(mesh: Any) -> Any: + np = _require_numpy() + vertices = np.asarray(mesh.vertices, dtype=np.float64) + if vertices.size == 0: + raise ValueError("Mesh contains no vertices.") + return _bounds(vertices) + + +def _scene_to_world_mesh(scene: Any) -> Any: + """Convert a loaded GLB scene to one world-space mesh. + + This intentionally bakes scene graph transforms into vertex coordinates so + later z-up conversion and origin anchoring are visible to downstream tools + that only inspect mesh vertices. + """ + try: + mesh = scene.dump(concatenate=True) + except Exception as exc: + raise ValueError("Failed to concatenate GLB scene into a mesh.") from exc + if not hasattr(mesh, "vertices") or len(mesh.vertices) == 0: + raise ValueError("GLB scene contains no mesh vertices.") + return mesh + + +def _require_trimesh() -> Any: + try: + import trimesh + except ImportError as exc: + raise ImportError("trimesh is required to scale GLB meshes.") from exc + return trimesh + + +def _require_numpy() -> Any: + try: + import numpy as np + except ImportError as exc: + raise ImportError("numpy is required to scale GLB meshes.") from exc + return np diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/pipeline.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/pipeline.py new file mode 100644 index 00000000..2154da48 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/pipeline.py @@ -0,0 +1,589 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +import os +import re +import shutil +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +try: + from .dimensions import estimate_real_dimensions + from .llm_client import OpenAICompatibleClient + from .mesh_scaling import scale_mesh_to_real_dimensions + from .sam3_client import SAM3Client + from .sam3d_client import SAM3DClient + from .schemas import SelectedBox + from .segmentation_outputs import save_segmentation_outputs + from .zimage_client import ZImageClient +except ImportError: + from dimensions import estimate_real_dimensions + from llm_client import OpenAICompatibleClient + from mesh_scaling import scale_mesh_to_real_dimensions + from sam3_client import SAM3Client + from sam3d_client import SAM3DClient + from schemas import SelectedBox + from segmentation_outputs import save_segmentation_outputs + from zimage_client import ZImageClient + +__all__ = ["Prompt2GeometryRequest", "run_prompt2geometry"] + + +@dataclass(frozen=True) +class Prompt2GeometryRequest: + """Request for prompt-to-single-asset geometry generation.""" + + prompt: str + output_root: Path + target_id: str = "asset_0" + request_id: str = "prompt2geometry_asset_0" + output_name: str | None = None + zimage_base_url: str = "http://192.168.3.23:5013" + zimage_width: int = 1024 + zimage_height: int = 1024 + zimage_seed: int = 42 + zimage_num_inference_steps: int = 8 + zimage_prompt_suffix: str = "a complete single object, with pure-black background" + sam3_base_url: str = "http://192.168.3.23:5015" + sam3d_base_url: str = "http://192.168.3.23:5016" + sam3d_seed: int = 42 + llm_api_key: str | None = None + llm_model: str | None = None + llm_base_url: str | None = None + llm_timeout_s: float = 120.0 + verbose: bool = True + + +def run_prompt2geometry(request: Prompt2GeometryRequest) -> dict[str, Any]: + """Run z-image, SAM3 segmentation, and SAM3D generation.""" + output_root = request.output_root.expanduser().resolve() + output_root.mkdir(parents=True, exist_ok=True) + final_glb_path: Path | None = None + success = False + try: + _log_status(request, "start", f"output_root={output_root}") + _write_json( + output_root / "prompt2geometry_request.json", + _request_manifest(request), + ) + + _log_status(request, "z-image", "generating source image") + image_path, zimage_manifest = _generate_image(request, output_root) + _log_status(request, "segmentation", "segmenting generated image") + raw_mask_path, segmentation_manifest = _segment_image( + request, + image_path, + output_root, + ) + _log_status(request, "mask", "checking mask orientation with center prior") + corrected_mask_path = _correct_mask_with_center_prior( + image_path=image_path, + raw_mask_path=raw_mask_path, + output_dir=output_root / "mask_correction", + ) + _log_status(request, "3D-generation", "generating raw mesh") + generation_manifest = _generate_geometry( + request=request, + image_path=image_path, + mask_path=corrected_mask_path, + output_root=output_root, + ) + _log_status(request, "dimensions", "estimating real-world dimensions") + dimension_manifest = _estimate_dimensions(request, output_root) + _log_status(request, "naming", "resolving final GLB file name") + final_glb_path = _final_scaled_glb_path(request, output_root) + _log_status(request, "scale", f"writing final mesh to {final_glb_path.name}") + scaling_manifest = _scale_generated_mesh( + mesh_path=Path(str(generation_manifest["local_glb_path"])), + dimensions_m=dimension_manifest, + output_path=final_glb_path, + output_root=output_root, + ) + manifest = { + "prompt": request.prompt, + "zimage_prompt": _zimage_prompt(request), + "output_root": str(output_root), + "image_path": str(image_path), + "raw_mask_path": str(raw_mask_path), + "corrected_mask_path": str(corrected_mask_path), + "zimage": zimage_manifest, + "sam3_segmentation": segmentation_manifest, + "sam3d_generation": generation_manifest, + "dimension_estimation": dimension_manifest, + "mesh_scaling": scaling_manifest, + "mesh_path": generation_manifest.get("local_glb_path"), + "scaled_mesh_path": scaling_manifest.get("scaled_mesh_path"), + "transform_metadata_path": generation_manifest.get( + "local_transform_metadata_path" + ), + } + _write_json(output_root / "prompt2geometry_result.json", manifest) + success = True + _log_status(request, "done", f"final_glb={final_glb_path}") + return manifest + finally: + _cleanup_output_root(output_root, keep_path=final_glb_path if success else None) + + +def _generate_image( + request: Prompt2GeometryRequest, + output_root: Path, +) -> tuple[Path, dict[str, Any]]: + image_path = output_root / "zimage" / "zimage.png" + client = ZImageClient(base_url=request.zimage_base_url) + manifest = client.generate_png( + prompt=_zimage_prompt(request), + output_path=image_path, + width=request.zimage_width, + height=request.zimage_height, + seed=request.zimage_seed, + num_inference_steps=request.zimage_num_inference_steps, + ) + _write_json(output_root / "zimage" / "zimage_result.json", manifest) + return image_path, manifest + + +def _segment_image( + request: Prompt2GeometryRequest, + image_path: Path, + output_root: Path, +) -> tuple[Path, dict[str, Any]]: + width, height = _image_size(image_path) + full_image_box = SelectedBox( + target_id=request.target_id, + target_kind="asset", + phrase=request.target_id, + bbox_xyxy=[0.0, 0.0, float(width), float(height)], + source_candidate_ids=["full_image_bbox"], + selection_reason="Use the full generated image as a bbox prompt.", + ) + sam3_client = SAM3Client( + base_url=os.getenv("PROMPT2GEOMETRY_SAM3_BASE_URL") or request.sam3_base_url, + ) + health = sam3_client.health() + _write_json(output_root / "sam3_health.json", health) + + sam3_request = { + "image": str(image_path), + "request_id": f"{request.request_id}_sam3_box", + "selected_boxes": [full_image_box.to_manifest()], + "save_visualizations": False, + } + _write_json(output_root / "sam3_box_segmentation_request.json", sam3_request) + result = sam3_client.segment_boxes_image( + image_path, + selected_boxes=[full_image_box], + request_id=f"{request.request_id}_sam3_box", + save_visualizations=False, + progress_path=output_root / "sam3_progress.jsonl", + verbose=request.verbose, + ) + _write_json(output_root / "sam3_segmentation_result.json", result) + + local_outputs = save_segmentation_outputs( + image_path=image_path, + segmentation_result=result, + output_dir=output_root / "segment_box", + ) + _write_json(output_root / "sam3_local_outputs.json", local_outputs) + segmentations = local_outputs.get("segmentations", []) + if not isinstance(segmentations, list) or not segmentations: + raise RuntimeError("SAM3 box segmentation produced no local masks.") + first = segmentations[0] + mask_path = first.get("local_mask_path") + if not isinstance(mask_path, str) or not mask_path: + raise RuntimeError("SAM3 local segmentation output missing local_mask_path.") + return Path(mask_path).expanduser().resolve(), local_outputs + + +def _generate_geometry( + *, + request: Prompt2GeometryRequest, + image_path: Path, + mask_path: Path, + output_root: Path, +) -> dict[str, Any]: + output_name = request.output_name or f"{request.request_id}.glb" + local_glb_path = output_root / "sam3d" / output_name + local_metadata_path = ( + output_root / "sam3d" / f"{Path(output_name).stem}_transform.json" + ) + + client = SAM3DClient( + base_url=os.getenv("PROMPT2GEOMETRY_SAM3D_BASE_URL") or request.sam3d_base_url, + ) + health = client.health() + _write_json(output_root / "sam3d_health.json", health) + generation_request = { + "image": str(image_path), + "mask": str(mask_path), + "request_id": request.request_id, + "output_name": output_name, + "prompt": request.prompt, + "seed": request.sam3d_seed, + "local_glb_path": str(local_glb_path), + "local_transform_metadata_path": str(local_metadata_path), + } + _write_json(output_root / "sam3d_generation_request.json", generation_request) + result = client.generate_asset( + image_path=image_path, + mask_path=mask_path, + request_id=request.request_id, + output_name=output_name, + prompt=request.prompt, + seed=request.sam3d_seed, + output_path=local_glb_path, + metadata_path=local_metadata_path, + progress_path=output_root / "sam3d_progress.jsonl", + verbose=request.verbose, + ) + _write_json(output_root / "sam3d_generation_result.json", result) + return result + + +def _estimate_dimensions( + request: Prompt2GeometryRequest, + output_root: Path, +) -> dict[str, Any]: + client = _llm_client_from_request(request, purpose="dimension estimation") + dimensions = estimate_real_dimensions( + object_prompt=request.prompt, + client=client, + ) + _write_json(output_root / "dimension_estimation.json", dimensions) + return dimensions + + +def _scale_generated_mesh( + *, + mesh_path: Path, + dimensions_m: dict[str, Any], + output_path: Path, + output_root: Path, +) -> dict[str, Any]: + report_path = output_root / "mesh_scaling_report.json" + return scale_mesh_to_real_dimensions( + mesh_path=mesh_path, + output_path=output_path, + dimensions_m=dimensions_m, + report_path=report_path, + ) + + +def _final_scaled_glb_path( + request: Prompt2GeometryRequest, + output_root: Path, +) -> Path: + if request.output_name: + stem = _safe_glb_stem(Path(request.output_name).stem) + else: + client = _llm_client_from_request(request, purpose="GLB file naming") + stem = _extract_glb_stem_from_prompt(request.prompt, client) + return output_root / f"{stem}.glb" + + +def _extract_glb_stem_from_prompt( + prompt: str, + client: OpenAICompatibleClient, +) -> str: + system_prompt = """ + +You extract a concise object file name from a prompt. + + + +Return a JSON object with one field, object_name, containing a short ASCII +snake_case name for the single main object described by the user. + + + +- Output JSON only. +- Required schema: {"object_name": "red_ceramic_mug"} +- object_name must be non-empty. +- Do not include a file extension. +- Use only lowercase English letters, numbers, and underscores. +- Prefer the concrete object noun with one or two useful modifiers. + +""".strip() + messages = [ + {"role": "system", "content": system_prompt}, + { + "role": "user", + "content": ( + "Prompt:\n" f"{prompt.strip()}\n\n" "Return the object_name JSON only." + ), + }, + ] + while True: + try: + raw = client.chat_json(messages=messages) + return _validate_glb_stem_output(raw) + except Exception: + time.sleep(1.0) + continue + + +def _validate_glb_stem_output(raw: dict[str, Any]) -> str: + value = raw.get("object_name") + if not isinstance(value, str) or not value.strip(): + raise ValueError("object_name must be a non-empty string.") + return _safe_glb_stem(value) + + +def _safe_glb_stem(value: str) -> str: + stem = value.strip().lower() + if stem.endswith(".glb"): + stem = stem[:-4] + stem = re.sub(r"[^a-z0-9]+", "_", stem) + stem = re.sub(r"_+", "_", stem).strip("_") + if not stem: + raise ValueError("GLB file name stem is empty after sanitization.") + return stem + + +def _llm_client_from_request( + request: Prompt2GeometryRequest, + *, + purpose: str, +) -> OpenAICompatibleClient: + api_key = os.getenv("PROMPT2GEOMETRY_LLM_API_KEY") or request.llm_api_key + model = os.getenv("PROMPT2GEOMETRY_LLM_MODEL") or request.llm_model + base_url = os.getenv("PROMPT2GEOMETRY_LLM_BASE_URL") or request.llm_base_url + missing = [ + name + for name, value in { + "PROMPT2GEOMETRY_LLM_API_KEY or --llm-api-key": api_key, + "PROMPT2GEOMETRY_LLM_MODEL or --llm-model": model, + "PROMPT2GEOMETRY_LLM_BASE_URL or --llm-base-url": base_url, + }.items() + if not value + ] + if missing: + raise ValueError(f"Missing required LLM config for {purpose}: {missing}") + return OpenAICompatibleClient( + api_key=str(api_key), + model=str(model), + base_url=str(base_url), + timeout_s=request.llm_timeout_s, + usage_stage=f"prompt2geometry.{purpose}", + ) + + +def _cleanup_output_root(output_root: Path, *, keep_path: Path | None) -> None: + output_root = output_root.expanduser().resolve() + keep_path = keep_path.expanduser().resolve() if keep_path is not None else None + if keep_path is not None and not keep_path.is_file(): + keep_path = None + for child in output_root.iterdir(): + if keep_path is not None and child.resolve() == keep_path: + continue + if child.is_dir() and not child.is_symlink(): + shutil.rmtree(child) + else: + child.unlink() + + +def _correct_mask_with_center_prior( + *, + image_path: Path, + raw_mask_path: Path, + output_dir: Path, +) -> Path: + cv2 = _require_cv2() + np = _require_numpy() + + image = cv2.imread(str(image_path), cv2.IMREAD_COLOR) + if image is None: + raise ValueError(f"Failed to read image for mask correction: {image_path}") + raw_mask = cv2.imread(str(raw_mask_path), cv2.IMREAD_GRAYSCALE) + if raw_mask is None: + raise ValueError(f"Failed to read raw mask for correction: {raw_mask_path}") + height, width = image.shape[:2] + if raw_mask.shape[:2] != (height, width): + raise ValueError( + "Raw mask shape does not match image shape: " + f"{raw_mask.shape[:2]} vs {(height, width)}" + ) + + output_dir.mkdir(parents=True, exist_ok=True) + raw_bool = raw_mask > 0 + inverted_bool = ~raw_bool + center_bool, edge_bool = _center_prior_regions(height, width) + normal_score = _center_prior_score(raw_bool, center_bool, edge_bool) + inverted_score = _center_prior_score(inverted_bool, center_bool, edge_bool) + used_inverted = inverted_score["score"] > normal_score["score"] + corrected_bool = inverted_bool if used_inverted else raw_bool + + raw_output = output_dir / "sam3_raw_mask.png" + center_prior_output = output_dir / "center_prior_reference_mask.png" + edge_prior_output = output_dir / "edge_prior_reference_mask.png" + corrected_output = output_dir / "sam3_corrected_mask.png" + cv2.imwrite(str(raw_output), raw_bool.astype("uint8") * 255) + cv2.imwrite(str(center_prior_output), center_bool.astype("uint8") * 255) + cv2.imwrite(str(edge_prior_output), edge_bool.astype("uint8") * 255) + cv2.imwrite(str(corrected_output), corrected_bool.astype("uint8") * 255) + _write_json( + output_dir / "mask_correction_report.json", + { + "image_path": str(image_path), + "raw_mask_path": str(raw_mask_path), + "raw_mask_copy_path": str(raw_output), + "center_prior_reference_mask_path": str(center_prior_output), + "edge_prior_reference_mask_path": str(edge_prior_output), + "corrected_mask_path": str(corrected_output), + "normal_center_prior_score": normal_score, + "inverted_center_prior_score": inverted_score, + "used_inverted_mask": used_inverted, + "raw_mask_area_ratio": float(raw_bool.mean()), + "corrected_mask_area_ratio": float(corrected_bool.mean()), + "foreground_rule": ( + "prefer masks with high center foreground density and low edge " + "foreground density" + ), + }, + ) + return corrected_output + + +def _request_manifest(request: Prompt2GeometryRequest) -> dict[str, Any]: + return { + "prompt": request.prompt, + "output_root": str(request.output_root.expanduser().resolve()), + "target_id": request.target_id, + "request_id": request.request_id, + "output_name": request.output_name, + "zimage_base_url": request.zimage_base_url, + "zimage_width": request.zimage_width, + "zimage_height": request.zimage_height, + "zimage_seed": request.zimage_seed, + "zimage_num_inference_steps": request.zimage_num_inference_steps, + "zimage_prompt_suffix": request.zimage_prompt_suffix, + "sam3_base_url": request.sam3_base_url, + "sam3d_base_url": request.sam3d_base_url, + "sam3d_seed": request.sam3d_seed, + "llm_model": request.llm_model, + "llm_base_url": request.llm_base_url, + "has_llm_api_key": bool(request.llm_api_key), + "llm_timeout_s": request.llm_timeout_s, + "verbose": request.verbose, + } + + +def _zimage_prompt(request: Prompt2GeometryRequest) -> str: + prompt = request.prompt.strip() + suffix = request.zimage_prompt_suffix.strip() + if not suffix: + return prompt + lowered = prompt.lower() + additions = [] + if "single object" not in lowered and "one object" not in lowered: + additions.append("a complete single object") + if "background" not in lowered: + additions.append(_normalize_background_suffix(suffix)) + if not additions: + return prompt + return f"{prompt}, {', '.join(additions)}" + + +def _normalize_background_suffix(suffix: str) -> str: + lowered = suffix.lower() + if "black background" in lowered or "pure-black background" in lowered: + return "with pure-black background" + if "white background" in lowered or "pure-white background" in lowered: + return "with pure-white background" + return suffix + + +def _image_size(image_path: Path) -> tuple[int, int]: + try: + from PIL import Image + except ImportError as exc: + raise ImportError("Pillow is required to read generated image size.") from exc + with Image.open(image_path) as image: + return image.size + + +def _center_prior_regions(height: int, width: int) -> tuple[Any, Any]: + np = _require_numpy() + center_x1 = int(width * 0.2) + center_x2 = int(width * 0.8) + center_y1 = int(height * 0.2) + center_y2 = int(height * 0.8) + center_bool = np.zeros((height, width), dtype=bool) + center_bool[center_y1:center_y2, center_x1:center_x2] = True + + edge_x = max(1, int(width * 0.08)) + edge_y = max(1, int(height * 0.08)) + edge_bool = np.zeros((height, width), dtype=bool) + edge_bool[:edge_y, :] = True + edge_bool[-edge_y:, :] = True + edge_bool[:, :edge_x] = True + edge_bool[:, -edge_x:] = True + return center_bool, edge_bool + + +def _center_prior_score( + mask_bool: Any, + center_bool: Any, + edge_bool: Any, +) -> dict[str, float]: + center_density = _masked_mean(mask_bool, center_bool) + edge_density = _masked_mean(mask_bool, edge_bool) + return { + "score": center_density - edge_density, + "center_foreground_density": center_density, + "edge_foreground_density": edge_density, + } + + +def _masked_mean(mask_bool: Any, region_bool: Any) -> float: + if not region_bool.any(): + return 0.0 + return float(mask_bool[region_bool].mean()) + + +def _write_json(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(payload, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +def _log_status(request: Prompt2GeometryRequest, stage: str, message: str) -> None: + if request.verbose: + print(f"[prompt2geometry:{stage}] {message}", flush=True) + + +def _require_cv2() -> Any: + try: + import cv2 + except ImportError as exc: + raise ImportError("opencv-python is required for mask correction.") from exc + return cv2 + + +def _require_numpy() -> Any: + try: + import numpy as np + except ImportError as exc: + raise ImportError("numpy is required for mask correction.") from exc + return np diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/run.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/run.py new file mode 100644 index 00000000..0890da91 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/run.py @@ -0,0 +1,135 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +if __package__: + from .config import load_prompt2geometry_config + from .pipeline import Prompt2GeometryRequest, run_prompt2geometry +else: + from config import load_prompt2geometry_config + from pipeline import Prompt2GeometryRequest, run_prompt2geometry + +__all__ = ["main"] + + +def main() -> None: + """Run prompt-to-geometry from the command line.""" + parser = argparse.ArgumentParser( + description=( + "Generate one object mesh from a prompt via z-image, segmentation, " + "and 3D-generation." + ) + ) + parser.add_argument( + "--prompt", + required=True, + help=( + "Object description. Complete single-object and pure-black background " + "constraints are appended automatically." + ), + ) + parser.add_argument( + "--output-root", + type=Path, + default=Path("prompt2geometry_output"), + help="Local output directory.", + ) + parser.add_argument("--target-id", default="asset_0") + parser.add_argument("--request-id", default="prompt2geometry_asset_0") + parser.add_argument( + "--output-name", + default=None, + help=( + "Final scaled GLB file name. If omitted, the LLM extracts one " + "from the prompt." + ), + ) + parser.add_argument( + "--config", + type=Path, + default=None, + help="Prompt2Geometry local config JSON path.", + ) + parser.add_argument("--zimage-base-url", default=None) + parser.add_argument("--sam3-base-url", default=None) + parser.add_argument("--sam3d-base-url", default=None) + parser.add_argument( + "--llm-api-key", + default=None, + help="OpenAI-compatible API key for real-world dimension estimation.", + ) + parser.add_argument( + "--llm-model", + default=None, + help="OpenAI-compatible model for real-world dimension estimation.", + ) + parser.add_argument( + "--llm-base-url", + default=None, + help="OpenAI-compatible base URL for real-world dimension estimation.", + ) + parser.add_argument("--llm-timeout-s", type=float, default=None) + parser.add_argument("--width", type=int, default=1024) + parser.add_argument("--height", type=int, default=1024) + parser.add_argument("--zimage-seed", type=int, default=42) + parser.add_argument("--num-inference-steps", type=int, default=8) + parser.add_argument( + "--zimage-prompt-suffix", + default="a complete single object, with pure-black background", + help="Suffix appended to the object description before z-image generation.", + ) + parser.add_argument("--sam3d-seed", type=int, default=42) + parser.add_argument( + "--quiet", + action="store_true", + help="Disable live progress logs.", + ) + args = parser.parse_args() + cfg = load_prompt2geometry_config(args.config) + + result = run_prompt2geometry( + Prompt2GeometryRequest( + prompt=args.prompt, + output_root=args.output_root, + target_id=args.target_id, + request_id=args.request_id, + output_name=args.output_name, + zimage_base_url=args.zimage_base_url or cfg.zimage_base_url, + zimage_width=args.width, + zimage_height=args.height, + zimage_seed=args.zimage_seed, + zimage_num_inference_steps=args.num_inference_steps, + zimage_prompt_suffix=args.zimage_prompt_suffix, + sam3_base_url=args.sam3_base_url or cfg.sam3_base_url, + sam3d_base_url=args.sam3d_base_url or cfg.sam3d_base_url, + sam3d_seed=args.sam3d_seed, + llm_api_key=args.llm_api_key or cfg.llm_api_key, + llm_model=args.llm_model or cfg.llm_model, + llm_base_url=args.llm_base_url or cfg.llm_base_url, + llm_timeout_s=args.llm_timeout_s or cfg.llm_timeout_s, + verbose=not args.quiet, + ) + ) + print(json.dumps(result, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3_client.py new file mode 100644 index 00000000..7bc60abe --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3_client.py @@ -0,0 +1,266 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +import mimetypes +import time +import uuid +from pathlib import Path +from typing import Any +from urllib.error import HTTPError, URLError +from urllib.request import ProxyHandler, Request, build_opener + +try: + from .schemas import SelectedBox +except ImportError: + from schemas import SelectedBox + +__all__ = ["SAM3Client", "SAM3ClientError"] + + +class SAM3ClientError(RuntimeError): + """Raised when the SAM3 segmentation service fails.""" + + +class SAM3Client: + """Self-contained HTTP client for SAM3 box segmentation.""" + + def __init__( + self, + *, + base_url: str, + boxes_path: str = "/segment_boxes", + health_path: str = "/health", + timeout_s: float = 120.0, + poll_interval_s: float = 2.0, + ): + self.base_url = base_url.rstrip("/") + self.boxes_path = boxes_path + self.health_path = health_path + self.timeout_s = timeout_s + self.poll_interval_s = poll_interval_s + self._opener = build_opener(ProxyHandler({})) + + def health(self) -> dict[str, Any]: + """Check SAM3 service health.""" + request = Request( + self._url(self.health_path), + headers={"Accept": "application/json"}, + method="GET", + ) + return self._open_json_request(request) + + def segment_boxes_image( + self, + image_path: Path, + *, + selected_boxes: list[SelectedBox], + request_id: str | None = None, + save_visualizations: bool = False, + progress_path: Path | None = None, + verbose: bool = False, + ) -> dict[str, Any]: + """Segment an image using box prompts.""" + payload: dict[str, object] = { + "mode": "box", + "async": True, + "selected_boxes": [box.to_manifest() for box in selected_boxes], + "save_visualizations": save_visualizations, + } + if request_id is not None: + payload["request_id"] = request_id + result = self._post_multipart_json( + self.boxes_path, + payload=payload, + image_path=image_path, + ) + result = self._resolve_async_result( + result, + progress_path=progress_path, + verbose=verbose, + ) + _validate_segmentation_result(result) + return result + + def _post_multipart_json( + self, + path: str, + *, + payload: dict[str, object], + image_path: Path, + ) -> dict[str, Any]: + body, content_type = _build_multipart_body( + payload=payload, + image_path=image_path, + ) + request = Request( + self._url(path), + data=body, + headers={ + "Accept": "application/json", + "Content-Type": content_type, + }, + method="POST", + ) + return self._open_json_request(request) + + def _open_json_request(self, request: Request) -> dict[str, Any]: + try: + with self._opener.open(request, timeout=self.timeout_s) as response: + response_body = response.read().decode("utf-8") + except HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise SAM3ClientError( + f"SAM3 request to {request.full_url} failed with " + f"HTTP {exc.code}: {detail}" + ) from exc + except URLError as exc: + raise SAM3ClientError( + f"SAM3 server is unreachable at {request.full_url}: {exc.reason}" + ) from exc + except TimeoutError as exc: + raise SAM3ClientError( + f"SAM3 request to {request.full_url} timed out after " + f"{self.timeout_s}s." + ) from exc + + try: + decoded = json.loads(response_body) + except json.JSONDecodeError as exc: + raise SAM3ClientError( + f"SAM3 server returned non-JSON: {response_body}" + ) from exc + if not isinstance(decoded, dict): + raise SAM3ClientError("SAM3 response must be a JSON object.") + return decoded + + def _resolve_async_result( + self, + result: dict[str, Any], + *, + progress_path: Path | None, + verbose: bool, + ) -> dict[str, Any]: + status = str(result.get("status") or "").lower() + status_url = result.get("status_url") + if status not in {"queued", "running"} or not isinstance(status_url, str): + _append_progress(progress_path, result) + _print_progress("segmentation", result, verbose=verbose) + return result + + _append_progress(progress_path, result) + _print_progress("segmentation", result, verbose=verbose) + while True: + time.sleep(self.poll_interval_s) + job = self._get_json(status_url) + _append_progress(progress_path, job) + _print_progress("segmentation", job, verbose=verbose) + job_status = str(job.get("status") or "").lower() + if job_status in {"queued", "running"}: + continue + if job_status == "succeeded": + final_result = job.get("result") + if not isinstance(final_result, dict): + raise SAM3ClientError("SAM3 async job succeeded without result.") + return final_result + if job_status == "failed": + raise SAM3ClientError(f"SAM3 async job failed: {job}") + raise SAM3ClientError(f"SAM3 async job returned unknown status: {job}") + + def _get_json(self, path: str) -> dict[str, Any]: + request = Request( + self._url(path), + headers={"Accept": "application/json"}, + method="GET", + ) + return self._open_json_request(request) + + def _url(self, path: str) -> str: + if path.startswith("http://") or path.startswith("https://"): + return path + normalized_path = path if path.startswith("/") else f"/{path}" + return f"{self.base_url}{normalized_path}" + + +def _build_multipart_body( + *, + payload: dict[str, object], + image_path: Path, +) -> tuple[bytes, str]: + image_path = image_path.expanduser().resolve() + if not image_path.is_file(): + raise FileNotFoundError(f"Image upload path is not a file: {image_path}") + + boundary = f"----prompt2geometry-sam3-{uuid.uuid4().hex}" + content_type = mimetypes.guess_type(image_path.name)[0] or "image/png" + chunks = [ + f"--{boundary}\r\n".encode("utf-8"), + b'Content-Disposition: form-data; name="payload"\r\n', + b"Content-Type: application/json\r\n\r\n", + json.dumps(payload).encode("utf-8"), + b"\r\n", + f"--{boundary}\r\n".encode("utf-8"), + ( + 'Content-Disposition: form-data; name="image"; ' + f'filename="{image_path.name}"\r\n' + ).encode("utf-8"), + f"Content-Type: {content_type}\r\n\r\n".encode("utf-8"), + image_path.read_bytes(), + b"\r\n", + f"--{boundary}--\r\n".encode("utf-8"), + ] + return b"".join(chunks), f"multipart/form-data; boundary={boundary}" + + +def _append_progress(progress_path: Path | None, payload: dict[str, Any]) -> None: + if progress_path is None: + return + progress_path = progress_path.expanduser().resolve() + progress_path.parent.mkdir(parents=True, exist_ok=True) + with progress_path.open("a", encoding="utf-8") as f: + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + +def _print_progress(stage: str, payload: dict[str, Any], *, verbose: bool) -> None: + if not verbose: + return + status = payload.get("status") or payload.get("ok") or "unknown" + job_id = payload.get("job_id") or payload.get("request_id") or payload.get("id") + progress = payload.get("progress") + parts = [f"[{stage}] status={status}"] + if job_id is not None: + parts.append(f"job={job_id}") + if progress is not None: + parts.append(f"progress={progress}") + print(" ".join(parts), flush=True) + + +def _validate_segmentation_result(result: dict[str, Any]) -> None: + if result.get("ok") is not True: + raise SAM3ClientError(f"SAM3 segmentation failed: {result}") + segmentations = result.get("segmentations") + if not isinstance(segmentations, list): + raise SAM3ClientError("SAM3 response missing segmentations list.") + for index, segmentation in enumerate(segmentations): + if not isinstance(segmentation, dict): + raise SAM3ClientError(f"SAM3 segmentation {index} must be an object.") + target_id = segmentation.get("target_id") + if not isinstance(target_id, str) or not target_id.strip(): + raise SAM3ClientError( + f"SAM3 segmentation {index} must contain target_id." + ) diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3d_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3d_client.py new file mode 100644 index 00000000..d8e4d8f8 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3d_client.py @@ -0,0 +1,324 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +import mimetypes +import time +import uuid +from pathlib import Path +from typing import Any +from urllib.error import HTTPError, URLError +from urllib.request import ProxyHandler, Request, build_opener + +__all__ = ["SAM3DClient", "SAM3DClientError"] + + +class SAM3DClientError(RuntimeError): + """Raised when the SAM3D service fails.""" + + +class SAM3DClient: + """Self-contained HTTP client for SAM3D image/mask-to-GLB generation.""" + + def __init__( + self, + *, + base_url: str, + generation_path: str = "/generate", + health_path: str = "/health", + timeout_s: float = 1800.0, + poll_interval_s: float = 5.0, + ): + self.base_url = base_url.rstrip("/") + self.generation_path = generation_path + self.health_path = health_path + self.timeout_s = timeout_s + self.poll_interval_s = poll_interval_s + self._opener = build_opener(ProxyHandler({})) + + def health(self) -> dict[str, Any]: + """Check SAM3D service health.""" + request = Request( + self._url(self.health_path), + headers={"Accept": "application/json"}, + method="GET", + ) + return self._open_json_request(request) + + def generate_asset( + self, + *, + image_path: Path, + mask_path: Path, + request_id: str, + output_name: str, + prompt: str, + seed: int, + output_path: Path, + metadata_path: Path, + progress_path: Path | None = None, + verbose: bool = False, + ) -> dict[str, Any]: + """Generate one 3D asset and download the returned GLB and metadata.""" + payload: dict[str, object] = { + "response_format": "json", + "async": True, + "request_id": request_id, + "output_name": output_name, + "prompt": prompt, + "seed": seed, + } + result = self._post_multipart_json( + self.generation_path, + payload=payload, + image_path=image_path, + mask_path=mask_path, + ) + result = self._resolve_async_result( + result, + progress_path=progress_path, + verbose=verbose, + ) + _validate_generation_result(result) + self._download_required(result, "glb_url", output_path, "model/gltf-binary") + result["local_glb_path"] = str(output_path.expanduser().resolve()) + self._download_required( + result, + "transform_metadata_url", + metadata_path, + "application/json", + ) + result["local_transform_metadata_path"] = str( + metadata_path.expanduser().resolve() + ) + return result + + def _post_multipart_json( + self, + path: str, + *, + payload: dict[str, object], + image_path: Path, + mask_path: Path, + ) -> dict[str, Any]: + body, content_type = _build_multipart_body( + payload=payload, + image_path=image_path, + mask_path=mask_path, + ) + request = Request( + self._url(path), + data=body, + headers={ + "Accept": "application/json", + "Content-Type": content_type, + }, + method="POST", + ) + return self._open_json_request(request) + + def _download_required( + self, + manifest: dict[str, Any], + key: str, + output_path: Path, + accept: str, + ) -> None: + url_path = manifest.get(key) + if not isinstance(url_path, str) or not url_path.strip(): + raise SAM3DClientError(f"SAM3D manifest missing {key}.") + url = url_path if url_path.startswith("http") else self._url(url_path) + output_path = output_path.expanduser().resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + request = Request(url, headers={"Accept": accept}, method="GET") + try: + with self._opener.open(request, timeout=self.timeout_s) as response: + output_path.write_bytes(response.read()) + except HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise SAM3DClientError( + f"SAM3D download from {url} failed with HTTP {exc.code}: {detail}" + ) from exc + except URLError as exc: + raise SAM3DClientError( + f"SAM3D server is unreachable at {url}: {exc.reason}" + ) from exc + + def _open_json_request(self, request: Request) -> dict[str, Any]: + try: + with self._opener.open(request, timeout=self.timeout_s) as response: + response_body = response.read().decode("utf-8") + except HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise SAM3DClientError( + f"SAM3D request to {request.full_url} failed with " + f"HTTP {exc.code}: {detail}" + ) from exc + except URLError as exc: + raise SAM3DClientError( + f"SAM3D server is unreachable at {request.full_url}: {exc.reason}" + ) from exc + except TimeoutError as exc: + raise SAM3DClientError( + f"SAM3D request to {request.full_url} timed out after " + f"{self.timeout_s}s." + ) from exc + + try: + decoded = json.loads(response_body) + except json.JSONDecodeError as exc: + raise SAM3DClientError( + f"SAM3D server returned non-JSON: {response_body}" + ) from exc + if not isinstance(decoded, dict): + raise SAM3DClientError("SAM3D response must be a JSON object.") + return decoded + + def _resolve_async_result( + self, + result: dict[str, Any], + *, + progress_path: Path | None, + verbose: bool, + ) -> dict[str, Any]: + status = str(result.get("status") or "").lower() + status_url = result.get("status_url") + if status not in {"queued", "running"} or not isinstance(status_url, str): + _append_progress(progress_path, result) + _print_progress("3D-generation", result, verbose=verbose) + return result + + _append_progress(progress_path, result) + _print_progress("3D-generation", result, verbose=verbose) + while True: + time.sleep(self.poll_interval_s) + job = self._get_json(status_url) + _append_progress(progress_path, job) + _print_progress("3D-generation", job, verbose=verbose) + job_status = str(job.get("status") or "").lower() + if job_status in {"queued", "running"}: + continue + if job_status == "succeeded": + final_result = job.get("result") + if not isinstance(final_result, dict): + raise SAM3DClientError("SAM3D async job succeeded without result.") + return final_result + if job_status == "failed": + raise SAM3DClientError(f"SAM3D async job failed: {job}") + raise SAM3DClientError(f"SAM3D async job returned unknown status: {job}") + + def _get_json(self, path: str) -> dict[str, Any]: + request = Request( + self._url(path), + headers={"Accept": "application/json"}, + method="GET", + ) + return self._open_json_request(request) + + def _url(self, path: str) -> str: + if path.startswith("http://") or path.startswith("https://"): + return path + normalized_path = path if path.startswith("/") else f"/{path}" + return f"{self.base_url}{normalized_path}" + + +def _build_multipart_body( + *, + payload: dict[str, object], + image_path: Path, + mask_path: Path, +) -> tuple[bytes, str]: + image_path = image_path.expanduser().resolve() + mask_path = mask_path.expanduser().resolve() + if not image_path.is_file(): + raise FileNotFoundError(f"Image upload path is not a file: {image_path}") + if not mask_path.is_file(): + raise FileNotFoundError(f"Mask upload path is not a file: {mask_path}") + + boundary = f"----prompt2geometry-sam3d-{uuid.uuid4().hex}" + chunks = [ + _multipart_text(boundary, "payload", json.dumps(payload), "application/json"), + _multipart_file(boundary, "image", image_path), + _multipart_file(boundary, "mask", mask_path), + f"--{boundary}--\r\n".encode("utf-8"), + ] + return b"".join(chunks), f"multipart/form-data; boundary={boundary}" + + +def _append_progress(progress_path: Path | None, payload: dict[str, Any]) -> None: + if progress_path is None: + return + progress_path = progress_path.expanduser().resolve() + progress_path.parent.mkdir(parents=True, exist_ok=True) + with progress_path.open("a", encoding="utf-8") as f: + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + +def _print_progress(stage: str, payload: dict[str, Any], *, verbose: bool) -> None: + if not verbose: + return + status = payload.get("status") or payload.get("ok") or "unknown" + job_id = payload.get("job_id") or payload.get("request_id") or payload.get("id") + progress = payload.get("progress") + parts = [f"[{stage}] status={status}"] + if job_id is not None: + parts.append(f"job={job_id}") + if progress is not None: + parts.append(f"progress={progress}") + print(" ".join(parts), flush=True) + + +def _multipart_text( + boundary: str, + field_name: str, + value: str, + content_type: str, +) -> bytes: + return b"".join( + [ + f"--{boundary}\r\n".encode("utf-8"), + f'Content-Disposition: form-data; name="{field_name}"\r\n'.encode("utf-8"), + f"Content-Type: {content_type}\r\n\r\n".encode("utf-8"), + value.encode("utf-8"), + b"\r\n", + ] + ) + + +def _multipart_file(boundary: str, field_name: str, path: Path) -> bytes: + content_type = mimetypes.guess_type(path.name)[0] or "application/octet-stream" + return b"".join( + [ + f"--{boundary}\r\n".encode("utf-8"), + ( + f'Content-Disposition: form-data; name="{field_name}"; ' + f'filename="{path.name}"\r\n' + ).encode("utf-8"), + f"Content-Type: {content_type}\r\n\r\n".encode("utf-8"), + path.read_bytes(), + b"\r\n", + ] + ) + + +def _validate_generation_result(result: dict[str, Any]) -> None: + if result.get("ok") is not True: + raise SAM3DClientError(f"SAM3D generation failed: {result}") + glb_url = result.get("glb_url") + if not isinstance(glb_url, str) or not glb_url.strip(): + raise SAM3DClientError("SAM3D response missing glb_url.") diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/schemas.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/schemas.py new file mode 100644 index 00000000..b86ee284 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/schemas.py @@ -0,0 +1,46 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from dataclasses import dataclass, field + +__all__ = ["SelectedBox"] + + +@dataclass(frozen=True) +class SelectedBox: + """One box prompt passed to the SAM3 segmentation service.""" + + target_id: str + target_kind: str + phrase: str + bbox_xyxy: list[float] + source_candidate_ids: list[str] = field(default_factory=list) + selection_reason: str | None = None + + def to_manifest(self) -> dict[str, object]: + """Convert the selected box to JSON-safe data.""" + manifest: dict[str, object] = { + "target_id": self.target_id, + "target_kind": self.target_kind, + "phrase": self.phrase, + "bbox_xyxy": self.bbox_xyxy, + "source_candidate_ids": self.source_candidate_ids, + } + if self.selection_reason is not None: + manifest["selection_reason"] = self.selection_reason + return manifest diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/segmentation_outputs.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/segmentation_outputs.py new file mode 100644 index 00000000..bad645ec --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/segmentation_outputs.py @@ -0,0 +1,245 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +__all__ = ["save_segmentation_outputs"] + + +def save_segmentation_outputs( + *, + image_path: Path, + segmentation_result: dict[str, Any], + output_dir: Path, +) -> dict[str, Any]: + """Save local mask and transparent crop images from mask RLE output.""" + cv2 = _require_cv2() + image_path = image_path.expanduser().resolve() + output_dir = output_dir.expanduser().resolve() + if not image_path.is_file(): + raise FileNotFoundError(f"Segmentation source image not found: {image_path}") + + image = cv2.imread(str(image_path)) + if image is None: + raise ValueError(f"Failed to read segmentation source image: {image_path}") + + segmentations = segmentation_result.get("segmentations", []) + if not isinstance(segmentations, list): + raise ValueError("Segmentation result key segmentations must be a list.") + + output_dir.mkdir(parents=True, exist_ok=True) + local_segmentations = [] + height, width = image.shape[:2] + used_stems: set[str] = set() + + for index, segmentation in enumerate(segmentations): + if not isinstance(segmentation, dict): + continue + mask_rle = segmentation.get("mask_rle") or segmentation.get("segmentation") + if not isinstance(mask_rle, dict): + continue + mask_bool = _decode_mask_rle(mask_rle).astype(bool) + if mask_bool.shape[:2] != (height, width): + raise ValueError( + "Decoded mask shape does not match source image: " + f"{mask_bool.shape[:2]} vs {(height, width)}" + ) + + bbox = _bbox_from_segmentation(segmentation, mask_bool) + target_id = str(segmentation.get("target_id") or f"segment_{index}") + phrase = str(segmentation.get("phrase") or target_id) + file_stem = _unique_file_stem(_safe_name(target_id), used_stems) + mask_path = output_dir / f"{file_stem}_mask.png" + crop_path = output_dir / f"{file_stem}_crop.png" + + _save_mask(mask_bool, mask_path) + _save_transparent_crop(image, mask_bool, bbox, crop_path) + local_segmentations.append( + { + "target_id": target_id, + "target_kind": segmentation.get("target_kind"), + "phrase": phrase, + "bbox_xyxy": [float(value) for value in bbox], + "local_mask_path": str(mask_path), + "local_crop_path": str(crop_path), + } + ) + + manifest = { + "output_dir": str(output_dir), + "source_image_path": str(image_path), + "segmentations": local_segmentations, + "num_segmentations": len(local_segmentations), + } + manifest_path = output_dir / "segmentation_outputs.json" + manifest["manifest_path"] = str(manifest_path) + manifest_path.write_text( + json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + return manifest + + +def _decode_mask_rle(rle: dict[str, Any]) -> Any: + try: + from pycocotools import mask as mask_util + except ImportError: + mask_util = None + + if mask_util is not None: + try: + return mask_util.decode(rle) + except Exception: + pass + + np = _require_numpy() + size = rle.get("size") + if not isinstance(size, list) or len(size) != 2: + raise ValueError("Mask RLE must contain size [height, width].") + height, width = int(size[0]), int(size[1]) + counts = rle.get("counts") + if isinstance(counts, str): + runs = _decode_compressed_coco_rle_counts(counts) + elif isinstance(counts, list): + runs = [int(value) for value in counts] + else: + raise ValueError("Mask RLE counts must be a string or list.") + + flat = np.zeros(height * width, dtype=np.uint8) + offset = 0 + value = 0 + for run_length in runs: + next_offset = min(offset + int(run_length), flat.size) + if value == 1: + flat[offset:next_offset] = 1 + offset = next_offset + value = 1 - value + return flat.reshape((height, width), order="F") + + +def _decode_compressed_coco_rle_counts(counts: str) -> list[int]: + runs = [] + index = 0 + while index < len(counts): + value = 0 + shift = 0 + more = True + while more: + char_value = ord(counts[index]) - 48 + index += 1 + value |= (char_value & 0x1F) << shift + more = bool(char_value & 0x20) + shift += 5 + if not more and (char_value & 0x10): + value |= -1 << shift + if len(runs) > 2: + value += runs[-2] + runs.append(value) + return runs + + +def _bbox_from_segmentation( + segmentation: dict[str, Any], + mask_bool: Any, +) -> tuple[int, int, int, int]: + bbox = segmentation.get("bbox_xyxy") + if isinstance(bbox, list) and len(bbox) == 4: + return tuple(int(round(float(value))) for value in bbox) + + np = _require_numpy() + ys, xs = np.where(mask_bool) + if len(xs) == 0 or len(ys) == 0: + return 0, 0, 0, 0 + return int(xs.min()), int(ys.min()), int(xs.max() + 1), int(ys.max() + 1) + + +def _save_mask(mask_bool: Any, output_path: Path) -> None: + cv2 = _require_cv2() + cv2.imwrite(str(output_path), mask_bool.astype("uint8") * 255) + + +def _save_transparent_crop( + image: Any, + mask_bool: Any, + bbox: tuple[int, int, int, int], + output_path: Path, +) -> None: + cv2 = _require_cv2() + np = _require_numpy() + x1, y1, x2, y2 = _clip_bbox(bbox, image=image) + if x2 <= x1 or y2 <= y1: + return + crop_bgr = image[y1:y2, x1:x2] + crop_mask = mask_bool[y1:y2, x1:x2].astype("uint8") * 255 + crop_bgra = np.dstack([crop_bgr, crop_mask]) + cv2.imwrite(str(output_path), crop_bgra) + + +def _clip_bbox( + bbox: tuple[int, int, int, int], + *, + image: Any, +) -> tuple[int, int, int, int]: + height, width = image.shape[:2] + x1, y1, x2, y2 = bbox + return ( + max(0, min(width, x1)), + max(0, min(height, y1)), + max(0, min(width, x2)), + max(0, min(height, y2)), + ) + + +def _safe_name(value: str) -> str: + safe = "".join( + char if char.isalnum() or char in {"-", "_"} else "_" + for char in value.strip().lower() + ) + return safe or "object" + + +def _unique_file_stem(stem: str, used_stems: set[str]) -> str: + if stem not in used_stems: + used_stems.add(stem) + return stem + suffix = 1 + while f"{stem}_{suffix}" in used_stems: + suffix += 1 + unique_stem = f"{stem}_{suffix}" + used_stems.add(unique_stem) + return unique_stem + + +def _require_cv2() -> Any: + try: + import cv2 + except ImportError as exc: + raise ImportError( + "opencv-python is required to save segmentation outputs." + ) from exc + return cv2 + + +def _require_numpy() -> Any: + try: + import numpy as np + except ImportError as exc: + raise ImportError("numpy is required to save segmentation outputs.") from exc + return np diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/zimage_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/zimage_client.py new file mode 100644 index 00000000..e9d7b287 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/zimage_client.py @@ -0,0 +1,115 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any +from urllib.error import HTTPError, URLError +from urllib.request import ProxyHandler, Request, build_opener + +__all__ = ["ZImageClient", "ZImageClientError"] + + +class ZImageClientError(RuntimeError): + """Raised when the z-image service request fails.""" + + +class ZImageClient: + """HTTP client for the deployed z-image PNG generation service.""" + + def __init__( + self, + *, + base_url: str = "http://192.168.3.23:5013", + generation_path: str = "/generate.png", + timeout_s: float = 300.0, + ): + """Initialize the z-image client.""" + self.base_url = base_url.rstrip("/") + self.generation_path = generation_path + self.timeout_s = timeout_s + self._opener = build_opener(ProxyHandler({})) + + def generate_png( + self, + *, + prompt: str, + output_path: Path, + width: int = 1024, + height: int = 1024, + seed: int = 42, + num_inference_steps: int = 8, + ) -> dict[str, Any]: + """Generate a PNG image and write it to ``output_path``.""" + payload = { + "prompt": prompt, + "width": width, + "height": height, + "seed": seed, + "num_inference_steps": num_inference_steps, + } + body = json.dumps(payload).encode("utf-8") + request = Request( + self._url(self.generation_path), + data=body, + headers={ + "Accept": "image/png", + "Content-Type": "application/json", + }, + method="POST", + ) + output_path = output_path.expanduser().resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + try: + with self._opener.open(request, timeout=self.timeout_s) as response: + content = response.read() + except HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise ZImageClientError( + f"z-image request to {request.full_url} failed with " + f"HTTP {exc.code}: {detail}" + ) from exc + except URLError as exc: + raise ZImageClientError( + f"z-image server is unreachable at {request.full_url}: {exc.reason}" + ) from exc + except TimeoutError as exc: + raise ZImageClientError( + f"z-image request to {request.full_url} timed out after " + f"{self.timeout_s}s." + ) from exc + + if not content: + raise ZImageClientError("z-image server returned an empty image response.") + output_path.write_bytes(content) + return { + "provider": "z-image", + "base_url": self.base_url, + "generation_path": self.generation_path, + "prompt": prompt, + "width": width, + "height": height, + "seed": seed, + "num_inference_steps": num_inference_steps, + "output_path": str(output_path), + "num_bytes": len(content), + } + + def _url(self, path: str) -> str: + normalized_path = path if path.startswith("/") else f"/{path}" + return f"{self.base_url}{normalized_path}" diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/__init__.py b/embodichain/gen_sim/action_agent_pipeline/prompts/__init__.py new file mode 100644 index 00000000..88168e41 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/__init__.py @@ -0,0 +1,21 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from .task_prompt import TaskPrompt + +__all__ = ["TaskPrompt"] diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt new file mode 100644 index 00000000..541763c8 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt @@ -0,0 +1,59 @@ +### Atomic Action Class JSON Specs for Robot Arm Control + +Each non-null graph edge action must be a JSON object with these common fields +and exactly one target field: + +{ + "atomic_action_class": "PickUpAction|MoveAction|PlaceAction", + "robot_name": "left_arm|right_arm", + "control": "arm|hand", + "cfg": {} +} + +Use only these atomic action classes: + +1. `PickUpAction` + - Required target_object: + {"obj_name": "", "affordance": "antipodal"} + - Typical cfg: + {"pre_grasp_distance": 0.08, "sample_interval": 45} + +2. `MoveAction` + - Use `control: "arm"` with target_pose or arm target_qpos. + - Use `control: "hand"` with gripper target_qpos. + - Supported target_pose objects: + {"reference": "object", "obj_name": "", "offset": [x, y, z], "orientation": "current"} + {"reference": "absolute", "position": [x, y, z], "orientation": "current"} + {"reference": "relative", "offset": [dx, dy, dz], "frame": "world|eef"} + - Supported target_qpos objects: + {"source": "initial"} + {"source": "gripper_state", "state": "open|close"} + {"source": "joint_delta", "joint_index": 5, "delta_degrees": -90} + - Typical cfg: + {"sample_interval": 30} + - For release settling after an open gripper target, use: + {"sample_interval": 15, "post_hold_steps": 25} + +3. `PlaceAction` + - Use this only when a single place action should lower, open, and retreat. + - Required target_pose. Supported pose targets are the same target_pose objects + accepted by `MoveAction`. + - Typical cfg: + {"sample_interval": 80, "lift_height": 0.08} + +Rules: +- Do not output Python code, function calls, or `fn`/`kwargs` action objects. +- Do not output legacy `action`-based specs. +- Use `null` for an idle arm. +- Keep all values JSON primitives. +- Each non-null action must contain exactly one of `target_object`, `target_pose`, + or `target_qpos`. +- Preserve current orientation by setting `"orientation": "current"` for pose targets. +- To keep a holding arm closed while the other arm moves, use: + { + "atomic_action_class": "MoveAction", + "robot_name": "", + "control": "hand", + "cfg": {"sample_interval": 10}, + "target_qpos": {"source": "gripper_state", "state": "close"} + } diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/basic_background.txt b/embodichain/gen_sim/action_agent_pipeline/prompts/basic_background.txt new file mode 100644 index 00000000..3a84455e --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/basic_background.txt @@ -0,0 +1,20 @@ +The environment uses a right-handed world coordinate system, where 1 unit equals 1 meter. +All robot poses are represented as 4×4 homogeneous transformation matrices. + +The robot base coordinate frame is the ONLY authoritative frame for all spatial reasoning, planning, and action generation. + +ROBOT BASE COORDINATE DEFINITIONS + +All directions below are defined strictly in the robot base frame: + +* Moving forward increases x +* Moving backward decreases x +* Moving left increases y +* Moving right decreases y +* Moving up increases z +* Moving down decreases z + +ROBOT INITIALIZATION AND TERMINATION + +Both robot arms start in predefined initial configurations with their end-effectors open. +At task completion, both arms must be returned to their initial poses. \ No newline at end of file diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/task_prompt.py b/embodichain/gen_sim/action_agent_pipeline/prompts/task_prompt.py new file mode 100644 index 00000000..8f512216 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/task_prompt.py @@ -0,0 +1,122 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from typing import Any + +import torch +from langchain_core.messages import SystemMessage +from langchain_core.prompts import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, +) +from embodichain.utils.utility import encode_image + +__all__ = ["TaskPrompt"] + + +class TaskPrompt: + @staticmethod + def generate_task_graph(observations: dict[str, Any], **kwargs: Any) -> Any: + """Build a prompt that asks the task agent for a nominal JSON graph.""" + schema = """{ + "task": "", + "start": "v0_start", + "goal": "vN_done", + "nodes": [ + {"id": "v0_start", "semantic": ""}, + {"id": "v1_", "semantic": ""} + ], + "edges": [ + { + "id": "e01_", + "source": "v0_start", + "target": "v1_", + "left_arm_action": { + "atomic_action_class": "PickUpAction|MoveAction|PlaceAction", + "robot_name": "left_arm|right_arm", + "control": "arm|hand", + "target_object": {"obj_name": "", "affordance": "antipodal"}, + "cfg": {} + }, + "right_arm_action": null + } + ] +}""" + + observation = ( + observations["rgb"].cpu().numpy() + if isinstance(observations["rgb"], torch.Tensor) + else observations["rgb"] + ) + kwargs.update( + { + "graph_schema": schema, + "observation": encode_image(observation), + } + ) + + prompt = ChatPromptTemplate.from_messages( + [ + SystemMessage( + content=( + "You are a precise robotic manipulation graph planner. " + "Given a camera observation and task description, produce only " + "the nominal atomic-action graph. Do not add failure monitors, " + "error injection, recovery branches, Python code, or prose. " + "All actions must strictly use the provided atomic action class JSON specs." + ) + ), + HumanMessagePromptTemplate.from_template( + [ + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,{observation}", + }, + }, + { + "type": "text", + "text": ( + "Use the current camera observation and context below to " + "generate a nominal atomic-action graph for the task.\n\n" + "**Environment background:**\n{basic_background}\n\n" + '**Task goal:**\n"{task_prompt}"\n\n' + "**Available atomic actions:**\n{atom_actions}\n\n" + "**Required JSON schema:**\n" + "{graph_schema}\n\n" + "Rules:\n" + "- Output exactly one JSON object and nothing else.\n" + "- The nominal graph must be one deterministic start-to-goal chain with no branches, cycles, or orphan edges.\n" + "- Each edge is one semantic task step from source node to target node.\n" + "- Every edge must define at least one non-null arm action.\n" + "- Use `null` for an idle arm action.\n" + "- Each non-null arm action must use the atomic action class JSON spec with `atomic_action_class`, `robot_name`, `control`, `cfg`, and exactly one of `target_object`, `target_pose`, or `target_qpos`.\n" + "- Do not output legacy function calls, `action`-based specs, or `fn`/`kwargs` action objects.\n" + "- Put only JSON primitives inside action specs: strings, numbers, booleans, null, arrays, or objects.\n" + "- Do not include `env`, tensors, comments, validation conditions, monitors, errors, or recovery fields.\n" + "- Preserve task order and use both arms on the same edge when they should act simultaneously.\n" + "- Use stable ids such as `v0_start`, `v1_grasped`, `e01_grasp_objects`.\n" + "- Replace `N` with the concrete final step index; do not literally output `vN_done`.\n" + "- The final edge target must equal the `goal` field." + ), + }, + ] + ), + ] + ) + return prompt.invoke(kwargs) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/__init__.py b/embodichain/gen_sim/action_agent_pipeline/runtime/__init__.py new file mode 100644 index 00000000..a6bb7005 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/__init__.py @@ -0,0 +1,21 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +"""Runtime graph compilation and atomic-action execution.""" + +__all__: list[str] = [] diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py new file mode 100644 index 00000000..6bfabdfe --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py @@ -0,0 +1,170 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import ast +from typing import List + +from embodichain.utils.logger import log_error + + +def _available_arm_sides(env) -> list[str]: + sides = [] + for side in ("left", "right"): + if len(getattr(env, f"{side}_arm_joints", []) or []) > 0: + sides.append(side) + return sides + + +def resolve_arm_side(env, robot_name: str) -> str: + """Resolve robot_name to an available left/right graph slot.""" + name = robot_name or "" + if "right" in name: + side = "right" + elif "left" in name: + side = "left" + else: + sides = _available_arm_sides(env) + side = "right" if sides == ["right"] else "left" + + if side not in _available_arm_sides(env): + log_error( + f"Requested {side}_arm for robot_name='{robot_name}', but available " + f"control parts are {getattr(env.robot, 'control_parts', None)}.", + error_type=ValueError, + ) + return side + + +def get_arm_states(env, robot_name): + """Get the current state of the specified robot arm. + + Args: + env: The simulation environment. + robot_name: Name of the robot arm (should contain "left" or "right"). + + Returns: + Tuple of (is_left, select_arm, current_qpos, current_pose, current_gripper_state): + - is_left: bool, whether this is the left arm + - select_arm: str, arm identifier ("left_arm" or "right_arm") + - current_qpos: Current joint positions + - current_pose: Current end-effector pose (4x4 matrix) + - current_gripper_state: Current gripper state + """ + left_arm_current_qpos, right_arm_current_qpos = env.get_current_qpos_agent() + left_arm_current_pose, right_arm_current_pose = env.get_current_xpos_agent() + left_arm_current_gripper_state, right_arm_current_gripper_state = ( + env.get_current_gripper_state_agent() + ) + + side = resolve_arm_side(env, robot_name) + is_left = True if side == "left" else False + if hasattr(env, "get_agent_arm_control_part"): + select_arm = env.get_agent_arm_control_part(is_left) + else: + select_arm = "left_arm" if is_left else "right_arm" + + arms = { + "left": ( + left_arm_current_qpos, + left_arm_current_pose, + left_arm_current_gripper_state, + ), + "right": ( + right_arm_current_qpos, + right_arm_current_pose, + right_arm_current_gripper_state, + ), + } + ( + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) = arms[side] + + return ( + is_left, + select_arm, + select_arm_current_qpos, + select_arm_current_pose, + select_arm_current_gripper_state, + ) + + +def extract_drive_calls(code_str: str) -> List[str]: + """Extract all drive() function calls from a code string. + + Args: + code_str: Python code string to parse. + + Returns: + List of code blocks containing drive() calls. + """ + tree = ast.parse(code_str) + lines = code_str.splitlines() + + drive_blocks = [] + + for node in tree.body: + # Match: drive(...) + if ( + isinstance(node, ast.Expr) + and isinstance(node.value, ast.Call) + and isinstance(node.value.func, ast.Name) + and node.value.func.id == "drive" + ): + # AST line numbers are 1-based + start = node.lineno - 1 + end = node.end_lineno + block = "\n".join(lines[start:end]) + drive_blocks.append(block) + + return drive_blocks + + +def apply_offset_to_pose(pose, offset: list): + pose[0, 3] += offset[0] + pose[1, 3] += offset[1] + pose[2, 3] += offset[2] + return pose + + +def resolve_action(action, env, kwargs): + if callable(action): + return action(env=env, **kwargs) + return action + + +def sync_agent_state_from_robot(env) -> None: + """Synchronize cached agent arm states from the physical robot state.""" + action = env.robot.get_qpos().squeeze(0) + for side in ("left", "right"): + is_left = side == "left" + arm_joints = getattr(env, f"{side}_arm_joints", []) + eef_joints = getattr(env, f"{side}_eef_joints", []) + if arm_joints: + arm_qpos = action[arm_joints] + env.set_current_qpos_agent(arm_qpos, is_left=is_left) + env.set_current_xpos_agent( + env.get_arm_fk(qpos=arm_qpos, is_left=is_left), + is_left=is_left, + ) + if eef_joints: + env.set_current_gripper_state_agent( + action[eef_joints][0].unsqueeze(0), + is_left=is_left, + ) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py new file mode 100644 index 00000000..876f28df --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -0,0 +1,1046 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import hashlib +import os +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Any, Mapping + +import numpy as np +import torch +from tqdm import tqdm + +from embodichain.gen_sim.action_agent_pipeline.runtime.atom_action_utils import ( + get_arm_states, + resolve_arm_side, +) +from embodichain.lab.sim.atomic_actions import ( + AntipodalAffordance, + MoveAction, + MoveActionCfg, + ObjectSemantics, + PickUpAction, + PickUpActionCfg, + PlaceAction, + PlaceActionCfg, +) +from embodichain.lab.sim.planners import MotionGenerator, MotionGenCfg, ToppraPlannerCfg +from embodichain.toolkits.graspkit.pg_grasp import ( + AntipodalSamplerCfg, + GraspGeneratorCfg, + GripperCollisionCfg, +) +from embodichain.toolkits.graspkit.pg_grasp.antipodal_generator import ( + GRASP_ANNOTATOR_CACHE_DIR, +) +from embodichain.utils.logger import log_info +from embodichain.utils.math import get_offset_pose + +__all__ = [ + "AtomicActionSpec", + "execute_atomic_action", + "execute_parallel_atomic_actions", + "normalize_atomic_action_spec", +] + + +SUPPORTED_ATOMIC_ACTION_CLASSES = {"PickUpAction", "MoveAction", "PlaceAction"} +SUPPORTED_CONTROLS = {"arm", "hand"} +TARGET_SPEC_FIELDS = ("target_object", "target_pose", "target_qpos") +SUPPORTED_POSE_REFERENCES = {"object", "absolute", "relative"} +SUPPORTED_QPOS_SOURCES = {"initial", "gripper_state", "joint_delta"} +SUPPORTED_CFG_KEYS = { + "sample_interval", + "pre_grasp_distance", + "lift_height", + "hand_interp_steps", + "post_hold_steps", +} + + +ATOMIC_ACTION_REGISTRY = { + "PickUpAction": (PickUpAction, PickUpActionCfg), + "MoveAction": (MoveAction, MoveActionCfg), + "PlaceAction": (PlaceAction, PlaceActionCfg), +} + + +@dataclass(frozen=True) +class AtomicActionSpec: + """JSON-serializable atomic action specification.""" + + atomic_action_class: str + robot_name: str + control: str = "arm" + target_object: dict[str, Any] = field(default_factory=dict) + target_pose: dict[str, Any] = field(default_factory=dict) + target_qpos: dict[str, Any] = field(default_factory=dict) + cfg: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_mapping(cls, spec: Mapping[str, Any]) -> "AtomicActionSpec": + normalized = normalize_atomic_action_spec(spec) + return cls( + atomic_action_class=normalized["atomic_action_class"], + robot_name=normalized["robot_name"], + control=normalized["control"], + target_object=normalized.get("target_object", {}), + target_pose=normalized.get("target_pose", {}), + target_qpos=normalized.get("target_qpos", {}), + cfg=normalized["cfg"], + ) + + def to_dict(self) -> dict[str, Any]: + spec = { + "atomic_action_class": self.atomic_action_class, + "robot_name": self.robot_name, + "control": self.control, + "cfg": deepcopy(self.cfg), + } + if self.target_object: + spec["target_object"] = deepcopy(self.target_object) + if self.target_pose: + spec["target_pose"] = deepcopy(self.target_pose) + if self.target_qpos: + spec["target_qpos"] = deepcopy(self.target_qpos) + return spec + + +def normalize_atomic_action_spec(spec: Mapping[str, Any]) -> dict[str, Any]: + """Validate and normalize an atomic action JSON spec.""" + if not isinstance(spec, Mapping): + raise TypeError(f"Action spec must be a mapping, got {type(spec)}.") + if "fn" in spec: + raise ValueError( + "Legacy fn/kwargs action schema is not supported. Use atomic action class " + "JSON spec with atomic_action_class, robot_name, control, cfg, and " + "exactly one of target_object, target_pose, or target_qpos." + ) + + if "action" in spec: + raise ValueError( + "Legacy action schema is not supported. Use atomic_action_class with " + "PickUpAction, MoveAction, or PlaceAction." + ) + if "target" in spec: + raise ValueError( + "Legacy target.kind schema is not supported. Use exactly one of " + "target_object, target_pose, or target_qpos." + ) + + atomic_action_class = spec.get("atomic_action_class") + if atomic_action_class not in SUPPORTED_ATOMIC_ACTION_CLASSES: + raise ValueError( + f"Unsupported atomic action class {atomic_action_class!r}; expected " + f"one of {sorted(SUPPORTED_ATOMIC_ACTION_CLASSES)}." + ) + + robot_name = spec.get("robot_name") + if not isinstance(robot_name, str) or not robot_name: + raise ValueError("Atomic action spec requires non-empty robot_name.") + + control = spec.get("control", "arm") + if control not in SUPPORTED_CONTROLS: + raise ValueError( + f"Unsupported atomic action control {control!r}; expected one of " + f"{sorted(SUPPORTED_CONTROLS)}." + ) + + cfg = dict(spec.get("cfg") or {}) + unknown_cfg = set(cfg) - SUPPORTED_CFG_KEYS + if unknown_cfg: + raise ValueError( + f"Unsupported atomic action cfg keys: {', '.join(sorted(unknown_cfg))}." + ) + + target_field, target_spec = _normalize_action_target( + spec, + atomic_action_class=atomic_action_class, + control=control, + ) + + normalized = { + "atomic_action_class": atomic_action_class, + "robot_name": robot_name, + "control": control, + "cfg": cfg, + } + normalized[target_field] = target_spec + return normalized + + +def _normalize_action_target( + spec: Mapping[str, Any], + *, + atomic_action_class: str, + control: str, +) -> tuple[str, dict[str, Any]]: + target_fields = [field for field in TARGET_SPEC_FIELDS if field in spec] + if len(target_fields) != 1: + raise ValueError( + "Atomic action spec requires exactly one of target_object, target_pose, " + f"or target_qpos; got {target_fields}." + ) + + target_field = target_fields[0] + target_spec = spec[target_field] + if not isinstance(target_spec, Mapping) or not target_spec: + raise ValueError(f"{target_field} must be a non-empty object.") + target_spec = dict(target_spec) + + if atomic_action_class == "PickUpAction": + if control != "arm" or target_field != "target_object": + raise ValueError("PickUpAction requires control='arm' and target_object.") + _validate_target_object(target_spec) + return target_field, target_spec + + if atomic_action_class == "PlaceAction": + if control != "arm" or target_field != "target_pose": + raise ValueError("PlaceAction requires control='arm' and target_pose.") + _validate_target_pose(target_spec) + return target_field, target_spec + + if target_field == "target_pose": + if control != "arm": + raise ValueError("MoveAction target_pose requires control='arm'.") + _validate_target_pose(target_spec) + return target_field, target_spec + + if target_field == "target_qpos": + _validate_target_qpos(target_spec, control=control) + return target_field, target_spec + + raise ValueError("MoveAction requires target_pose or target_qpos.") + + +def _validate_target_object(target_object: Mapping[str, Any]) -> None: + obj_name = target_object.get("obj_name") + if not isinstance(obj_name, str) or not obj_name: + raise ValueError("target_object requires non-empty obj_name.") + affordance = target_object.get("affordance", "antipodal") + if affordance != "antipodal": + raise ValueError("target_object only supports affordance='antipodal'.") + + +def _validate_target_pose(target_pose: Mapping[str, Any]) -> None: + reference = target_pose.get("reference") + if reference not in SUPPORTED_POSE_REFERENCES: + raise ValueError( + f"target_pose reference must be one of {sorted(SUPPORTED_POSE_REFERENCES)}." + ) + + if reference == "object": + obj_name = target_pose.get("obj_name") + if not isinstance(obj_name, str) or not obj_name: + raise ValueError("object target_pose requires non-empty obj_name.") + _xyz(target_pose.get("offset", [0.0, 0.0, 0.0]), "offset") + return + + if reference == "absolute": + position = target_pose.get("position") + if not isinstance(position, list) or len(position) != 3: + raise ValueError( + "absolute target_pose requires position with three entries." + ) + return + + _xyz(target_pose.get("offset", [0.0, 0.0, 0.0]), "offset") + frame = target_pose.get("frame", "world") + if frame not in {"world", "eef"}: + raise ValueError("relative target_pose frame must be 'world' or 'eef'.") + + +def _validate_target_qpos( + target_qpos: Mapping[str, Any], + *, + control: str, +) -> None: + source = target_qpos.get("source") + if source not in SUPPORTED_QPOS_SOURCES: + raise ValueError( + f"target_qpos source must be one of {sorted(SUPPORTED_QPOS_SOURCES)}." + ) + + if source == "initial": + if control != "arm": + raise ValueError("initial target_qpos requires control='arm'.") + return + + if source == "gripper_state": + if control != "hand": + raise ValueError("gripper_state target_qpos requires control='hand'.") + state = target_qpos.get("state") + if state not in {"open", "close"}: + raise ValueError( + "gripper_state target_qpos state must be 'open' or 'close'." + ) + return + + if control != "arm": + raise ValueError("joint_delta target_qpos requires control='arm'.") + if "joint_index" not in target_qpos: + raise ValueError("joint_delta target_qpos requires joint_index.") + int(target_qpos["joint_index"]) + float(target_qpos.get("delta_degrees", 0.0)) + + +def execute_atomic_action( + action_spec: Mapping[str, Any] | AtomicActionSpec, + *, + env, + **runtime_kwargs, +) -> np.ndarray: + """Execute one atomic action spec and return local arm+eef qpos actions.""" + spec = ( + action_spec + if isinstance(action_spec, AtomicActionSpec) + else AtomicActionSpec.from_mapping(action_spec) + ) + if spec.atomic_action_class == "MoveAction" and spec.target_qpos: + action_np = _execute_move_qpos_action(env, spec) + action_np = _append_hold_steps( + action_np, + int(spec.cfg.get("post_hold_steps", 0)), + "atomic qpos action", + ) + _sync_agent_state_from_atomic_action( + env, + spec.robot_name, + action_np, + spec.control, + ) + log_info( + "Using action-agent qpos action: " + f"control={spec.control}, target={_target_summary(spec)}, " + f"steps={len(action_np)}.", + color="green", + ) + return action_np + + target = _resolve_target(env, spec, runtime_kwargs) + cfg, start_qpos = _build_action_cfg_and_start(env, spec) + action_cls = _get_atomic_action_class(spec.atomic_action_class) + action = action_cls(motion_generator=_make_motion_generator(env), cfg=cfg) + is_success, trajectory, joint_ids = action.execute( + target=target, + start_qpos=start_qpos, + ) + if not is_success: + raise RuntimeError( + f"Atomic action failed: atomic_action_class={spec.atomic_action_class}, " + f"robot_name={spec.robot_name}, target={_target_summary(spec)}." + ) + + action_np = _trajectory_to_agent_action( + env, + spec.robot_name, + trajectory, + joint_ids, + ) + action_np = _append_hold_steps( + action_np, + int(spec.cfg.get("post_hold_steps", 0)), + "atomic action", + ) + _sync_agent_state_from_atomic_action(env, spec.robot_name, action_np, spec.control) + log_info( + "Using atomic action: " + f"atomic_action_class={spec.atomic_action_class}, cfg={cfg.__class__.__name__}, " + f"control={spec.control}, target={_target_summary(spec)}, " + f"steps={len(action_np)}.", + color="green", + ) + return action_np + + +def execute_parallel_atomic_actions( + left_arm_action=None, + right_arm_action=None, + env=None, + return_result: bool = False, + monitor_sequences=None, + **runtime_kwargs, +): + """Execute left/right atomic action specs as one synchronized stream.""" + if monitor_sequences is not None: + raise NotImplementedError("Monitor sequences have been removed.") + + left_arm_action = _resolve_action_spec(left_arm_action, env, runtime_kwargs) + right_arm_action = _resolve_action_spec(right_arm_action, env, runtime_kwargs) + + left_arm_action = _as_2d_action(left_arm_action, "left_arm_action") + right_arm_action = _as_2d_action(right_arm_action, "right_arm_action") + arm_actions = {"left": left_arm_action, "right": right_arm_action} + + if all(action is None for action in arm_actions.values()): + raise ValueError("At least one atomic arm action must be provided.") + + action_len = max( + len(action) for action in arm_actions.values() if action is not None + ) + for side, action in arm_actions.items(): + if action is not None and len(action) < action_len: + diff = action_len - len(action) + padding = np.repeat(action[-1:], diff, axis=0) + arm_actions[side] = np.concatenate([action, padding], axis=0) + + current_qpos = ( + env.robot.get_qpos().squeeze(0).detach().cpu().numpy().astype(np.float32) + ) + actions = np.repeat(current_qpos[None, :], action_len, axis=0) + + for side, action in arm_actions.items(): + if action is None: + continue + + arm_index = list(getattr(env, f"{side}_arm_joints", [])) + list( + getattr(env, f"{side}_eef_joints", []) + ) + if not arm_index: + raise ValueError( + f"{side}_arm_action was provided, but {side}_arm is not configured " + f"on robot control parts {getattr(env.robot, 'control_parts', None)}." + ) + if action.shape[-1] != len(arm_index): + raise ValueError( + f"{side}_arm_action width {action.shape[-1]} does not match " + f"{side}_arm joints plus eef joints ({len(arm_index)})." + ) + actions[:, arm_index] = action + + actions = torch.from_numpy(actions).to(dtype=torch.float32).unsqueeze(1) + actions = list(actions.unbind(dim=0)) + + for action in tqdm(actions): + env.step(action) + env.update_obj_info() + + if return_result: + return { + "actions": actions, + "monitor_index": None, + "monitor_name": None, + "step_index": None, + } + return actions + + +def _resolve_action_spec(action_spec, env, runtime_kwargs: dict[str, Any]): + if action_spec is None: + return None + if isinstance(action_spec, np.ndarray): + return action_spec + if isinstance(action_spec, torch.Tensor): + return action_spec + return execute_atomic_action(action_spec, env=env, **runtime_kwargs) + + +def _execute_move_qpos_action(env, spec: AtomicActionSpec) -> np.ndarray: + """Execute MoveAction target_qpos locally without extending core MoveAction.""" + target_qpos = _resolve_qpos_target(env, spec) + start_qpos, joint_ids = _qpos_start_and_joint_ids(env, spec) + target_qpos = _resolve_batched_qpos( + target_qpos, + expected_dof=len(joint_ids), + device=env.robot.device, + name="target_qpos", + ) + sample_interval = int(spec.cfg.get("sample_interval", 80)) + trajectory = _interpolate_qpos_trajectory( + start_qpos, + target_qpos, + sample_interval, + ) + return _trajectory_to_agent_action( + env, + spec.robot_name, + trajectory, + joint_ids, + ) + + +def _qpos_start_and_joint_ids( + env, + spec: AtomicActionSpec, +) -> tuple[torch.Tensor, list[int]]: + is_left, _, _, arm_joints, eef_joints = _select_arm_parts(env, spec.robot_name) + if spec.control == "hand": + _, _, _, _, current_gripper_state = get_arm_states(env, spec.robot_name) + start_qpos = _state_to_hand_qpos( + current_gripper_state, + len(eef_joints), + env.robot.device, + ) + return start_qpos.reshape(1, len(eef_joints)), eef_joints + return _current_arm_qpos(env, is_left, arm_joints), arm_joints + + +def _resolve_batched_qpos( + qpos, + *, + expected_dof: int, + device, + name: str, +) -> torch.Tensor: + qpos = torch.as_tensor(qpos, dtype=torch.float32, device=device) + if qpos.shape == (expected_dof,): + qpos = qpos.reshape(1, expected_dof) + if qpos.ndim != 2 or qpos.shape[1] != expected_dof: + raise ValueError( + f"{name} must have shape ({expected_dof},) or (num_envs, {expected_dof}), " + f"got {tuple(qpos.shape)}." + ) + return qpos + + +def _interpolate_qpos_trajectory( + start_qpos: torch.Tensor, + target_qpos: torch.Tensor, + sample_interval: int, +) -> torch.Tensor: + if sample_interval < 2: + raise ValueError("sample_interval must be at least 2 for qpos interpolation.") + if target_qpos.shape[0] == 1 and start_qpos.shape[0] > 1: + target_qpos = target_qpos.repeat(start_qpos.shape[0], 1) + if start_qpos.shape != target_qpos.shape: + raise ValueError( + f"start_qpos and target_qpos must have matching shapes, got " + f"{tuple(start_qpos.shape)} and {tuple(target_qpos.shape)}." + ) + weights = torch.linspace( + 0.0, + 1.0, + steps=sample_interval, + dtype=start_qpos.dtype, + device=start_qpos.device, + ).reshape(1, sample_interval, 1) + return start_qpos.unsqueeze(1) + ( + target_qpos.unsqueeze(1) - start_qpos.unsqueeze(1) + ) * weights + + +def _select_arm_parts(env, robot_name: str): + is_left = resolve_arm_side(env, robot_name) == "left" + if hasattr(env, "get_agent_arm_control_part"): + arm_part = env.get_agent_arm_control_part(is_left) + hand_part = env.get_agent_eef_control_part(is_left) + else: + arm_part = "left_arm" if is_left else "right_arm" + hand_part = "left_eef" if is_left else "right_eef" + arm_joints = env.left_arm_joints if is_left else env.right_arm_joints + eef_joints = env.left_eef_joints if is_left else env.right_eef_joints + return is_left, arm_part, hand_part, list(arm_joints), list(eef_joints) + + +def _make_motion_generator(env): + return MotionGenerator( + cfg=MotionGenCfg(planner_cfg=ToppraPlannerCfg(robot_uid=env.robot.uid)) + ) + + +def _get_atomic_action_class(atomic_action_class: str): + action_class, _ = ATOMIC_ACTION_REGISTRY[atomic_action_class] + return action_class + + +def _build_action_cfg_and_start(env, spec: AtomicActionSpec): + is_left, arm_part, hand_part, arm_joints, eef_joints = _select_arm_parts( + env, spec.robot_name + ) + cfg_values = dict(spec.cfg) + cfg_values.pop("post_hold_steps", None) + device = env.robot.device + + if spec.atomic_action_class == "PickUpAction": + if spec.control != "arm": + raise ValueError("PickUpAction atomic action requires control='arm'.") + hand_dof = len(eef_joints) + cfg = PickUpActionCfg( + control_part=arm_part, + hand_control_part=hand_part, + hand_open_qpos=_state_to_hand_qpos(env.open_state, hand_dof, device), + hand_close_qpos=_state_to_hand_qpos(env.close_state, hand_dof, device), + **_cfg_supported_kwargs(PickUpActionCfg, cfg_values), + ) + return cfg, _current_arm_qpos(env, is_left, arm_joints) + + if spec.atomic_action_class == "PlaceAction": + if spec.control != "arm": + raise ValueError("PlaceAction atomic action requires control='arm'.") + cfg = PlaceActionCfg( + control_part=arm_part, + hand_control_part=hand_part, + hand_open_qpos=_state_to_hand_qpos(env.open_state, len(eef_joints), device), + hand_close_qpos=_state_to_hand_qpos( + env.close_state, len(eef_joints), device + ), + **_cfg_supported_kwargs(PlaceActionCfg, cfg_values), + ) + return cfg, _current_arm_qpos(env, is_left, arm_joints) + + control_part = arm_part if spec.control == "arm" else hand_part + cfg = MoveActionCfg( + control_part=control_part, + **_cfg_supported_kwargs(MoveActionCfg, cfg_values), + ) + if spec.control == "hand": + _, _, _, _, current_gripper_state = get_arm_states(env, spec.robot_name) + return ( + cfg, + _state_to_hand_qpos( + current_gripper_state, + len(eef_joints), + device, + ).reshape(1, len(eef_joints)), + ) + return cfg, _current_arm_qpos(env, is_left, arm_joints) + + +def _resolve_target(env, spec: AtomicActionSpec, runtime_kwargs: dict[str, Any]): + if spec.atomic_action_class == "PickUpAction": + return _resolve_pickup_target(env, spec, runtime_kwargs) + if spec.atomic_action_class == "MoveAction": + return _resolve_move_target(env, spec) + if spec.atomic_action_class == "PlaceAction": + return _resolve_place_target(env, spec) + raise ValueError(f"Unsupported atomic action class: {spec.atomic_action_class}.") + + +def _resolve_pickup_target( + env, + spec: AtomicActionSpec, + runtime_kwargs: dict[str, Any], +): + if not spec.target_object: + raise ValueError("PickUpAction requires target_object.") + return _build_object_semantics(env, spec.target_object, runtime_kwargs) + + +def _resolve_move_target(env, spec: AtomicActionSpec): + if spec.target_pose: + return _resolve_pose_target(env, spec) + if spec.target_qpos: + return _resolve_qpos_target(env, spec) + raise ValueError("MoveAction requires target_pose or target_qpos.") + + +def _resolve_place_target(env, spec: AtomicActionSpec): + if not spec.target_pose: + raise ValueError("PlaceAction requires target_pose.") + return _resolve_pose_target(env, spec) + + +def _resolve_pose_target(env, spec: AtomicActionSpec): + reference = spec.target_pose["reference"] + if reference == "object": + return _resolve_object_pose_target(env, spec) + if reference == "absolute": + return _resolve_absolute_pose_target(env, spec) + if reference == "relative": + return _resolve_relative_pose_target(env, spec) + raise ValueError(f"Unsupported target_pose reference: {reference}.") + + +def _resolve_qpos_target(env, spec: AtomicActionSpec): + source = spec.target_qpos["source"] + if source == "initial": + return _resolve_initial_qpos_target(env, spec) + if source == "gripper_state": + return _resolve_gripper_qpos_target(env, spec) + if source == "joint_delta": + return _resolve_joint_delta_qpos_target(env, spec) + raise ValueError(f"Unsupported target_qpos source: {source}.") + + +def _resolve_object_pose_target(env, spec: AtomicActionSpec): + obj_name = spec.target_pose.get("obj_name") + target_obj = env.sim.get_rigid_object(obj_name) + if target_obj is None: + raise ValueError(f"No rigid object found for {obj_name}.") + offset = _xyz(spec.target_pose.get("offset", [0.0, 0.0, 0.0]), "offset") + _, _, _, current_pose, _ = get_arm_states(env, spec.robot_name) + target_pose = deepcopy(current_pose) + target_obj_pose = target_obj.get_local_pose(to_matrix=True).squeeze(0) + target_pose[:3, 3] = target_obj_pose[:3, 3] + target_pose[0, 3] += offset[0] + target_pose[1, 3] += offset[1] + target_pose[2, 3] += offset[2] + return torch.as_tensor(target_pose, dtype=torch.float32, device=env.robot.device) + + +def _resolve_absolute_pose_target(env, spec: AtomicActionSpec): + position = spec.target_pose.get("position") + if not isinstance(position, list) or len(position) != 3: + raise ValueError("absolute target_pose requires position with three entries.") + _, _, _, current_pose, _ = get_arm_states(env, spec.robot_name) + target_pose = deepcopy(current_pose) + for index, value in enumerate(position): + if value is not None: + target_pose[index, 3] = float(value) + return torch.as_tensor(target_pose, dtype=torch.float32, device=env.robot.device) + + +def _resolve_relative_pose_target(env, spec: AtomicActionSpec): + offset = _xyz(spec.target_pose.get("offset", [0.0, 0.0, 0.0]), "offset") + frame = spec.target_pose.get("frame", "world") + if frame not in {"world", "eef"}: + raise ValueError("relative target_pose frame must be 'world' or 'eef'.") + mode = "extrinsic" if frame == "world" else "intrinsic" + _, _, _, current_pose, _ = get_arm_states(env, spec.robot_name) + target_pose = deepcopy(current_pose) + target_pose = get_offset_pose(target_pose, offset[0], "x", mode) + target_pose = get_offset_pose(target_pose, offset[1], "y", mode) + target_pose = get_offset_pose(target_pose, offset[2], "z", mode) + return torch.as_tensor(target_pose, dtype=torch.float32, device=env.robot.device) + + +def _resolve_initial_qpos_target(env, spec: AtomicActionSpec): + if spec.control != "arm": + raise ValueError("initial target_qpos requires control='arm'.") + is_left, _, _, _, _ = _select_arm_parts(env, spec.robot_name) + target_qpos = env.left_arm_init_qpos if is_left else env.right_arm_init_qpos + return torch.as_tensor(target_qpos, dtype=torch.float32, device=env.robot.device) + + +def _resolve_gripper_qpos_target(env, spec: AtomicActionSpec): + if spec.control != "hand": + raise ValueError("gripper_state target_qpos requires control='hand'.") + state = spec.target_qpos.get("state") + if state == "open": + source = env.open_state + elif state == "close": + source = env.close_state + else: + raise ValueError("gripper_state target_qpos state must be 'open' or 'close'.") + _, _, _, _, eef_joints = _select_arm_parts(env, spec.robot_name) + return _state_to_hand_qpos(source, len(eef_joints), env.robot.device) + + +def _resolve_joint_delta_qpos_target(env, spec: AtomicActionSpec): + if spec.control != "arm": + raise ValueError("joint_delta target_qpos requires control='arm'.") + joint_index = int(spec.target_qpos["joint_index"]) + delta_degrees = float(spec.target_qpos.get("delta_degrees", 0.0)) + _, _, current_qpos, _, _ = get_arm_states(env, spec.robot_name) + target_qpos = torch.as_tensor( + current_qpos, + dtype=torch.float32, + device=env.robot.device, + ).clone() + if joint_index < 0 or joint_index >= target_qpos.numel(): + raise ValueError(f"joint_index {joint_index} is out of range.") + target_qpos[joint_index] += float(np.deg2rad(delta_degrees)) + return target_qpos + + +def _target_summary(spec: AtomicActionSpec) -> str: + if spec.target_object: + return f"target_object:{spec.target_object.get('obj_name')}" + if spec.target_pose: + return f"target_pose:{spec.target_pose.get('reference')}" + if spec.target_qpos: + return f"target_qpos:{spec.target_qpos.get('source')}" + return "target:none" + + +def _build_object_semantics( + env, + target: Mapping[str, Any], + runtime_kwargs: dict[str, Any], +): + obj_name = target.get("obj_name") + if target.get("affordance", "antipodal") != "antipodal": + raise ValueError("target_object only supports antipodal affordance.") + target_obj = env.sim.get_rigid_object(obj_name) + if target_obj is None: + raise ValueError(f"No rigid object found for {obj_name}.") + + mesh_vertices = target_obj.get_vertices(env_ids=[0], scale=True)[0] + mesh_triangles = target_obj.get_triangles(env_ids=[0])[0] + mesh_vertices = torch.as_tensor(mesh_vertices, dtype=torch.float32) + mesh_triangles = torch.as_tensor(mesh_triangles, dtype=torch.int64) + if ( + mesh_vertices.numel() == 0 + or mesh_triangles.numel() == 0 + or mesh_vertices.shape[-1] != 3 + or mesh_triangles.shape[-1] != 3 + ): + raise ValueError(f"Object {obj_name} has empty or invalid mesh geometry.") + + allow_annotation = bool(runtime_kwargs.get("allow_grasp_annotation", True)) + force_reannotate = bool(runtime_kwargs.get("force_grasp_reannotate", False)) + cache_path = _affordance_cache_path(mesh_vertices, mesh_triangles) + if not os.path.exists(cache_path) and not allow_annotation: + raise RuntimeError( + "Grasp annotation cache is missing and annotation is disabled; " + "set allow_grasp_annotation=True." + ) + + antipodal_sampler_cfg = AntipodalSamplerCfg( + **_cfg_supported_kwargs( + AntipodalSamplerCfg, + { + "n_sample": int(runtime_kwargs.get("grasp_antipodal_n_sample", 20000)), + "max_angle": runtime_kwargs.get( + "grasp_antipodal_max_angle", np.pi / 12 + ), + "max_length": runtime_kwargs.get("grasp_max_open_length", 0.088), + "min_length": runtime_kwargs.get("grasp_min_open_length", 0.003), + }, + ) + ) + generator_cfg = GraspGeneratorCfg( + **_cfg_supported_kwargs( + GraspGeneratorCfg, + { + "viser_port": int(runtime_kwargs.get("grasp_viser_port", 11801)), + "antipodal_sampler_cfg": antipodal_sampler_cfg, + "max_deviation_angle": runtime_kwargs.get( + "grasp_max_deviation_angle", + np.pi / 6, + ), + }, + ) + ) + gripper_collision_cfg = GripperCollisionCfg( + **_cfg_supported_kwargs( + GripperCollisionCfg, + { + "max_open_length": runtime_kwargs.get("grasp_max_open_length", 0.088), + "finger_length": runtime_kwargs.get("grasp_finger_length", 0.078), + "point_sample_dense": runtime_kwargs.get( + "grasp_point_sample_dense", + 0.012, + ), + "max_decomposition_hulls": _max_decomposition_hulls( + target_obj, + runtime_kwargs, + ), + "env_coacd_source_mesh_path": _rigid_object_mesh_path(target_obj), + "env_coacd_body_scale": _rigid_object_body_scale(target_obj), + }, + ) + ) + affordance = AntipodalAffordance( + object_label=obj_name, + force_reannotate=force_reannotate, + custom_config={ + "gripper_collision_cfg": gripper_collision_cfg, + "generator_cfg": generator_cfg, + }, + ) + return ObjectSemantics( + label=obj_name, + geometry={ + "mesh_vertices": mesh_vertices, + "mesh_triangles": mesh_triangles, + }, + affordance=affordance, + entity=target_obj, + ) + + +def _trajectory_to_agent_action(env, robot_name, trajectory, joint_ids): + _, _, current_arm_qpos, _, current_gripper_state = get_arm_states(env, robot_name) + _, _, _, arm_joints, eef_joints = _select_arm_parts(env, robot_name) + + if isinstance(trajectory, torch.Tensor): + trajectory = trajectory.detach() + else: + trajectory = torch.as_tensor(trajectory) + + if trajectory.dim() == 3: + trajectory = trajectory[0] + if trajectory.dim() != 2 or trajectory.shape[0] == 0: + raise ValueError( + "Atomic action trajectory must have shape (T, D) or (N, T, D), " + f"got {trajectory.shape}." + ) + + joint_ids = [int(joint_id) for joint_id in joint_ids] + if len(joint_ids) != trajectory.shape[-1]: + raise ValueError( + f"Atomic action joint_ids length {len(joint_ids)} does not match " + f"trajectory width {trajectory.shape[-1]}." + ) + + device = trajectory.device + agent_action = torch.cat( + [ + torch.as_tensor( + current_arm_qpos, dtype=torch.float32, device=device + ).flatten(), + _state_to_hand_qpos(current_gripper_state, len(eef_joints), device), + ], + dim=0, + ) + agent_action = agent_action.unsqueeze(0).repeat(trajectory.shape[0], 1) + + joint_id_to_col = {joint_id: col for col, joint_id in enumerate(joint_ids)} + for out_col, joint_id in enumerate(arm_joints + eef_joints): + if joint_id in joint_id_to_col: + agent_action[:, out_col] = trajectory[:, joint_id_to_col[joint_id]] + + return agent_action.detach().cpu().numpy().astype(np.float32) + + +def _sync_agent_state_from_atomic_action(env, robot_name, action_np, control): + if action_np is None or len(action_np) == 0: + raise ValueError("Atomic action is empty; cannot sync agent state.") + + is_left, _, _, arm_joints, eef_joints = _select_arm_parts(env, robot_name) + final_action = np.asarray(action_np[-1], dtype=np.float32) + arm_dof = len(arm_joints) + + if control == "arm": + arm_qpos = torch.as_tensor( + final_action[:arm_dof], + dtype=torch.float32, + device=env.robot.device, + ) + env.set_current_qpos_agent(arm_qpos, is_left=is_left) + env.set_current_xpos_agent( + env.get_arm_fk(qpos=arm_qpos, is_left=is_left), + is_left=is_left, + ) + + if len(eef_joints) == 0: + return + + _, _, _, _, current_gripper_state = get_arm_states(env, robot_name) + eef_qpos = final_action[arm_dof : arm_dof + len(eef_joints)] + state_dof = max(int(torch.as_tensor(current_gripper_state).numel()), 1) + if len(eef_qpos) >= state_dof: + gripper_qpos = eef_qpos[:state_dof] + else: + gripper_qpos = np.resize(eef_qpos, state_dof) + + current_gripper_state = torch.as_tensor(current_gripper_state) + env.set_current_gripper_state_agent( + torch.as_tensor( + gripper_qpos, + dtype=current_gripper_state.dtype, + device=current_gripper_state.device, + ), + is_left=is_left, + ) + + +def _current_arm_qpos(env, is_left: bool, arm_joints: list[int]) -> torch.Tensor: + source = env.left_arm_current_qpos if is_left else env.right_arm_current_qpos + return torch.as_tensor( + source, + dtype=torch.float32, + device=env.robot.device, + ).reshape(1, len(arm_joints)) + + +def _state_to_hand_qpos(state, hand_dof: int, device): + if hand_dof <= 0: + return torch.empty(0, dtype=torch.float32, device=device) + + state = torch.as_tensor(state, dtype=torch.float32, device=device).flatten() + if state.numel() == 0: + return torch.zeros(hand_dof, dtype=torch.float32, device=device) + if state.numel() == hand_dof: + return state + if state.numel() == 1: + return state.repeat(hand_dof) + if state.numel() > hand_dof: + return state[:hand_dof] + + repeat_num = int(np.ceil(hand_dof / state.numel())) + return state.repeat(repeat_num)[:hand_dof] + + +def _as_2d_action(action, action_name: str): + if action is None: + return None + if isinstance(action, torch.Tensor): + action = action.detach().cpu().numpy() + action = np.asarray(action, dtype=np.float32) + if action.ndim == 1: + action = action[None, :] + if action.ndim != 2 or len(action) == 0: + raise ValueError( + f"{action_name} must have shape (T, D) with T > 0, got {action.shape}." + ) + return action + + +def _append_hold_steps(action_np, hold_steps: int, log_name: str): + hold_steps = int(hold_steps) + if hold_steps <= 0: + return action_np + if action_np is None or len(action_np) == 0: + raise ValueError(f"{log_name} action is empty; cannot append hold steps.") + + hold_actions = np.repeat(action_np[-1:], hold_steps, axis=0) + action_np = np.concatenate([action_np, hold_actions], axis=0) + log_info( + f"Append {hold_steps} hold steps after {log_name}; " + f"total trajectory length is {len(action_np)}.", + color="green", + ) + return action_np + + +def _cfg_supported_kwargs(cfg_cls, values: Mapping[str, Any]): + supported = set() + for cls in reversed(cfg_cls.__mro__): + supported.update(getattr(cls, "__annotations__", {}).keys()) + return {key: value for key, value in values.items() if key in supported} + + +def _affordance_cache_path(mesh_vertices, mesh_triangles): + vert_bytes = mesh_vertices.to("cpu").numpy().tobytes() + face_bytes = mesh_triangles.to("cpu").numpy().tobytes() + md5_hash = hashlib.md5(vert_bytes + face_bytes).hexdigest() + return os.path.join(GRASP_ANNOTATOR_CACHE_DIR, f"antipodal_cache_{md5_hash}.npy") + + +def _rigid_object_mesh_path(obj) -> str | None: + shape = getattr(getattr(obj, "cfg", None), "shape", None) + fpath = getattr(shape, "fpath", None) + return str(fpath) if fpath else None + + +def _rigid_object_body_scale(obj) -> list[float] | None: + body_scale = obj.get_body_scale(env_ids=[0])[0] + return body_scale.detach().to("cpu", dtype=torch.float32).tolist() + + +def _max_decomposition_hulls(target_obj, runtime_kwargs: Mapping[str, Any]) -> int: + if "grasp_max_decomposition_hulls" in runtime_kwargs: + return int(runtime_kwargs["grasp_max_decomposition_hulls"]) + + max_convex_hull_num = getattr( + getattr(target_obj, "cfg", None), + "max_convex_hull_num", + None, + ) + if max_convex_hull_num is not None and int(max_convex_hull_num) > 1: + return int(max_convex_hull_num) + return 8 + + +def _xyz(value, field_name: str) -> list[float]: + if not isinstance(value, list) or len(value) != 3: + raise ValueError(f"{field_name} must be a three-element list.") + return [float(item) for item in value] diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py new file mode 100644 index 00000000..f522c5ab --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py @@ -0,0 +1,262 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import importlib +from collections.abc import Mapping +from pathlib import Path +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import extract_json_object + +__all__ = [ + "compile_agent_graph_from_file", + "compile_agent_graph_spec", + "load_agent_graph_bundle", +] + +_RECOVERY_KEYS = { + "recovery_graph", + "recovery_spec", + "recovery_bindings", + "recovery_nodes", + "recovery_edges", + "recovery_branches", + "recoveries", +} + + +def load_agent_graph_bundle(path: str | Path) -> dict[str, Any]: + """Load a compiled graph JSON bundle from disk.""" + return extract_json_object(Path(path).read_text(encoding="utf-8")) + + +def compile_agent_graph_from_file( + path: str | Path, + *, + env: Any = None, + graph_cls: type | None = None, + action_module: Any = None, + monitor_module: Any = None, +) -> Any: + """Compile a graph JSON bundle from disk into an executable graph.""" + del env, monitor_module + + bundle = load_agent_graph_bundle(path) + recovery_graph = bundle.get("recovery_graph") + if _has_recovery_content(recovery_graph): + raise ValueError("Recovery graph artifacts are no longer supported.") + + task_graph = bundle.get("task_graph", bundle) + return compile_agent_graph_spec( + task_graph, + graph_cls=graph_cls, + action_module=action_module, + ) + + +def compile_agent_graph_spec( + task_graph: str | Mapping[str, Any], + recovery_graph: str | Mapping[str, Any] | None = None, + *, + env: Any = None, + graph_cls: type | None = None, + action_module: Any = None, + monitor_module: Any = None, +) -> Any: + """Compile a nominal JSON graph into ``AgentTaskGraph``.""" + del env, monitor_module + + if _has_recovery_content(recovery_graph): + raise ValueError("Recovery graph compilation has been removed.") + + task_spec = extract_json_object(task_graph) + _reject_recovery_keys(task_spec) + _validate_task_spec(task_spec) + graph_cls, action_module = _resolve_runtime( + graph_cls=graph_cls, + action_module=action_module, + ) + + graph = graph_cls( + start=task_spec["start"], + goal=task_spec["goal"], + max_transitions=int(task_spec.get("max_transitions", 1000)), + ) + + for node in task_spec.get("nodes", []): + graph.add_node(node["id"], node.get("semantic", "")) + + for edge in task_spec.get("edges", []): + graph.add_edge( + edge["id"], + edge["source"], + edge["target"], + left_arm_action=_compile_action(edge.get("left_arm_action"), action_module), + right_arm_action=_compile_action( + edge.get("right_arm_action"), action_module + ), + ) + + return graph + + +def _resolve_runtime( + *, + graph_cls: type | None, + action_module: Any, +) -> tuple[type, Any]: + if graph_cls is None: + graph_cls = _resolve_attr( + importlib.import_module( + "embodichain.gen_sim.action_agent_pipeline.runtime.task_graph" + ), + "AgentTaskGraph", + ) + if action_module is None: + action_module = importlib.import_module( + "embodichain.gen_sim.action_agent_pipeline.runtime.atom_actions" + ) + return graph_cls, action_module + + +def _validate_task_spec(task_spec: Mapping[str, Any]) -> None: + node_ids = set() + for node in task_spec.get("nodes", []): + node_id = node["id"] + if node_id in node_ids: + raise ValueError(f"Duplicate graph node id '{node_id}'.") + node_ids.add(node_id) + + for required_node in (task_spec["start"], task_spec["goal"]): + if required_node not in node_ids: + raise ValueError(f"Graph node '{required_node}' is not defined.") + + edge_specs = list(task_spec.get("edges", [])) + edge_ids = set() + for edge in edge_specs: + edge_id = edge["id"] + if edge_id in edge_ids: + raise ValueError(f"Duplicate graph edge id '{edge_id}'.") + edge_ids.add(edge_id) + if edge.get("left_arm_action") is None and edge.get("right_arm_action") is None: + raise ValueError(f"Nominal edge '{edge_id}' must define an arm action.") + + for node_key in ("source", "target"): + node_id = edge[node_key] + if node_id not in node_ids: + raise ValueError( + f"Edge '{edge_id}' references unknown {node_key} node '{node_id}'." + ) + + _validate_nominal_path(task_spec, edge_specs) + + +def _validate_nominal_path( + task_spec: Mapping[str, Any], + edge_specs: list[Mapping[str, Any]], +) -> None: + outgoing_edges: dict[str, Mapping[str, Any]] = {} + for edge in edge_specs: + source = edge["source"] + if source in outgoing_edges: + raise ValueError( + f"Nominal node '{source}' has multiple outgoing edges. " + "The current graph executor expects one deterministic nominal path." + ) + outgoing_edges[source] = edge + + current = task_spec["start"] + goal = task_spec["goal"] + visited_edges = set() + visited_nodes = {current} + + while current != goal: + edge = outgoing_edges.get(current) + if edge is None: + raise ValueError( + f"Nominal graph has no start-to-goal path from node '{current}'." + ) + edge_id = edge["id"] + if edge_id in visited_edges: + raise ValueError("Nominal graph contains a cycle.") + + visited_edges.add(edge_id) + current = edge["target"] + if current in visited_nodes and current != goal: + raise ValueError("Nominal graph contains a cycle.") + visited_nodes.add(current) + + all_edge_ids = {edge["id"] for edge in edge_specs} + unused_edge_ids = all_edge_ids - visited_edges + if unused_edge_ids: + unused = ", ".join(sorted(unused_edge_ids)) + raise ValueError( + f"Nominal graph contains edges outside the start-to-goal path: {unused}." + ) + + +def _compile_action(spec: Any, action_module: Any) -> Any: + if spec is None: + return None + if isinstance(spec, str) and spec.strip().lower() in {"", "none", "null"}: + return None + if not isinstance(spec, Mapping): + raise TypeError(f"Action spec must be a mapping or null, but got {type(spec)}.") + if "fn" in spec: + raise ValueError( + "Legacy fn/kwargs action schema is not supported. Use atomic action " + "class JSON spec with atomic_action_class, robot_name, control, cfg, " + "and exactly one of target_object, target_pose, or target_qpos." + ) + if "action" in spec: + raise ValueError( + "Legacy action schema is not supported. Use atomic_action_class with " + "PickUpAction, MoveAction, or PlaceAction." + ) + if spec.get("atomic_action_class") is None: + raise ValueError( + "Atomic action class schema requires atomic_action_class, robot_name, " + "control, cfg, and exactly one of target_object, target_pose, or " + "target_qpos." + ) + + return action_module.normalize_atomic_action_spec(spec) + + +def _has_recovery_content(value: Any) -> bool: + if value is None: + return False + recovery_spec = extract_json_object(value) + if not isinstance(recovery_spec, Mapping): + return bool(recovery_spec) + return any(bool(recovery_spec.get(key)) for key in _RECOVERY_KEYS) + + +def _reject_recovery_keys(task_spec: Mapping[str, Any]) -> None: + present = _RECOVERY_KEYS & set(task_spec) + if present: + raise ValueError( + "Recovery graph fields are no longer supported: " + f"{', '.join(sorted(present))}." + ) + + +def _resolve_attr(namespace: Any, name: str) -> Any: + if isinstance(namespace, Mapping): + return namespace[name] + return getattr(namespace, name) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py b/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py new file mode 100644 index 00000000..53ea5a1f --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py @@ -0,0 +1,134 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections import defaultdict +from dataclasses import dataclass +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.runtime.atom_actions import ( + execute_parallel_atomic_actions, +) + +__all__ = [ + "AgentGraphEdge", + "AgentGraphNode", + "AgentTaskGraph", + "ExecutedActionList", +] + + +@dataclass +class AgentGraphNode: + """Semantic keyframe in an atomic-action task graph.""" + + id: str + semantic: str = "" + + +@dataclass +class AgentGraphEdge: + """Executable transition between two graph nodes.""" + + id: str + source: str + target: str + left_arm_action: Any = None + right_arm_action: Any = None + + +class ExecutedActionList: + """Action sequence already executed online by the graph runtime.""" + + already_executed = True + + def __init__(self, actions: list[Any]) -> None: + self.actions = actions + + def __len__(self) -> int: + return len(self.actions) + + def __iter__(self): + return iter(self.actions) + + def __getitem__(self, index): + return self.actions[index] + + +class AgentTaskGraph: + """Deterministic atomic-action graph with one nominal start-to-goal path.""" + + def __init__(self, start: str, goal: str, max_transitions: int = 1000) -> None: + self.start = start + self.goal = goal + self.max_transitions = max_transitions + self.nodes: dict[str, AgentGraphNode] = {} + self.edges: dict[str, AgentGraphEdge] = {} + self.outgoing: dict[str, list[str]] = defaultdict(list) + + def add_node(self, node_id: str, semantic: str = "") -> "AgentTaskGraph": + self.nodes[node_id] = AgentGraphNode(node_id, semantic) + return self + + def add_edge( + self, + edge_id: str, + source: str, + target: str, + *, + left_arm_action=None, + right_arm_action=None, + ) -> "AgentTaskGraph": + self.edges[edge_id] = AgentGraphEdge( + id=edge_id, + source=source, + target=target, + left_arm_action=left_arm_action, + right_arm_action=right_arm_action, + ) + self.outgoing[source].append(edge_id) + return self + + def run(self, env=None, **kwargs) -> ExecutedActionList: + current = self.start + executed_actions: list[Any] = [] + transitions = 0 + + while current != self.goal: + transitions += 1 + if transitions > self.max_transitions: + raise RuntimeError("Agent task graph exceeded max_transitions.") + + edge = self.edges[self._next_edge(current)] + actions = execute_parallel_atomic_actions( + left_arm_action=edge.left_arm_action, + right_arm_action=edge.right_arm_action, + env=env, + **kwargs, + ) + executed_actions.extend(actions) + current = edge.target + + return ExecutedActionList(executed_actions) + + def _next_edge(self, node_id: str) -> str: + outgoing_edges = self.outgoing[node_id] + if len(outgoing_edges) != 1: + raise RuntimeError( + f"Nominal node '{node_id}' must have exactly one outgoing edge." + ) + return outgoing_edges[0] diff --git a/embodichain/gen_sim/action_agent_pipeline/utils/__init__.py b/embodichain/gen_sim/action_agent_pipeline/utils/__init__.py new file mode 100644 index 00000000..9cfdb173 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/utils/__init__.py @@ -0,0 +1,18 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + diff --git a/embodichain/gen_sim/action_agent_pipeline/utils/llm_config.py b/embodichain/gen_sim/action_agent_pipeline/utils/llm_config.py new file mode 100644 index 00000000..c267a1c8 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/utils/llm_config.py @@ -0,0 +1,159 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any + +__all__ = [ + "DEFAULT_LLM_MODEL", + "ACTION_PIPELINE_LLM_ENV_PATH", + "GEN_CONFIG_PATH", + "LLM_ENV_PATH", + "LEGACY_LLM_ENV_PATH", + "SIMREADY_LLM_ENV_PATH", + "get_openai_compatible_llm_config", +] + +DEFAULT_LLM_MODEL = "gpt-4o" +CONFIG_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = next( + ( + parent + for parent in CONFIG_DIR.parents + if (parent / "setup.py").exists() and (parent / "embodichain").exists() + ), + CONFIG_DIR.parents[3], +) +GEN_CONFIG_PATH = ( + PROJECT_ROOT / "embodichain/gen_sim/simready_pipeline/configs/gen_config.json" +) +LLM_ENV_PATH = PROJECT_ROOT / ".env" +SIMREADY_LLM_ENV_PATH = ( + PROJECT_ROOT / "embodichain/gen_sim/simready_pipeline/configs/.env" +) +ACTION_PIPELINE_LLM_ENV_PATH = CONFIG_DIR / ".env" +LEGACY_LLM_ENV_PATH = SIMREADY_LLM_ENV_PATH + + +def _load_env_file(path: Path | None = None) -> dict[str, str]: + """Read local KEY=VALUE credentials without overriding shell variables.""" + path = path or LLM_ENV_PATH + if not path.exists(): + return {} + + env_values: dict[str, str] = {} + for raw_line in path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + key = key.strip() + value = value.strip().strip("\"'") + if key: + env_values[key] = value + return env_values + + +def _load_env_files(paths: tuple[Path, ...] | None = None) -> dict[str, str]: + """Read local env files, with later paths taking precedence.""" + env_values: dict[str, str] = {} + for path in paths or ( + SIMREADY_LLM_ENV_PATH, + ACTION_PIPELINE_LLM_ENV_PATH, + LLM_ENV_PATH, + ): + env_values.update(_load_env_file(path)) + return env_values + + +def _get_first_value( + local_env: dict[str, str], + *names: str, + default: str | None = None, +) -> str | None: + for name in names: + value = os.getenv(name) + if value: + return value + value = local_env.get(name) + if value: + return value + return default + + +def _load_gen_config(path: Path | None = None) -> dict[str, Any]: + path = path or GEN_CONFIG_PATH + if not path.exists(): + raise FileNotFoundError(f"gen_config.json not found: {path}") + + with path.open("r", encoding="utf-8") as f: + raw_cfg = json.load(f) + return dict(raw_cfg.get("llm", {}).get("openai_compatible", {})) + + +def get_openai_compatible_llm_config( + *, + required: bool = False, + require_base_url: bool = False, + default_model: str = DEFAULT_LLM_MODEL, +) -> dict[str, Any]: + """Return shared OpenAI-compatible LLM config for agents and gen-sim.""" + local_env = _load_env_files() + json_cfg = _load_gen_config() + + cfg = { + "api_key": _get_first_value(local_env, "OPENAI_API_KEY") + or json_cfg.get("api_key", ""), + "model": _get_first_value(local_env, "OPENAI_MODEL", "LLM_MODEL") + or json_cfg.get("model") + or default_model, + "base_url": _get_first_value( + local_env, + "OPENAI_BASE_URL", + "OPENAI_API_BASE", + "LLM_URL", + ) + or json_cfg.get("base_url", ""), + "default_query": json_cfg.get("default_query", {}) or {}, + "proxy_url": _get_first_value( + local_env, + "EMBODICHAIN_LLM_PROXY", + "LLM_PROXY_URL", + ) + or json_cfg.get("proxy_url", ""), + } + + if cfg["base_url"]: + cfg["base_url"] = cfg["base_url"].rstrip("/") + + if required: + required_keys = ["api_key", "model"] + if require_base_url: + required_keys.append("base_url") + missing = [key for key in required_keys if not cfg.get(key)] + if missing: + raise ValueError( + f"Missing required LLM config keys: {missing}. " + "Set them in shell environment variables, " + f"{LLM_ENV_PATH}, {ACTION_PIPELINE_LLM_ENV_PATH}, " + f"{SIMREADY_LLM_ENV_PATH}, or {GEN_CONFIG_PATH}." + ) + + return cfg diff --git a/embodichain/gen_sim/action_agent_pipeline/utils/llm_json.py b/embodichain/gen_sim/action_agent_pipeline/utils/llm_json.py new file mode 100644 index 00000000..68bdacfb --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/utils/llm_json.py @@ -0,0 +1,73 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json +import re +from collections.abc import Mapping +from typing import Any + +__all__ = [ + "extract_json_object", + "normalize_json_content", +] + +_JSON_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.DOTALL | re.IGNORECASE) + + +def extract_json_object(content: str | Mapping[str, Any]) -> dict[str, Any]: + """Extract a JSON object from plain or fenced LLM content. + + Args: + content: Raw LLM text, already parsed JSON-like mapping, or markdown fenced + JSON content. + + Returns: + Parsed JSON object. + + Raises: + ValueError: If no JSON object can be parsed. + """ + if isinstance(content, Mapping): + return dict(content) + + text = str(content).strip() + candidates = [match.group(1).strip() for match in _JSON_FENCE_RE.finditer(text)] + candidates.append(text) + + decoder = json.JSONDecoder() + for candidate in candidates: + try: + value = json.loads(candidate) + except json.JSONDecodeError: + start = candidate.find("{") + if start < 0: + continue + try: + value, _ = decoder.raw_decode(candidate[start:]) + except json.JSONDecodeError: + continue + + if isinstance(value, dict): + return value + + raise ValueError("Expected a JSON object in the LLM response.") + + +def normalize_json_content(content: str | Mapping[str, Any]) -> str: + """Normalize JSON-like LLM content into stable pretty-printed JSON text.""" + return json.dumps(extract_json_object(content), ensure_ascii=False, indent=2) diff --git a/embodichain/gen_sim/action_agent_pipeline/utils/llm_usage.py b/embodichain/gen_sim/action_agent_pipeline/utils/llm_usage.py new file mode 100644 index 00000000..e8919bdd --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/utils/llm_usage.py @@ -0,0 +1,410 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping +from datetime import datetime, timezone +import json +import os +from pathlib import Path +import re +from typing import Any + +__all__ = [ + "LLM_USAGE_PATH_ENV", + "LLM_USAGE_PROCESS_ENV", + "LLM_USAGE_RUN_ID_ENV", + "UsageTrackedChatModel", + "build_usage_summary", + "configure_usage_tracking", + "disable_usage_tracking", + "extract_usage_from_langchain_response", + "normalize_usage", + "normalize_usage_stage", + "record_langchain_usage", + "record_llm_usage", + "scrub_usage_tracking_env", + "write_usage_summary", +] + + +LLM_USAGE_PATH_ENV = "EMBODICHAIN_LLM_USAGE_PATH" +LLM_USAGE_RUN_ID_ENV = "EMBODICHAIN_LLM_USAGE_RUN_ID" +LLM_USAGE_PROCESS_ENV = "EMBODICHAIN_LLM_USAGE_PROCESS" + +_USAGE_ENV_KEYS = { + LLM_USAGE_PATH_ENV, + LLM_USAGE_RUN_ID_ENV, + LLM_USAGE_PROCESS_ENV, +} +_TOKEN_FIELDS = ( + "input_tokens", + "output_tokens", + "total_tokens", + "cached_tokens", + "reasoning_tokens", +) + + +class UsageTrackedChatModel: + """Proxy a LangChain chat model and record usage after each invoke call.""" + + def __init__( + self, + inner: Any, + *, + stage: str | None, + provider: str = "langchain_openai", + ) -> None: + self._inner = inner + self._usage_stage = normalize_usage_stage(stage or "chat") + self._usage_provider = provider + + def invoke(self, *args, **kwargs): + response = self._inner.invoke(*args, **kwargs) + record_langchain_usage( + response, + stage=self._usage_stage, + provider=self._usage_provider, + model=_model_name_from_chat_model(self._inner), + ) + return response + + def __getattr__(self, name: str) -> Any: + return getattr(self._inner, name) + + +def configure_usage_tracking( + *, + usage_path: str | Path, + run_id: str, + process_name: str, + reset: bool = False, +) -> Path: + """Configure process-local environment variables for LLM usage logging.""" + path = Path(usage_path).expanduser().resolve() + path.parent.mkdir(parents=True, exist_ok=True) + if reset: + path.write_text("", encoding="utf-8") + os.environ[LLM_USAGE_PATH_ENV] = path.as_posix() + os.environ[LLM_USAGE_RUN_ID_ENV] = str(run_id) + os.environ[LLM_USAGE_PROCESS_ENV] = str(process_name) + return path + + +def disable_usage_tracking() -> None: + """Disable process-local EmbodiChain LLM usage logging.""" + for key in _USAGE_ENV_KEYS: + os.environ.pop(key, None) + + +def scrub_usage_tracking_env(env: Mapping[str, str] | None = None) -> dict[str, str]: + """Return an environment copy without EmbodiChain LLM usage variables.""" + cleaned = dict(os.environ if env is None else env) + for key in _USAGE_ENV_KEYS: + cleaned.pop(key, None) + return cleaned + + +def normalize_usage_stage(stage: str) -> str: + """Normalize a human-readable usage stage into a compact identifier.""" + value = str(stage or "unknown").strip().lower() + value = re.sub(r"[^a-z0-9_.-]+", "_", value) + value = re.sub(r"_+", "_", value).strip("_.-") + return value or "unknown" + + +def normalize_usage(usage: Mapping[str, Any] | None) -> dict[str, int | None]: + """Normalize OpenAI and LangChain token usage shapes.""" + if not isinstance(usage, Mapping): + return {field: None for field in _TOKEN_FIELDS} + + input_tokens = _first_int(usage, "input_tokens", "prompt_tokens") + output_tokens = _first_int(usage, "output_tokens", "completion_tokens") + total_tokens = _first_int(usage, "total_tokens") + if total_tokens is None and input_tokens is not None and output_tokens is not None: + total_tokens = input_tokens + output_tokens + + prompt_details = _mapping_value(usage, "prompt_tokens_details") + input_details = _mapping_value(usage, "input_token_details") + completion_details = _mapping_value(usage, "completion_tokens_details") + output_details = _mapping_value(usage, "output_token_details") + + cached_tokens = _first_int(usage, "cached_tokens", "cache_read") + if cached_tokens is None: + cached_tokens = _first_int( + prompt_details, + "cached_tokens", + "cache_read", + ) + if cached_tokens is None: + cached_tokens = _first_int(input_details, "cached_tokens", "cache_read") + + reasoning_tokens = _first_int(usage, "reasoning_tokens", "reasoning") + if reasoning_tokens is None: + reasoning_tokens = _first_int( + completion_details, + "reasoning_tokens", + "reasoning", + ) + if reasoning_tokens is None: + reasoning_tokens = _first_int( + output_details, + "reasoning_tokens", + "reasoning", + ) + + return { + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + "cached_tokens": cached_tokens, + "reasoning_tokens": reasoning_tokens, + } + + +def extract_usage_from_langchain_response( + response: Any, +) -> tuple[dict[str, int | None], dict[str, Any]]: + """Extract usage fields and lightweight metadata from a LangChain response.""" + metadata = _mapping_value_from_object(response, "response_metadata") + usage = _mapping_value_from_object(response, "usage_metadata") + if not usage: + usage = _mapping_value(metadata, "token_usage") + + usage_values = normalize_usage(usage) + response_metadata = { + "model": _string_value(metadata, "model_name", "model"), + "request_id": _string_value(metadata, "id", "request_id"), + "finish_reason": _finish_reason(metadata), + "raw_usage": _json_safe(usage) if isinstance(usage, Mapping) else None, + } + return usage_values, response_metadata + + +def record_langchain_usage( + response: Any, + *, + stage: str, + provider: str = "langchain_openai", + model: str | None = None, +) -> None: + """Record usage from a LangChain response if usage logging is enabled.""" + usage, metadata = extract_usage_from_langchain_response(response) + record_llm_usage( + stage=stage, + provider=provider, + model=metadata.get("model") or model, + usage=usage, + request_id=metadata.get("request_id"), + finish_reason=metadata.get("finish_reason"), + raw_usage=metadata.get("raw_usage"), + ) + + +def record_llm_usage( + *, + stage: str, + provider: str, + model: str | None, + usage: Mapping[str, Any] | None, + request_id: str | None = None, + finish_reason: str | None = None, + raw_usage: Mapping[str, Any] | None = None, + metadata: Mapping[str, Any] | None = None, +) -> None: + """Append one LLM usage record to the configured JSONL file.""" + usage_path = os.getenv(LLM_USAGE_PATH_ENV) + if not usage_path: + return + + usage_values = normalize_usage(usage) + usage_available = any(usage_values[field] is not None for field in _TOKEN_FIELDS) + record: dict[str, Any] = { + "created_at": datetime.now(timezone.utc).isoformat(timespec="milliseconds"), + "run_id": os.getenv(LLM_USAGE_RUN_ID_ENV), + "process": os.getenv(LLM_USAGE_PROCESS_ENV), + "pid": os.getpid(), + "stage": normalize_usage_stage(stage), + "provider": provider, + "model": model, + "usage_available": usage_available, + "request_id": request_id, + "finish_reason": finish_reason, + } + record.update(usage_values) + if raw_usage is not None: + record["raw_usage"] = _json_safe(raw_usage) + if metadata: + record["metadata"] = _json_safe(metadata) + + path = Path(usage_path).expanduser().resolve() + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as file: + file.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n") + + +def build_usage_summary(usage_path: str | Path) -> dict[str, Any]: + """Build aggregate token usage totals from a JSONL usage file.""" + path = Path(usage_path).expanduser().resolve() + records = _read_usage_records(path) + summary: dict[str, Any] = { + "usage_path": path.as_posix(), + "generated_at": datetime.now(timezone.utc).isoformat(timespec="milliseconds"), + "run_id": os.getenv(LLM_USAGE_RUN_ID_ENV), + "total": _empty_bucket(), + "by_stage": {}, + "by_model": {}, + "by_process": {}, + } + + for record in records: + _add_record(summary["total"], record) + _add_grouped_record(summary["by_stage"], record.get("stage"), record) + _add_grouped_record(summary["by_model"], record.get("model"), record) + _add_grouped_record(summary["by_process"], record.get("process"), record) + + return summary + + +def write_usage_summary( + *, + usage_path: str | Path, + summary_path: str | Path, +) -> dict[str, Any]: + """Write a JSON token usage summary and return it.""" + summary = build_usage_summary(usage_path) + path = Path(summary_path).expanduser().resolve() + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(summary, ensure_ascii=False, indent=4, sort_keys=True) + "\n", + encoding="utf-8", + ) + return summary + + +def _read_usage_records(path: Path) -> list[dict[str, Any]]: + if not path.is_file(): + return [] + records: list[dict[str, Any]] = [] + for line in path.read_text(encoding="utf-8").splitlines(): + stripped = line.strip() + if not stripped: + continue + try: + parsed = json.loads(stripped) + except json.JSONDecodeError: + continue + if isinstance(parsed, dict): + records.append(parsed) + return records + + +def _empty_bucket() -> dict[str, int]: + bucket = { + "calls": 0, + "calls_with_usage": 0, + } + for field in _TOKEN_FIELDS: + bucket[field] = 0 + return bucket + + +def _add_grouped_record( + groups: dict[str, dict[str, int]], + key: Any, + record: Mapping[str, Any], +) -> None: + group_key = str(key or "unknown") + bucket = groups.setdefault(group_key, _empty_bucket()) + _add_record(bucket, record) + + +def _add_record(bucket: dict[str, int], record: Mapping[str, Any]) -> None: + bucket["calls"] += 1 + if record.get("usage_available"): + bucket["calls_with_usage"] += 1 + for field in _TOKEN_FIELDS: + value = record.get(field) + if isinstance(value, int): + bucket[field] += value + + +def _model_name_from_chat_model(model: Any) -> str | None: + for attr in ("model_name", "model"): + value = getattr(model, attr, None) + if value: + return str(value) + return None + + +def _mapping_value_from_object(value: Any, attr_name: str) -> Mapping[str, Any]: + attr = getattr(value, attr_name, None) + return attr if isinstance(attr, Mapping) else {} + + +def _mapping_value(mapping: Mapping[str, Any], key: str) -> Mapping[str, Any]: + value = mapping.get(key) if isinstance(mapping, Mapping) else None + return value if isinstance(value, Mapping) else {} + + +def _first_int(mapping: Mapping[str, Any], *keys: str) -> int | None: + if not isinstance(mapping, Mapping): + return None + for key in keys: + value = mapping.get(key) + if isinstance(value, bool): + continue + if isinstance(value, int): + return value + if isinstance(value, float) and value.is_integer(): + return int(value) + return None + + +def _string_value(mapping: Mapping[str, Any], *keys: str) -> str | None: + if not isinstance(mapping, Mapping): + return None + for key in keys: + value = mapping.get(key) + if isinstance(value, str) and value: + return value + return None + + +def _finish_reason(metadata: Mapping[str, Any]) -> str | None: + reason = _string_value(metadata, "finish_reason") + if reason: + return reason + response_metadata = ( + metadata.get("response_metadata") if isinstance(metadata, Mapping) else None + ) + if isinstance(response_metadata, Mapping): + return _string_value(response_metadata, "finish_reason") + return None + + +def _json_safe(value: Any) -> Any: + try: + json.dumps(value, ensure_ascii=False) + return value + except TypeError: + if isinstance(value, Mapping): + return {str(key): _json_safe(item) for key, item in value.items()} + if isinstance(value, (list, tuple)): + return [_json_safe(item) for item in value] + return str(value) diff --git a/embodichain/gen_sim/action_agent_pipeline/utils/mllm.py b/embodichain/gen_sim/action_agent_pipeline/utils/mllm.py new file mode 100644 index 00000000..f39f2d0f --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/utils/mllm.py @@ -0,0 +1,115 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import os +from collections.abc import Mapping +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.utils.llm_config import ( + DEFAULT_LLM_MODEL, + get_openai_compatible_llm_config, +) +from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + UsageTrackedChatModel, +) + +__all__ = [ + "DEFAULT_LLM_MODEL", + "apply_proxy_env", + "create_chat_openai", + "create_openai_client", + "get_openai_compatible_llm_config", +] + + +def apply_proxy_env(proxy_url: str | None) -> None: + """Apply an optional proxy URL for OpenAI-compatible clients.""" + if not proxy_url: + return + os.environ["HTTP_PROXY"] = proxy_url + os.environ["HTTPS_PROXY"] = proxy_url + + +def _resolve_llm_config( + *, + config: Mapping[str, Any] | None, + required: bool, + require_base_url: bool, +) -> dict[str, Any]: + if config is not None: + return dict(config) + return get_openai_compatible_llm_config( + required=required, + require_base_url=require_base_url, + ) + + +def create_openai_client( + *, + config: Mapping[str, Any] | None = None, + required: bool = True, + require_base_url: bool = False, +): + """Create the shared OpenAI-compatible SDK client used by gen-sim MLLM calls.""" + from openai import OpenAI + + cfg = _resolve_llm_config( + config=config, + required=required, + require_base_url=require_base_url, + ) + apply_proxy_env(cfg.get("proxy_url")) + + kwargs: dict[str, Any] = { + "api_key": cfg["api_key"], + "default_query": cfg.get("default_query") or None, + } + if cfg.get("base_url"): + kwargs["base_url"] = cfg["base_url"] + return OpenAI(**kwargs) + + +def create_chat_openai( + *, + temperature: float = 0.0, + model: str | None = None, + config: Mapping[str, Any] | None = None, + required: bool = True, + usage_stage: str | None = None, +): + """Create the shared LangChain OpenAI-compatible chat client for agents.""" + from langchain_openai import ChatOpenAI + + cfg = _resolve_llm_config( + config=config, + required=required, + require_base_url=False, + ) + apply_proxy_env(cfg.get("proxy_url")) + + kwargs: dict[str, Any] = { + "temperature": temperature, + "model": model or cfg.get("model") or DEFAULT_LLM_MODEL, + "api_key": cfg["api_key"], + } + if cfg.get("base_url"): + kwargs["base_url"] = cfg["base_url"] + return UsageTrackedChatModel( + ChatOpenAI(**kwargs), + stage=usage_stage, + ) diff --git a/embodichain/lab/sim/objects/articulation.py b/embodichain/lab/sim/objects/articulation.py index ed5a9e32..477098e9 100644 --- a/embodichain/lab/sim/objects/articulation.py +++ b/embodichain/lab/sim/objects/articulation.py @@ -1446,7 +1446,14 @@ def set_joint_drive( drive_args["joint_friction"] = friction[i].cpu().numpy() if armature is not None: drive_args["armature"] = armature[i].cpu().numpy() - self._entities[env_idx].set_drive(**drive_args) + try: + self._entities[env_idx].set_drive(**drive_args) + except TypeError as exc: + if "armature" not in drive_args or "armature" not in str(exc): + raise + legacy_drive_args = dict(drive_args) + legacy_drive_args.pop("armature", None) + self._entities[env_idx].set_drive(**legacy_drive_args) def get_joint_drive( self, diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py new file mode 100644 index 00000000..5ae0f5b3 --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -0,0 +1,432 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest +import torch + +from embodichain.gen_sim.action_agent_pipeline.runtime import atom_actions +from embodichain.gen_sim.action_agent_pipeline.runtime.atom_actions import ( + execute_atomic_action, + normalize_atomic_action_spec, +) +from embodichain.lab.sim.atomic_actions import ( + MoveActionCfg, + PickUpActionCfg, + PlaceActionCfg, +) + + +class _FakeRobot: + uid = "fake_robot" + device = torch.device("cpu") + control_parts = { + "left_arm": [0, 1], + "left_eef": [2], + "right_arm": [3, 4], + "right_eef": [5], + } + + def get_qpos(self): + return torch.zeros(1, 6) + + +class _FakeObject: + cfg = SimpleNamespace(shape=SimpleNamespace(fpath="/tmp/fake.obj")) + + def __init__(self, xyz): + self._pose = torch.eye(4) + self._pose[:3, 3] = torch.tensor(xyz, dtype=torch.float32) + + def get_local_pose(self, to_matrix: bool = True): + return self._pose.unsqueeze(0) + + def get_vertices(self, env_ids=None, scale: bool = True): + return [torch.tensor([[0.0, 0.0, 0.0], [0.01, 0.0, 0.0], [0.0, 0.01, 0.0]])] + + def get_triangles(self, env_ids=None): + return [torch.tensor([[0, 1, 2]])] + + def get_body_scale(self, env_ids=None): + return torch.ones(1, 3) + + +class _FakeSim: + def __init__(self): + self.objects = {"apple": _FakeObject([0.4, -0.2, 0.1])} + + def get_rigid_object(self, uid: str): + return self.objects.get(uid) + + +class _FakeEnv: + def __init__(self): + self.robot = _FakeRobot() + self.sim = _FakeSim() + self.left_arm_joints = [0, 1] + self.left_eef_joints = [2] + self.right_arm_joints = [3, 4] + self.right_eef_joints = [5] + self.left_arm_current_qpos = torch.tensor([0.1, 0.2]) + self.right_arm_current_qpos = torch.tensor([0.3, 0.4]) + self.left_arm_init_qpos = torch.tensor([-0.1, -0.2]) + self.right_arm_init_qpos = torch.tensor([-0.3, -0.4]) + self.left_arm_current_xpos = torch.eye(4) + self.right_arm_current_xpos = torch.eye(4) + self.left_arm_current_gripper_state = torch.tensor([0.0]) + self.right_arm_current_gripper_state = torch.tensor([0.0]) + self.open_state = torch.tensor([0.05]) + self.close_state = torch.tensor([0.0]) + + def get_current_qpos_agent(self): + return self.left_arm_current_qpos, self.right_arm_current_qpos + + def set_current_qpos_agent(self, arm_qpos, is_left): + if is_left: + self.left_arm_current_qpos = arm_qpos + else: + self.right_arm_current_qpos = arm_qpos + + def get_current_xpos_agent(self): + return self.left_arm_current_xpos, self.right_arm_current_xpos + + def set_current_xpos_agent(self, arm_xpos, is_left): + if is_left: + self.left_arm_current_xpos = arm_xpos + else: + self.right_arm_current_xpos = arm_xpos + + def get_current_gripper_state_agent(self): + return self.left_arm_current_gripper_state, self.right_arm_current_gripper_state + + def set_current_gripper_state_agent(self, arm_gripper_state, is_left): + if is_left: + self.left_arm_current_gripper_state = arm_gripper_state + else: + self.right_arm_current_gripper_state = arm_gripper_state + + def get_arm_fk(self, qpos, is_left): + pose = torch.eye(4) + pose[0, 3] = torch.as_tensor(qpos).flatten()[0] + return pose + + +class _FakeBackendAction: + capture: list | None = None + + def __init__(self, motion_generator, cfg): + self.motion_generator = motion_generator + self.cfg = cfg + if self.capture is not None: + self.capture.append( + { + "cfg": self.cfg, + "motion_generator": self.motion_generator, + } + ) + + def execute(self, target, start_qpos=None, **kwargs): + if self.capture is not None: + self.capture[-1].update({"target": target, "start_qpos": start_qpos}) + if self.cfg.name in {"pick_up", "place"}: + trajectory = torch.tensor( + [[[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]]], dtype=torch.float32 + ) + return ( + True, + trajectory, + [0, 1, 2] if "left" in self.cfg.control_part else [3, 4, 5], + ) + if self.cfg.control_part.endswith("eef"): + trajectory = torch.tensor([[[0.0], [0.05]]], dtype=torch.float32) + return True, trajectory, [2 if "left" in self.cfg.control_part else 5] + trajectory = torch.tensor([[[0.1, 0.2], [0.2, 0.3]]], dtype=torch.float32) + return True, trajectory, [0, 1] if "left" in self.cfg.control_part else [3, 4] + + +def test_normalize_atomic_action_spec_rejects_legacy_schema() -> None: + with pytest.raises(ValueError, match="Legacy action schema"): + normalize_atomic_action_spec({"action": "move", "robot_name": "left_arm"}) + + +def test_normalize_atomic_action_spec_rejects_legacy_target_kind_schema() -> None: + with pytest.raises(ValueError, match="Legacy target.kind schema"): + normalize_atomic_action_spec( + { + "atomic_action_class": "MoveAction", + "robot_name": "left_arm", + "control": "arm", + "target": {"kind": "pose_relative_to_object", "obj_name": "apple"}, + "cfg": {}, + } + ) + + +def test_normalize_atomic_action_spec_rejects_multiple_target_fields() -> None: + with pytest.raises(ValueError, match="exactly one of target_object"): + normalize_atomic_action_spec( + { + "atomic_action_class": "MoveAction", + "robot_name": "left_arm", + "control": "arm", + "target_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + }, + "target_qpos": {"source": "initial"}, + "cfg": {}, + } + ) + + +def test_normalize_atomic_action_spec_rejects_pickup_pose_target() -> None: + with pytest.raises(ValueError, match="PickUpAction requires control='arm'"): + normalize_atomic_action_spec( + { + "atomic_action_class": "PickUpAction", + "robot_name": "left_arm", + "control": "arm", + "target_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + }, + "cfg": {}, + } + ) + + +def test_atom_actions_module_exposes_atomic_runtime_entrypoints() -> None: + assert atom_actions.execute_atomic_action is execute_atomic_action + assert atom_actions.normalize_atomic_action_spec is normalize_atomic_action_spec + assert callable(atom_actions.execute_parallel_atomic_actions) + + +def test_object_referenced_pose_builds_move_cfg_and_pose_target(monkeypatch) -> None: + env = _FakeEnv() + capture = [] + _FakeBackendAction.capture = capture + + monkeypatch.setattr( + atom_actions, + "_make_motion_generator", + lambda env: SimpleNamespace(robot=env.robot, device=env.robot.device), + ) + monkeypatch.setattr( + atom_actions, + "_get_atomic_action_class", + lambda atomic_action_class: _FakeBackendAction, + ) + + action = execute_atomic_action( + { + "atomic_action_class": "MoveAction", + "robot_name": "left_arm", + "control": "arm", + "target_pose": { + "reference": "object", + "obj_name": "apple", + "offset": [0.1, 0.2, 0.3], + "orientation": "current", + }, + "cfg": {"sample_interval": 12}, + }, + env=env, + ) + + assert action.shape == (2, 3) + assert isinstance(capture[0]["cfg"], MoveActionCfg) + assert capture[0]["cfg"].control_part == "left_arm" + assert capture[0]["cfg"].sample_interval == 12 + assert capture[0]["target"][:3, 3].tolist() == pytest.approx([0.5, 0.0, 0.4]) + + +def test_gripper_state_qpos_target_interpolates_hand_action(monkeypatch) -> None: + env = _FakeEnv() + capture = [] + _FakeBackendAction.capture = capture + + monkeypatch.setattr( + atom_actions, + "_make_motion_generator", + lambda env: SimpleNamespace(robot=env.robot, device=env.robot.device), + ) + monkeypatch.setattr( + atom_actions, + "_get_atomic_action_class", + lambda atomic_action_class: _FakeBackendAction, + ) + + action = execute_atomic_action( + { + "atomic_action_class": "MoveAction", + "robot_name": "left_arm", + "control": "hand", + "target_qpos": {"source": "gripper_state", "state": "open"}, + "cfg": {"sample_interval": 5, "post_hold_steps": 2}, + }, + env=env, + ) + + assert action.shape == (7, 3) + assert capture == [] + assert action[0].tolist() == pytest.approx([0.1, 0.2, 0.0]) + assert action[4].tolist() == pytest.approx([0.1, 0.2, 0.05]) + assert action[-1].tolist() == pytest.approx([0.1, 0.2, 0.05]) + assert env.left_arm_current_gripper_state.tolist() == pytest.approx([0.05]) + + +def test_initial_qpos_target_interpolates_arm_action(monkeypatch) -> None: + env = _FakeEnv() + capture = [] + _FakeBackendAction.capture = capture + + monkeypatch.setattr( + atom_actions, + "_make_motion_generator", + lambda env: SimpleNamespace(robot=env.robot, device=env.robot.device), + ) + monkeypatch.setattr( + atom_actions, + "_get_atomic_action_class", + lambda atomic_action_class: _FakeBackendAction, + ) + + action = execute_atomic_action( + { + "atomic_action_class": "MoveAction", + "robot_name": "right_arm", + "control": "arm", + "target_qpos": {"source": "initial"}, + "cfg": {"sample_interval": 4}, + }, + env=env, + ) + + assert action.shape == (4, 3) + assert capture == [] + assert action[0].tolist() == pytest.approx([0.3, 0.4, 0.0]) + assert action[-1].tolist() == pytest.approx([-0.3, -0.4, 0.0]) + assert env.right_arm_current_qpos.tolist() == pytest.approx([-0.3, -0.4]) + + +def test_target_object_builds_pick_up_cfg(monkeypatch) -> None: + env = _FakeEnv() + capture = [] + _FakeBackendAction.capture = capture + + monkeypatch.setattr( + atom_actions, + "_make_motion_generator", + lambda env: SimpleNamespace(robot=env.robot, device=env.robot.device), + ) + monkeypatch.setattr( + atom_actions, + "_get_atomic_action_class", + lambda atomic_action_class: _FakeBackendAction, + ) + + execute_atomic_action( + { + "atomic_action_class": "PickUpAction", + "robot_name": "left_arm", + "control": "arm", + "target_object": { + "obj_name": "apple", + "affordance": "antipodal", + }, + "cfg": { + "pre_grasp_distance": 0.07, + "sample_interval": 11, + }, + }, + env=env, + allow_grasp_annotation=True, + ) + + assert isinstance(capture[0]["cfg"], PickUpActionCfg) + assert capture[0]["cfg"].control_part == "left_arm" + assert capture[0]["cfg"].hand_control_part == "left_eef" + assert capture[0]["cfg"].pre_grasp_distance == pytest.approx(0.07) + assert capture[0]["target"].label == "apple" + + +def test_place_action_builds_place_cfg(monkeypatch) -> None: + env = _FakeEnv() + capture = [] + _FakeBackendAction.capture = capture + + monkeypatch.setattr( + atom_actions, + "_make_motion_generator", + lambda env: SimpleNamespace(robot=env.robot, device=env.robot.device), + ) + monkeypatch.setattr( + atom_actions, + "_get_atomic_action_class", + lambda atomic_action_class: _FakeBackendAction, + ) + + action = execute_atomic_action( + { + "atomic_action_class": "PlaceAction", + "robot_name": "left_arm", + "control": "arm", + "target_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + }, + "cfg": {"sample_interval": 19, "lift_height": 0.06}, + }, + env=env, + ) + + assert action.shape == (2, 3) + assert isinstance(capture[0]["cfg"], PlaceActionCfg) + assert capture[0]["cfg"].control_part == "left_arm" + assert capture[0]["cfg"].lift_height == pytest.approx(0.06) + + +def test_place_action_rejects_qpos_target(monkeypatch) -> None: + env = _FakeEnv() + monkeypatch.setattr( + atom_actions, + "_make_motion_generator", + lambda env: SimpleNamespace(robot=env.robot, device=env.robot.device), + ) + monkeypatch.setattr( + atom_actions, + "_get_atomic_action_class", + lambda atomic_action_class: _FakeBackendAction, + ) + + with pytest.raises( + ValueError, + match="PlaceAction requires control='arm' and target_pose", + ): + execute_atomic_action( + { + "atomic_action_class": "PlaceAction", + "robot_name": "left_arm", + "control": "arm", + "target_qpos": {"source": "initial"}, + "cfg": {"sample_interval": 20}, + }, + env=env, + ) diff --git a/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py b/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py new file mode 100644 index 00000000..a5950be1 --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py @@ -0,0 +1,252 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse +import json +import os +import re +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + import torch + +pytestmark = pytest.mark.skipif( + os.environ.get("RUN_DEXSIM_GRASP_TESTS") != "1", + reason="Set RUN_DEXSIM_GRASP_TESTS=1 to run DexSim semantic grasp integration tests.", +) + + +_REPO_ROOT = Path(__file__).resolve().parents[3] +_DEMO3_CONFIG_DIR = ( + _REPO_ROOT / "embodichain/gen_sim/action_agent_pipeline/configs/demo3_text" +) +_MIN_LIFT_M = float(os.environ.get("RUN_DEXSIM_GRASP_MIN_LIFT_M", "0.04")) +_MAX_EEF_DISTANCE_M = float( + os.environ.get("RUN_DEXSIM_GRASP_MAX_EEF_DISTANCE_M", "0.25") +) +_POST_GRASP_HOLD_STEPS = int(os.environ.get("RUN_DEXSIM_GRASP_HOLD_STEPS", "10")) +_PICK_UP_SPEC_RE = re.compile( + r'"atomic_action_class"\s*:\s*"PickUpAction".*?' + r'"robot_name"\s*:\s*"(?P[^"]+)".*?' + r'"obj_name"\s*:\s*"(?P[^"]+)"', + re.DOTALL, +) + + +def _load_demo3_gym_config() -> dict: + return json.loads( + (_DEMO3_CONFIG_DIR / "fast_gym_config.json").read_text(encoding="utf-8") + ) + + +def _configured_rigid_object_uids() -> set[str]: + return { + rigid_object["uid"] + for rigid_object in _load_demo3_gym_config().get("rigid_object", []) + } + + +def _configured_grasp_targets() -> list[tuple[str, str]]: + atom_actions_text = (_DEMO3_CONFIG_DIR / "atom_actions.txt").read_text( + encoding="utf-8" + ) + targets = [ + (match.group("robot_name"), match.group("obj_name")) + for match in _PICK_UP_SPEC_RE.finditer(atom_actions_text) + ] + rigid_object_uids = _configured_rigid_object_uids() + stale_targets = [ + (robot_name, obj_name) + for robot_name, obj_name in targets + if obj_name not in rigid_object_uids + ] + assert not stale_targets, ( + "atom_actions.txt references pick_up objects that are not present in " + f"fast_gym_config.json: stale_targets={stale_targets}, " + f"rigid_object_uids={sorted(rigid_object_uids)}." + ) + return targets + + +def _configured_grasp_target_for(*keywords: str) -> tuple[str, str]: + lower_keywords = tuple(keyword.lower() for keyword in keywords) + matches = [ + (robot_name, obj_name) + for robot_name, obj_name in _configured_grasp_targets() + if all(keyword in obj_name.lower() for keyword in lower_keywords) + ] + assert matches, ( + f"No configured grasp target matching keywords={keywords}. " + f"grasp_targets={_configured_grasp_targets()}." + ) + assert ( + len(matches) == 1 + ), f"Ambiguous grasp target matching keywords={keywords}: {matches}." + return matches[0] + + +def _write_runtime_gym_config(tmp_path: Path) -> Path: + gym_config = _load_demo3_gym_config() + gym_config["env"]["events"] = {} + gym_config["env"]["dataset"] = {} + gym_config["sensor"] = [] + + runtime_config_path = tmp_path / "demo3_semantic_grasp_gym_config.json" + runtime_config_path.write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + return runtime_config_path + + +def _make_env(tmp_path: Path): + import gymnasium + + from embodichain.lab.gym.utils.gym_utils import build_env_cfg_from_args + from embodichain.utils.utility import load_config + + # Import registers AtomicActionsAgent-v3. + from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware import ( # noqa: F401 + atomic_actions, + ) + + args = argparse.Namespace( + num_envs=1, + device=os.environ.get("RUN_DEXSIM_GRASP_DEVICE", "cpu"), + headless=True, + renderer=os.environ.get("RUN_DEXSIM_GRASP_RENDERER", "hybrid"), + arena_space=float(os.environ.get("RUN_DEXSIM_GRASP_ARENA_SPACE", "5.0")), + gpu_id=int(os.environ.get("RUN_DEXSIM_GRASP_GPU_ID", "0")), + gym_config=str(_write_runtime_gym_config(tmp_path)), + action_config=None, + preview=False, + filter_visual_rand=True, + filter_dataset_saving=True, + ) + env_cfg, gym_config, _ = build_env_cfg_from_args(args) + agent_config_path = _DEMO3_CONFIG_DIR / "agent_config.json" + return gymnasium.make( + id=gym_config["id"], + cfg=env_cfg, + agent_config=load_config(agent_config_path), + agent_config_path=str(agent_config_path), + task_name="Demo3_Text", + ) + + +def _object_xyz(env, obj_name: str) -> torch.Tensor: + pose = env.sim.get_rigid_object(obj_name).get_local_pose(to_matrix=True).squeeze(0) + return pose[:3, 3].detach().cpu() + + +def _arm_eef_xyz(env, robot_name: str) -> torch.Tensor: + left_pose, right_pose = env.get_current_xpos_agent() + pose = left_pose if "left" in robot_name else right_pose + return pose[:3, 3].detach().cpu() + + +def _hold_last_action(env, actions: list, steps: int) -> None: + if steps <= 0 or not actions: + return + last_action = actions[-1] + for _ in range(steps): + env.step(last_action) + + +def _assert_semantic_grasp_lifts_object( + tmp_path: Path, + *, + robot_name: str, + obj_name: str, +) -> None: + import torch + + from embodichain.gen_sim.action_agent_pipeline.runtime.atom_actions import ( + execute_parallel_atomic_actions, + ) + + gym_env = _make_env(tmp_path) + env = gym_env.unwrapped + try: + gym_env.reset() + z_before = float(_object_xyz(env, obj_name)[2]) + action_spec = { + "atomic_action_class": "PickUpAction", + "robot_name": robot_name, + "control": "arm", + "target_object": { + "obj_name": obj_name, + "affordance": "antipodal", + }, + "cfg": { + "pre_grasp_distance": 0.08, + "lift_height": 0.14, + "sample_interval": 80, + }, + } + result = execute_parallel_atomic_actions( + left_arm_action=action_spec if "left" in robot_name else None, + right_arm_action=action_spec if "right" in robot_name else None, + env=env, + return_result=True, + allow_grasp_annotation=True, + force_grasp_reannotate=bool( + int(os.environ.get("RUN_DEXSIM_GRASP_FORCE_REANNOTATE", "0")) + ), + ) + _hold_last_action(env, result["actions"], _POST_GRASP_HOLD_STEPS) + + obj_xyz = _object_xyz(env, obj_name) + eef_xyz = _arm_eef_xyz(env, robot_name) + lift = float(obj_xyz[2] - z_before) + eef_distance = float(torch.linalg.norm(obj_xyz - eef_xyz)) + + assert lift >= _MIN_LIFT_M, ( + f"{obj_name} semantic grasp did not lift enough: lift={lift:.4f}m, " + f"required={_MIN_LIFT_M:.4f}m, obj_xyz={obj_xyz.tolist()}, " + f"eef_xyz={eef_xyz.tolist()}." + ) + assert eef_distance <= _MAX_EEF_DISTANCE_M, ( + f"{obj_name} is too far from {robot_name} after grasp: " + f"distance={eef_distance:.4f}m, " + f"required<={_MAX_EEF_DISTANCE_M:.4f}m, " + f"obj_xyz={obj_xyz.tolist()}, eef_xyz={eef_xyz.tolist()}." + ) + finally: + gym_env.close() + + +def test_demo3_semantic_grasp_lifts_orange(tmp_path: Path) -> None: + robot_name, obj_name = _configured_grasp_target_for("orange", "1") + _assert_semantic_grasp_lifts_object( + tmp_path, + robot_name=robot_name, + obj_name=obj_name, + ) + + +def test_demo3_semantic_grasp_lifts_can(tmp_path: Path) -> None: + robot_name, obj_name = _configured_grasp_target_for("can") + _assert_semantic_grasp_lifts_object( + tmp_path, + robot_name=robot_name, + obj_name=obj_name, + ) diff --git a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py new file mode 100644 index 00000000..5ae7d1e9 --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py @@ -0,0 +1,112 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import pytest + +from embodichain.gen_sim.action_agent_pipeline.runtime.graph_compiler import ( + compile_agent_graph_spec, +) + + +class _FakeGraph: + def __init__(self, start: str, goal: str, max_transitions: int = 1000) -> None: + self.start = start + self.goal = goal + self.max_transitions = max_transitions + self.nodes = {} + self.edges = {} + + def add_node(self, node_id: str, semantic: str = ""): + self.nodes[node_id] = semantic + return self + + def add_edge( + self, + edge_id: str, + source: str, + target: str, + *, + left_arm_action=None, + right_arm_action=None, + ): + self.edges[edge_id] = { + "source": source, + "target": target, + "left_arm_action": left_arm_action, + "right_arm_action": right_arm_action, + } + return self + + +def _pick_up_spec(robot_name: str, obj_name: str) -> dict: + return { + "atomic_action_class": "PickUpAction", + "robot_name": robot_name, + "control": "arm", + "target_object": { + "obj_name": obj_name, + "affordance": "antipodal", + }, + "cfg": { + "pre_grasp_distance": 0.08, + "sample_interval": 45, + }, + } + + +def _task_graph(action: dict) -> dict: + return { + "task": "unit", + "start": "v0_start", + "goal": "v1_done", + "nodes": [ + {"id": "v0_start"}, + {"id": "v1_done"}, + ], + "edges": [ + { + "id": "e01", + "source": "v0_start", + "target": "v1_done", + "left_arm_action": action, + "right_arm_action": None, + } + ], + } + + +def test_compile_agent_graph_accepts_atomic_action_class_spec() -> None: + action = _pick_up_spec("left_arm", "apple") + graph = compile_agent_graph_spec( + _task_graph(action), + graph_cls=_FakeGraph, + monitor_module={}, + ) + + assert graph.edges["e01"]["left_arm_action"] == action + + +def test_compile_agent_graph_rejects_legacy_action_schema() -> None: + task_graph = _task_graph({"action": "pick_up", "robot_name": "left_arm"}) + + with pytest.raises(ValueError, match="Legacy action schema"): + compile_agent_graph_spec( + task_graph, + graph_cls=_FakeGraph, + monitor_module={}, + ) diff --git a/tests/gen_sim/action_agent_pipeline/test_llm_usage.py b/tests/gen_sim/action_agent_pipeline/test_llm_usage.py new file mode 100644 index 00000000..8c85123d --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_llm_usage.py @@ -0,0 +1,161 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import json + +import pytest + +from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + LLM_USAGE_PATH_ENV, + UsageTrackedChatModel, + build_usage_summary, + configure_usage_tracking, + disable_usage_tracking, + normalize_usage, + record_langchain_usage, + scrub_usage_tracking_env, +) + + +class _FakeLangChainResponse: + usage_metadata = { + "input_tokens": 10, + "output_tokens": 4, + "total_tokens": 14, + "input_token_details": {"cache_read": 3}, + "output_token_details": {"reasoning": 2}, + } + response_metadata = { + "model_name": "gpt-test", + "id": "chatcmpl-test", + "finish_reason": "stop", + } + content = "{}" + + +class _FakeChatModel: + model_name = "gpt-test" + + def __init__(self) -> None: + self.inputs = [] + + def invoke(self, value): + self.inputs.append(value) + return _FakeLangChainResponse() + + +@pytest.fixture(autouse=True) +def _clear_usage_env(): + disable_usage_tracking() + yield + disable_usage_tracking() + + +def test_normalize_usage_handles_openai_and_langchain_shapes(): + openai_usage = { + "prompt_tokens": 11, + "completion_tokens": 5, + "total_tokens": 16, + "prompt_tokens_details": {"cached_tokens": 7}, + "completion_tokens_details": {"reasoning_tokens": 2}, + } + assert normalize_usage(openai_usage) == { + "input_tokens": 11, + "output_tokens": 5, + "total_tokens": 16, + "cached_tokens": 7, + "reasoning_tokens": 2, + } + + langchain_usage = { + "input_tokens": 10, + "output_tokens": 4, + "input_token_details": {"cache_read": 3}, + "output_token_details": {"reasoning": 2}, + } + assert normalize_usage(langchain_usage) == { + "input_tokens": 10, + "output_tokens": 4, + "total_tokens": 14, + "cached_tokens": 3, + "reasoning_tokens": 2, + } + + +def test_record_langchain_usage_writes_jsonl_and_summary(tmp_path): + usage_path = tmp_path / "llm_usage.jsonl" + configure_usage_tracking( + usage_path=usage_path, + run_id="test-run", + process_name="pytest", + reset=True, + ) + + record_langchain_usage( + _FakeLangChainResponse(), + stage="Action Agent Task Graph", + model="fallback-model", + ) + + records = [ + json.loads(line) for line in usage_path.read_text(encoding="utf-8").splitlines() + ] + assert len(records) == 1 + assert records[0]["stage"] == "action_agent_task_graph" + assert records[0]["model"] == "gpt-test" + assert records[0]["input_tokens"] == 10 + assert records[0]["output_tokens"] == 4 + + summary = build_usage_summary(usage_path) + assert summary["total"]["calls"] == 1 + assert summary["total"]["total_tokens"] == 14 + assert summary["by_stage"]["action_agent_task_graph"]["cached_tokens"] == 3 + + +def test_usage_tracked_chat_model_records_invoke(tmp_path): + usage_path = tmp_path / "llm_usage.jsonl" + configure_usage_tracking( + usage_path=usage_path, + run_id="test-run", + process_name="pytest", + reset=True, + ) + inner = _FakeChatModel() + wrapped = UsageTrackedChatModel(inner, stage="action_agent.task_graph") + + response = wrapped.invoke("hello") + + assert response.content == "{}" + assert inner.inputs == ["hello"] + record = json.loads(usage_path.read_text(encoding="utf-8").splitlines()[0]) + assert record["stage"] == "action_agent.task_graph" + assert record["total_tokens"] == 14 + + +def test_scrub_usage_tracking_env_removes_usage_keys(tmp_path): + usage_path = tmp_path / "llm_usage.jsonl" + configure_usage_tracking( + usage_path=usage_path, + run_id="test-run", + process_name="pytest", + reset=True, + ) + + cleaned = scrub_usage_tracking_env() + + assert LLM_USAGE_PATH_ENV not in cleaned diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py new file mode 100644 index 00000000..794ba8dd --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -0,0 +1,1059 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from pathlib import Path +import json +import struct + +import pytest +import torch + +from embodichain.gen_sim.action_agent_pipeline.generation import ( + ur5_basket_config as ur5_basket_config_generation, +) +from embodichain.gen_sim.action_agent_pipeline.generation.ur5_basket_config import ( + TargetReplacementSpec, + generate_ur5_basket_config_from_project, +) +from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.success import ( + evaluate_configured_success, +) + + +def test_ur5_basket_generator_uses_parallel_handoff( + tmp_path: Path, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_agent", + target_body_scale=0.6, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + + assert set(rigid_objects) == {"left_apple", "right_apple", "wicker_basket"} + assert rigid_objects["left_apple"]["body_scale"] == [0.6, 0.6, 0.6] + assert rigid_objects["right_apple"]["body_scale"] == [0.6, 0.6, 0.6] + assert rigid_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["table"]["body_scale"] == [1.0, 1.0, 1.0] + assert rigid_objects["left_apple"]["shape"]["fpath"].endswith( + "mesh_assets/apple/apple_2/apple_2.glb" + ) + assert rigid_objects["right_apple"]["shape"]["fpath"].endswith( + "mesh_assets/apple/apple_1/apple_1.glb" + ) + assert gym_config["robot"]["init_pos"] == [-2.0, 0.0, 0.5] + assert gym_config["robot"]["init_rot"] == [0.0, 0.0, 90.0] + + success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] + assert {term["object"] for term in success_terms} == {"left_apple", "right_apple"} + assert {term["container"] for term in success_terms} == {"wicker_basket"} + + registry = gym_config["env"]["events"]["register_info_to_env"]["params"]["registry"] + registered_uids = {entry["entity_cfg"]["uid"] for entry in registry} + assert registered_uids == {"left_apple", "right_apple", "wicker_basket"} + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + basic_background = paths.basic_background.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + normalized_task_prompt = " ".join(task_prompt.split()) + + assert "Generate exactly 10 nominal edges" in normalized_task_prompt + assert "Generate exactly 11 nominal edges" not in normalized_task_prompt + assert "negative-y side" in basic_background + assert "positive-y side" in basic_background + assert "negative-x side" not in basic_background + assert "positive-x side" not in basic_background + assert '"offset":[0.0,-0.04,0.22]' in task_prompt + assert '"offset":[0.0,0.04,0.22]' in task_prompt + assert '"offset":[-0.04,0.0,0.22]' not in task_prompt + assert '"offset":[0.04,0.0,0.22]' not in task_prompt + assert '"offset":[0.0,-0.04,0.22]' in atom_actions + assert '"offset":[0.0,0.04,0.22]' in atom_actions + assert "parallel handoff" in task_prompt + assert "parallel handoff" in basic_background + assert "parallel handoff" in atom_actions + + handoff_edge = task_prompt.split("6. After the left gripper", maxsplit=1)[1].split( + "\n7. Lower the held right target object", + maxsplit=1, + )[0] + assert ( + '"robot_name":"left_arm","control":"arm","target_qpos":{"source":"initial"}' + in handoff_edge + ) + assert ( + '"robot_name":"right_arm","control":"arm","target_pose":{"reference":"object"' + in handoff_edge + ) + assert '"state":"close"' not in handoff_edge + assert "left_arm_action: null" not in handoff_edge + assert paths.summary["mode"] == "basket_template" + + +def test_target_replacements_generate_meshes_and_replace_paths( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + calls = _patch_prompt2geometry(monkeypatch) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_agent", + target_replacements=[ + TargetReplacementSpec("apple_1", "A orange", "new1"), + TargetReplacementSpec("apple_2", "A apple", "new2"), + ], + ) + + assert calls == [ + ("A orange", project_dir / "mesh_assets" / "new1", "orange.glb"), + ("A apple", project_dir / "mesh_assets" / "new2", "apple.glb"), + ] + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + + assert set(rigid_objects) == {"left_apple", "right_apple", "wicker_basket"} + assert rigid_objects["right_apple"]["shape"]["fpath"].endswith( + "mesh_assets/new1/orange.glb" + ) + assert rigid_objects["left_apple"]["shape"]["fpath"].endswith( + "mesh_assets/new2/apple.glb" + ) + assert paths.summary["target_replacements"][0]["source_uid"] == "apple_1" + assert paths.summary["target_replacements"][1]["source_uid"] == "apple_2" + + +def test_target_replacements_can_sync_runtime_names( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + _patch_prompt2geometry(monkeypatch) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_agent", + target_replacements=[ + TargetReplacementSpec("apple_2", "A orange", "new1"), + TargetReplacementSpec("apple_1", "A apple", "new2"), + ], + sync_replacement_names=True, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + + assert set(rigid_objects) == {"left_orange", "right_apple", "wicker_basket"} + assert rigid_objects["left_orange"]["shape"]["fpath"].endswith( + "mesh_assets/new1/orange.glb" + ) + assert rigid_objects["right_apple"]["shape"]["fpath"].endswith( + "mesh_assets/new2/apple.glb" + ) + + success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] + assert {term["object"] for term in success_terms} == { + "left_orange", + "right_apple", + } + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + basic_background = paths.basic_background.read_text(encoding="utf-8") + assert "the left orange and right apple into the wicker_basket" in task_prompt + assert "left_arm must only manipulate `left_orange`" in task_prompt + assert "- left_orange: the orange mesh initially" in basic_background + assert "- right_apple: the apple mesh initially" in basic_background + + +def test_directory_input_prefers_merged_config_and_preserves_extra_scene_scale( + tmp_path: Path, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + background_mesh = project_dir / "mesh_assets/backgrounds/vase_0.glb" + background_mesh.parent.mkdir(parents=True, exist_ok=True) + background_mesh.write_bytes(b"") + + merged_config_path = project_dir / "gym_config_merged.json" + source_config = json.loads( + (project_dir / "gym_config.json").read_text(encoding="utf-8") + ) + extra_scene_object = _mesh_object( + "vase_0", + "mesh_assets/backgrounds/vase_0.glb", + [0.16, -0.44, 0.77], + [0.0, 0.0, -90.0], + ) + extra_scene_object["body_scale"] = [1.2, 1.1, 0.9] + source_config["rigid_object"].append(extra_scene_object) + merged_config_path.write_text( + json.dumps(source_config, indent=2), + encoding="utf-8", + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_agent", + target_body_scale=0.8, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + + assert set(rigid_objects) == { + "left_apple", + "right_apple", + "wicker_basket", + "vase_0", + } + assert rigid_objects["left_apple"]["body_scale"] == [0.8, 0.8, 0.8] + assert rigid_objects["right_apple"]["body_scale"] == [0.8, 0.8, 0.8] + assert rigid_objects["vase_0"]["body_scale"] == [1.2, 1.1, 0.9] + assert rigid_objects["vase_0"]["shape"]["fpath"].endswith( + "mesh_assets/backgrounds/vase_0.glb" + ) + + +def test_task_description_generates_relative_left_of_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + assert kwargs["task_description"] == "把 apple_2 放到 basket_3 左边" + return { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "left_of", + "task_prompt_summary": "Move apple_2 to the left of basket_3.", + "basic_background_notes": "The basket is the spatial reference.", + "action_sketch": [ + "grasp apple_2", + "move to the left side of basket_3", + "release on the table", + ], + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + monkeypatch.setattr( + ur5_basket_config_generation, + "_resolve_table_mesh_world_zmax", + lambda scene_dir, table_obj: None, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_relative_agent", + task_name="AppleLeftOfBasket", + task_description="把 apple_2 放到 basket_3 左边", + target_body_scale=0.5, + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert set(rigid_objects) == {"apple_1", "apple_2", "wicker_basket"} + assert rigid_objects["apple_2"]["body_scale"] == [0.5, 0.5, 0.5] + assert rigid_objects["apple_1"]["body_scale"] == [0.5, 0.5, 0.5] + assert rigid_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["table"]["body_scale"] == [1.0, 1.0, 1.0] + + success = gym_config["env"]["extensions"]["agent_success"] + assert success["op"] == "all" + axis_terms = { + (term.get("axis"), term.get("offset")) + for term in success["terms"] + if term["type"] == "object_axis_offset_near" + } + assert ("y", -0.16) in axis_terms + assert ("x", 0.0) in axis_terms + + assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + assert "Move apple_2 to the left of basket_3." in task_prompt + assert ( + "Generate one deterministic nominal graph with exactly 6 nominal edges" + in task_prompt + ) + assert '"atomic_action_class":"PickUpAction","robot_name":"left_arm"' in task_prompt + assert '"obj_name":"apple_2"' in task_prompt + assert "right_arm_action: null" in task_prompt + assert "Generate exactly 10 nominal edges" not in task_prompt + + assert paths.summary == { + "mode": "relative_placement", + "moved_object": "apple_2", + "reference_object": "wicker_basket", + "relation": "left_of", + "active_arm": "left_arm", + "release_offset": [0.0, -0.16, 0.12], + } + + +def test_task_description_generates_relative_front_of_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + assert kwargs["task_description"] == "用右臂把 apple_1 放到 apple_2 前边" + return { + "moved_object": "apple_1", + "reference_object": "apple_2", + "goal_relation": "front_of", + "arm": "right", + "task_prompt_summary": "Move apple_1 in front of apple_2.", + "basic_background_notes": "The apple_2 object is the spatial reference.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + monkeypatch.setattr( + ur5_basket_config_generation, + "_resolve_table_mesh_world_zmax", + lambda scene_dir, table_obj: None, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_front_relative_agent", + task_name="AppleFrontOfApple", + task_description="用右臂把 apple_1 放到 apple_2 前边", + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + success = gym_config["env"]["extensions"]["agent_success"] + assert success["op"] == "all" + axis_terms = { + (term.get("axis"), term.get("offset")) + for term in success["terms"] + if term["type"] == "object_axis_offset_near" + } + assert ("x", -0.16) in axis_terms + assert ("y", 0.0) in axis_terms + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert '"offset":[-0.16,0.0,0.22]' in task_prompt + assert '"offset":[-0.16,0.0,0.22]' in atom_actions + + assert paths.summary == { + "mode": "relative_placement", + "moved_object": "apple_1", + "reference_object": "apple_2", + "relation": "front_of", + "active_arm": "right_arm", + "release_offset": [-0.16, 0.0, 0.12], + } + + +def test_task_description_on_container_is_compiled_as_inside( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_1", + "reference_object": "basket_3", + "goal_relation": "on", + "task_prompt_summary": "Release apple_1 above basket_3.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + monkeypatch.setattr( + ur5_basket_config_generation, + "_resolve_table_mesh_world_zmax", + lambda scene_dir, table_obj: None, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_above_container_agent", + task_description="把 apple_1 放到 basket_3 上方然后松手", + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + success = gym_config["env"]["extensions"]["agent_success"] + assert success["type"] == "object_in_container" + assert success["object"] == "apple_1" + assert success["container"] == "wicker_basket" + assert paths.summary["relation"] == "inside" + assert paths.summary["active_arm"] == "right_arm" + + assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] + + +def test_task_description_respects_explicit_left_arm( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_1", + "reference_object": "basket_3", + "goal_relation": "left_of", + "arm": "left", + "task_prompt_summary": "Use the left arm to move apple_1.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + monkeypatch.setattr( + ur5_basket_config_generation, + "_resolve_table_mesh_world_zmax", + lambda scene_dir, table_obj: None, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_left_arm_agent", + task_description="左臂把 apple_1 放到 basket_3 左边", + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] + assert paths.summary["active_arm"] == "left_arm" + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + assert '"atomic_action_class":"PickUpAction","robot_name":"left_arm"' in task_prompt + assert '"obj_name":"apple_1"' in task_prompt + assert "right_arm_action: null" in task_prompt + + +def test_task_description_respects_explicit_right_arm( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "right_of", + "arm": "right", + "task_prompt_summary": "Use the right arm to move apple_2.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_right_arm_agent", + task_description="右臂把 apple_2 放到 basket_3 右边", + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] + assert paths.summary["active_arm"] == "right_arm" + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + assert ( + '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in task_prompt + ) + assert '"obj_name":"apple_2"' in task_prompt + assert "left_arm_action: null" in task_prompt + + +def test_task_description_generates_dual_arm_relative_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + assert kwargs["task_description"] == ( + "左臂把 apple_2 放到 basket_3 左边,右臂把 apple_1 放到 basket_3 右边" + ) + return { + "placements": [ + { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "left_of", + "arm": "left", + }, + { + "moved_object": "apple_1", + "reference_object": "basket_3", + "goal_relation": "right_of", + "arm": "right", + }, + ], + "task_prompt_summary": "Use both arms for two side placements.", + "basic_background_notes": "Both arms have explicit work.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + monkeypatch.setattr( + ur5_basket_config_generation, + "_resolve_table_mesh_world_zmax", + lambda scene_dir, table_obj: None, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_dual_relative_agent", + task_description=( + "左臂把 apple_2 放到 basket_3 左边,右臂把 apple_1 放到 basket_3 右边" + ), + target_body_scale=0.7, + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] + + success = gym_config["env"]["extensions"]["agent_success"] + assert success["op"] == "all" + assert len(success["terms"]) == 2 + axis_terms = { + (term["object"], term["axis"], term["offset"]) + for placement_success in success["terms"] + for term in placement_success["terms"] + if term["type"] == "object_axis_offset_near" + } + assert ("apple_2", "y", -0.16) in axis_terms + assert ("apple_1", "y", 0.16) in axis_terms + + attr_names = { + attr["name"] + for attr in gym_config["env"]["events"]["prepare_extra_attr"]["params"]["attrs"] + } + assert "grasp_pose_object" not in attr_names + + assert paths.summary == { + "mode": "dual_arm_relative_placement", + "placements": [ + { + "moved_object": "apple_2", + "reference_object": "wicker_basket", + "relation": "left_of", + "active_arm": "left_arm", + "release_offset": [0.0, -0.16, 0.12], + }, + { + "moved_object": "apple_1", + "reference_object": "wicker_basket", + "relation": "right_of", + "active_arm": "right_arm", + "release_offset": [0.0, 0.16, 0.12], + }, + ], + } + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + basic_background = paths.basic_background.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert "Generate one deterministic nominal graph with exactly 10 nominal edges" in ( + task_prompt + ) + assert ( + 'left_arm_action: {"atomic_action_class":"PickUpAction","robot_name":"left_arm"' + in task_prompt + ) + assert ( + 'right_arm_action: {"atomic_action_class":"PickUpAction","robot_name":"right_arm"' + in task_prompt + ) + assert ( + '"robot_name":"right_arm","control":"hand","target_qpos":{"source":"gripper_state","state":"close"}' + in task_prompt + ) + assert "The inactive arm must remain null" not in task_prompt + assert "Both arms participate" in basic_background + assert "left_arm moves `apple_2`" in basic_background + assert "right_arm moves `apple_1`" in basic_background + assert ( + '"atomic_action_class":"PickUpAction","robot_name":"left_arm"' in atom_actions + ) + assert '"obj_name":"apple_2"' in atom_actions + assert ( + '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in atom_actions + ) + assert '"obj_name":"apple_1"' in atom_actions + + +def test_task_description_rejects_dual_relative_same_arm( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "placements": [ + { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "left_of", + "arm": "left", + }, + { + "moved_object": "apple_1", + "reference_object": "basket_3", + "goal_relation": "right_of", + "arm": "left", + }, + ], + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + with pytest.raises(ValueError, match="one left arm and one right arm"): + generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "bad_dual_relative_agent", + task_description="双臂分别移动两个苹果", + ) + + +def test_task_description_dual_auto_assigns_complementary_arms( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + gym_config_path = project_dir / "gym_config.json" + source_config = json.loads(gym_config_path.read_text(encoding="utf-8")) + for obj_config in source_config["rigid_object"]: + if obj_config["uid"] == "apple_1": + obj_config["init_pos"][1] = -0.03 + gym_config_path.write_text( + json.dumps(source_config, indent=2), + encoding="utf-8", + ) + + def fake_call_relative_task_llm(**kwargs): + return { + "placements": [ + { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "left_of", + "arm": "auto", + }, + { + "moved_object": "apple_1", + "reference_object": "basket_3", + "goal_relation": "right_of", + "arm": "auto", + }, + ], + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_dual_auto_relative_agent", + task_description="双臂分别移动两个苹果", + prewarm_coacd_cache=False, + ) + + active_arms = [placement["active_arm"] for placement in paths.summary["placements"]] + assert active_arms == ["left_arm", "right_arm"] + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] + + +def test_task_description_on_object_uses_object_on_object_success( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_2", + "reference_object": "apple_1", + "goal_relation": "on", + "task_prompt_summary": "Stack apple_2 on apple_1.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_stack_agent", + task_description="把 apple_2 放到 apple_1 上方并松手", + target_body_scale=0.6, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + assert rigid_objects["apple_2"]["body_scale"] == [0.6, 0.6, 0.6] + assert rigid_objects["apple_1"]["body_scale"] == [0.6, 0.6, 0.6] + assert rigid_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] + + success = gym_config["env"]["extensions"]["agent_success"] + assert success["type"] == "object_on_object" + assert success["object"] == "apple_2" + assert success["support"] == "apple_1" + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + assert "on top of `apple_1`" in task_prompt + + +def test_task_description_rejects_unknown_llm_uid( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "missing_bread", + "reference_object": "basket_3", + "goal_relation": "left_of", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + with pytest.raises(ValueError, match="unknown moved_object"): + generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "bad_agent", + task_description="把 missing_bread 放到 basket_3 左边", + ) + + +def test_high_tabletop_scene_adjusts_robot_height_and_light( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + gym_config_path = project_dir / "gym_config.json" + source_config = json.loads(gym_config_path.read_text(encoding="utf-8")) + for obj_config in source_config["rigid_object"]: + obj_config["init_pos"][2] = 0.12 + gym_config_path.write_text( + json.dumps(source_config, indent=2), + encoding="utf-8", + ) + + def fake_resolve_table_mesh_world_zmax( + scene_dir: Path, + table_obj, + ) -> float: + assert scene_dir == project_dir + assert table_obj.source_uid == "table" + return 1.18 + + monkeypatch.setattr( + ur5_basket_config_generation, + "_resolve_table_mesh_world_zmax", + fake_resolve_table_mesh_world_zmax, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_high_table_agent", + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + expected_init_z = ( + 1.18 + + ur5_basket_config_generation._DUAL_UR5_TABLETOP_CLEARANCE + - ur5_basket_config_generation._DUAL_UR5_ARM_COMPONENT_Z + ) + assert gym_config["robot"]["init_pos"][2] == pytest.approx(expected_init_z) + assert gym_config["light"]["direct"][0]["intensity"] == 40.0 + + +def test_table_mesh_world_zmax_reads_glb_vertices(tmp_path: Path) -> None: + scene_dir = tmp_path / "1790000000_gym_project" + mesh_path = scene_dir / "mesh_assets/table/table_0.glb" + _write_minimal_glb( + mesh_path, + [(-0.5, -0.5, 0.0), (0.5, -0.5, 1.2), (0.0, 0.5, 0.4)], + ) + table_obj = ur5_basket_config_generation._SceneObject( + source_uid="table", + source_role="background", + config=_mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.1], + [0.0, 0.0, 0.0], + ), + ) + table_obj.config["body_scale"] = [1.0, 1.0, 2.0] + + assert ur5_basket_config_generation._resolve_table_mesh_world_zmax( + scene_dir, + table_obj, + ) == pytest.approx(2.5) + + +def test_object_on_object_success_predicate() -> None: + env = _FakeEnv( + { + "apple_2": [0.0, 0.0, 0.15], + "apple_1": [0.02, 0.01, 0.0], + } + ) + + success = evaluate_configured_success( + env, + { + "type": "object_on_object", + "object": "apple_2", + "support": "apple_1", + "xy_radius": 0.08, + "min_z_offset": 0.02, + "max_z_offset": 0.35, + }, + ) + + assert bool(success.item()) is True + + +def _write_project(project_dir: Path) -> None: + for rel_path in ( + "mesh_assets/table/table_0.glb", + "mesh_assets/basket/basket_3/basket_3.glb", + "mesh_assets/apple/apple_1/apple_1.glb", + "mesh_assets/apple/apple_2/apple_2.glb", + ): + mesh_path = project_dir / rel_path + mesh_path.parent.mkdir(parents=True, exist_ok=True) + mesh_path.write_bytes(b"") + + gym_config = { + "id": "Image2Tabletop-1790000000-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 180.0], + ) + ], + "rigid_object": [ + _mesh_object( + "basket_3", + "mesh_assets/basket/basket_3/basket_3.glb", + [0.0, 0.08, 0.75], + [0.0, 0.0, 180.0], + ), + _mesh_object( + "apple_1", + "mesh_assets/apple/apple_1/apple_1.glb", + [0.38, 0.11, 0.76], + [0.0, 0.0, 140.0], + ), + _mesh_object( + "apple_2", + "mesh_assets/apple/apple_2/apple_2.glb", + [-0.39, -0.12, 0.76], + [0.0, 0.0, 160.0], + ), + ], + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + +def _mesh_object( + uid: str, + fpath: str, + init_pos: list[float], + init_rot: list[float], +) -> dict: + return { + "uid": uid, + "shape": { + "shape_type": "Mesh", + "fpath": fpath, + "compute_uv": False, + }, + "init_pos": init_pos, + "init_rot": init_rot, + "body_scale": [1.0, 1.0, 1.0], + } + + +def _write_minimal_glb( + path: Path, + vertices: list[tuple[float, float, float]], +) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + binary = b"".join(struct.pack(" list: + calls = [] + + def fake_run_prompt2geometry_replacement( + *, + prompt: str, + output_root: Path, + output_name: str, + ) -> dict: + output_root.mkdir(parents=True, exist_ok=True) + mesh_path = output_root / output_name + mesh_path.write_bytes(b"glb") + calls.append((prompt, output_root, output_name)) + return {"scaled_mesh_path": str(mesh_path)} + + monkeypatch.setattr( + ur5_basket_config_generation, + "_run_prompt2geometry_replacement", + fake_run_prompt2geometry_replacement, + ) + return calls + + +class _FakeEnv: + num_envs = 1 + device = torch.device("cpu") + + def __init__(self, positions: dict[str, list[float]]) -> None: + self.sim = _FakeSim(positions) + + +class _FakeSim: + def __init__(self, positions: dict[str, list[float]]) -> None: + self._objects = { + uid: _FakeRigidObject(position) for uid, position in positions.items() + } + + def get_rigid_object(self, uid: str): + return self._objects[uid] + + +class _FakeRigidObject: + def __init__(self, position: list[float]) -> None: + self._position = torch.tensor(position, dtype=torch.float32) + + def get_local_pose(self, to_matrix: bool = True) -> torch.Tensor: + pose = torch.eye(4, dtype=torch.float32).unsqueeze(0) + pose[:, :3, 3] = self._position.reshape(1, 3) + return pose From 2f68e9de4e729e46ead1c39bca82000dd6335ed5 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 14 Jun 2026 11:13:36 +0800 Subject: [PATCH 02/33] change config and image root --- .../cli/run_agent_pipeline.py | 15 +++++---------- .../gym_project_api/image2tabletop_client.py | 5 +++-- .../test_demo3_semantic_grasp_integration.py | 12 +++++++++++- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py index abb46a9d..ac08b311 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py @@ -59,24 +59,19 @@ def _repo_root() -> Path: ) _DEFAULT_SERVER = "http://192.168.3.23:4523" -_DEFAULT_IMAGE = ( - _REPO_ROOT - / "embodichain/gen_sim/action_agent_pipeline/gym_project_api/image/demo5.jpg" -) -_DEFAULT_IMAGE_DIR = _DEFAULT_IMAGE.parent _DEFAULT_GYM_PROJECT_ROOT = _REPO_ROOT / "gym_project" +_DEFAULT_ACTION_AGENT_WORKSPACE = _DEFAULT_GYM_PROJECT_ROOT / "action_agent_pipeline" +_DEFAULT_IMAGE = _DEFAULT_ACTION_AGENT_WORKSPACE / "images/demo1.jpg" +_DEFAULT_IMAGE_DIR = _DEFAULT_IMAGE.parent _DEFAULT_EXISTING_GYM_PROJECT = _DEFAULT_GYM_PROJECT_ROOT / "1780562837_gym_project" _DEFAULT_IMAGE2SCENE_ROOT = _REPO_ROOT / "gym_project/environment/image2tabletop" _DEFAULT_IMAGE2SCENE_IMAGE = "scene_image/robotwin_example.png" _DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR = "./downloads" _DEFAULT_IMAGE2SCENE_OUTPUT_ROOT = "./generated" _DEFAULT_IMAGE2SCENE_CONFIG = "./gen_config.json" -_DEFAULT_CONFIG_OUTPUT_DIR = ( - _REPO_ROOT / "embodichain/gen_sim/action_agent_pipeline/configs/demo3_text" -) +_DEFAULT_CONFIG_OUTPUT_DIR = _DEFAULT_ACTION_AGENT_WORKSPACE / "configs/demo3_text" _DEFAULT_PIPELINE_HISTORY = ( - _REPO_ROOT - / "embodichain/gen_sim/action_agent_pipeline/configs/pipeline_history.json" + _DEFAULT_ACTION_AGENT_WORKSPACE / "configs/pipeline_history.json" ) _DEFAULT_TASK_NAME = "Demo3_Text" _DEFAULT_TASK_TEMPLATE_NAMES = frozenset({"Demo1_Text"}) diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py index c7901c9f..0da04292 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py @@ -46,8 +46,9 @@ def _repo_root() -> Path: return Path.cwd().resolve() -_DEFAULT_IMAGE_INPUT = Path(__file__).resolve().parent / "image" -_DEFAULT_OUTPUT_ROOT = _repo_root() / "gym_project" +_REPO_ROOT = _repo_root() +_DEFAULT_OUTPUT_ROOT = _REPO_ROOT / "gym_project" +_DEFAULT_IMAGE_INPUT = _DEFAULT_OUTPUT_ROOT / "action_agent_pipeline/images" def _server_url(base_url: str, path: str) -> str: diff --git a/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py b/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py index a5950be1..d14308d6 100644 --- a/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py +++ b/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py @@ -35,8 +35,18 @@ _REPO_ROOT = Path(__file__).resolve().parents[3] +_DEFAULT_DEMO3_CONFIG_DIR = ( + _REPO_ROOT / "gym_project/action_agent_pipeline/configs/demo3_text" +) _DEMO3_CONFIG_DIR = ( - _REPO_ROOT / "embodichain/gen_sim/action_agent_pipeline/configs/demo3_text" + Path( + os.environ.get( + "RUN_DEXSIM_GRASP_CONFIG_DIR", + str(_DEFAULT_DEMO3_CONFIG_DIR), + ) + ) + .expanduser() + .resolve() ) _MIN_LIFT_M = float(os.environ.get("RUN_DEXSIM_GRASP_MIN_LIFT_M", "0.04")) _MAX_EEF_DISTANCE_M = float( From 64b04e2e34ec5ee584145431976786129396181c Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 14 Jun 2026 12:51:33 +0800 Subject: [PATCH 03/33] update dexsim0.4.1 --- .../generation/ur5_basket_config.py | 51 ++++++++++++++++--- embodichain/lab/sim/objects/articulation.py | 9 +--- .../test_ur5_basket_config_generation.py | 44 ++++++++++++++++ 3 files changed, 90 insertions(+), 14 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index a1bd6b88..0ac025ae 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -25,6 +25,7 @@ import math import re import struct +import warnings from embodichain.gen_sim.action_agent_pipeline.generation.prompt_builders import ( make_agent_config, @@ -48,6 +49,7 @@ _GYM_CONFIG_FILENAMES = frozenset({"gym_config.json", "gym_config_merged.json"}) _GYM_CONFIG_PREFERENCE = ("gym_config_merged.json", "gym_config.json") _TARGET_REPLACEMENT_MANIFEST_FILENAME = ".embodichain_replacement_manifest.json" +_DEXSIM_041_GLB_LOCAL_X_CORRECTION_DEGREES = -90.0 _CONTAINER_KEYWORDS = ( "basket", @@ -2765,14 +2767,15 @@ def _make_light_config() -> dict[str, Any]: def _make_background_config(scene_dir: Path, obj: _SceneObject) -> dict[str, Any]: + shape = _make_shape_config(scene_dir, obj.config) return { "uid": "table", - "shape": _make_shape_config(scene_dir, obj.config), + "shape": shape, "attrs": dict(_BACKGROUND_ATTRS), "body_scale": _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])), "body_type": "kinematic", "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "init_rot": _corrected_init_rot_for_shape(obj.config, shape), "max_convex_hull_num": _role_limited_max_convex_hull_num( obj, _BACKGROUND_MAX_CONVEX_HULL_NUM, @@ -2785,9 +2788,10 @@ def _make_extra_background_config( obj: _SceneObject, body_scale: Any | None = None, ) -> dict[str, Any]: + shape = _make_shape_config(scene_dir, obj.config) config = { "uid": _normalize_runtime_uid(obj.source_uid), - "shape": _make_shape_config(scene_dir, obj.config), + "shape": shape, "attrs": copy.deepcopy(dict(obj.config.get("attrs", _BACKGROUND_ATTRS))), "body_scale": _clean_vector3( obj.config.get("body_scale", [1.0, 1.0, 1.0]) @@ -2796,7 +2800,7 @@ def _make_extra_background_config( ), "body_type": str(obj.config.get("body_type", "static")), "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "init_rot": _corrected_init_rot_for_shape(obj.config, shape), "max_convex_hull_num": _role_limited_max_convex_hull_num( obj, _BACKGROUND_MAX_CONVEX_HULL_NUM, @@ -2889,12 +2893,13 @@ def _make_rigid_object_config( max_convex_hull_num: int, mesh_fpath: str | Path | None = None, ) -> dict[str, Any]: + shape = _make_shape_config(scene_dir, obj.config, mesh_fpath=mesh_fpath) config = { "uid": runtime_uid, - "shape": _make_shape_config(scene_dir, obj.config, mesh_fpath=mesh_fpath), + "shape": shape, "attrs": dict(_RIGID_OBJECT_ATTRS), "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "init_rot": _corrected_init_rot_for_shape(obj.config, shape), "body_scale": _clean_vector3(body_scale), "max_convex_hull_num": int(max_convex_hull_num), } @@ -2949,6 +2954,40 @@ def _make_shape_config( return shape +def _corrected_init_rot_for_shape( + source_config: Mapping[str, Any], + shape_config: Mapping[str, Any], +) -> list[float]: + init_rot = _clean_vector3(source_config.get("init_rot", [0.0, 0.0, 0.0])) + if not _is_glb_mesh_shape(shape_config): + return init_rot + + from scipy.spatial.transform import Rotation + + source_rotation = Rotation.from_euler("XYZ", init_rot, degrees=True) + correction = Rotation.from_euler( + "X", + _DEXSIM_041_GLB_LOCAL_X_CORRECTION_DEGREES, + degrees=True, + ) + corrected = source_rotation * correction + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Gimbal lock detected.*", + category=UserWarning, + ) + corrected_euler = corrected.as_euler("XYZ", degrees=True) + return [float(value) for value in corrected_euler] + + +def _is_glb_mesh_shape(shape_config: Mapping[str, Any]) -> bool: + if shape_config.get("shape_type") != "Mesh": + return False + fpath = shape_config.get("fpath") + return isinstance(fpath, str) and Path(fpath).suffix.lower() == ".glb" + + def _asset_path_for_config(scene_dir: Path, fpath: str) -> str: raw_path = Path(fpath) if raw_path.is_absolute(): diff --git a/embodichain/lab/sim/objects/articulation.py b/embodichain/lab/sim/objects/articulation.py index 477098e9..ed5a9e32 100644 --- a/embodichain/lab/sim/objects/articulation.py +++ b/embodichain/lab/sim/objects/articulation.py @@ -1446,14 +1446,7 @@ def set_joint_drive( drive_args["joint_friction"] = friction[i].cpu().numpy() if armature is not None: drive_args["armature"] = armature[i].cpu().numpy() - try: - self._entities[env_idx].set_drive(**drive_args) - except TypeError as exc: - if "armature" not in drive_args or "armature" not in str(exc): - raise - legacy_drive_args = dict(drive_args) - legacy_drive_args.pop("armature", None) - self._entities[env_idx].set_drive(**legacy_drive_args) + self._entities[env_idx].set_drive(**drive_args) def get_joint_drive( self, diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 794ba8dd..8adfaa78 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -111,6 +111,35 @@ def test_ur5_basket_generator_uses_parallel_handoff( assert paths.summary["mode"] == "basket_template" +def test_generator_applies_dexsim_041_glb_rotation_correction( + tmp_path: Path, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_agent", + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + + assert background_objects["table"]["init_rot"] == pytest.approx( + _expected_glb_corrected_rot([0.0, 0.0, 180.0]) + ) + assert rigid_objects["wicker_basket"]["init_rot"] == pytest.approx( + _expected_glb_corrected_rot([0.0, 0.0, 180.0]) + ) + assert rigid_objects["right_apple"]["init_rot"] == pytest.approx( + _expected_glb_corrected_rot([0.0, 0.0, 140.0]) + ) + assert rigid_objects["left_apple"]["init_rot"] == pytest.approx( + _expected_glb_corrected_rot([0.0, 0.0, 160.0]) + ) + + def test_target_replacements_generate_meshes_and_replace_paths( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -961,6 +990,21 @@ def _mesh_object( } +def _expected_glb_corrected_rot(init_rot: list[float]) -> list[float]: + from scipy.spatial.transform import Rotation + + source_rotation = Rotation.from_euler("XYZ", init_rot, degrees=True) + correction = Rotation.from_euler( + "X", + ur5_basket_config_generation._DEXSIM_041_GLB_LOCAL_X_CORRECTION_DEGREES, + degrees=True, + ) + return [ + float(value) + for value in (source_rotation * correction).as_euler("XYZ", degrees=True) + ] + + def _write_minimal_glb( path: Path, vertices: list[tuple[float, float, float]], From adb29c84519c94298a098884e807a130c6cb1d3b Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 14 Jun 2026 18:58:00 +0800 Subject: [PATCH 04/33] fix demo1 basket --- .../generation/ur5_basket_config.py | 121 ++++++++++++++---- .../test_ur5_basket_config_generation.py | 38 ++++-- 2 files changed, 125 insertions(+), 34 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index 0ac025ae..ed08384b 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -1686,6 +1686,12 @@ def _build_ur5_basket_bundle( "light": _make_light_config(), "background": [ _make_background_config(scene_dir, by_uid[roles.table_source_uid]), + _make_container_background_config( + scene_dir, + by_uid[roles.container_source_uid], + roles.container_runtime_uid, + container_scale, + ), *[ _make_extra_background_config(scene_dir, obj) for obj in extra_background_objects @@ -1706,12 +1712,6 @@ def _build_ur5_basket_bundle( object_scale, replacement_by_source_uid.get(roles.left_target_source_uid), ), - _make_container_object_config( - scene_dir, - by_uid[roles.container_source_uid], - roles.container_runtime_uid, - container_scale, - ), *[ _make_extra_rigid_object_config(scene_dir, obj, _source_body_scale(obj)) for obj in extra_rigid_objects @@ -1772,6 +1772,16 @@ def _build_relative_placement_bundle( rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] by_uid = {obj.source_uid: obj for obj in scene_objects} runtime_uids = _relative_runtime_uid_mapping(rigid_objects) + static_reference_source_uids = _static_relative_reference_source_uids( + spec, + by_uid, + ) + dynamic_rigid_objects = [ + obj for obj in rigid_objects if obj.source_uid not in static_reference_source_uids + ] + static_reference_objects = [ + obj for obj in rigid_objects if obj.source_uid in static_reference_source_uids + ] object_scale = _target_body_scale_vector(target_body_scale) robot_init_z = _estimate_dual_ur5_init_z( scene_dir, @@ -1793,6 +1803,15 @@ def _build_relative_placement_bundle( "light": _make_light_config(), "background": [ _make_background_config(scene_dir, by_uid[spec.table_source_uid]), + *[ + _make_container_background_config( + scene_dir, + obj, + runtime_uids[obj.source_uid], + _relative_object_body_scale(obj, target_scale=object_scale), + ) + for obj in static_reference_objects + ], *[ _make_extra_background_config(scene_dir, obj, object_scale) for obj in background_objects @@ -1813,7 +1832,7 @@ def _build_relative_placement_bundle( spec, ), ) - for obj in rigid_objects + for obj in dynamic_rigid_objects ], } return { @@ -1839,6 +1858,19 @@ def _source_body_scale(obj: _SceneObject) -> list[float]: return _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])) +def _static_relative_reference_source_uids( + spec: _RelativePlacementSpec, + by_uid: Mapping[str, _SceneObject], +) -> set[str]: + moved_source_uids = {placement.moved_source_uid for placement in spec.placements} + return { + placement.reference_source_uid + for placement in spec.placements + if placement.reference_source_uid not in moved_source_uids + and _is_container_like(by_uid[placement.reference_source_uid]) + } + + def _relative_object_body_scale( obj: _SceneObject, *, @@ -2844,6 +2876,23 @@ def _make_container_object_config( ) +def _make_container_background_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + body_scale: Any, +) -> dict[str, Any]: + config = _make_container_object_config( + scene_dir, + obj, + runtime_uid, + body_scale, + ) + config["body_type"] = "kinematic" + config["init_rot"] = _corrected_init_rot_for_shape(obj.config, config["shape"]) + return config + + def _make_extra_rigid_object_config( scene_dir: Path, obj: _SceneObject, @@ -3011,21 +3060,26 @@ def _validate_bundle(bundle: Mapping[str, Any], roles: _BasketTaskRoles) -> None raise ValueError("Generated UR5 basket config must use DualUR5.") rigid_uids = {obj["uid"] for obj in gym_config.get("rigid_object", [])} - required = { + background_uids = {obj["uid"] for obj in gym_config.get("background", [])} + scene_uids = rigid_uids | background_uids + required_rigid = { roles.left_target_runtime_uid, roles.right_target_runtime_uid, - roles.container_runtime_uid, } - if not required.issubset(rigid_uids): + if not required_rigid.issubset(rigid_uids): + raise ValueError( + f"Generated rigid objects missing: {sorted(required_rigid - rigid_uids)}" + ) + if roles.container_runtime_uid not in scene_uids: raise ValueError( - f"Generated rigid objects missing: {sorted(required - rigid_uids)}" + f"Generated scene objects missing container: {roles.container_runtime_uid}" ) success = gym_config["env"]["extensions"]["agent_success"] for term in success.get("terms", []): if ( term.get("object") not in rigid_uids - or term.get("container") not in rigid_uids + or term.get("container") not in scene_uids ): raise ValueError(f"Invalid success term uid reference: {term}") @@ -3040,36 +3094,50 @@ def _validate_relative_bundle( if gym_config.get("robot", {}).get("uid") != "DualUR5": raise ValueError("Generated relative placement config must use DualUR5.") - rigid_uids = [obj["uid"] for obj in gym_config.get("rigid_object", [])] - if len(rigid_uids) != len(set(rigid_uids)): - raise ValueError(f"Duplicate rigid object runtime uid(s): {rigid_uids}") - required = { - uid - for placement in spec.placements - for uid in (placement.moved_runtime_uid, placement.reference_runtime_uid) + rigid_uid_list = [obj["uid"] for obj in gym_config.get("rigid_object", [])] + if len(rigid_uid_list) != len(set(rigid_uid_list)): + raise ValueError(f"Duplicate rigid object runtime uid(s): {rigid_uid_list}") + rigid_uids = set(rigid_uid_list) + background_uids = {obj["uid"] for obj in gym_config.get("background", [])} + scene_uids = rigid_uids | background_uids + moved_required = {placement.moved_runtime_uid for placement in spec.placements} + missing_moved = moved_required - rigid_uids + if missing_moved: + raise ValueError( + f"Generated relative config missing moved rigid object(s): {missing_moved}" + ) + reference_required = { + placement.reference_runtime_uid for placement in spec.placements } - missing = required - set(rigid_uids) - if missing: + missing_reference = reference_required - scene_uids + if missing_reference: raise ValueError( - f"Generated relative config missing rigid object(s): {missing}" + f"Generated relative config missing reference object(s): {missing_reference}" ) _validate_success_uids( gym_config["env"]["extensions"]["agent_success"], - set(rigid_uids), + rigid_uids=rigid_uids, + scene_uids=scene_uids, ) registry = gym_config["env"]["events"]["register_info_to_env"]["params"]["registry"] registered = {entry["entity_cfg"]["uid"] for entry in registry} + required = moved_required | reference_required if not required.issubset(registered): raise ValueError( f"Relative config registry missing: {sorted(required - registered)}" ) -def _validate_success_uids(success: Mapping[str, Any], rigid_uids: set[str]) -> None: +def _validate_success_uids( + success: Mapping[str, Any], + *, + rigid_uids: set[str], + scene_uids: set[str], +) -> None: if success.get("op") in {"all", "and", "any", "or"}: for term in success.get("terms", []): - _validate_success_uids(term, rigid_uids) + _validate_success_uids(term, rigid_uids=rigid_uids, scene_uids=scene_uids) return success_type = str(success.get("type", success.get("func", ""))).lower() @@ -3089,7 +3157,8 @@ def _validate_success_uids(success: Mapping[str, Any], rigid_uids: set[str]) -> for key in required_keys: uid = success.get(key) - if uid not in rigid_uids: + valid_uids = rigid_uids if key == "object" else scene_uids + if uid not in valid_uids: raise ValueError(f"Invalid success uid reference {key}={uid!r}.") diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 8adfaa78..df6a3488 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -51,11 +51,12 @@ def test_ur5_basket_generator_uses_parallel_handoff( rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} background_objects = {obj["uid"]: obj for obj in gym_config["background"]} - assert set(rigid_objects) == {"left_apple", "right_apple", "wicker_basket"} + assert set(rigid_objects) == {"left_apple", "right_apple"} assert rigid_objects["left_apple"]["body_scale"] == [0.6, 0.6, 0.6] assert rigid_objects["right_apple"]["body_scale"] == [0.6, 0.6, 0.6] - assert rigid_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] assert background_objects["table"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["wicker_basket"]["body_type"] == "kinematic" assert rigid_objects["left_apple"]["shape"]["fpath"].endswith( "mesh_assets/apple/apple_2/apple_2.glb" ) @@ -129,7 +130,7 @@ def test_generator_applies_dexsim_041_glb_rotation_correction( assert background_objects["table"]["init_rot"] == pytest.approx( _expected_glb_corrected_rot([0.0, 0.0, 180.0]) ) - assert rigid_objects["wicker_basket"]["init_rot"] == pytest.approx( + assert background_objects["wicker_basket"]["init_rot"] == pytest.approx( _expected_glb_corrected_rot([0.0, 0.0, 180.0]) ) assert rigid_objects["right_apple"]["init_rot"] == pytest.approx( @@ -165,7 +166,11 @@ def test_target_replacements_generate_meshes_and_replace_paths( gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} - assert set(rigid_objects) == {"left_apple", "right_apple", "wicker_basket"} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + + assert set(rigid_objects) == {"left_apple", "right_apple"} + assert "wicker_basket" in background_objects + assert background_objects["wicker_basket"]["body_type"] == "kinematic" assert rigid_objects["right_apple"]["shape"]["fpath"].endswith( "mesh_assets/new1/orange.glb" ) @@ -197,7 +202,11 @@ def test_target_replacements_can_sync_runtime_names( gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} - assert set(rigid_objects) == {"left_orange", "right_apple", "wicker_basket"} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + + assert set(rigid_objects) == {"left_orange", "right_apple"} + assert "wicker_basket" in background_objects + assert background_objects["wicker_basket"]["body_type"] == "kinematic" assert rigid_objects["left_orange"]["shape"]["fpath"].endswith( "mesh_assets/new1/orange.glb" ) @@ -257,9 +266,11 @@ def test_directory_input_prefers_merged_config_and_preserves_extra_scene_scale( assert set(rigid_objects) == { "left_apple", "right_apple", - "wicker_basket", "vase_0", } + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert "wicker_basket" in background_objects + assert background_objects["wicker_basket"]["body_type"] == "kinematic" assert rigid_objects["left_apple"]["body_scale"] == [0.8, 0.8, 0.8] assert rigid_objects["right_apple"]["body_scale"] == [0.8, 0.8, 0.8] assert rigid_objects["vase_0"]["body_scale"] == [1.2, 1.1, 0.9] @@ -313,11 +324,15 @@ def fake_call_relative_task_llm(**kwargs): gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} background_objects = {obj["uid"]: obj for obj in gym_config["background"]} - assert set(rigid_objects) == {"apple_1", "apple_2", "wicker_basket"} + assert set(rigid_objects) == {"apple_1", "apple_2"} assert rigid_objects["apple_2"]["body_scale"] == [0.5, 0.5, 0.5] assert rigid_objects["apple_1"]["body_scale"] == [0.5, 0.5, 0.5] - assert rigid_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] assert background_objects["table"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["wicker_basket"]["body_type"] == "kinematic" + assert background_objects["wicker_basket"]["init_rot"] == pytest.approx( + _expected_glb_corrected_rot([0.0, 0.0, 180.0]) + ) success = gym_config["env"]["extensions"]["agent_success"] assert success["op"] == "all" @@ -492,6 +507,13 @@ def fake_call_relative_task_llm(**kwargs): ) gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert set(rigid_objects) == {"apple_1", "apple_2"} + assert background_objects["wicker_basket"]["body_type"] == "kinematic" + assert background_objects["wicker_basket"]["init_rot"] == pytest.approx( + _expected_glb_corrected_rot([0.0, 0.0, 180.0]) + ) assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] assert paths.summary["active_arm"] == "left_arm" From f13f4813758322e092848d9744ff03f3bb2425cc Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 14 Jun 2026 19:04:04 +0800 Subject: [PATCH 05/33] fix: normalize mesh frame generation --- .../generation/mesh_frame_normalization.py | 198 ++++++++++++++++++ .../generation/ur5_basket_config.py | 41 +++- 2 files changed, 231 insertions(+), 8 deletions(-) create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py new file mode 100644 index 00000000..64a011dc --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py @@ -0,0 +1,198 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from dataclasses import dataclass +from importlib import metadata +from pathlib import Path +from typing import Any +import hashlib +import json +import math +import re + +__all__ = [ + "GLB_LOCAL_X_CORRECTION_DEGREES", + "MESH_FRAME_NORMALIZATION_POLICY_VERSION", + "MeshFrameNormalizer", + "NormalizedMeshResult", +] + + +MESH_FRAME_NORMALIZATION_POLICY_VERSION = "action_agent_glb_rx_minus_90_obj_v1" +GLB_LOCAL_X_CORRECTION_DEGREES = -90.0 + +_SAFE_STEM_RE = re.compile(r"[^0-9a-zA-Z_.-]+") + + +@dataclass(frozen=True) +class NormalizedMeshResult: + """A normalized mesh path and metadata for generation summaries.""" + + source_path: Path + normalized_path: Path + source_sha256: str + status: str + transform: list[list[float]] + dexsim_engine_version: str + + def to_summary(self) -> dict[str, Any]: + return { + "source_path": self.source_path.as_posix(), + "normalized_path": self.normalized_path.as_posix(), + "source_sha256": self.source_sha256, + "status": self.status, + "policy_version": MESH_FRAME_NORMALIZATION_POLICY_VERSION, + "dexsim_engine_version": self.dexsim_engine_version, + "transform": self.transform, + } + + +class MeshFrameNormalizer: + """Normalize GLB meshes to OBJ so visual and collision share one frame.""" + + def __init__( + self, + *, + output_dir: str | Path, + local_x_correction_degrees: float = GLB_LOCAL_X_CORRECTION_DEGREES, + ) -> None: + self.output_dir = Path(output_dir).expanduser().resolve() + self.local_x_correction_degrees = float(local_x_correction_degrees) + self.transform = _rotation_x_matrix4(self.local_x_correction_degrees) + self.dexsim_engine_version = _dexsim_engine_version() + self._results_by_source: dict[Path, NormalizedMeshResult] = {} + self._reports: list[dict[str, Any]] = [] + + @property + def reports(self) -> list[dict[str, Any]]: + return list(self._reports) + + def normalize_path(self, mesh_path: str | Path) -> Path: + """Return a runtime mesh path, normalizing GLB/GLTF inputs to OBJ.""" + + path = Path(mesh_path).expanduser().resolve() + if path.suffix.lower() not in {".glb", ".gltf"}: + return path + + cached = self._results_by_source.get(path) + if cached is not None: + return cached.normalized_path + + source_sha256 = _file_sha256(path) + normalized_path = self._normalized_path_for(path, source_sha256) + status = "reused" if normalized_path.is_file() else "generated" + if status == "generated": + self._write_normalized_obj(path, normalized_path, source_sha256) + + result = NormalizedMeshResult( + source_path=path, + normalized_path=normalized_path, + source_sha256=source_sha256, + status=status, + transform=self.transform, + dexsim_engine_version=self.dexsim_engine_version, + ) + self._results_by_source[path] = result + self._reports.append(result.to_summary()) + return normalized_path + + def _normalized_path_for(self, mesh_path: Path, source_sha256: str) -> Path: + stem = _SAFE_STEM_RE.sub("_", mesh_path.stem).strip("._") or "mesh" + filename = ( + f"{stem}_{source_sha256[:12]}_" + f"{MESH_FRAME_NORMALIZATION_POLICY_VERSION}.obj" + ) + return self.output_dir / filename + + def _write_normalized_obj( + self, + source_path: Path, + normalized_path: Path, + source_sha256: str, + ) -> None: + trimesh = _require_trimesh() + scene = trimesh.load(str(source_path), force="scene") + mesh = _scene_to_world_mesh(scene) + mesh.apply_transform(self.transform) + + normalized_path.parent.mkdir(parents=True, exist_ok=True) + obj_payload = mesh.export(file_type="obj") + if isinstance(obj_payload, bytes): + obj_text = obj_payload.decode("utf-8") + else: + obj_text = str(obj_payload) + + header = "\n".join( + [ + "# EmbodiChain action-agent normalized mesh", + f"# policy_version: {MESH_FRAME_NORMALIZATION_POLICY_VERSION}", + f"# dexsim_engine_version: {self.dexsim_engine_version}", + f"# source_path: {source_path.as_posix()}", + f"# source_sha256: {source_sha256}", + f"# transform: {json.dumps(self.transform, separators=(',', ':'))}", + "", + ] + ) + normalized_path.write_text(header + obj_text, encoding="utf-8") + + +def _scene_to_world_mesh(scene: Any) -> Any: + try: + mesh = scene.dump(concatenate=True) + except AttributeError: + mesh = scene + if not hasattr(mesh, "vertices") or len(mesh.vertices) == 0: + raise ValueError("Mesh contains no vertices.") + return mesh + + +def _rotation_x_matrix4(degrees: float) -> list[list[float]]: + radians = math.radians(degrees) + cos_value = math.cos(radians) + sin_value = math.sin(radians) + return [ + [1.0, 0.0, 0.0, 0.0], + [0.0, cos_value, -sin_value, 0.0], + [0.0, sin_value, cos_value, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + + +def _file_sha256(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as file: + for chunk in iter(lambda: file.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _dexsim_engine_version() -> str: + for package_name in ("dexsim-engine", "dexsim_engine"): + try: + return metadata.version(package_name) + except metadata.PackageNotFoundError: + continue + return "unknown" + + +def _require_trimesh() -> Any: + try: + import trimesh + except ImportError as exc: + raise ImportError("trimesh is required to normalize GLB meshes.") from exc + return trimesh diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index ed08384b..e0d92e38 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -25,8 +25,10 @@ import math import re import struct -import warnings +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_frame_normalization import ( + MeshFrameNormalizer, +) from embodichain.gen_sim.action_agent_pipeline.generation.prompt_builders import ( make_agent_config, make_basket_atom_actions_prompt, @@ -49,7 +51,6 @@ _GYM_CONFIG_FILENAMES = frozenset({"gym_config.json", "gym_config_merged.json"}) _GYM_CONFIG_PREFERENCE = ("gym_config_merged.json", "gym_config.json") _TARGET_REPLACEMENT_MANIFEST_FILENAME = ".embodichain_replacement_manifest.json" -_DEXSIM_041_GLB_LOCAL_X_CORRECTION_DEGREES = -90.0 _CONTAINER_KEYWORDS = ( "basket", @@ -125,6 +126,7 @@ _TARGET_MAX_CONVEX_HULL_NUM = 16 _CONTAINER_MAX_CONVEX_HULL_NUM = 8 _EXTRA_RIGID_MAX_CONVEX_HULL_NUM = 1 +_TABLETOP_OBJECT_CLEARANCE = 0.003 _GLB_JSON_CHUNK_TYPE = 0x4E4F534A _GLB_BINARY_CHUNK_TYPE = 0x004E4942 _GLTF_COMPONENT_FORMATS = { @@ -299,12 +301,18 @@ def generate_ur5_basket_config_from_project( Paths of generated config files. """ + output_dir_path = Path(output_dir).expanduser().resolve() + _raise_if_generated_files_exist(output_dir_path, overwrite) + input_path = Path(gym_project).expanduser().resolve() gym_config_path = _resolve_gym_config_path(input_path) scene_dir = gym_config_path.parent source_config = _read_json(gym_config_path) project_name = _infer_project_name(input_path, scene_dir) replacement_specs = _normalize_target_replacements(target_replacements) + mesh_normalizer = MeshFrameNormalizer( + output_dir=output_dir_path / "mesh_assets" / "normalized" + ) scene_objects = _collect_scene_objects(source_config) if task_description: @@ -328,12 +336,14 @@ def generate_ur5_basket_config_from_project( target_body_scale=target_body_scale, max_episodes=max_episodes, max_episode_steps=max_episode_steps, + mesh_normalizer=mesh_normalizer, ) _validate_relative_bundle(bundle, spec) + _attach_mesh_normalization_summary(bundle, mesh_normalizer) if prewarm_coacd_cache: _attach_coacd_cache_summary(bundle) return _write_config_bundle( - output_dir=Path(output_dir).expanduser().resolve(), + output_dir=output_dir_path, bundle=bundle, overwrite=overwrite, ) @@ -369,12 +379,14 @@ def generate_ur5_basket_config_from_project( target_replacements=resolved_replacements, max_episodes=max_episodes, max_episode_steps=max_episode_steps, + mesh_normalizer=mesh_normalizer, ) _validate_bundle(bundle, roles) + _attach_mesh_normalization_summary(bundle, mesh_normalizer) if prewarm_coacd_cache: _attach_coacd_cache_summary(bundle) return _write_config_bundle( - output_dir=Path(output_dir).expanduser().resolve(), + output_dir=output_dir_path, bundle=bundle, overwrite=overwrite, ) @@ -1643,6 +1655,7 @@ def _build_ur5_basket_bundle( target_replacements: Sequence[_ResolvedTargetReplacement], max_episodes: int, max_episode_steps: int, + mesh_normalizer: MeshFrameNormalizer, ) -> dict[str, Any]: scene_objects = _collect_scene_objects(source_config) by_uid = {obj.source_uid: obj for obj in scene_objects} @@ -1666,10 +1679,13 @@ def _build_ur5_basket_bundle( for obj in scene_objects if obj.source_role == "background" and obj.source_uid != roles.table_source_uid ] - robot_init_z = _estimate_dual_ur5_init_z( + table_config = _make_background_config( scene_dir, by_uid[roles.table_source_uid], + mesh_normalizer, ) + table_top_z = _mesh_config_world_zmax(table_config) + robot_init_z = _dual_ur5_init_z_from_table_top(table_top_z) gym_config = { "id": "AtomicActionsAgent-v3", @@ -1685,15 +1701,16 @@ def _build_ur5_basket_bundle( "sensor": _make_sensor_config(), "light": _make_light_config(), "background": [ - _make_background_config(scene_dir, by_uid[roles.table_source_uid]), + table_config, _make_container_background_config( scene_dir, by_uid[roles.container_source_uid], roles.container_runtime_uid, container_scale, + mesh_normalizer, ), *[ - _make_extra_background_config(scene_dir, obj) + _make_extra_background_config(scene_dir, obj, mesh_normalizer) for obj in extra_background_objects ], ], @@ -1704,6 +1721,7 @@ def _build_ur5_basket_bundle( roles.right_target_runtime_uid, object_scale, replacement_by_source_uid.get(roles.right_target_source_uid), + mesh_normalizer, ), _make_target_object_config( scene_dir, @@ -1711,13 +1729,20 @@ def _build_ur5_basket_bundle( roles.left_target_runtime_uid, object_scale, replacement_by_source_uid.get(roles.left_target_source_uid), + mesh_normalizer, ), *[ - _make_extra_rigid_object_config(scene_dir, obj, _source_body_scale(obj)) + _make_extra_rigid_object_config( + scene_dir, + obj, + _source_body_scale(obj), + mesh_normalizer, + ) for obj in extra_rigid_objects ], ], } + _apply_tabletop_z_placement(gym_config, table_top_z) return { "gym_config": gym_config, "agent_config": make_agent_config(), From 0957ecb507960aa864f4935a28a03cb7918d6c25 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 14 Jun 2026 20:39:33 +0800 Subject: [PATCH 06/33] fix temp glb 90 but without material --- .../generation/mesh_frame_normalization.py | 133 +++++- .../generation/ur5_basket_config.py | 339 ++++++++----- .../test_ur5_basket_config_generation.py | 447 ++++++++++++++---- 3 files changed, 708 insertions(+), 211 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py index 64a011dc..7437afec 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py @@ -26,6 +26,7 @@ import re __all__ = [ + "GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES", "GLB_LOCAL_X_CORRECTION_DEGREES", "MESH_FRAME_NORMALIZATION_POLICY_VERSION", "MeshFrameNormalizer", @@ -33,8 +34,9 @@ ] -MESH_FRAME_NORMALIZATION_POLICY_VERSION = "action_agent_glb_rx_minus_90_obj_v1" -GLB_LOCAL_X_CORRECTION_DEGREES = -90.0 +MESH_FRAME_NORMALIZATION_POLICY_VERSION = "action_agent_glb_scene_obj_v2" +GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES = 0.0 +GLB_LOCAL_X_CORRECTION_DEGREES = GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES _SAFE_STEM_RE = re.compile(r"[^0-9a-zA-Z_.-]+") @@ -69,7 +71,7 @@ def __init__( self, *, output_dir: str | Path, - local_x_correction_degrees: float = GLB_LOCAL_X_CORRECTION_DEGREES, + local_x_correction_degrees: float = GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES, ) -> None: self.output_dir = Path(output_dir).expanduser().resolve() self.local_x_correction_degrees = float(local_x_correction_degrees) @@ -91,6 +93,11 @@ def normalize_path(self, mesh_path: str | Path) -> Path: cached = self._results_by_source.get(path) if cached is not None: + if cached.normalized_path.is_file(): + obj_text = _repair_obj_material_reference(cached.normalized_path) + self._ensure_material_library( + _obj_material_names(obj_text) or {"material_0"} + ) return cached.normalized_path source_sha256 = _file_sha256(path) @@ -98,6 +105,11 @@ def normalize_path(self, mesh_path: str | Path) -> Path: status = "reused" if normalized_path.is_file() else "generated" if status == "generated": self._write_normalized_obj(path, normalized_path, source_sha256) + else: + obj_text = _repair_obj_material_reference(normalized_path) + self._ensure_material_library( + _obj_material_names(obj_text) or {"material_0"} + ) result = NormalizedMeshResult( source_path=path, @@ -113,11 +125,54 @@ def normalize_path(self, mesh_path: str | Path) -> Path: def _normalized_path_for(self, mesh_path: Path, source_sha256: str) -> Path: stem = _SAFE_STEM_RE.sub("_", mesh_path.stem).strip("._") or "mesh" - filename = ( - f"{stem}_{source_sha256[:12]}_" - f"{MESH_FRAME_NORMALIZATION_POLICY_VERSION}.obj" + stem = stem[:32].strip("._") or "mesh" + runtime_hash = hashlib.sha256( + json.dumps( + { + "source_sha256": source_sha256, + "policy_version": MESH_FRAME_NORMALIZATION_POLICY_VERSION, + "dexsim_engine_version": self.dexsim_engine_version, + "transform": self.transform, + }, + sort_keys=True, + separators=(",", ":"), + ).encode("utf-8") + ).hexdigest() + return self.output_dir / f"{stem}_{runtime_hash[:16]}.obj" + + def _material_path(self) -> Path: + return self.output_dir / "material.mtl" + + def _ensure_material_library(self, material_names: set[str]) -> None: + if not material_names: + return + + material_path = self._material_path() + existing_names = _read_material_names(material_path) + all_names = sorted(existing_names | material_names) + material_path.write_text( + "\n".join( + [ + "# EmbodiChain action-agent normalized mesh materials", + *[ + "\n".join( + [ + f"newmtl {name}", + "Ka 0.8 0.8 0.8", + "Kd 0.8 0.8 0.8", + "Ks 0.0 0.0 0.0", + "Ns 1.0", + "d 1.0", + "illum 2", + ] + ) + for name in all_names + ], + "", + ] + ), + encoding="utf-8", ) - return self.output_dir / filename def _write_normalized_obj( self, @@ -128,7 +183,8 @@ def _write_normalized_obj( trimesh = _require_trimesh() scene = trimesh.load(str(source_path), force="scene") mesh = _scene_to_world_mesh(scene) - mesh.apply_transform(self.transform) + if self.local_x_correction_degrees: + mesh.apply_transform(self.transform) normalized_path.parent.mkdir(parents=True, exist_ok=True) obj_payload = mesh.export(file_type="obj") @@ -136,6 +192,7 @@ def _write_normalized_obj( obj_text = obj_payload.decode("utf-8") else: obj_text = str(obj_payload) + obj_text = _ensure_obj_material_reference(obj_text) header = "\n".join( [ @@ -149,19 +206,75 @@ def _write_normalized_obj( ] ) normalized_path.write_text(header + obj_text, encoding="utf-8") + self._ensure_material_library(_obj_material_names(obj_text) or {"material_0"}) def _scene_to_world_mesh(scene: Any) -> Any: - try: + if hasattr(scene, "to_geometry"): + mesh = scene.to_geometry() + elif hasattr(scene, "dump"): mesh = scene.dump(concatenate=True) - except AttributeError: + else: mesh = scene if not hasattr(mesh, "vertices") or len(mesh.vertices) == 0: raise ValueError("Mesh contains no vertices.") return mesh +def _obj_material_names(obj_text: str) -> set[str]: + names: set[str] = set() + for line in obj_text.splitlines(): + if not line.startswith("usemtl "): + continue + name = line.split(maxsplit=1)[1].strip() + if name: + names.add(name) + return names + + +def _repair_obj_material_reference(obj_path: Path) -> str: + obj_text = obj_path.read_text(encoding="utf-8") + repaired = _ensure_obj_material_reference(obj_text) + if repaired != obj_text: + obj_path.write_text(repaired, encoding="utf-8") + return repaired + + +def _ensure_obj_material_reference(obj_text: str) -> str: + has_mtllib = any(line.startswith("mtllib ") for line in obj_text.splitlines()) + has_usemtl = any(line.startswith("usemtl ") for line in obj_text.splitlines()) + prefix = [] + if not has_mtllib: + prefix.append("mtllib material.mtl") + if not has_usemtl: + prefix.append("usemtl material_0") + if not prefix: + return obj_text + return "\n".join(prefix) + "\n" + obj_text + + +def _read_material_names(material_path: Path) -> set[str]: + if not material_path.is_file(): + return set() + + names: set[str] = set() + for line in material_path.read_text(encoding="utf-8").splitlines(): + if not line.startswith("newmtl "): + continue + name = line.split(maxsplit=1)[1].strip() + if name: + names.add(name) + return names + + def _rotation_x_matrix4(degrees: float) -> list[list[float]]: + if degrees == 0.0: + return [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] radians = math.radians(degrees) cos_value = math.cos(radians) sin_value = math.sin(radians) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index e0d92e38..484b855b 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -1720,16 +1720,16 @@ def _build_ur5_basket_bundle( by_uid[roles.right_target_source_uid], roles.right_target_runtime_uid, object_scale, - replacement_by_source_uid.get(roles.right_target_source_uid), mesh_normalizer, + replacement_by_source_uid.get(roles.right_target_source_uid), ), _make_target_object_config( scene_dir, by_uid[roles.left_target_source_uid], roles.left_target_runtime_uid, object_scale, - replacement_by_source_uid.get(roles.left_target_source_uid), mesh_normalizer, + replacement_by_source_uid.get(roles.left_target_source_uid), ), *[ _make_extra_rigid_object_config( @@ -1779,6 +1779,15 @@ def _attach_coacd_cache_summary(bundle: dict[str, Any]) -> None: ) +def _attach_mesh_normalization_summary( + bundle: dict[str, Any], + mesh_normalizer: MeshFrameNormalizer, +) -> None: + reports = mesh_normalizer.reports + if reports: + bundle.setdefault("summary", {})["normalized_meshes"] = reports + + def _build_relative_placement_bundle( *, scene_dir: Path, @@ -1789,6 +1798,7 @@ def _build_relative_placement_bundle( target_body_scale: float | list[float] | tuple[float, float, float], max_episodes: int, max_episode_steps: int, + mesh_normalizer: MeshFrameNormalizer, ) -> dict[str, Any]: scene_objects = _collect_scene_objects(source_config) background_objects = [ @@ -1797,21 +1807,21 @@ def _build_relative_placement_bundle( rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] by_uid = {obj.source_uid: obj for obj in scene_objects} runtime_uids = _relative_runtime_uid_mapping(rigid_objects) - static_reference_source_uids = _static_relative_reference_source_uids( - spec, - by_uid, - ) + moved_source_uids = {placement.moved_source_uid for placement in spec.placements} dynamic_rigid_objects = [ - obj for obj in rigid_objects if obj.source_uid not in static_reference_source_uids + obj for obj in rigid_objects if obj.source_uid in moved_source_uids ] - static_reference_objects = [ - obj for obj in rigid_objects if obj.source_uid in static_reference_source_uids + static_scene_objects = [ + obj for obj in rigid_objects if obj.source_uid not in moved_source_uids ] object_scale = _target_body_scale_vector(target_body_scale) - robot_init_z = _estimate_dual_ur5_init_z( + table_config = _make_background_config( scene_dir, by_uid[spec.table_source_uid], + mesh_normalizer, ) + table_top_z = _mesh_config_world_zmax(table_config) + robot_init_z = _dual_ur5_init_z_from_table_top(table_top_z) gym_config = { "id": "AtomicActionsAgent-v3", @@ -1827,18 +1837,22 @@ def _build_relative_placement_bundle( "sensor": _make_sensor_config(), "light": _make_light_config(), "background": [ - _make_background_config(scene_dir, by_uid[spec.table_source_uid]), + table_config, *[ - _make_container_background_config( + _make_relative_background_object_config( scene_dir, obj, runtime_uids[obj.source_uid], - _relative_object_body_scale(obj, target_scale=object_scale), + max_convex_hull_num=_relative_static_background_max_convex_hull_num( + runtime_uids[obj.source_uid], + spec, + ), + mesh_normalizer=mesh_normalizer, ) - for obj in static_reference_objects + for obj in static_scene_objects ], *[ - _make_extra_background_config(scene_dir, obj, object_scale) + _make_extra_background_config(scene_dir, obj, mesh_normalizer) for obj in background_objects if obj.source_uid != spec.table_source_uid ], @@ -1848,18 +1862,17 @@ def _build_relative_placement_bundle( scene_dir=scene_dir, obj=obj, runtime_uid=runtime_uids[obj.source_uid], - body_scale=_relative_object_body_scale( - obj, - target_scale=object_scale, - ), + body_scale=object_scale, max_convex_hull_num=_relative_rigid_object_max_convex_hull_num( runtime_uids[obj.source_uid], spec, ), + mesh_normalizer=mesh_normalizer, ) for obj in dynamic_rigid_objects ], } + _apply_tabletop_z_placement(gym_config, table_top_z) return { "gym_config": gym_config, "agent_config": make_agent_config(), @@ -1883,34 +1896,6 @@ def _source_body_scale(obj: _SceneObject) -> list[float]: return _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])) -def _static_relative_reference_source_uids( - spec: _RelativePlacementSpec, - by_uid: Mapping[str, _SceneObject], -) -> set[str]: - moved_source_uids = {placement.moved_source_uid for placement in spec.placements} - return { - placement.reference_source_uid - for placement in spec.placements - if placement.reference_source_uid not in moved_source_uids - and _is_container_like(by_uid[placement.reference_source_uid]) - } - - -def _relative_object_body_scale( - obj: _SceneObject, - *, - target_scale: list[float], -) -> list[float]: - if _is_container_object(obj): - return _source_body_scale(obj) - return target_scale - - -def _is_container_object(obj: _SceneObject) -> bool: - text = _object_text(obj) - return any(keyword in text for keyword in _CONTAINER_KEYWORDS) - - def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: if len(spec.placements) == 1: return { @@ -1936,10 +1921,7 @@ def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: } -def _estimate_dual_ur5_init_z(scene_dir: Path, table_obj: _SceneObject) -> float: - """Estimate robot root height from the table mesh top surface.""" - - table_top_z = _resolve_table_mesh_world_zmax(scene_dir, table_obj) +def _dual_ur5_init_z_from_table_top(table_top_z: float | None) -> float: if table_top_z is None: return _DUAL_UR5_LEGACY_INIT_Z @@ -1947,6 +1929,104 @@ def _estimate_dual_ur5_init_z(scene_dir: Path, table_obj: _SceneObject) -> float return round(max(_DUAL_UR5_LEGACY_INIT_Z, init_z), 6) +def _apply_tabletop_z_placement( + gym_config: dict[str, Any], + table_top_z: float | None, +) -> None: + if table_top_z is None: + return + target_bottom_z = float(table_top_z) + _TABLETOP_OBJECT_CLEARANCE + for obj in _iter_generated_scene_object_configs(gym_config): + if obj.get("uid") == "table": + continue + mesh_min_z = _mesh_config_local_zmin_after_rotation(obj) + if mesh_min_z is None: + continue + init_pos = _clean_vector3(obj.get("init_pos", [0.0, 0.0, 0.0])) + init_pos[2] = round(target_bottom_z - mesh_min_z, 6) + obj["init_pos"] = init_pos + + +def _iter_generated_scene_object_configs( + gym_config: Mapping[str, Any], +) -> list[dict[str, Any]]: + objects: list[dict[str, Any]] = [] + for section in ("background", "rigid_object"): + value = gym_config.get(section, []) + if isinstance(value, Mapping): + value = [value] + if not isinstance(value, list): + continue + objects.extend(obj for obj in value if isinstance(obj, dict)) + return objects + + +def _mesh_config_world_zmax(obj_config: Mapping[str, Any]) -> float | None: + bounds = _mesh_config_world_z_bounds(obj_config) + if bounds is None: + return None + return bounds[1] + + +def _mesh_config_local_zmin_after_rotation( + obj_config: Mapping[str, Any], +) -> float | None: + shape = obj_config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + mesh_path = shape.get("fpath") + if not isinstance(mesh_path, str): + return None + vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) + if not vertices: + return None + + matrix = _mesh_config_transform_matrix( + obj_config, + translation=[0.0, 0.0, 0.0], + ) + return min(_transform_point(matrix, vertex)[2] for vertex in vertices) + + +def _mesh_config_world_z_bounds( + obj_config: Mapping[str, Any], +) -> tuple[float, float] | None: + shape = obj_config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + mesh_path = shape.get("fpath") + if not isinstance(mesh_path, str): + return None + vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) + if not vertices: + return None + + matrix = _mesh_config_transform_matrix(obj_config) + z_values = [_transform_point(matrix, vertex)[2] for vertex in vertices] + return (min(z_values), max(z_values)) + + +def _mesh_config_transform_matrix( + obj_config: Mapping[str, Any], + *, + translation: list[float] | None = None, +) -> list[list[float]]: + scale = _vector3(obj_config.get("body_scale", [1.0, 1.0, 1.0])) + init_local_pose = obj_config.get("init_local_pose") + if init_local_pose is not None and translation is None: + root_matrix = _matrix4(init_local_pose) + else: + root_matrix = _euler_xyz_degrees_matrix( + _vector3(obj_config.get("init_rot", [0.0, 0.0, 0.0])), + ( + _vector3(obj_config.get("init_pos", [0.0, 0.0, 0.0])) + if translation is None + else translation + ), + ) + return _matrix_multiply(root_matrix, _scale_matrix4(scale)) + + def _resolve_table_mesh_world_zmax( scene_dir: Path, table_obj: _SceneObject, @@ -2015,9 +2095,11 @@ def _load_mesh_vertices_with_trimesh( try: scene_or_mesh = trimesh.load(str(mesh_path), force="scene") - try: + if hasattr(scene_or_mesh, "to_geometry"): + mesh = scene_or_mesh.to_geometry() + elif hasattr(scene_or_mesh, "dump"): mesh = scene_or_mesh.dump(concatenate=True) - except AttributeError: + else: mesh = scene_or_mesh except Exception: return None @@ -2823,8 +2905,12 @@ def _make_light_config() -> dict[str, Any]: } -def _make_background_config(scene_dir: Path, obj: _SceneObject) -> dict[str, Any]: - shape = _make_shape_config(scene_dir, obj.config) +def _make_background_config( + scene_dir: Path, + obj: _SceneObject, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + shape = _make_shape_config(scene_dir, obj.config, mesh_normalizer=mesh_normalizer) return { "uid": "table", "shape": shape, @@ -2832,7 +2918,7 @@ def _make_background_config(scene_dir: Path, obj: _SceneObject) -> dict[str, Any "body_scale": _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])), "body_type": "kinematic", "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _corrected_init_rot_for_shape(obj.config, shape), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), "max_convex_hull_num": _role_limited_max_convex_hull_num( obj, _BACKGROUND_MAX_CONVEX_HULL_NUM, @@ -2843,9 +2929,10 @@ def _make_background_config(scene_dir: Path, obj: _SceneObject) -> dict[str, Any def _make_extra_background_config( scene_dir: Path, obj: _SceneObject, + mesh_normalizer: MeshFrameNormalizer, body_scale: Any | None = None, ) -> dict[str, Any]: - shape = _make_shape_config(scene_dir, obj.config) + shape = _make_shape_config(scene_dir, obj.config, mesh_normalizer=mesh_normalizer) config = { "uid": _normalize_runtime_uid(obj.source_uid), "shape": shape, @@ -2857,7 +2944,7 @@ def _make_extra_background_config( ), "body_type": str(obj.config.get("body_type", "static")), "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _corrected_init_rot_for_shape(obj.config, shape), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), "max_convex_hull_num": _role_limited_max_convex_hull_num( obj, _BACKGROUND_MAX_CONVEX_HULL_NUM, @@ -2871,16 +2958,20 @@ def _make_target_object_config( obj: _SceneObject, runtime_uid: str, target_scale: list[float], + mesh_normalizer: MeshFrameNormalizer, replacement: _ResolvedTargetReplacement | None = None, ) -> dict[str, Any]: - return _make_rigid_object_config( + config = _make_rigid_object_config( scene_dir, obj, runtime_uid, target_scale, max_convex_hull_num=_TARGET_MAX_CONVEX_HULL_NUM, mesh_fpath=replacement.mesh_path if replacement else None, + mesh_normalizer=mesh_normalizer, ) + config["body_type"] = "dynamic" + return config def _make_container_object_config( @@ -2888,6 +2979,7 @@ def _make_container_object_config( obj: _SceneObject, runtime_uid: str, body_scale: Any, + mesh_normalizer: MeshFrameNormalizer, ) -> dict[str, Any]: return _make_rigid_object_config( scene_dir, @@ -2898,6 +2990,7 @@ def _make_container_object_config( obj, _CONTAINER_MAX_CONVEX_HULL_NUM, ), + mesh_normalizer=mesh_normalizer, ) @@ -2906,15 +2999,36 @@ def _make_container_background_config( obj: _SceneObject, runtime_uid: str, body_scale: Any, + mesh_normalizer: MeshFrameNormalizer, ) -> dict[str, Any]: config = _make_container_object_config( scene_dir, obj, runtime_uid, body_scale, + mesh_normalizer, + ) + config["body_type"] = "kinematic" + return config + + +def _make_relative_background_object_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + *, + max_convex_hull_num: int, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + config = _make_rigid_object_config( + scene_dir, + obj, + runtime_uid, + _source_body_scale(obj), + max_convex_hull_num=max_convex_hull_num, + mesh_normalizer=mesh_normalizer, ) config["body_type"] = "kinematic" - config["init_rot"] = _corrected_init_rot_for_shape(obj.config, config["shape"]) return config @@ -2922,6 +3036,7 @@ def _make_extra_rigid_object_config( scene_dir: Path, obj: _SceneObject, body_scale: Any, + mesh_normalizer: MeshFrameNormalizer, ) -> dict[str, Any]: return _make_rigid_object_config( scene_dir, @@ -2932,6 +3047,7 @@ def _make_extra_rigid_object_config( obj, _EXTRA_RIGID_MAX_CONVEX_HULL_NUM, ), + mesh_normalizer=mesh_normalizer, ) @@ -2942,6 +3058,7 @@ def _make_relative_rigid_object_config( runtime_uid: str, body_scale: Any, max_convex_hull_num: int, + mesh_normalizer: MeshFrameNormalizer, ) -> dict[str, Any]: if max_convex_hull_num == _TARGET_MAX_CONVEX_HULL_NUM: resolved_max_convex_hull_num = max_convex_hull_num @@ -2950,13 +3067,16 @@ def _make_relative_rigid_object_config( obj, max_convex_hull_num, ) - return _make_rigid_object_config( + config = _make_rigid_object_config( scene_dir, obj, runtime_uid, body_scale, max_convex_hull_num=resolved_max_convex_hull_num, + mesh_normalizer=mesh_normalizer, ) + config["body_type"] = "dynamic" + return config def _make_rigid_object_config( @@ -2966,14 +3086,20 @@ def _make_rigid_object_config( body_scale: Any, max_convex_hull_num: int, mesh_fpath: str | Path | None = None, + mesh_normalizer: MeshFrameNormalizer | None = None, ) -> dict[str, Any]: - shape = _make_shape_config(scene_dir, obj.config, mesh_fpath=mesh_fpath) + shape = _make_shape_config( + scene_dir, + obj.config, + mesh_fpath=mesh_fpath, + mesh_normalizer=mesh_normalizer, + ) config = { "uid": runtime_uid, "shape": shape, "attrs": dict(_RIGID_OBJECT_ATTRS), "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _corrected_init_rot_for_shape(obj.config, shape), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), "body_scale": _clean_vector3(body_scale), "max_convex_hull_num": int(max_convex_hull_num), } @@ -3012,56 +3138,39 @@ def _relative_rigid_object_max_convex_hull_num( return _EXTRA_RIGID_MAX_CONVEX_HULL_NUM +def _relative_static_background_max_convex_hull_num( + runtime_uid: str, + spec: _RelativePlacementSpec, +) -> int: + for placement in spec.placements: + if ( + placement.relation == "inside" + and runtime_uid == placement.reference_runtime_uid + ): + return _CONTAINER_MAX_CONVEX_HULL_NUM + return _BACKGROUND_MAX_CONVEX_HULL_NUM + + def _make_shape_config( scene_dir: Path, source_config: Mapping[str, Any], *, mesh_fpath: str | Path | None = None, + mesh_normalizer: MeshFrameNormalizer | None = None, ) -> dict[str, Any]: shape = copy.deepcopy(dict(source_config.get("shape", {}))) if mesh_fpath is not None: shape["shape_type"] = "Mesh" shape["fpath"] = str(mesh_fpath) if shape.get("shape_type") == "Mesh" and "fpath" in shape: - shape["fpath"] = _asset_path_for_config(scene_dir, str(shape["fpath"])) + mesh_path = Path(_asset_path_for_config(scene_dir, str(shape["fpath"]))) + if mesh_normalizer is not None: + mesh_path = mesh_normalizer.normalize_path(mesh_path) + shape["fpath"] = mesh_path.as_posix() shape.setdefault("compute_uv", False) return shape -def _corrected_init_rot_for_shape( - source_config: Mapping[str, Any], - shape_config: Mapping[str, Any], -) -> list[float]: - init_rot = _clean_vector3(source_config.get("init_rot", [0.0, 0.0, 0.0])) - if not _is_glb_mesh_shape(shape_config): - return init_rot - - from scipy.spatial.transform import Rotation - - source_rotation = Rotation.from_euler("XYZ", init_rot, degrees=True) - correction = Rotation.from_euler( - "X", - _DEXSIM_041_GLB_LOCAL_X_CORRECTION_DEGREES, - degrees=True, - ) - corrected = source_rotation * correction - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message="Gimbal lock detected.*", - category=UserWarning, - ) - corrected_euler = corrected.as_euler("XYZ", degrees=True) - return [float(value) for value in corrected_euler] - - -def _is_glb_mesh_shape(shape_config: Mapping[str, Any]) -> bool: - if shape_config.get("shape_type") != "Mesh": - return False - fpath = shape_config.get("fpath") - return isinstance(fpath, str) and Path(fpath).suffix.lower() == ".glb" - - def _asset_path_for_config(scene_dir: Path, fpath: str) -> str: raw_path = Path(fpath) if raw_path.is_absolute(): @@ -3202,20 +3311,7 @@ def _write_config_bundle( atom_actions=output_dir / "atom_actions.txt", summary=dict(bundle.get("summary", {})), ) - output_files = [ - paths.gym_config, - paths.agent_config, - paths.task_prompt, - paths.basic_background, - paths.atom_actions, - ] - existing = [path for path in output_files if path.exists()] - if existing and not overwrite: - existing_text = ", ".join(path.as_posix() for path in existing) - raise FileExistsError( - f"Generated file(s) already exist: {existing_text}. " - "Pass overwrite=True or --overwrite to replace them." - ) + _raise_if_generated_files_exist(output_dir, overwrite) output_dir.mkdir(parents=True, exist_ok=True) _write_json(paths.gym_config, bundle["gym_config"]) @@ -3226,6 +3322,25 @@ def _write_config_bundle( return paths +def _raise_if_generated_files_exist(output_dir: Path, overwrite: bool) -> None: + if overwrite: + return + output_files = [ + output_dir / "fast_gym_config.json", + output_dir / "agent_config.json", + output_dir / "task_prompt.txt", + output_dir / "basic_background.txt", + output_dir / "atom_actions.txt", + ] + existing = [path for path in output_files if path.exists()] + if existing: + existing_text = ", ".join(path.as_posix() for path in existing) + raise FileExistsError( + f"Generated file(s) already exist: {existing_text}. " + "Pass overwrite=True or --overwrite to replace them." + ) + + def _write_json(path: Path, data: Mapping[str, Any]) -> None: path.write_text( json.dumps(data, ensure_ascii=False, indent=4) + "\n", diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index df6a3488..8ee185d7 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -17,6 +17,7 @@ from __future__ import annotations from pathlib import Path +import hashlib import json import struct @@ -26,6 +27,10 @@ from embodichain.gen_sim.action_agent_pipeline.generation import ( ur5_basket_config as ur5_basket_config_generation, ) +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_frame_normalization import ( + MESH_FRAME_NORMALIZATION_POLICY_VERSION, + MeshFrameNormalizer, +) from embodichain.gen_sim.action_agent_pipeline.generation.ur5_basket_config import ( TargetReplacementSpec, generate_ur5_basket_config_from_project, @@ -54,15 +59,15 @@ def test_ur5_basket_generator_uses_parallel_handoff( assert set(rigid_objects) == {"left_apple", "right_apple"} assert rigid_objects["left_apple"]["body_scale"] == [0.6, 0.6, 0.6] assert rigid_objects["right_apple"]["body_scale"] == [0.6, 0.6, 0.6] + assert rigid_objects["left_apple"]["body_type"] == "dynamic" + assert rigid_objects["right_apple"]["body_type"] == "dynamic" assert background_objects["table"]["body_scale"] == [1.0, 1.0, 1.0] assert background_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] assert background_objects["wicker_basket"]["body_type"] == "kinematic" - assert rigid_objects["left_apple"]["shape"]["fpath"].endswith( - "mesh_assets/apple/apple_2/apple_2.glb" - ) - assert rigid_objects["right_apple"]["shape"]["fpath"].endswith( - "mesh_assets/apple/apple_1/apple_1.glb" - ) + _assert_normalized_obj_path(rigid_objects["left_apple"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) + _assert_normalized_obj_path(background_objects["table"]["shape"]["fpath"]) + _assert_normalized_obj_path(background_objects["wicker_basket"]["shape"]["fpath"]) assert gym_config["robot"]["init_pos"] == [-2.0, 0.0, 0.5] assert gym_config["robot"]["init_rot"] == [0.0, 0.0, 90.0] @@ -94,6 +99,7 @@ def test_ur5_basket_generator_uses_parallel_handoff( assert "parallel handoff" in task_prompt assert "parallel handoff" in basic_background assert "parallel handoff" in atom_actions + assert len(paths.summary["normalized_meshes"]) == 4 handoff_edge = task_prompt.split("6. After the left gripper", maxsplit=1)[1].split( "\n7. Lower the held right target object", @@ -112,7 +118,7 @@ def test_ur5_basket_generator_uses_parallel_handoff( assert paths.summary["mode"] == "basket_template" -def test_generator_applies_dexsim_041_glb_rotation_correction( +def test_generator_normalizes_glb_meshes_and_preserves_source_rot( tmp_path: Path, ) -> None: project_dir = tmp_path / "1790000000_gym_project" @@ -127,18 +133,85 @@ def test_generator_applies_dexsim_041_glb_rotation_correction( rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} background_objects = {obj["uid"]: obj for obj in gym_config["background"]} - assert background_objects["table"]["init_rot"] == pytest.approx( - _expected_glb_corrected_rot([0.0, 0.0, 180.0]) - ) - assert background_objects["wicker_basket"]["init_rot"] == pytest.approx( - _expected_glb_corrected_rot([0.0, 0.0, 180.0]) - ) - assert rigid_objects["right_apple"]["init_rot"] == pytest.approx( - _expected_glb_corrected_rot([0.0, 0.0, 140.0]) - ) - assert rigid_objects["left_apple"]["init_rot"] == pytest.approx( - _expected_glb_corrected_rot([0.0, 0.0, 160.0]) + assert background_objects["table"]["init_rot"] == [0.0, 0.0, 180.0] + assert background_objects["wicker_basket"]["init_rot"] == [0.0, 0.0, 180.0] + assert rigid_objects["right_apple"]["init_rot"] == [0.0, 0.0, 140.0] + assert rigid_objects["left_apple"]["init_rot"] == [0.0, 0.0, 160.0] + for obj_config in [ + background_objects["table"], + background_objects["wicker_basket"], + rigid_objects["right_apple"], + rigid_objects["left_apple"], + ]: + _assert_normalized_obj_path(obj_config["shape"]["fpath"]) + + source_paths = { + Path(entry["source_path"]).name for entry in paths.summary["normalized_meshes"] + } + assert source_paths == { + "table_0.glb", + "basket_3.glb", + "apple_1.glb", + "apple_2.glb", + } + + +def test_mesh_frame_normalizer_bakes_glb_scene_transform_to_obj( + tmp_path: Path, +) -> None: + mesh_path = tmp_path / "source" / "triangle.glb" + _write_minimal_glb( + mesh_path, + [(0.0, 0.0, 0.0), (0.0, 1.0, 0.0), (0.0, 0.0, 1.0)], + node_translation=(1.0, 0.0, 0.0), + ) + source_sha256 = hashlib.sha256(mesh_path.read_bytes()).hexdigest() + normalizer = MeshFrameNormalizer(output_dir=tmp_path / "normalized") + + normalized_path = normalizer.normalize_path(mesh_path) + repeated_path = normalizer.normalize_path(mesh_path) + + assert repeated_path == normalized_path + assert normalized_path.suffix == ".obj" + assert MESH_FRAME_NORMALIZATION_POLICY_VERSION not in normalized_path.name + assert len(normalized_path.name) <= 64 + obj_text = normalized_path.read_text(encoding="utf-8") + assert f"policy_version: {MESH_FRAME_NORMALIZATION_POLICY_VERSION}" in obj_text + assert f"source_sha256: {source_sha256}" in obj_text + assert "dexsim_engine_version:" in obj_text + assert ( + "transform: [[1.0,0.0,0.0,0.0],[0.0,1.0,0.0,0.0]," + "[0.0,0.0,1.0,0.0],[0.0,0.0,0.0,1.0]]" + ) in obj_text + assert "mtllib material.mtl" in obj_text + material_text = (normalized_path.parent / "material.mtl").read_text( + encoding="utf-8" + ) + assert "newmtl material_0" in material_text + assert _rounded_vertex_set(_obj_vertices(normalized_path)) == { + (1.0, 0.0, 0.0), + (1.0, 1.0, 0.0), + (1.0, 0.0, 1.0), + } + + +def test_mesh_frame_normalizer_recreates_material_library_for_reused_obj( + tmp_path: Path, +) -> None: + mesh_path = tmp_path / "source" / "triangle.glb" + _write_minimal_glb(mesh_path, _default_mesh_vertices()) + output_dir = tmp_path / "normalized" + normalized_path = MeshFrameNormalizer(output_dir=output_dir).normalize_path( + mesh_path ) + material_path = normalized_path.parent / "material.mtl" + material_path.unlink() + + reused_path = MeshFrameNormalizer(output_dir=output_dir).normalize_path(mesh_path) + + assert reused_path == normalized_path + assert material_path.is_file() + assert "newmtl material_0" in material_path.read_text(encoding="utf-8") def test_target_replacements_generate_meshes_and_replace_paths( @@ -171,12 +244,18 @@ def test_target_replacements_generate_meshes_and_replace_paths( assert set(rigid_objects) == {"left_apple", "right_apple"} assert "wicker_basket" in background_objects assert background_objects["wicker_basket"]["body_type"] == "kinematic" - assert rigid_objects["right_apple"]["shape"]["fpath"].endswith( - "mesh_assets/new1/orange.glb" - ) - assert rigid_objects["left_apple"]["shape"]["fpath"].endswith( - "mesh_assets/new2/apple.glb" - ) + _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["left_apple"]["shape"]["fpath"]) + normalized_sources = { + Path(entry["source_path"]).as_posix() + for entry in paths.summary["normalized_meshes"] + } + assert ( + project_dir / "mesh_assets" / "new1" / "orange.glb" + ).as_posix() in normalized_sources + assert ( + project_dir / "mesh_assets" / "new2" / "apple.glb" + ).as_posix() in normalized_sources assert paths.summary["target_replacements"][0]["source_uid"] == "apple_1" assert paths.summary["target_replacements"][1]["source_uid"] == "apple_2" @@ -207,12 +286,8 @@ def test_target_replacements_can_sync_runtime_names( assert set(rigid_objects) == {"left_orange", "right_apple"} assert "wicker_basket" in background_objects assert background_objects["wicker_basket"]["body_type"] == "kinematic" - assert rigid_objects["left_orange"]["shape"]["fpath"].endswith( - "mesh_assets/new1/orange.glb" - ) - assert rigid_objects["right_apple"]["shape"]["fpath"].endswith( - "mesh_assets/new2/apple.glb" - ) + _assert_normalized_obj_path(rigid_objects["left_orange"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] assert {term["object"] for term in success_terms} == { @@ -234,8 +309,7 @@ def test_directory_input_prefers_merged_config_and_preserves_extra_scene_scale( project_dir = tmp_path / "1790000000_gym_project" _write_project(project_dir) background_mesh = project_dir / "mesh_assets/backgrounds/vase_0.glb" - background_mesh.parent.mkdir(parents=True, exist_ok=True) - background_mesh.write_bytes(b"") + _write_minimal_glb(background_mesh, _default_mesh_vertices()) merged_config_path = project_dir / "gym_config_merged.json" source_config = json.loads( @@ -274,9 +348,7 @@ def test_directory_input_prefers_merged_config_and_preserves_extra_scene_scale( assert rigid_objects["left_apple"]["body_scale"] == [0.8, 0.8, 0.8] assert rigid_objects["right_apple"]["body_scale"] == [0.8, 0.8, 0.8] assert rigid_objects["vase_0"]["body_scale"] == [1.2, 1.1, 0.9] - assert rigid_objects["vase_0"]["shape"]["fpath"].endswith( - "mesh_assets/backgrounds/vase_0.glb" - ) + _assert_normalized_obj_path(rigid_objects["vase_0"]["shape"]["fpath"]) def test_task_description_generates_relative_left_of_config( @@ -324,15 +396,15 @@ def fake_call_relative_task_llm(**kwargs): gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} background_objects = {obj["uid"]: obj for obj in gym_config["background"]} - assert set(rigid_objects) == {"apple_1", "apple_2"} + assert set(rigid_objects) == {"apple_2"} assert rigid_objects["apple_2"]["body_scale"] == [0.5, 0.5, 0.5] - assert rigid_objects["apple_1"]["body_scale"] == [0.5, 0.5, 0.5] + assert rigid_objects["apple_2"]["body_type"] == "dynamic" + assert background_objects["apple_1"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["apple_1"]["body_type"] == "kinematic" assert background_objects["table"]["body_scale"] == [1.0, 1.0, 1.0] assert background_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] assert background_objects["wicker_basket"]["body_type"] == "kinematic" - assert background_objects["wicker_basket"]["init_rot"] == pytest.approx( - _expected_glb_corrected_rot([0.0, 0.0, 180.0]) - ) + assert background_objects["wicker_basket"]["init_rot"] == [0.0, 0.0, 180.0] success = gym_config["env"]["extensions"]["agent_success"] assert success["op"] == "all" @@ -357,7 +429,7 @@ def fake_call_relative_task_llm(**kwargs): assert "right_arm_action: null" in task_prompt assert "Generate exactly 10 nominal edges" not in task_prompt - assert paths.summary == { + assert _stable_summary(paths.summary) == { "mode": "relative_placement", "moved_object": "apple_2", "reference_object": "wicker_basket", @@ -405,6 +477,13 @@ def fake_call_relative_task_llm(**kwargs): ) gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert set(rigid_objects) == {"apple_1"} + assert rigid_objects["apple_1"]["body_type"] == "dynamic" + assert background_objects["apple_2"]["body_type"] == "kinematic" + assert background_objects["wicker_basket"]["body_type"] == "kinematic" + success = gym_config["env"]["extensions"]["agent_success"] assert success["op"] == "all" axis_terms = { @@ -420,7 +499,7 @@ def fake_call_relative_task_llm(**kwargs): assert '"offset":[-0.16,0.0,0.22]' in task_prompt assert '"offset":[-0.16,0.0,0.22]' in atom_actions - assert paths.summary == { + assert _stable_summary(paths.summary) == { "mode": "relative_placement", "moved_object": "apple_1", "reference_object": "apple_2", @@ -509,11 +588,11 @@ def fake_call_relative_task_llm(**kwargs): gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} background_objects = {obj["uid"]: obj for obj in gym_config["background"]} - assert set(rigid_objects) == {"apple_1", "apple_2"} + assert set(rigid_objects) == {"apple_1"} + assert rigid_objects["apple_1"]["body_type"] == "dynamic" + assert background_objects["apple_2"]["body_type"] == "kinematic" assert background_objects["wicker_basket"]["body_type"] == "kinematic" - assert background_objects["wicker_basket"]["init_rot"] == pytest.approx( - _expected_glb_corrected_rot([0.0, 0.0, 180.0]) - ) + assert background_objects["wicker_basket"]["init_rot"] == [0.0, 0.0, 180.0] assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] assert paths.summary["active_arm"] == "left_arm" @@ -552,6 +631,12 @@ def fake_call_relative_task_llm(**kwargs): ) gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert set(rigid_objects) == {"apple_2"} + assert rigid_objects["apple_2"]["body_type"] == "dynamic" + assert background_objects["apple_1"]["body_type"] == "kinematic" + assert background_objects["wicker_basket"]["body_type"] == "kinematic" assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] assert paths.summary["active_arm"] == "right_arm" @@ -563,6 +648,60 @@ def fake_call_relative_task_llm(**kwargs): assert "left_arm_action: null" in task_prompt +def test_demo3_relative_placement_uses_role_aware_scene_partition( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_demo3_role_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + assert kwargs["task_description"] == "用右臂把咖啡杯子放到垫子上" + return { + "moved_object": "cup_1", + "reference_object": "pad_1", + "goal_relation": "on", + "arm": "right", + "task_prompt_summary": "Place the cup on the pad.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_demo3_relative_agent", + task_description="用右臂把咖啡杯子放到垫子上", + target_body_scale=0.8, + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert set(rigid_objects) == {"cup"} + assert rigid_objects["cup"]["body_type"] == "dynamic" + assert rigid_objects["cup"]["body_scale"] == [0.8, 0.8, 0.8] + assert background_objects["pad"]["body_type"] == "kinematic" + assert background_objects["pad"]["body_scale"] == [1.2, 1.0, 0.4] + assert background_objects["fork"]["body_type"] == "kinematic" + assert background_objects["fork"]["body_scale"] == [0.7, 0.7, 0.7] + + success = gym_config["env"]["extensions"]["agent_success"] + assert success["type"] == "object_on_object" + assert success["object"] == "cup" + assert success["support"] == "pad" + + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert atom_actions.count('"atomic_action_class":"PickUpAction"') == 1 + assert '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in atom_actions + assert '"obj_name":"cup"' in atom_actions + assert _stable_summary(paths.summary)["relation"] == "on" + + def test_task_description_generates_dual_arm_relative_config( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -615,6 +754,12 @@ def fake_call_relative_task_llm(**kwargs): ) gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert set(rigid_objects) == {"apple_1", "apple_2"} + assert rigid_objects["apple_1"]["body_type"] == "dynamic" + assert rigid_objects["apple_2"]["body_type"] == "dynamic" + assert background_objects["wicker_basket"]["body_type"] == "kinematic" assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] success = gym_config["env"]["extensions"]["agent_success"] @@ -635,7 +780,7 @@ def fake_call_relative_task_llm(**kwargs): } assert "grasp_pose_object" not in attr_names - assert paths.summary == { + assert _stable_summary(paths.summary) == { "mode": "dual_arm_relative_placement", "placements": [ { @@ -811,9 +956,14 @@ def fake_call_relative_task_llm(**kwargs): gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert set(rigid_objects) == {"apple_2"} assert rigid_objects["apple_2"]["body_scale"] == [0.6, 0.6, 0.6] - assert rigid_objects["apple_1"]["body_scale"] == [0.6, 0.6, 0.6] - assert rigid_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] + assert rigid_objects["apple_2"]["body_type"] == "dynamic" + assert background_objects["apple_1"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["apple_1"]["body_type"] == "kinematic" + assert background_objects["wicker_basket"]["body_scale"] == [1.0, 1.0, 1.0] + assert background_objects["wicker_basket"]["body_type"] == "kinematic" success = gym_config["env"]["extensions"]["agent_success"] assert success["type"] == "object_on_object" @@ -854,10 +1004,13 @@ def fake_call_relative_task_llm(**kwargs): def test_high_tabletop_scene_adjusts_robot_height_and_light( tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, ) -> None: project_dir = tmp_path / "1790000000_gym_project" _write_project(project_dir) + _write_minimal_glb( + project_dir / "mesh_assets/table/table_0.glb", + [(-0.5, 0.0, 0.82), (0.5, 0.0, 0.82), (0.0, -0.82, 0.82)], + ) gym_config_path = project_dir / "gym_config.json" source_config = json.loads(gym_config_path.read_text(encoding="utf-8")) @@ -868,20 +1021,6 @@ def test_high_tabletop_scene_adjusts_robot_height_and_light( encoding="utf-8", ) - def fake_resolve_table_mesh_world_zmax( - scene_dir: Path, - table_obj, - ) -> float: - assert scene_dir == project_dir - assert table_obj.source_uid == "table" - return 1.18 - - monkeypatch.setattr( - ur5_basket_config_generation, - "_resolve_table_mesh_world_zmax", - fake_resolve_table_mesh_world_zmax, - ) - paths = generate_ur5_basket_config_from_project( project_dir, tmp_path / "generated_high_table_agent", @@ -897,6 +1036,33 @@ def fake_resolve_table_mesh_world_zmax( assert gym_config["light"]["direct"][0]["intensity"] == 40.0 +def test_tabletop_z_placement_uses_normalized_mesh_bounds( + tmp_path: Path, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_z_agent", + target_body_scale=0.8, + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + table_config = next(obj for obj in gym_config["background"] if obj["uid"] == "table") + table_top_z = ur5_basket_config_generation._mesh_config_world_zmax(table_config) + expected_min_z = ( + table_top_z + ur5_basket_config_generation._TABLETOP_OBJECT_CLEARANCE + ) + for obj_config in [ + *[obj for obj in gym_config["background"] if obj["uid"] != "table"], + *gym_config["rigid_object"], + ]: + min_z, _ = ur5_basket_config_generation._mesh_config_world_z_bounds(obj_config) + assert min_z == pytest.approx(expected_min_z) + + def test_table_mesh_world_zmax_reads_glb_vertices(tmp_path: Path) -> None: scene_dir = tmp_path / "1790000000_gym_project" mesh_path = scene_dir / "mesh_assets/table/table_0.glb" @@ -953,8 +1119,7 @@ def _write_project(project_dir: Path) -> None: "mesh_assets/apple/apple_2/apple_2.glb", ): mesh_path = project_dir / rel_path - mesh_path.parent.mkdir(parents=True, exist_ok=True) - mesh_path.write_bytes(b"") + _write_minimal_glb(mesh_path, _default_mesh_vertices()) gym_config = { "id": "Image2Tabletop-1790000000-v0", @@ -993,6 +1158,54 @@ def _write_project(project_dir: Path) -> None: ) +def _write_demo3_role_project(project_dir: Path) -> None: + for rel_path in ( + "mesh_assets/table/table_0.glb", + "mesh_assets/cup/cup_1/cup_1.glb", + "mesh_assets/pad/pad_1/pad_1.glb", + "mesh_assets/fork/fork_1/fork_1.glb", + ): + _write_minimal_glb(project_dir / rel_path, _default_mesh_vertices()) + + cup = _mesh_object( + "cup_1", + "mesh_assets/cup/cup_1/cup_1.glb", + [0.18, 0.22, 0.76], + [0.0, 0.0, 25.0], + ) + pad = _mesh_object( + "pad_1", + "mesh_assets/pad/pad_1/pad_1.glb", + [-0.1, -0.15, 0.74], + [0.0, 0.0, -10.0], + ) + pad["body_scale"] = [1.2, 1.0, 0.4] + fork = _mesh_object( + "fork_1", + "mesh_assets/fork/fork_1/fork_1.glb", + [0.32, -0.18, 0.75], + [0.0, 0.0, 90.0], + ) + fork["body_scale"] = [0.7, 0.7, 0.7] + + gym_config = { + "id": "Image2Tabletop-1790000000-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 180.0], + ) + ], + "rigid_object": [cup, pad, fork], + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def _mesh_object( uid: str, fpath: str, @@ -1012,29 +1225,72 @@ def _mesh_object( } -def _expected_glb_corrected_rot(init_rot: list[float]) -> list[float]: - from scipy.spatial.transform import Rotation +def _assert_normalized_obj_path(fpath: str) -> None: + path = Path(fpath) + assert path.suffix == ".obj" + assert "mesh_assets/normalized" in path.as_posix() + assert MESH_FRAME_NORMALIZATION_POLICY_VERSION not in path.name + assert len(path.name) <= 64 + assert path.is_file() + assert (path.parent / "material.mtl").is_file() - source_rotation = Rotation.from_euler("XYZ", init_rot, degrees=True) - correction = Rotation.from_euler( - "X", - ur5_basket_config_generation._DEXSIM_041_GLB_LOCAL_X_CORRECTION_DEGREES, - degrees=True, - ) - return [ - float(value) - for value in (source_rotation * correction).as_euler("XYZ", degrees=True) - ] + +def _stable_summary(summary: dict) -> dict: + return { + key: value + for key, value in summary.items() + if key not in {"normalized_meshes", "coacd_cache"} + } + + +def _obj_vertices(path: Path) -> list[tuple[float, float, float]]: + vertices = [] + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.startswith("v "): + continue + _, x, y, z = line.split(maxsplit=3) + vertices.append((float(x), float(y), float(z))) + return vertices + + +def _rounded_vertex_set( + vertices: list[tuple[float, float, float]], +) -> set[tuple[float, float, float]]: + return { + (round(vertex[0], 6), round(vertex[1], 6), round(vertex[2], 6)) + for vertex in vertices + } + + +def _default_mesh_vertices() -> list[tuple[float, float, float]]: + return [(-0.05, 0.0, 0.0), (0.05, 0.0, 0.0), (0.0, -0.04, 0.0)] def _write_minimal_glb( path: Path, vertices: list[tuple[float, float, float]], + *, + node_translation: tuple[float, float, float] | None = None, ) -> None: path.parent.mkdir(parents=True, exist_ok=True) - binary = b"".join(struct.pack(" dict: output_root.mkdir(parents=True, exist_ok=True) mesh_path = output_root / output_name - mesh_path.write_bytes(b"glb") + _write_minimal_glb(mesh_path, _default_mesh_vertices()) calls.append((prompt, output_root, output_name)) return {"scaled_mesh_path": str(mesh_path)} From c2f442797fce6a53245f5e30cab45741e52f7a31 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 14 Jun 2026 21:20:43 +0800 Subject: [PATCH 07/33] fix normalizer glb --- .../generation/mesh_frame_normalization.py | 329 +++++++++++++++--- .../test_ur5_basket_config_generation.py | 216 +++++++++--- 2 files changed, 450 insertions(+), 95 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py index 7437afec..e9576cd0 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py @@ -24,6 +24,7 @@ import json import math import re +import struct __all__ = [ "GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES", @@ -34,11 +35,30 @@ ] -MESH_FRAME_NORMALIZATION_POLICY_VERSION = "action_agent_glb_scene_obj_v2" +MESH_FRAME_NORMALIZATION_POLICY_VERSION = "action_agent_glb_scene_texture_obj_v3" GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES = 0.0 GLB_LOCAL_X_CORRECTION_DEGREES = GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES _SAFE_STEM_RE = re.compile(r"[^0-9a-zA-Z_.-]+") +_GLB_JSON_CHUNK_TYPE = 0x4E4F534A +_GLB_BINARY_CHUNK_TYPE = 0x004E4942 +_TEXTURE_EXTENSION_BY_MIME_TYPE = { + "image/jpeg": ".jpg", + "image/png": ".png", + "image/webp": ".webp", +} + + +@dataclass(frozen=True) +class _MaterialSpec: + name: str + texture_path: str | None = None + + +@dataclass(frozen=True) +class _TextureAsset: + data: bytes + extension: str @dataclass(frozen=True) @@ -94,22 +114,32 @@ def normalize_path(self, mesh_path: str | Path) -> Path: cached = self._results_by_source.get(path) if cached is not None: if cached.normalized_path.is_file(): - obj_text = _repair_obj_material_reference(cached.normalized_path) - self._ensure_material_library( - _obj_material_names(obj_text) or {"material_0"} + material_spec = self._material_spec_for( + path, + cached.normalized_path, + cached.source_sha256, + ) + _repair_obj_material_reference( + cached.normalized_path, + material_spec.name, ) + self._ensure_material_library({material_spec.name: material_spec}) return cached.normalized_path source_sha256 = _file_sha256(path) normalized_path = self._normalized_path_for(path, source_sha256) + material_spec = self._material_spec_for(path, normalized_path, source_sha256) status = "reused" if normalized_path.is_file() else "generated" if status == "generated": - self._write_normalized_obj(path, normalized_path, source_sha256) - else: - obj_text = _repair_obj_material_reference(normalized_path) - self._ensure_material_library( - _obj_material_names(obj_text) or {"material_0"} + self._write_normalized_obj( + path, + normalized_path, + source_sha256, + material_spec, ) + else: + _repair_obj_material_reference(normalized_path, material_spec.name) + self._ensure_material_library({material_spec.name: material_spec}) result = NormalizedMeshResult( source_path=path, @@ -143,30 +173,66 @@ def _normalized_path_for(self, mesh_path: Path, source_sha256: str) -> Path: def _material_path(self) -> Path: return self.output_dir / "material.mtl" - def _ensure_material_library(self, material_names: set[str]) -> None: - if not material_names: + def _texture_dir(self) -> Path: + return self.output_dir / "textures" + + def _material_spec_for( + self, + source_path: Path, + normalized_path: Path, + source_sha256: str, + ) -> _MaterialSpec: + material_hash = _material_hash_for(normalized_path) + material_name = f"material_{material_hash}" + texture_path = self._write_base_color_texture( + source_path, + material_hash, + source_sha256, + ) + return _MaterialSpec(name=material_name, texture_path=texture_path) + + def _write_base_color_texture( + self, + source_path: Path, + material_hash: str, + source_sha256: str, + ) -> str | None: + try: + texture = _extract_glb_base_color_texture(source_path) + except (IndexError, KeyError, TypeError, ValueError, json.JSONDecodeError): + return None + if texture is None: + return None + + texture_dir = self._texture_dir() + texture_dir.mkdir(parents=True, exist_ok=True) + texture_name = ( + f"{material_hash}_{source_sha256[:12]}_basecolor{texture.extension}" + ) + texture_path = texture_dir / texture_name + texture_path.write_bytes(texture.data) + return f"textures/{texture_name}" + + def _ensure_material_library( + self, material_specs: dict[str, _MaterialSpec] + ) -> None: + if not material_specs: return material_path = self._material_path() - existing_names = _read_material_names(material_path) - all_names = sorted(existing_names | material_names) + all_specs = { + **_read_material_specs(material_path), + **material_specs, + } material_path.write_text( "\n".join( [ "# EmbodiChain action-agent normalized mesh materials", *[ - "\n".join( - [ - f"newmtl {name}", - "Ka 0.8 0.8 0.8", - "Kd 0.8 0.8 0.8", - "Ks 0.0 0.0 0.0", - "Ns 1.0", - "d 1.0", - "illum 2", - ] + _format_material_spec(spec) + for spec in sorted( + all_specs.values(), key=lambda item: item.name ) - for name in all_names ], "", ] @@ -179,6 +245,7 @@ def _write_normalized_obj( source_path: Path, normalized_path: Path, source_sha256: str, + material_spec: _MaterialSpec, ) -> None: trimesh = _require_trimesh() scene = trimesh.load(str(source_path), force="scene") @@ -192,7 +259,7 @@ def _write_normalized_obj( obj_text = obj_payload.decode("utf-8") else: obj_text = str(obj_payload) - obj_text = _ensure_obj_material_reference(obj_text) + obj_text = _ensure_obj_material_reference(obj_text, material_spec.name) header = "\n".join( [ @@ -206,7 +273,7 @@ def _write_normalized_obj( ] ) normalized_path.write_text(header + obj_text, encoding="utf-8") - self._ensure_material_library(_obj_material_names(obj_text) or {"material_0"}) + self._ensure_material_library({material_spec.name: material_spec}) def _scene_to_world_mesh(scene: Any) -> Any: @@ -221,50 +288,202 @@ def _scene_to_world_mesh(scene: Any) -> Any: return mesh -def _obj_material_names(obj_text: str) -> set[str]: - names: set[str] = set() - for line in obj_text.splitlines(): - if not line.startswith("usemtl "): - continue - name = line.split(maxsplit=1)[1].strip() - if name: - names.add(name) - return names +def _material_hash_for(normalized_path: Path) -> str: + hash_part = normalized_path.stem.rsplit("_", maxsplit=1)[-1] + if re.fullmatch(r"[0-9a-fA-F]{8,}", hash_part): + return hash_part.lower() + return hashlib.sha256(normalized_path.stem.encode("utf-8")).hexdigest()[:16] -def _repair_obj_material_reference(obj_path: Path) -> str: +def _repair_obj_material_reference(obj_path: Path, material_name: str) -> str: obj_text = obj_path.read_text(encoding="utf-8") - repaired = _ensure_obj_material_reference(obj_text) + repaired = _ensure_obj_material_reference(obj_text, material_name) if repaired != obj_text: obj_path.write_text(repaired, encoding="utf-8") return repaired -def _ensure_obj_material_reference(obj_text: str) -> str: - has_mtllib = any(line.startswith("mtllib ") for line in obj_text.splitlines()) - has_usemtl = any(line.startswith("usemtl ") for line in obj_text.splitlines()) - prefix = [] - if not has_mtllib: - prefix.append("mtllib material.mtl") +def _ensure_obj_material_reference(obj_text: str, material_name: str) -> str: + lines = obj_text.splitlines() + header_lines: list[str] = [] + body_start = 0 + for line in lines: + if not line.startswith("#"): + break + header_lines.append(line) + body_start += 1 + + body_lines: list[str] = [] + has_usemtl = False + for line in lines[body_start:]: + if line.startswith("mtllib "): + continue + if line.startswith("usemtl "): + body_lines.append(f"usemtl {material_name}") + has_usemtl = True + continue + body_lines.append(line) + + prefix = ["mtllib material.mtl"] if not has_usemtl: - prefix.append("usemtl material_0") - if not prefix: - return obj_text - return "\n".join(prefix) + "\n" + obj_text + prefix.append(f"usemtl {material_name}") + return "\n".join(header_lines + prefix + body_lines) + "\n" -def _read_material_names(material_path: Path) -> set[str]: +def _read_material_specs(material_path: Path) -> dict[str, _MaterialSpec]: if not material_path.is_file(): - return set() + return {} - names: set[str] = set() + specs: dict[str, _MaterialSpec] = {} + current_name: str | None = None + current_texture_path: str | None = None for line in material_path.read_text(encoding="utf-8").splitlines(): - if not line.startswith("newmtl "): + if line.startswith("newmtl "): + if current_name is not None: + specs[current_name] = _MaterialSpec( + name=current_name, + texture_path=current_texture_path, + ) + current_name = line.split(maxsplit=1)[1].strip() + current_texture_path = None + continue + if current_name is not None and line.startswith("map_Kd "): + current_texture_path = line.split(maxsplit=1)[1].strip() + if current_name is not None: + specs[current_name] = _MaterialSpec( + name=current_name, + texture_path=current_texture_path, + ) + return specs + + +def _format_material_spec(spec: _MaterialSpec) -> str: + ambient = "1.0 1.0 1.0" if spec.texture_path else "0.8 0.8 0.8" + diffuse = "1.0 1.0 1.0" if spec.texture_path else "0.8 0.8 0.8" + lines = [ + f"newmtl {spec.name}", + f"Ka {ambient}", + f"Kd {diffuse}", + "Ks 0.0 0.0 0.0", + "Ns 1.0", + "d 1.0", + "illum 2", + ] + if spec.texture_path: + lines.append(f"map_Kd {spec.texture_path}") + return "\n".join(lines) + + +def _extract_glb_base_color_texture(source_path: Path) -> _TextureAsset | None: + if source_path.suffix.lower() != ".glb": + return None + + doc, binary_chunk = _read_glb(source_path) + material = _first_textured_material(doc) + if material is None: + return None + + texture_index = int(material["pbrMetallicRoughness"]["baseColorTexture"]["index"]) + textures = doc.get("textures", []) + if not isinstance(textures, list) or texture_index >= len(textures): + return None + + texture = textures[texture_index] + if not isinstance(texture, dict): + return None + image_index = texture.get("source") + if image_index is None: + return None + + images = doc.get("images", []) + if not isinstance(images, list) or int(image_index) >= len(images): + return None + + image = images[int(image_index)] + if not isinstance(image, dict): + return None + + mime_type = str(image.get("mimeType", "")) + extension = _TEXTURE_EXTENSION_BY_MIME_TYPE.get(mime_type) + if extension is None: + return None + + buffer_view_index = image.get("bufferView") + if buffer_view_index is None: + return None + + image_data = _buffer_view_bytes(doc, binary_chunk, int(buffer_view_index)) + if not image_data: + return None + return _TextureAsset(data=image_data, extension=extension) + + +def _first_textured_material(doc: dict[str, Any]) -> dict[str, Any] | None: + materials = doc.get("materials", []) + if not isinstance(materials, list): + return None + for material in materials: + if not isinstance(material, dict): + continue + pbr = material.get("pbrMetallicRoughness", {}) + if not isinstance(pbr, dict): + continue + base_color_texture = pbr.get("baseColorTexture", {}) + if not isinstance(base_color_texture, dict): continue - name = line.split(maxsplit=1)[1].strip() - if name: - names.add(name) - return names + if "index" in base_color_texture: + return material + return None + + +def _read_glb(source_path: Path) -> tuple[dict[str, Any], bytes]: + data = source_path.read_bytes() + if len(data) < 12: + raise ValueError(f"GLB file is too small: {source_path}") + magic, version, declared_length = struct.unpack_from("<4sII", data, 0) + if magic != b"glTF" or version != 2: + raise ValueError(f"Only GLB version 2 files are supported: {source_path}") + if declared_length > len(data): + raise ValueError(f"GLB length header exceeds file size: {source_path}") + + offset = 12 + doc: dict[str, Any] | None = None + binary_chunk = b"" + while offset + 8 <= declared_length: + chunk_length, chunk_type = struct.unpack_from(" declared_length: + raise ValueError(f"GLB chunk exceeds file size: {source_path}") + chunk = data[offset:chunk_end] + offset = chunk_end + if chunk_type == _GLB_JSON_CHUNK_TYPE: + doc = json.loads(chunk.decode("utf-8")) + elif chunk_type == _GLB_BINARY_CHUNK_TYPE: + binary_chunk = chunk + if doc is None: + raise ValueError(f"GLB file does not contain a JSON chunk: {source_path}") + return doc, binary_chunk + + +def _buffer_view_bytes( + doc: dict[str, Any], + binary_chunk: bytes, + buffer_view_index: int, +) -> bytes: + buffer_views = doc.get("bufferViews", []) + if not isinstance(buffer_views, list) or buffer_view_index >= len(buffer_views): + return b"" + buffer_view = buffer_views[buffer_view_index] + if not isinstance(buffer_view, dict): + return b"" + if int(buffer_view.get("buffer", 0)) != 0: + return b"" + byte_offset = int(buffer_view.get("byteOffset", 0)) + byte_length = int(buffer_view.get("byteLength", 0)) + if byte_length <= 0: + return b"" + return binary_chunk[byte_offset : byte_offset + byte_length] def _rotation_x_matrix4(degrees: float) -> list[list[float]]: diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 8ee185d7..800d4b00 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -17,6 +17,7 @@ from __future__ import annotations from pathlib import Path +import base64 import hashlib import json import struct @@ -187,7 +188,10 @@ def test_mesh_frame_normalizer_bakes_glb_scene_transform_to_obj( material_text = (normalized_path.parent / "material.mtl").read_text( encoding="utf-8" ) - assert "newmtl material_0" in material_text + material_name = _single_obj_material_name(obj_text) + assert material_name != "material_0" + assert f"newmtl {material_name}" in material_text + assert "map_Kd " not in material_text assert _rounded_vertex_set(_obj_vertices(normalized_path)) == { (1.0, 0.0, 0.0), (1.0, 1.0, 0.0), @@ -195,6 +199,44 @@ def test_mesh_frame_normalizer_bakes_glb_scene_transform_to_obj( } +def test_mesh_frame_normalizer_extracts_embedded_base_color_texture( + tmp_path: Path, +) -> None: + mesh_path = tmp_path / "source" / "textured_triangle.glb" + texture_png = _tiny_png() + _write_minimal_glb( + mesh_path, + _default_mesh_vertices(), + embedded_base_color_png=texture_png, + ) + output_dir = tmp_path / "normalized" + + normalized_path = MeshFrameNormalizer(output_dir=output_dir).normalize_path( + mesh_path + ) + + obj_text = normalized_path.read_text(encoding="utf-8") + material_name = _single_obj_material_name(obj_text) + material_text = (output_dir / "material.mtl").read_text(encoding="utf-8") + assert f"newmtl {material_name}" in material_text + assert "Kd 1.0 1.0 1.0" in material_text + map_kd = _single_map_kd_path(material_text, material_name) + assert map_kd.startswith("textures/") + assert map_kd.endswith("_basecolor.png") + assert (output_dir / map_kd).read_bytes() == texture_png + + material_path = output_dir / "material.mtl" + texture_path = output_dir / map_kd + material_path.unlink() + texture_path.unlink() + + reused_path = MeshFrameNormalizer(output_dir=output_dir).normalize_path(mesh_path) + + assert reused_path == normalized_path + assert material_path.is_file() + assert texture_path.read_bytes() == texture_png + + def test_mesh_frame_normalizer_recreates_material_library_for_reused_obj( tmp_path: Path, ) -> None: @@ -211,7 +253,11 @@ def test_mesh_frame_normalizer_recreates_material_library_for_reused_obj( assert reused_path == normalized_path assert material_path.is_file() - assert "newmtl material_0" in material_path.read_text(encoding="utf-8") + material_text = material_path.read_text(encoding="utf-8") + reused_material_name = _single_obj_material_name( + reused_path.read_text(encoding="utf-8") + ) + assert f"newmtl {reused_material_name}" in material_text def test_target_replacements_generate_meshes_and_replace_paths( @@ -697,7 +743,9 @@ def fake_call_relative_task_llm(**kwargs): atom_actions = paths.atom_actions.read_text(encoding="utf-8") assert atom_actions.count('"atomic_action_class":"PickUpAction"') == 1 - assert '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in atom_actions + assert ( + '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in atom_actions + ) assert '"obj_name":"cup"' in atom_actions assert _stable_summary(paths.summary)["relation"] == "on" @@ -1050,7 +1098,9 @@ def test_tabletop_z_placement_uses_normalized_mesh_bounds( ) gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) - table_config = next(obj for obj in gym_config["background"] if obj["uid"] == "table") + table_config = next( + obj for obj in gym_config["background"] if obj["uid"] == "table" + ) table_top_z = ur5_basket_config_generation._mesh_config_world_zmax(table_config) expected_min_z = ( table_top_z + ur5_basket_config_generation._TABLETOP_OBJECT_CLEARANCE @@ -1253,6 +1303,29 @@ def _obj_vertices(path: Path) -> list[tuple[float, float, float]]: return vertices +def _single_obj_material_name(obj_text: str) -> str: + names = { + line.split(maxsplit=1)[1].strip() + for line in obj_text.splitlines() + if line.startswith("usemtl ") + } + assert len(names) == 1 + return next(iter(names)) + + +def _single_map_kd_path(material_text: str, material_name: str) -> str: + current_material = None + texture_paths = [] + for line in material_text.splitlines(): + if line.startswith("newmtl "): + current_material = line.split(maxsplit=1)[1].strip() + continue + if current_material == material_name and line.startswith("map_Kd "): + texture_paths.append(line.split(maxsplit=1)[1].strip()) + assert len(texture_paths) == 1 + return texture_paths[0] + + def _rounded_vertex_set( vertices: list[tuple[float, float, float]], ) -> set[tuple[float, float, float]]: @@ -1266,11 +1339,19 @@ def _default_mesh_vertices() -> list[tuple[float, float, float]]: return [(-0.05, 0.0, 0.0), (0.05, 0.0, 0.0), (0.0, -0.04, 0.0)] +def _tiny_png() -> bytes: + return base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAADUlEQVR4nGP4z8DwHwAF" + "gAJ/l7p7YwAAAABJRU5ErkJggg==" + ) + + def _write_minimal_glb( path: Path, vertices: list[tuple[float, float, float]], *, node_translation: tuple[float, float, float] | None = None, + embedded_base_color_png: bytes | None = None, ) -> None: path.parent.mkdir(parents=True, exist_ok=True) if len(vertices) < 3: @@ -1281,56 +1362,111 @@ def _write_minimal_glb( ) indices = (0, 1, 2) index_binary = b"".join(struct.pack(" Date: Sun, 14 Jun 2026 22:56:18 +0800 Subject: [PATCH 08/33] conda activate base turn 180 --- .../cli/run_agent_pipeline.py | 2 +- .../generation/prompt_builders.py | 43 +++++----- .../generation/ur5_basket_config.py | 46 +++++----- .../test_ur5_basket_config_generation.py | 83 +++++++++++++------ 4 files changed, 102 insertions(+), 72 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py index ac08b311..6a29cd6c 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py @@ -983,7 +983,7 @@ def _duplicated_numbered_rigid_object_groups( sorted( entries, key=lambda entry: ( - float(entry["y"]), + -float(entry["y"]), str(entry["object"]["uid"]), ), ), diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index b4cf86ec..80b6000a 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -32,6 +32,15 @@ "make_relative_task_prompt", ] +_BASKET_LEFT_RELEASE_OFFSET_Y = 0.04 +_BASKET_RIGHT_RELEASE_OFFSET_Y = -0.04 +_RELATIVE_COORDINATE_CONVENTION = """Coordinate convention for relative placement: +- `left_of` means positive world y relative to the reference object. +- `right_of` means negative world y relative to the reference object. +- `front_of` means positive world x relative to the reference object. +- `behind` means negative world x relative to the reference object. +- `inside` and `on` use the reference object's xy center.""" + class _BasketRolesLike(Protocol): left_target_runtime_uid: str @@ -149,12 +158,7 @@ def make_relative_task_prompt( - Active arm: `{active_arm}`. - Keep every `{inactive_slot}` as null. -Coordinate convention for relative placement: -- `left_of` means negative world y relative to the reference object. -- `right_of` means positive world y relative to the reference object. -- `front_of` means negative world x relative to the reference object. -- `behind` means positive world x relative to the reference object. -- `inside` and `on` use the reference object's xy center. +{_RELATIVE_COORDINATE_CONVENTION} Generate one deterministic nominal graph with exactly 6 nominal edges. Use only the atomic action class JSON specs shown below. Do not add recovery, monitor, search, @@ -292,12 +296,7 @@ def _make_dual_relative_task_prompt( `{second.moved_runtime_uid}`. Goal relation: `{second.relation}` ({_relative_relation_phrase(second.relation)}). -Coordinate convention for relative placement: -- `left_of` means negative world y relative to the reference object. -- `right_of` means positive world y relative to the reference object. -- `front_of` means negative world x relative to the reference object. -- `behind` means positive world x relative to the reference object. -- `inside` and `on` use the reference object's xy center. +{_RELATIVE_COORDINATE_CONVENTION} Generate one deterministic nominal graph with exactly 10 nominal edges. Use only the atomic action class JSON specs shown below. Do not add recovery, monitor, search, @@ -547,25 +546,25 @@ def make_basket_task_prompt( left_high_spec = _format_pose_object_spec( "left_arm", roles.container_runtime_uid, - (0.0, -0.04, 0.22), + (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.22), sample_interval=45, ) left_release_spec = _format_pose_object_spec( "left_arm", roles.container_runtime_uid, - (0.0, -0.04, 0.12), + (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.12), sample_interval=30, ) right_high_spec = _format_pose_object_spec( "right_arm", roles.container_runtime_uid, - (0.0, 0.04, 0.22), + (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.22), sample_interval=45, ) right_release_spec = _format_pose_object_spec( "right_arm", roles.container_runtime_uid, - (0.0, 0.04, 0.12), + (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.12), sample_interval=30, ) left_open_spec = _format_gripper_spec( @@ -718,9 +717,9 @@ def make_basket_basic_background( The interactive objects are: - {roles.left_target_runtime_uid}: the {left_target_text} mesh initially on the - negative-y side (source object {roles.left_target_source_uid}). + positive-y side (source object {roles.left_target_source_uid}). - {roles.right_target_runtime_uid}: the {right_target_text} mesh initially on the - positive-y side (source object {roles.right_target_source_uid}). + negative-y side (source object {roles.right_target_source_uid}). - {roles.container_runtime_uid}: the target container near the center of the table (source object {roles.container_source_uid}). @@ -760,25 +759,25 @@ def make_basket_atom_actions_prompt(roles: _BasketRolesLike) -> str: left_high_spec = _format_pose_object_spec( "left_arm", roles.container_runtime_uid, - (0.0, -0.04, 0.22), + (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.22), sample_interval=45, ) left_release_spec = _format_pose_object_spec( "left_arm", roles.container_runtime_uid, - (0.0, -0.04, 0.12), + (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.12), sample_interval=30, ) right_high_spec = _format_pose_object_spec( "right_arm", roles.container_runtime_uid, - (0.0, 0.04, 0.22), + (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.22), sample_interval=45, ) right_release_spec = _format_pose_object_spec( "right_arm", roles.container_runtime_uid, - (0.0, 0.04, 0.12), + (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.12), sample_interval=30, ) return f"""### Atomic Action Class JSON Specs for UR5BreadBasket Dual-UR5 Placement diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index 484b855b..01b2ef89 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -122,6 +122,10 @@ _DUAL_UR5_ARM_COMPONENT_Z = 0.4 _DUAL_UR5_TABLETOP_CLEARANCE = 0.25 _DUAL_UR5_SIDE_AXIS_INDEX = 1 +_DUAL_UR5_ROTATED_INIT_X = 2.0 +_DUAL_UR5_ROTATED_INIT_YAW_DEGREES = -90.0 +_ROBOT_VIEW_LEFT_WORLD_Y_SIGN = 1.0 +_ROBOT_VIEW_FRONT_WORLD_X_SIGN = 1.0 _BACKGROUND_MAX_CONVEX_HULL_NUM = 1 _TARGET_MAX_CONVEX_HULL_NUM = 16 _CONTAINER_MAX_CONVEX_HULL_NUM = 8 @@ -574,7 +578,7 @@ def _pick_left_right_targets( key=lambda obj: abs(_side_axis_value(obj)), reverse=True, )[:2] - left, right = sorted(picked, key=_side_axis_value) + left, right = sorted(picked, key=_side_axis_value, reverse=True) return left, right @@ -595,7 +599,7 @@ def _position_side_axis_value(position: list[float]) -> float: def _arm_side_for_position(position: list[float]) -> str: - return "left" if _position_side_axis_value(position) < 0.0 else "right" + return "left" if _position_side_axis_value(position) >= 0.0 else "right" def _target_noun(left_target: _SceneObject, right_target: _SceneObject) -> str: @@ -1051,8 +1055,8 @@ def _call_role_llm( "Return only one JSON object with keys: container_object, " "left_target_object, right_target_object, target_noun, " "container_runtime_uid. Use only source_uid values from the scene. The " - "left target starts on the negative-y side, and the right target starts " - "on the positive-y side.\n\n" + "rotated robot-view left target starts on the positive-y side, and the " + "rotated robot-view right target starts on the negative-y side.\n\n" f"Project: {project_name}\n" f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}\n" f"Default roles:\n{json.dumps(default_roles, ensure_ascii=False, indent=2)}" @@ -1184,9 +1188,10 @@ def _call_relative_task_llm( "left arm, or left UR5; use arm='right' for explicit right-arm " "instructions such as 右臂, 右机械臂, right arm, or right UR5; use " "arm='auto' when the task does not specify an arm.\n" - "- For Chinese/English left/right/front/back, use the relation enums. " - "front_of means negative world-x; behind means positive world-x; " - "left_of means negative world-y; right_of means positive world-y.\n" + "- For Chinese/English left/right/front/back, use the relation enums " + "from the rotated robot-view perspective. front_of means positive " + "world-x; behind means negative world-x; left_of means positive " + "world-y; right_of means negative world-y.\n" "- If the task says to release an object above a basket/container so it " "falls into it, use goal_relation='inside'.\n" "- If the task says to stack/place one object on another non-container " @@ -1330,7 +1335,7 @@ def _relative_forced_arm_sides( return inferred_sides side_values = [_position_side_axis_value(position) for position in positions] - if side_values[0] <= side_values[1]: + if side_values[0] >= side_values[1]: return ["left", "right"] return ["right", "left"] @@ -1508,14 +1513,11 @@ def _relative_release_offset(relation: str) -> list[float]: return [0.0, 0.0, _SIDE_RELEASE_Z_OFFSET] if relation == "on": return [0.0, 0.0, _ON_RELEASE_Z_OFFSET] - if relation == "left_of": - return [0.0, -_SIDE_RELATION_DISTANCE, _SIDE_RELEASE_Z_OFFSET] - if relation == "right_of": - return [0.0, _SIDE_RELATION_DISTANCE, _SIDE_RELEASE_Z_OFFSET] - if relation == "front_of": - return [-_SIDE_RELATION_DISTANCE, 0.0, _SIDE_RELEASE_Z_OFFSET] - if relation == "behind": - return [_SIDE_RELATION_DISTANCE, 0.0, _SIDE_RELEASE_Z_OFFSET] + if relation in {"left_of", "right_of", "front_of", "behind"}: + axis, offset, _ = _side_relation_axes(relation) + release_offset = [0.0, 0.0, _SIDE_RELEASE_Z_OFFSET] + release_offset[0 if axis == "x" else 1] = offset + return release_offset raise ValueError(f"Unsupported relative placement relation: {relation!r}.") @@ -2514,13 +2516,13 @@ def _make_relative_placement_success_spec( def _side_relation_axes(relation: str) -> tuple[str, float, str]: if relation == "left_of": - return "y", -_SIDE_RELATION_DISTANCE, "x" + return "y", _ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE, "x" if relation == "right_of": - return "y", _SIDE_RELATION_DISTANCE, "x" + return "y", -_ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE, "x" if relation == "front_of": - return "x", -_SIDE_RELATION_DISTANCE, "y" + return "x", _ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE, "y" if relation == "behind": - return "x", _SIDE_RELATION_DISTANCE, "y" + return "x", -_ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE, "y" raise ValueError(f"Unsupported side relation: {relation!r}.") @@ -2771,8 +2773,8 @@ def _make_dual_ur5_robot_config(*, robot_init_z: float) -> dict[str, Any]: }, ], }, - "init_pos": [-2.0, 0.0, float(robot_init_z)], - "init_rot": [0.0, 0.0, 90.0], + "init_pos": [_DUAL_UR5_ROTATED_INIT_X, 0.0, float(robot_init_z)], + "init_rot": [0.0, 0.0, _DUAL_UR5_ROTATED_INIT_YAW_DEGREES], "init_qpos": [ 0, 0, diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 800d4b00..af37acad 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -28,6 +28,9 @@ from embodichain.gen_sim.action_agent_pipeline.generation import ( ur5_basket_config as ur5_basket_config_generation, ) +from embodichain.gen_sim.action_agent_pipeline.cli import ( + run_agent_pipeline as run_agent_pipeline_cli, +) from embodichain.gen_sim.action_agent_pipeline.generation.mesh_frame_normalization import ( MESH_FRAME_NORMALIZATION_POLICY_VERSION, MeshFrameNormalizer, @@ -69,8 +72,8 @@ def test_ur5_basket_generator_uses_parallel_handoff( _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) _assert_normalized_obj_path(background_objects["table"]["shape"]["fpath"]) _assert_normalized_obj_path(background_objects["wicker_basket"]["shape"]["fpath"]) - assert gym_config["robot"]["init_pos"] == [-2.0, 0.0, 0.5] - assert gym_config["robot"]["init_rot"] == [0.0, 0.0, 90.0] + assert gym_config["robot"]["init_pos"] == [2.0, 0.0, 0.5] + assert gym_config["robot"]["init_rot"] == [0.0, 0.0, -90.0] success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] assert {term["object"] for term in success_terms} == {"left_apple", "right_apple"} @@ -87,16 +90,16 @@ def test_ur5_basket_generator_uses_parallel_handoff( assert "Generate exactly 10 nominal edges" in normalized_task_prompt assert "Generate exactly 11 nominal edges" not in normalized_task_prompt - assert "negative-y side" in basic_background assert "positive-y side" in basic_background + assert "negative-y side" in basic_background assert "negative-x side" not in basic_background assert "positive-x side" not in basic_background - assert '"offset":[0.0,-0.04,0.22]' in task_prompt assert '"offset":[0.0,0.04,0.22]' in task_prompt + assert '"offset":[0.0,-0.04,0.22]' in task_prompt assert '"offset":[-0.04,0.0,0.22]' not in task_prompt assert '"offset":[0.04,0.0,0.22]' not in task_prompt - assert '"offset":[0.0,-0.04,0.22]' in atom_actions assert '"offset":[0.0,0.04,0.22]' in atom_actions + assert '"offset":[0.0,-0.04,0.22]' in atom_actions assert "parallel handoff" in task_prompt assert "parallel handoff" in basic_background assert "parallel handoff" in atom_actions @@ -136,8 +139,8 @@ def test_generator_normalizes_glb_meshes_and_preserves_source_rot( assert background_objects["table"]["init_rot"] == [0.0, 0.0, 180.0] assert background_objects["wicker_basket"]["init_rot"] == [0.0, 0.0, 180.0] - assert rigid_objects["right_apple"]["init_rot"] == [0.0, 0.0, 140.0] - assert rigid_objects["left_apple"]["init_rot"] == [0.0, 0.0, 160.0] + assert rigid_objects["left_apple"]["init_rot"] == [0.0, 0.0, 140.0] + assert rigid_objects["right_apple"]["init_rot"] == [0.0, 0.0, 160.0] for obj_config in [ background_objects["table"], background_objects["wicker_basket"], @@ -329,24 +332,50 @@ def test_target_replacements_can_sync_runtime_names( background_objects = {obj["uid"]: obj for obj in gym_config["background"]} - assert set(rigid_objects) == {"left_orange", "right_apple"} + assert set(rigid_objects) == {"left_apple", "right_orange"} assert "wicker_basket" in background_objects assert background_objects["wicker_basket"]["body_type"] == "kinematic" - _assert_normalized_obj_path(rigid_objects["left_orange"]["shape"]["fpath"]) - _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["left_apple"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["right_orange"]["shape"]["fpath"]) success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] assert {term["object"] for term in success_terms} == { - "left_orange", - "right_apple", + "left_apple", + "right_orange", } task_prompt = paths.task_prompt.read_text(encoding="utf-8") basic_background = paths.basic_background.read_text(encoding="utf-8") - assert "the left orange and right apple into the wicker_basket" in task_prompt - assert "left_arm must only manipulate `left_orange`" in task_prompt - assert "- left_orange: the orange mesh initially" in basic_background - assert "- right_apple: the apple mesh initially" in basic_background + assert "the left apple and right orange into the wicker_basket" in task_prompt + assert "left_arm must only manipulate `left_apple`" in task_prompt + assert "- left_apple: the apple mesh initially" in basic_background + assert "- right_orange: the orange mesh initially" in basic_background + + +def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: + gym_config = { + "rigid_object": [ + {"uid": "bread_1", "init_pos": [0.0, 0.2, 0.76]}, + {"uid": "bread_2", "init_pos": [0.0, -0.1, 0.76]}, + ], + } + + assert ( + run_agent_pipeline_cli._auto_replacement_source_uid( + gym_config, + replacement_number=1, + option_name="--target_replacement1", + ) + == "bread_1" + ) + assert ( + run_agent_pipeline_cli._auto_replacement_source_uid( + gym_config, + replacement_number=2, + option_name="--target_replacement2", + ) + == "bread_2" + ) def test_directory_input_prefers_merged_config_and_preserves_extra_scene_scale( @@ -459,7 +488,7 @@ def fake_call_relative_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("y", -0.16) in axis_terms + assert ("y", 0.16) in axis_terms assert ("x", 0.0) in axis_terms assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] @@ -481,7 +510,7 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "wicker_basket", "relation": "left_of", "active_arm": "left_arm", - "release_offset": [0.0, -0.16, 0.12], + "release_offset": [0.0, 0.16, 0.12], } @@ -537,13 +566,13 @@ def fake_call_relative_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("x", -0.16) in axis_terms + assert ("x", 0.16) in axis_terms assert ("y", 0.0) in axis_terms task_prompt = paths.task_prompt.read_text(encoding="utf-8") atom_actions = paths.atom_actions.read_text(encoding="utf-8") - assert '"offset":[-0.16,0.0,0.22]' in task_prompt - assert '"offset":[-0.16,0.0,0.22]' in atom_actions + assert '"offset":[0.16,0.0,0.22]' in task_prompt + assert '"offset":[0.16,0.0,0.22]' in atom_actions assert _stable_summary(paths.summary) == { "mode": "relative_placement", @@ -551,7 +580,7 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "apple_2", "relation": "front_of", "active_arm": "right_arm", - "release_offset": [-0.16, 0.0, 0.12], + "release_offset": [0.16, 0.0, 0.12], } @@ -819,8 +848,8 @@ def fake_call_relative_task_llm(**kwargs): for term in placement_success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("apple_2", "y", -0.16) in axis_terms - assert ("apple_1", "y", 0.16) in axis_terms + assert ("apple_2", "y", 0.16) in axis_terms + assert ("apple_1", "y", -0.16) in axis_terms attr_names = { attr["name"] @@ -836,14 +865,14 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "wicker_basket", "relation": "left_of", "active_arm": "left_arm", - "release_offset": [0.0, -0.16, 0.12], + "release_offset": [0.0, 0.16, 0.12], }, { "moved_object": "apple_1", "reference_object": "wicker_basket", "relation": "right_of", "active_arm": "right_arm", - "release_offset": [0.0, 0.16, 0.12], + "release_offset": [0.0, -0.16, 0.12], }, ], } @@ -968,7 +997,7 @@ def fake_call_relative_task_llm(**kwargs): ) active_arms = [placement["active_arm"] for placement in paths.summary["placements"]] - assert active_arms == ["left_arm", "right_arm"] + assert active_arms == ["right_arm", "left_arm"] gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] From 3a1109d569cdac2040a2a80a0afc5e1e2e0aa3ab Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Mon, 15 Jun 2026 09:50:04 +0800 Subject: [PATCH 09/33] direction right --- .../cli/run_agent_pipeline.py | 2 +- .../generation/prompt_builders.py | 30 ++++---- .../generation/ur5_basket_config.py | 40 +++++------ .../test_ur5_basket_config_generation.py | 69 ++++++++++++------- 4 files changed, 84 insertions(+), 57 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py index 6a29cd6c..ac08b311 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py @@ -983,7 +983,7 @@ def _duplicated_numbered_rigid_object_groups( sorted( entries, key=lambda entry: ( - -float(entry["y"]), + float(entry["y"]), str(entry["object"]["uid"]), ), ), diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 80b6000a..65455915 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -32,11 +32,11 @@ "make_relative_task_prompt", ] -_BASKET_LEFT_RELEASE_OFFSET_Y = 0.04 -_BASKET_RIGHT_RELEASE_OFFSET_Y = -0.04 +_BASKET_LEFT_RELEASE_OFFSET_Y = -0.04 +_BASKET_RIGHT_RELEASE_OFFSET_Y = 0.04 _RELATIVE_COORDINATE_CONVENTION = """Coordinate convention for relative placement: -- `left_of` means positive world y relative to the reference object. -- `right_of` means negative world y relative to the reference object. +- `left_of` means negative world y relative to the reference object. +- `right_of` means positive world y relative to the reference object. - `front_of` means positive world x relative to the reference object. - `behind` means negative world x relative to the reference object. - `inside` and `on` use the reference object's xy center.""" @@ -371,8 +371,10 @@ def make_relative_basic_background( from a simple natural-language task description. The robot is a dual-UR5 composite robot with DH_PGI_140_80 parallel grippers: -- left_arm is the UR5 outside the left side of the table's near long edge. -- right_arm is the UR5 outside the right side of the table's near long edge. +- left_arm is the semantic robot-view left slot, mapped to the physical + right_arm control part. +- right_arm is the semantic robot-view right slot, mapped to the physical + left_arm control part. The active arm for this task is `{active_arm}`. The inactive arm `{inactive_arm}` must stay null in the nominal graph. @@ -411,8 +413,10 @@ def _make_dual_relative_basic_background( generated from a simple natural-language task description. The robot is a dual-UR5 composite robot with DH_PGI_140_80 parallel grippers: -- left_arm is the UR5 outside the left side of the table's near long edge. -- right_arm is the UR5 outside the right side of the table's near long edge. +- left_arm is the semantic robot-view left slot, mapped to the physical + right_arm control part. +- right_arm is the semantic robot-view right slot, mapped to the physical + left_arm control part. Both arms participate in the nominal graph: {placement_lines} @@ -708,8 +712,10 @@ def make_basket_basic_background( grippers. The robot is a dual-UR5 composite robot with two parallel grippers: -- left_arm is the UR5 outside the left side of the table's near long edge. -- right_arm is the UR5 outside the right side of the table's near long edge. +- left_arm is the semantic robot-view left slot, mapped to the physical + right_arm control part. +- right_arm is the semantic robot-view right slot, mapped to the physical + left_arm control part. Both UR5 bases are on the same long side of the table and face inward toward the central {roles.container_runtime_uid}. The bases are intentionally kept @@ -717,9 +723,9 @@ def make_basket_basic_background( The interactive objects are: - {roles.left_target_runtime_uid}: the {left_target_text} mesh initially on the - positive-y side (source object {roles.left_target_source_uid}). + negative-y side (source object {roles.left_target_source_uid}). - {roles.right_target_runtime_uid}: the {right_target_text} mesh initially on the - negative-y side (source object {roles.right_target_source_uid}). + positive-y side (source object {roles.right_target_source_uid}). - {roles.container_runtime_uid}: the target container near the center of the table (source object {roles.container_source_uid}). diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index 01b2ef89..f6046031 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -124,7 +124,7 @@ _DUAL_UR5_SIDE_AXIS_INDEX = 1 _DUAL_UR5_ROTATED_INIT_X = 2.0 _DUAL_UR5_ROTATED_INIT_YAW_DEGREES = -90.0 -_ROBOT_VIEW_LEFT_WORLD_Y_SIGN = 1.0 +_ROBOT_VIEW_LEFT_WORLD_Y_SIGN = -1.0 _ROBOT_VIEW_FRONT_WORLD_X_SIGN = 1.0 _BACKGROUND_MAX_CONVEX_HULL_NUM = 1 _TARGET_MAX_CONVEX_HULL_NUM = 16 @@ -578,7 +578,7 @@ def _pick_left_right_targets( key=lambda obj: abs(_side_axis_value(obj)), reverse=True, )[:2] - left, right = sorted(picked, key=_side_axis_value, reverse=True) + left, right = sorted(picked, key=_side_axis_value) return left, right @@ -599,7 +599,7 @@ def _position_side_axis_value(position: list[float]) -> float: def _arm_side_for_position(position: list[float]) -> str: - return "left" if _position_side_axis_value(position) >= 0.0 else "right" + return "left" if _position_side_axis_value(position) < 0.0 else "right" def _target_noun(left_target: _SceneObject, right_target: _SceneObject) -> str: @@ -1055,8 +1055,8 @@ def _call_role_llm( "Return only one JSON object with keys: container_object, " "left_target_object, right_target_object, target_noun, " "container_runtime_uid. Use only source_uid values from the scene. The " - "rotated robot-view left target starts on the positive-y side, and the " - "rotated robot-view right target starts on the negative-y side.\n\n" + "rotated robot-view left target starts on the negative-y side, and the " + "rotated robot-view right target starts on the positive-y side.\n\n" f"Project: {project_name}\n" f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}\n" f"Default roles:\n{json.dumps(default_roles, ensure_ascii=False, indent=2)}" @@ -1190,8 +1190,8 @@ def _call_relative_task_llm( "arm='auto' when the task does not specify an arm.\n" "- For Chinese/English left/right/front/back, use the relation enums " "from the rotated robot-view perspective. front_of means positive " - "world-x; behind means negative world-x; left_of means positive " - "world-y; right_of means negative world-y.\n" + "world-x; behind means negative world-x; left_of means negative " + "world-y; right_of means positive world-y.\n" "- If the task says to release an object above a basket/container so it " "falls into it, use goal_relation='inside'.\n" "- If the task says to stack/place one object on another non-container " @@ -1335,7 +1335,7 @@ def _relative_forced_arm_sides( return inferred_sides side_values = [_position_side_axis_value(position) for position in positions] - if side_values[0] >= side_values[1]: + if side_values[0] <= side_values[1]: return ["left", "right"] return ["right", "left"] @@ -2386,17 +2386,17 @@ def _make_extensions_config(roles: _BasketTaskRoles) -> dict[str, Any]: return { "agent_arm_slots": { "left": { - "arm": "left_arm", - "eef": "left_eef", - }, - "right": { "arm": "right_arm", "eef": "right_eef", }, + "right": { + "arm": "left_arm", + "eef": "left_eef", + }, }, "arm_aim_yaw_offset": { - "left": 0.0, - "right": 3.141592653589793, + "left": 3.141592653589793, + "right": 0.0, }, "gripper_open_state": [0.0], "gripper_close_state": [0.04], @@ -2433,17 +2433,17 @@ def _make_relative_extensions_config(spec: _RelativePlacementSpec) -> dict[str, return { "agent_arm_slots": { "left": { - "arm": "left_arm", - "eef": "left_eef", - }, - "right": { "arm": "right_arm", "eef": "right_eef", }, + "right": { + "arm": "left_arm", + "eef": "left_eef", + }, }, "arm_aim_yaw_offset": { - "left": 0.0, - "right": 3.141592653589793, + "left": 3.141592653589793, + "right": 0.0, }, "gripper_open_state": [0.0], "gripper_close_state": [0.04], diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index af37acad..90f6e6cf 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -74,6 +74,19 @@ def test_ur5_basket_generator_uses_parallel_handoff( _assert_normalized_obj_path(background_objects["wicker_basket"]["shape"]["fpath"]) assert gym_config["robot"]["init_pos"] == [2.0, 0.0, 0.5] assert gym_config["robot"]["init_rot"] == [0.0, 0.0, -90.0] + extensions = gym_config["env"]["extensions"] + assert extensions["agent_arm_slots"]["left"] == { + "arm": "right_arm", + "eef": "right_eef", + } + assert extensions["agent_arm_slots"]["right"] == { + "arm": "left_arm", + "eef": "left_eef", + } + assert extensions["arm_aim_yaw_offset"]["left"] == pytest.approx( + 3.141592653589793 + ) + assert extensions["arm_aim_yaw_offset"]["right"] == pytest.approx(0.0) success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] assert {term["object"] for term in success_terms} == {"left_apple", "right_apple"} @@ -94,12 +107,20 @@ def test_ur5_basket_generator_uses_parallel_handoff( assert "negative-y side" in basic_background assert "negative-x side" not in basic_background assert "positive-x side" not in basic_background - assert '"offset":[0.0,0.04,0.22]' in task_prompt - assert '"offset":[0.0,-0.04,0.22]' in task_prompt + left_high_offset_spec = ( + '"robot_name":"left_arm","control":"arm","target_pose":{"reference":"object",' + '"obj_name":"wicker_basket","offset":[0.0,-0.04,0.22]' + ) + right_high_offset_spec = ( + '"robot_name":"right_arm","control":"arm","target_pose":{"reference":"object",' + '"obj_name":"wicker_basket","offset":[0.0,0.04,0.22]' + ) + assert left_high_offset_spec in task_prompt + assert right_high_offset_spec in task_prompt assert '"offset":[-0.04,0.0,0.22]' not in task_prompt assert '"offset":[0.04,0.0,0.22]' not in task_prompt - assert '"offset":[0.0,0.04,0.22]' in atom_actions - assert '"offset":[0.0,-0.04,0.22]' in atom_actions + assert left_high_offset_spec in atom_actions + assert right_high_offset_spec in atom_actions assert "parallel handoff" in task_prompt assert "parallel handoff" in basic_background assert "parallel handoff" in atom_actions @@ -139,8 +160,8 @@ def test_generator_normalizes_glb_meshes_and_preserves_source_rot( assert background_objects["table"]["init_rot"] == [0.0, 0.0, 180.0] assert background_objects["wicker_basket"]["init_rot"] == [0.0, 0.0, 180.0] - assert rigid_objects["left_apple"]["init_rot"] == [0.0, 0.0, 140.0] - assert rigid_objects["right_apple"]["init_rot"] == [0.0, 0.0, 160.0] + assert rigid_objects["right_apple"]["init_rot"] == [0.0, 0.0, 140.0] + assert rigid_objects["left_apple"]["init_rot"] == [0.0, 0.0, 160.0] for obj_config in [ background_objects["table"], background_objects["wicker_basket"], @@ -332,24 +353,24 @@ def test_target_replacements_can_sync_runtime_names( background_objects = {obj["uid"]: obj for obj in gym_config["background"]} - assert set(rigid_objects) == {"left_apple", "right_orange"} + assert set(rigid_objects) == {"left_orange", "right_apple"} assert "wicker_basket" in background_objects assert background_objects["wicker_basket"]["body_type"] == "kinematic" - _assert_normalized_obj_path(rigid_objects["left_apple"]["shape"]["fpath"]) - _assert_normalized_obj_path(rigid_objects["right_orange"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["left_orange"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] assert {term["object"] for term in success_terms} == { - "left_apple", - "right_orange", + "left_orange", + "right_apple", } task_prompt = paths.task_prompt.read_text(encoding="utf-8") basic_background = paths.basic_background.read_text(encoding="utf-8") - assert "the left apple and right orange into the wicker_basket" in task_prompt - assert "left_arm must only manipulate `left_apple`" in task_prompt - assert "- left_apple: the apple mesh initially" in basic_background - assert "- right_orange: the orange mesh initially" in basic_background + assert "the left orange and right apple into the wicker_basket" in task_prompt + assert "left_arm must only manipulate `left_orange`" in task_prompt + assert "- left_orange: the orange mesh initially" in basic_background + assert "- right_apple: the apple mesh initially" in basic_background def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: @@ -366,7 +387,7 @@ def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: replacement_number=1, option_name="--target_replacement1", ) - == "bread_1" + == "bread_2" ) assert ( run_agent_pipeline_cli._auto_replacement_source_uid( @@ -374,7 +395,7 @@ def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: replacement_number=2, option_name="--target_replacement2", ) - == "bread_2" + == "bread_1" ) @@ -488,7 +509,7 @@ def fake_call_relative_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("y", 0.16) in axis_terms + assert ("y", -0.16) in axis_terms assert ("x", 0.0) in axis_terms assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] @@ -510,7 +531,7 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "wicker_basket", "relation": "left_of", "active_arm": "left_arm", - "release_offset": [0.0, 0.16, 0.12], + "release_offset": [0.0, -0.16, 0.12], } @@ -848,8 +869,8 @@ def fake_call_relative_task_llm(**kwargs): for term in placement_success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("apple_2", "y", 0.16) in axis_terms - assert ("apple_1", "y", -0.16) in axis_terms + assert ("apple_2", "y", -0.16) in axis_terms + assert ("apple_1", "y", 0.16) in axis_terms attr_names = { attr["name"] @@ -865,14 +886,14 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "wicker_basket", "relation": "left_of", "active_arm": "left_arm", - "release_offset": [0.0, 0.16, 0.12], + "release_offset": [0.0, -0.16, 0.12], }, { "moved_object": "apple_1", "reference_object": "wicker_basket", "relation": "right_of", "active_arm": "right_arm", - "release_offset": [0.0, -0.16, 0.12], + "release_offset": [0.0, 0.16, 0.12], }, ], } @@ -997,7 +1018,7 @@ def fake_call_relative_task_llm(**kwargs): ) active_arms = [placement["active_arm"] for placement in paths.summary["placements"]] - assert active_arms == ["right_arm", "left_arm"] + assert active_arms == ["left_arm", "right_arm"] gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] From 52d9a758753fd27546b172846df83511c04355f6 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Tue, 16 Jun 2026 14:12:15 +0800 Subject: [PATCH 10/33] fix one object error and robot high --- .../generation/prompt_builders.py | 244 +++++++--- .../generation/ur5_basket_config.py | 416 +++++++++++++++--- .../test_ur5_basket_config_generation.py | 248 +++++++++++ 3 files changed, 788 insertions(+), 120 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 65455915..4d3bced7 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -39,6 +39,10 @@ - `right_of` means positive world y relative to the reference object. - `front_of` means positive world x relative to the reference object. - `behind` means negative world x relative to the reference object. +- `front_left_of` combines positive world x and negative world y. +- `back_left_of` combines negative world x and negative world y. +- `front_right_of` combines positive world x and positive world y. +- `back_right_of` combines negative world x and positive world y. - `inside` and `on` use the reference object's xy center.""" @@ -62,6 +66,9 @@ class _RelativePlacementLike(Protocol): relation: str high_offset: tuple[float, float, float] release_offset: tuple[float, float, float] + reference_is_initial_pose: bool + high_position: Sequence[float] | None + release_position: Sequence[float] | None class _RelativeSpecLike(_RelativePlacementLike, Protocol): @@ -114,16 +121,16 @@ def make_relative_task_prompt( active_slot = f"{spec.active_side}_arm_action" action_sketch = _format_action_sketch(spec.action_sketch) pick_spec = _format_pick_up_spec(active_arm, spec.moved_runtime_uid) - high_spec = _format_pose_object_spec( + high_spec = _format_relative_pose_spec( active_arm, - spec.reference_runtime_uid, - spec.high_offset, + spec, + pose_kind="high", sample_interval=45, ) - release_spec = _format_pose_object_spec( + release_spec = _format_relative_pose_spec( active_arm, - spec.reference_runtime_uid, - spec.release_offset, + spec, + pose_kind="release", sample_interval=30, ) open_spec = _format_gripper_spec( @@ -138,6 +145,10 @@ def make_relative_task_prompt( sample_interval=20, ) initial_spec = _format_initial_qpos_spec(active_arm, sample_interval=30) + reference_line = _relative_reference_line(spec) + final_planning_rule = _relative_final_planning_rule(project_name, spec) + high_step_label = _relative_pose_step_label(spec, "high staging") + release_step_label = _relative_pose_step_label(spec, "release") return f"""Task: {task_name}: {spec.task_prompt_summary} @@ -152,8 +163,7 @@ def make_relative_task_prompt( Object and arm mapping: - Move `{spec.moved_runtime_uid}`. Source object: `{spec.moved_source_uid}`. -- Use `{spec.reference_runtime_uid}` as the spatial reference. Source object: - `{spec.reference_source_uid}`. +- {reference_line} - Goal relation: `{spec.relation}` ({_relative_relation_phrase(spec.relation)}). - Active arm: `{active_arm}`. - Keep every `{inactive_slot}` as null. @@ -168,11 +178,11 @@ def make_relative_task_prompt( - {active_slot}: {pick_spec} - {inactive_slot}: null -2. Move the held object to the high staging pose relative to the reference: +2. Move the held object to the {high_step_label} pose: - {active_slot}: {high_spec} - {inactive_slot}: null -3. Lower the held object to the release pose: +3. Lower the held object to the {release_step_label} pose: - {active_slot}: {release_spec} - {inactive_slot}: null @@ -189,9 +199,8 @@ def make_relative_task_prompt( - {inactive_slot}: null Final state: `{spec.moved_runtime_uid}` must be -{_relative_relation_phrase(spec.relation)} `{spec.reference_runtime_uid}`. Always -plan to the current object poses from the exported {project_name} environment -config. Do not hard-code absolute object coordinates in the generated graph. +{_relative_relation_phrase(spec.relation)} `{spec.reference_runtime_uid}`. +{final_planning_rule} """ @@ -208,28 +217,28 @@ def _make_dual_relative_task_prompt( action_sketch = _format_action_sketch(spec.action_sketch) first_pick_spec = _format_pick_up_spec(first_arm, first.moved_runtime_uid) second_pick_spec = _format_pick_up_spec(second_arm, second.moved_runtime_uid) - first_high_spec = _format_pose_object_spec( + first_high_spec = _format_relative_pose_spec( first_arm, - first.reference_runtime_uid, - first.high_offset, + first, + pose_kind="high", sample_interval=45, ) - first_release_spec = _format_pose_object_spec( + first_release_spec = _format_relative_pose_spec( first_arm, - first.reference_runtime_uid, - first.release_offset, + first, + pose_kind="release", sample_interval=30, ) - second_high_spec = _format_pose_object_spec( + second_high_spec = _format_relative_pose_spec( second_arm, - second.reference_runtime_uid, - second.high_offset, + second, + pose_kind="high", sample_interval=45, ) - second_release_spec = _format_pose_object_spec( + second_release_spec = _format_relative_pose_spec( second_arm, - second.reference_runtime_uid, - second.release_offset, + second, + pose_kind="release", sample_interval=30, ) first_open_spec = _format_gripper_spec( @@ -272,6 +281,9 @@ def _make_dual_relative_task_prompt( second_arm, sample_interval=30, ) + first_reference_line = _relative_reference_line(first) + second_reference_line = _relative_reference_line(second) + final_planning_rule = _dual_relative_final_planning_rule(project_name, spec) return f"""Task: {task_name}: {spec.task_prompt_summary} @@ -289,12 +301,10 @@ def _make_dual_relative_task_prompt( `{first.moved_source_uid}`. - {second_slot} must manipulate `{second.moved_runtime_uid}`. Source object: `{second.moved_source_uid}`. -- `{first.reference_runtime_uid}` is the spatial reference for - `{first.moved_runtime_uid}`. Goal relation: `{first.relation}` - ({_relative_relation_phrase(first.relation)}). -- `{second.reference_runtime_uid}` is the spatial reference for - `{second.moved_runtime_uid}`. Goal relation: `{second.relation}` - ({_relative_relation_phrase(second.relation)}). +- {first_reference_line} Goal relation for `{first.moved_runtime_uid}`: + `{first.relation}` ({_relative_relation_phrase(first.relation)}). +- {second_reference_line} Goal relation for `{second.moved_runtime_uid}`: + `{second.relation}` ({_relative_relation_phrase(second.relation)}). {_RELATIVE_COORDINATE_CONVENTION} @@ -347,9 +357,8 @@ def _make_dual_relative_task_prompt( Final state: `{first.moved_runtime_uid}` must be {_relative_relation_phrase(first.relation)} `{first.reference_runtime_uid}`, and `{second.moved_runtime_uid}` must be {_relative_relation_phrase(second.relation)} -`{second.reference_runtime_uid}`. Always plan to the current object poses from the -exported {project_name} environment config. Do not hard-code absolute object -coordinates in the generated graph. +`{second.reference_runtime_uid}`. +{final_planning_rule} """ @@ -381,16 +390,14 @@ def make_relative_basic_background( Interactive task objects: - {spec.moved_runtime_uid}: moved object from source `{spec.moved_source_uid}`. -- {spec.reference_runtime_uid}: reference object from source - `{spec.reference_source_uid}`. +- {_relative_reference_line(spec)} Config-stage LLM notes: {notes} The execution-stage LLM should generate graph JSON that grasps the moved object, -moves it to a high staging pose relative to the current reference object pose, -lowers to the release pose, opens the gripper, retreats upward, and returns the -active arm to its initial pose. +moves it to the configured high staging pose, lowers to the release pose, opens +the gripper, retreats upward, and returns the active arm to its initial pose. """ @@ -437,16 +444,16 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: active_arm = f"{spec.active_side}_arm" inactive_arm = "right_arm" if spec.active_side == "left" else "left_arm" - high_spec = _format_pose_object_spec( + high_spec = _format_relative_pose_spec( active_arm, - spec.reference_runtime_uid, - spec.high_offset, + spec, + pose_kind="high", sample_interval=45, ) - release_spec = _format_pose_object_spec( + release_spec = _format_relative_pose_spec( active_arm, - spec.reference_runtime_uid, - spec.release_offset, + spec, + pose_kind="release", sample_interval=30, ) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Relative Placement @@ -458,9 +465,9 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: Use exactly these action patterns: - Pick up `{spec.moved_runtime_uid}`: {_format_pick_up_spec(active_arm, spec.moved_runtime_uid)} -- High staging relative to `{spec.reference_runtime_uid}`: +- {_relative_pose_step_label(spec, "High staging")}: {high_spec} -- Release pose relative to `{spec.reference_runtime_uid}`: +- {_relative_pose_step_label(spec, "Release pose")}: {release_spec} - Release the held object: {_format_gripper_spec(active_arm, "open", sample_interval=15, post_hold_steps=25)} @@ -475,28 +482,28 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: first, second = spec.placements first_arm = f"{first.active_side}_arm" second_arm = f"{second.active_side}_arm" - first_high_spec = _format_pose_object_spec( + first_high_spec = _format_relative_pose_spec( first_arm, - first.reference_runtime_uid, - first.high_offset, + first, + pose_kind="high", sample_interval=45, ) - first_release_spec = _format_pose_object_spec( + first_release_spec = _format_relative_pose_spec( first_arm, - first.reference_runtime_uid, - first.release_offset, + first, + pose_kind="release", sample_interval=30, ) - second_high_spec = _format_pose_object_spec( + second_high_spec = _format_relative_pose_spec( second_arm, - second.reference_runtime_uid, - second.high_offset, + second, + pose_kind="high", sample_interval=45, ) - second_release_spec = _format_pose_object_spec( + second_release_spec = _format_relative_pose_spec( second_arm, - second.reference_runtime_uid, - second.release_offset, + second, + pose_kind="release", sample_interval=30, ) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Dual-Arm Relative Placement @@ -878,6 +885,59 @@ def _format_pose_object_spec( ) +def _format_relative_pose_spec( + robot_name: str, + placement: _RelativePlacementLike, + *, + pose_kind: str, + sample_interval: int, +) -> str: + if getattr(placement, "reference_is_initial_pose", False): + position = ( + placement.high_position + if pose_kind == "high" + else placement.release_position + ) + if position is None: + raise ValueError( + "Self-relative placement requires absolute high/release positions." + ) + return _format_pose_absolute_spec( + robot_name, + position, + sample_interval=sample_interval, + ) + + offset = placement.high_offset if pose_kind == "high" else placement.release_offset + return _format_pose_object_spec( + robot_name, + placement.reference_runtime_uid, + offset, + sample_interval=sample_interval, + ) + + +def _format_pose_absolute_spec( + robot_name: str, + position: Sequence[float], + *, + sample_interval: int, +) -> str: + return _compact_json( + { + "atomic_action_class": "MoveAction", + "robot_name": robot_name, + "control": "arm", + "target_pose": { + "reference": "absolute", + "position": [float(value) for value in position], + "orientation": "current", + }, + "cfg": {"sample_interval": sample_interval}, + } + ) + + def _format_pose_offset_spec( robot_name: str, offset: tuple[float, float, float], @@ -945,6 +1005,64 @@ def _format_action_sketch(action_sketch: list[str]) -> str: return "\n".join(f"- {item}" for item in action_sketch) +def _relative_reference_line(spec: _RelativePlacementLike) -> str: + if getattr(spec, "reference_is_initial_pose", False): + return ( + f"Use the initial position of `{spec.moved_runtime_uid}` as the fixed " + f"spatial anchor. Source object: `{spec.moved_source_uid}`." + ) + return ( + f"Use `{spec.reference_runtime_uid}` as the spatial reference. Source " + f"object: `{spec.reference_source_uid}`." + ) + + +def _relative_pose_step_label( + spec: _RelativePlacementLike, + label: str, +) -> str: + if getattr(spec, "reference_is_initial_pose", False): + return f"{label} at the absolute initial-position offset" + return f"{label} relative to `{spec.reference_runtime_uid}`" + + +def _relative_final_planning_rule( + project_name: str, + spec: _RelativePlacementLike, +) -> str: + if getattr(spec, "reference_is_initial_pose", False): + return ( + "Use the exact absolute target_pose JSON specs shown above. Do not " + "rewrite this self-relative task as an object-referenced pose, because " + "the moved object would become a moving reference after pickup." + ) + return ( + f"Always plan to the current object poses from the exported {project_name} " + "environment config. Do not hard-code absolute object coordinates in the " + "generated graph." + ) + + +def _dual_relative_final_planning_rule( + project_name: str, + spec: _RelativeSpecLike, +) -> str: + if any( + getattr(placement, "reference_is_initial_pose", False) + for placement in spec.placements + ): + return ( + "Use the exact absolute target_pose JSON specs shown above for any " + "initial-position placement. Do not rewrite those self-relative " + "steps as object-referenced poses." + ) + return ( + f"Always plan to the current object poses from the exported {project_name} " + "environment config. Do not hard-code absolute object coordinates in the " + "generated graph." + ) + + def _relative_relation_phrase(relation: str) -> str: if relation == "inside": return "inside" @@ -958,6 +1076,14 @@ def _relative_relation_phrase(relation: str) -> str: return "in front of" if relation == "behind": return "behind" + if relation == "front_left_of": + return "to the front-left of" + if relation == "back_left_of": + return "to the back-left of" + if relation == "front_right_of": + return "to the front-right of" + if relation == "back_right_of": + return "to the back-right of" raise ValueError(f"Unsupported relative placement relation: {relation!r}.") diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index f6046031..2d93c0f8 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -69,6 +69,25 @@ "right_of", "front_of", "behind", + "front_left_of", + "back_left_of", + "front_right_of", + "back_right_of", +} + +_SIDE_RELATIONS = _RELATIVE_RELATIONS - {"inside", "on"} + +_SELF_REFERENCE_VALUES = { + "self", + "initial_self", + "initial_position", + "initial_pose", + "origin", + "itself", + "自身", + "自己", + "原位", + "初始位置", } _RELATION_ALIASES = { @@ -93,11 +112,47 @@ "to_the_left_of": "left_of", "左": "left_of", "左边": "left_of", + "front_left": "front_left_of", + "front_left_of": "front_left_of", + "left_front": "front_left_of", + "left_front_of": "front_left_of", + "to_the_front_left_of": "front_left_of", + "左前": "front_left_of", + "左前方": "front_left_of", + "左前面": "front_left_of", + "back_left": "back_left_of", + "back_left_of": "back_left_of", + "behind_left": "back_left_of", + "left_back": "back_left_of", + "left_behind": "back_left_of", + "left_back_of": "back_left_of", + "to_the_back_left_of": "back_left_of", + "左后": "back_left_of", + "左后方": "back_left_of", + "左后面": "back_left_of", "右": "right_of", "右边": "right_of", "right": "right_of", "right_of": "right_of", "to_the_right_of": "right_of", + "front_right": "front_right_of", + "front_right_of": "front_right_of", + "right_front": "front_right_of", + "right_front_of": "front_right_of", + "to_the_front_right_of": "front_right_of", + "右前": "front_right_of", + "右前方": "front_right_of", + "右前面": "front_right_of", + "back_right": "back_right_of", + "back_right_of": "back_right_of", + "behind_right": "back_right_of", + "right_back": "back_right_of", + "right_behind": "back_right_of", + "right_back_of": "back_right_of", + "to_the_back_right_of": "back_right_of", + "右后": "back_right_of", + "右后方": "back_right_of", + "右后面": "back_right_of", "front": "front_of", "front_of": "front_of", "in_front_of": "front_of", @@ -231,6 +286,9 @@ class _RelativePlacementStepSpec: active_side: str release_offset: list[float] high_offset: list[float] + reference_is_initial_pose: bool = False + release_position: list[float] | None = None + high_position: list[float] | None = None @dataclass(frozen=True) @@ -249,6 +307,9 @@ class _RelativePlacementSpec: release_offset: list[float] high_offset: list[float] placements: tuple[_RelativePlacementStepSpec, ...] + reference_is_initial_pose: bool = False + release_position: list[float] | None = None + high_position: list[float] | None = None def generate_ur5_basket_config_from_project( @@ -1094,9 +1155,9 @@ def _build_relative_placement_spec_with_llm( rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] if not background_objects: raise ValueError("Relative placement generation requires a background table.") - if len(rigid_objects) < 2: + if not rigid_objects: raise ValueError( - "Relative placement generation requires at least two rigid objects." + "Relative placement generation requires a movable rigid object." ) table = _pick_table(background_objects) @@ -1119,6 +1180,7 @@ def _build_relative_placement_spec_with_llm( return _apply_relative_task_response( response=response, table_source_uid=table.source_uid, + scene_objects=scene_objects, rigid_objects=rigid_objects, task_description=task_description, ) @@ -1151,9 +1213,9 @@ def _call_relative_task_llm( ' "placements": [\n' " {\n" ' "moved_object": "",\n' - ' "reference_object": "",\n' + ' "reference_object": "",\n' ' "goal_relation": ' - '"inside|on|left_of|right_of|front_of|behind",\n' + '"inside|on|left_of|right_of|front_of|behind|front_left_of|back_left_of|front_right_of|back_right_of",\n' ' "arm": "left|right|auto"\n' " }\n" " ],\n" @@ -1168,7 +1230,7 @@ def _call_relative_task_llm( " ]\n" "}\n\n" "Rules:\n" - "- Use only source_uid values from rigid_object entries.\n" + "- Use only source_uid values from the scene objects listed below.\n" "- Return one placement for a single-arm task and exactly two placements " "for a dual-arm task.\n" "- Treat the task as dual-arm when it explicitly says 双臂, 两臂, both " @@ -1178,8 +1240,15 @@ def _call_relative_task_llm( "- moved_object is the object to grasp and move.\n" "- reference_object is the object used as the spatial reference, " "container, or support.\n" + "- reference_object may be a rigid_object or a background object such as " + "a pad, tray, basket, or container.\n" + "- For single-object directional tasks such as moving the only object " + "forward, left, front-left, or back-right from its initial position, set " + "reference_object to the same source_uid as moved_object (or 'self'). " + "This means the generator will use the object's initial position as a " + "fixed anchor, not the object's moving runtime pose.\n" "- Within each placement, moved_object and reference_object must be " - "different.\n" + "different unless the task is an initial-position directional move.\n" "- For dual-arm tasks, the placements must use two different moved_object " "values and one left arm plus one right arm. Use arm='auto' only when " "the user did not specify which arm handles that placement.\n" @@ -1191,7 +1260,8 @@ def _call_relative_task_llm( "- For Chinese/English left/right/front/back, use the relation enums " "from the rotated robot-view perspective. front_of means positive " "world-x; behind means negative world-x; left_of means negative " - "world-y; right_of means positive world-y.\n" + "world-y; right_of means positive world-y. Diagonal relations combine " + "both axes: front_left_of, back_left_of, front_right_of, back_right_of.\n" "- If the task says to release an object above a basket/container so it " "falls into it, use goal_relation='inside'.\n" "- If the task says to stack/place one object on another non-container " @@ -1227,11 +1297,15 @@ def _apply_relative_task_response( *, response: Mapping[str, Any], table_source_uid: str, + scene_objects: list[_SceneObject], rigid_objects: list[_SceneObject], task_description: str, ) -> _RelativePlacementSpec: - by_uid = {obj.source_uid: obj for obj in rigid_objects} - runtime_uids = _relative_runtime_uid_mapping(rigid_objects) + by_uid = {obj.source_uid: obj for obj in scene_objects} + runtime_uids = _relative_scene_runtime_uid_mapping( + scene_objects, + table_source_uid=table_source_uid, + ) placement_entries = _relative_placement_entries(response) if len(placement_entries) > 2: @@ -1246,6 +1320,7 @@ def _apply_relative_task_response( _build_relative_placement_step( entry=entry, by_uid=by_uid, + scene_objects=scene_objects, rigid_objects=rigid_objects, runtime_uids=runtime_uids, forced_side=forced_side, @@ -1279,6 +1354,9 @@ def _apply_relative_task_response( release_offset=primary.release_offset, high_offset=primary.high_offset, placements=placements, + reference_is_initial_pose=primary.reference_is_initial_pose, + release_position=primary.release_position, + high_position=primary.high_position, ) @@ -1344,6 +1422,7 @@ def _build_relative_placement_step( *, entry: Mapping[str, Any], by_uid: Mapping[str, _SceneObject], + scene_objects: list[_SceneObject], rigid_objects: list[_SceneObject], runtime_uids: Mapping[str, str], forced_side: str | None, @@ -1353,24 +1432,26 @@ def _build_relative_placement_step( rigid_objects, field_name="moved_object", ) - reference_source_uid = _resolve_rigid_source_uid( + relation = _normalize_relative_relation(entry.get("goal_relation")) + reference_source_uid = _resolve_relative_reference_source_uid( entry.get("reference_object"), - rigid_objects, - field_name="reference_object", + moved_source_uid=moved_source_uid, + scene_objects=scene_objects, ) - if moved_source_uid == reference_source_uid: + reference_is_initial_pose = moved_source_uid == reference_source_uid + if reference_is_initial_pose and relation not in _SIDE_RELATIONS: raise ValueError( - "Relative placement requires distinct moved/reference objects." + "Initial-position self-relative placement only supports directional " + "relations, not inside/on." ) reference_obj = by_uid[reference_source_uid] - relation = _normalize_relative_relation(entry.get("goal_relation")) if relation == "on" and _is_container_like(reference_obj): relation = "inside" moved_runtime_uid = runtime_uids[moved_source_uid] reference_runtime_uid = runtime_uids[reference_source_uid] - if moved_runtime_uid == reference_runtime_uid: + if moved_runtime_uid == reference_runtime_uid and not reference_is_initial_pose: raise ValueError( f"Relative placement produced duplicate runtime uid {moved_runtime_uid!r}." ) @@ -1401,6 +1482,7 @@ def _build_relative_placement_step( active_side=active_side, release_offset=release_offset, high_offset=high_offset, + reference_is_initial_pose=reference_is_initial_pose, ) @@ -1425,18 +1507,49 @@ def _resolve_rigid_source_uid( rigid_objects: list[_SceneObject], *, field_name: str, +) -> str: + return _resolve_scene_source_uid( + value, + rigid_objects, + field_name=field_name, + ) + + +def _resolve_relative_reference_source_uid( + value: Any, + *, + moved_source_uid: str, + scene_objects: list[_SceneObject], +) -> str: + if value is not None: + text = str(value).strip() + normalized = text.lower().replace("-", "_").replace(" ", "_") + if normalized in _SELF_REFERENCE_VALUES: + return moved_source_uid + return _resolve_scene_source_uid( + value, + scene_objects, + field_name="reference_object", + ) + + +def _resolve_scene_source_uid( + value: Any, + scene_objects: list[_SceneObject], + *, + field_name: str, ) -> str: if value is None: raise ValueError(f"LLM response missing required {field_name}.") text = str(value).strip() - by_uid = {obj.source_uid: obj for obj in rigid_objects} + by_uid = {obj.source_uid: obj for obj in scene_objects} if text in by_uid: return text normalized = _normalize_runtime_uid(text) matches = [ obj.source_uid - for obj in rigid_objects + for obj in scene_objects if _normalize_runtime_uid(obj.source_uid) == normalized or _base_name(obj) == normalized or _candidate_relative_runtime_uid(obj) == normalized @@ -1513,14 +1626,37 @@ def _relative_release_offset(relation: str) -> list[float]: return [0.0, 0.0, _SIDE_RELEASE_Z_OFFSET] if relation == "on": return [0.0, 0.0, _ON_RELEASE_Z_OFFSET] - if relation in {"left_of", "right_of", "front_of", "behind"}: - axis, offset, _ = _side_relation_axes(relation) - release_offset = [0.0, 0.0, _SIDE_RELEASE_Z_OFFSET] - release_offset[0 if axis == "x" else 1] = offset - return release_offset + if relation in _SIDE_RELATIONS: + x_offset, y_offset = _side_relation_xy_offsets(relation) + return [x_offset, y_offset, _SIDE_RELEASE_Z_OFFSET] raise ValueError(f"Unsupported relative placement relation: {relation!r}.") +def _side_relation_xy_offsets(relation: str) -> tuple[float, float]: + relation = _normalize_relative_relation(relation) + left_y = _ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE + right_y = -_ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE + front_x = _ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE + behind_x = -_ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE + if relation == "left_of": + return 0.0, left_y + if relation == "right_of": + return 0.0, right_y + if relation == "front_of": + return front_x, 0.0 + if relation == "behind": + return behind_x, 0.0 + if relation == "front_left_of": + return front_x, left_y + if relation == "back_left_of": + return behind_x, left_y + if relation == "front_right_of": + return front_x, right_y + if relation == "back_right_of": + return behind_x, right_y + raise ValueError(f"Unsupported side relation: {relation!r}.") + + def _relative_runtime_uid_mapping( rigid_objects: list[_SceneObject], ) -> dict[str, str]: @@ -1551,6 +1687,36 @@ def _relative_runtime_uid_mapping( } +def _relative_scene_runtime_uid_mapping( + scene_objects: list[_SceneObject], + *, + table_source_uid: str, +) -> dict[str, str]: + candidates: dict[str, str] = {} + rigid_runtime_uids = _relative_runtime_uid_mapping( + [obj for obj in scene_objects if obj.source_role == "rigid_object"] + ) + for obj in scene_objects: + if obj.source_uid == table_source_uid: + candidates[obj.source_uid] = "table" + elif obj.source_role == "rigid_object": + candidates[obj.source_uid] = rigid_runtime_uids[obj.source_uid] + else: + candidates[obj.source_uid] = _candidate_relative_runtime_uid(obj) + + counts: dict[str, int] = {} + for runtime_uid in candidates.values(): + counts[runtime_uid] = counts.get(runtime_uid, 0) + 1 + return { + source_uid: ( + runtime_uid + if source_uid == table_source_uid or counts[runtime_uid] == 1 + else _normalize_runtime_uid(source_uid) + ) + for source_uid, runtime_uid in candidates.items() + } + + def _candidate_relative_runtime_uid(obj: _SceneObject) -> str: if _is_container_like(obj): return _container_runtime_uid(obj) @@ -1643,6 +1809,14 @@ def _relative_relation_phrase(relation: str) -> str: return "in front of" if relation == "behind": return "behind" + if relation == "front_left_of": + return "to the front-left of" + if relation == "back_left_of": + return "to the back-left of" + if relation == "front_right_of": + return "to the front-right of" + if relation == "back_right_of": + return "to the back-right of" raise ValueError(f"Unsupported relative placement relation: {relation!r}.") @@ -1808,8 +1982,17 @@ def _build_relative_placement_bundle( ] rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] by_uid = {obj.source_uid: obj for obj in scene_objects} - runtime_uids = _relative_runtime_uid_mapping(rigid_objects) + runtime_uids = _relative_scene_runtime_uid_mapping( + scene_objects, + table_source_uid=spec.table_source_uid, + ) moved_source_uids = {placement.moved_source_uid for placement in spec.placements} + reference_runtime_uids = { + placement.reference_runtime_uid for placement in spec.placements + } + registered_runtime_uids = sorted( + {runtime_uids[obj.source_uid] for obj in rigid_objects} | reference_runtime_uids + ) dynamic_rigid_objects = [ obj for obj in rigid_objects if obj.source_uid in moved_source_uids ] @@ -1830,10 +2013,10 @@ def _build_relative_placement_bundle( "max_episodes": int(max_episodes), "max_episode_steps": int(max_episode_steps), "env": { - "extensions": _make_relative_extensions_config(spec), - "events": _make_relative_events_config(spec, list(runtime_uids.values())), + "extensions": {}, + "events": _make_relative_events_config(spec, registered_runtime_uids), "observations": _make_observations_config(), - "dataset": _make_relative_dataset_config(project_name, spec), + "dataset": {}, }, "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), "sensor": _make_sensor_config(), @@ -1854,7 +2037,12 @@ def _build_relative_placement_bundle( for obj in static_scene_objects ], *[ - _make_extra_background_config(scene_dir, obj, mesh_normalizer) + _make_extra_background_config( + scene_dir, + obj, + mesh_normalizer, + runtime_uid=runtime_uids[obj.source_uid], + ) for obj in background_objects if obj.source_uid != spec.table_source_uid ], @@ -1875,6 +2063,9 @@ def _build_relative_placement_bundle( ], } _apply_tabletop_z_placement(gym_config, table_top_z) + spec = _with_self_relative_absolute_targets(spec, gym_config) + gym_config["env"]["extensions"] = _make_relative_extensions_config(spec) + gym_config["env"]["dataset"] = _make_relative_dataset_config(project_name, spec) return { "gym_config": gym_config, "agent_config": make_agent_config(), @@ -1885,6 +2076,82 @@ def _build_relative_placement_bundle( } +def _with_self_relative_absolute_targets( + spec: _RelativePlacementSpec, + gym_config: Mapping[str, Any], +) -> _RelativePlacementSpec: + if not any(placement.reference_is_initial_pose for placement in spec.placements): + return spec + + generated_positions = { + str(obj.get("uid")): _clean_vector3(obj.get("init_pos", [0.0, 0.0, 0.0])) + for obj in gym_config.get("rigid_object", []) + } + placements = tuple( + _with_self_relative_absolute_target(placement, generated_positions) + for placement in spec.placements + ) + primary = placements[0] + return _RelativePlacementSpec( + table_source_uid=spec.table_source_uid, + moved_source_uid=primary.moved_source_uid, + reference_source_uid=primary.reference_source_uid, + moved_runtime_uid=primary.moved_runtime_uid, + reference_runtime_uid=primary.reference_runtime_uid, + relation=primary.relation, + active_side=primary.active_side, + task_description=spec.task_description, + task_prompt_summary=spec.task_prompt_summary, + basic_background_notes=spec.basic_background_notes, + action_sketch=spec.action_sketch, + release_offset=primary.release_offset, + high_offset=primary.high_offset, + placements=placements, + reference_is_initial_pose=primary.reference_is_initial_pose, + release_position=primary.release_position, + high_position=primary.high_position, + ) + + +def _with_self_relative_absolute_target( + placement: _RelativePlacementStepSpec, + generated_positions: Mapping[str, list[float]], +) -> _RelativePlacementStepSpec: + if not placement.reference_is_initial_pose: + return placement + initial_position = generated_positions.get(placement.moved_runtime_uid) + if initial_position is None: + raise ValueError( + "Generated relative config missing self-relative moved object " + f"{placement.moved_runtime_uid!r}." + ) + release_position = _offset_position(initial_position, placement.release_offset) + high_position = _offset_position(initial_position, placement.high_offset) + return _RelativePlacementStepSpec( + moved_source_uid=placement.moved_source_uid, + reference_source_uid=placement.reference_source_uid, + moved_runtime_uid=placement.moved_runtime_uid, + reference_runtime_uid=placement.reference_runtime_uid, + relation=placement.relation, + active_side=placement.active_side, + release_offset=placement.release_offset, + high_offset=placement.high_offset, + reference_is_initial_pose=True, + release_position=release_position, + high_position=high_position, + ) + + +def _offset_position( + position: Sequence[float], + offset: Sequence[float], +) -> list[float]: + return [ + round(float(position[index]) + float(offset[index]), 6) + for index in range(3) + ] + + def _target_body_scale_vector( target_body_scale: float | list[float] | tuple[float, float, float], ) -> list[float]: @@ -1928,7 +2195,7 @@ def _dual_ur5_init_z_from_table_top(table_top_z: float | None) -> float: return _DUAL_UR5_LEGACY_INIT_Z init_z = table_top_z + _DUAL_UR5_TABLETOP_CLEARANCE - _DUAL_UR5_ARM_COMPONENT_Z - return round(max(_DUAL_UR5_LEGACY_INIT_Z, init_z), 6) + return round(init_z, 6) def _apply_tabletop_z_placement( @@ -2483,28 +2750,30 @@ def _make_relative_placement_success_spec( "max_z_offset": 0.35, } - primary_axis, primary_offset, secondary_axis = _side_relation_axes( - placement.relation - ) + if placement.reference_is_initial_pose: + if placement.release_position is None: + raise ValueError( + "Self-relative success requires an absolute release position." + ) + return { + "op": "all", + "terms": [ + *_absolute_xy_success_terms( + placement.moved_runtime_uid, + placement.release_position, + ), + { + "type": "object_not_fallen", + "object": placement.moved_runtime_uid, + "max_tilt": 0.9, + }, + ], + } + return { "op": "all", "terms": [ - { - "type": "object_axis_offset_near", - "object": placement.moved_runtime_uid, - "reference": placement.reference_runtime_uid, - "axis": primary_axis, - "offset": primary_offset, - "tolerance": 0.05, - }, - { - "type": "object_axis_offset_near", - "object": placement.moved_runtime_uid, - "reference": placement.reference_runtime_uid, - "axis": secondary_axis, - "offset": 0.0, - "tolerance": 0.06, - }, + *_relative_xy_success_terms(placement), { "type": "object_not_fallen", "object": placement.moved_runtime_uid, @@ -2514,21 +2783,42 @@ def _make_relative_placement_success_spec( } -def _side_relation_axes(relation: str) -> tuple[str, float, str]: - if relation == "left_of": - return "y", _ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE, "x" - if relation == "right_of": - return "y", -_ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE, "x" - if relation == "front_of": - return "x", _ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE, "y" - if relation == "behind": - return "x", -_ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE, "y" - raise ValueError(f"Unsupported side relation: {relation!r}.") +def _absolute_xy_success_terms( + object_uid: str, + position: Sequence[float], +) -> list[dict[str, Any]]: + return [ + { + "type": "object_axis_near", + "object": object_uid, + "axis": axis, + "target": float(position[index]), + "tolerance": 0.05, + } + for index, axis in enumerate(("x", "y")) + ] + + +def _relative_xy_success_terms( + placement: _RelativePlacementStepSpec, +) -> list[dict[str, Any]]: + x_offset, y_offset = _side_relation_xy_offsets(placement.relation) + return [ + { + "type": "object_axis_offset_near", + "object": placement.moved_runtime_uid, + "reference": placement.reference_runtime_uid, + "axis": axis, + "offset": offset, + "tolerance": 0.05 if offset else 0.06, + } + for axis, offset in (("x", x_offset), ("y", y_offset)) + ] def _make_relative_events_config( spec: _RelativePlacementSpec, - rigid_runtime_uids: list[str], + registered_runtime_uids: list[str], ) -> dict[str, Any]: return { "record_camera": _record_camera_event_config(), @@ -2556,7 +2846,8 @@ def _make_relative_events_config( "mode": "reset", "params": { "registry": [ - _object_registry_entry(uid) for uid in sorted(rigid_runtime_uids) + _object_registry_entry(uid) + for uid in sorted(registered_runtime_uids) ], "registration": "affordance_datas", "sim_update": True, @@ -2933,10 +3224,11 @@ def _make_extra_background_config( obj: _SceneObject, mesh_normalizer: MeshFrameNormalizer, body_scale: Any | None = None, + runtime_uid: str | None = None, ) -> dict[str, Any]: shape = _make_shape_config(scene_dir, obj.config, mesh_normalizer=mesh_normalizer) config = { - "uid": _normalize_runtime_uid(obj.source_uid), + "uid": runtime_uid or _normalize_runtime_uid(obj.source_uid), "shape": shape, "attrs": copy.deepcopy(dict(obj.config.get("attrs", _BACKGROUND_ATTRS))), "body_scale": _clean_vector3( @@ -3286,6 +3578,8 @@ def _validate_success_uids( "object_relative_axis_near", }: required_keys = ("object", "reference") + elif success_type in {"object_axis_near", "object_coordinate_near"}: + required_keys = ("object",) elif success_type in {"object_not_fallen", "not_fallen"}: required_keys = ("object",) else: diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 90f6e6cf..fc5651fa 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -605,6 +605,164 @@ def fake_call_relative_task_llm(**kwargs): } +def test_task_description_generates_self_relative_front_left_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + for rel_path in ( + "mesh_assets/table/table_0.glb", + "mesh_assets/chip_bag/chip_bag_1.glb", + ): + _write_minimal_glb(project_dir / rel_path, _default_mesh_vertices()) + + gym_config = { + "id": "Image2Tabletop-1790000000-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 180.0], + ), + ], + "rigid_object": [ + _mesh_object( + "chip_bag_1", + "mesh_assets/chip_bag/chip_bag_1.glb", + [0.18, 0.22, 0.76], + [0.0, 0.0, 25.0], + ) + ], + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "chip_bag_1", + "reference_object": "chip_bag_1", + "goal_relation": "front_left_of", + "arm": "left", + "task_prompt_summary": "Move the chip bag front-left from its start.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_self_relative_agent", + task_description="用左臂把薯片袋子往左前移动", + target_body_scale=0.5, + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + assert set(rigid_objects) == {"chip_bag"} + initial_position = rigid_objects["chip_bag"]["init_pos"] + expected_x = round(initial_position[0] + 0.16, 6) + expected_y = round(initial_position[1] - 0.16, 6) + + success = gym_config["env"]["extensions"]["agent_success"] + assert success["op"] == "all" + axis_terms = { + (term.get("axis"), term.get("target")) + for term in success["terms"] + if term["type"] == "object_axis_near" + } + assert ("x", expected_x) in axis_terms + assert ("y", expected_y) in axis_terms + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert '"reference":"absolute"' in task_prompt + assert '"reference":"absolute"' in atom_actions + assert f'"position":[{expected_x},{expected_y},' in task_prompt + + assert _stable_summary(paths.summary) == { + "mode": "relative_placement", + "moved_object": "chip_bag", + "reference_object": "chip_bag", + "relation": "front_left_of", + "active_arm": "left_arm", + "release_offset": [0.16, -0.16, 0.12], + } + + +def test_task_description_generates_relative_front_right_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_1", + "reference_object": "basket_3", + "goal_relation": "front_right_of", + "arm": "right", + "task_prompt_summary": "Move apple_1 to the front-right of basket_3.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + monkeypatch.setattr( + ur5_basket_config_generation, + "_resolve_table_mesh_world_zmax", + lambda scene_dir, table_obj: None, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_front_right_relative_agent", + task_description="用右臂把 apple_1 放到 basket_3 右前", + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + success = gym_config["env"]["extensions"]["agent_success"] + axis_terms = { + (term.get("axis"), term.get("offset")) + for term in success["terms"] + if term["type"] == "object_axis_offset_near" + } + assert ("x", 0.16) in axis_terms + assert ("y", 0.16) in axis_terms + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + assert '"offset":[0.16,0.16,0.12]' in task_prompt + assert _stable_summary(paths.summary)["release_offset"] == [0.16, 0.16, 0.12] + + +@pytest.mark.parametrize( + ("raw_relation", "normalized"), + [ + ("左前", "front_left_of"), + ("左后", "back_left_of"), + ("右前", "front_right_of"), + ("右后", "back_right_of"), + ], +) +def test_relative_relation_aliases_include_diagonal_chinese_directions( + raw_relation: str, + normalized: str, +) -> None: + assert ur5_basket_config_generation._normalize_relative_relation(raw_relation) == ( + normalized + ) + + def test_task_description_on_container_is_compiled_as_inside( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -800,6 +958,96 @@ def fake_call_relative_task_llm(**kwargs): assert _stable_summary(paths.summary)["relation"] == "on" +def test_task_description_allows_single_rigid_with_background_reference( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + for rel_path in ( + "mesh_assets/table/table_0.glb", + "mesh_assets/pad/pad_1.glb", + "mesh_assets/chip_bag/chip_bag_1.glb", + ): + _write_minimal_glb(project_dir / rel_path, _default_mesh_vertices()) + + gym_config = { + "id": "Image2Tabletop-1790000000-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 180.0], + ), + _mesh_object( + "pad_1", + "mesh_assets/pad/pad_1.glb", + [-0.1, -0.15, 0.74], + [0.0, 0.0, 0.0], + ), + ], + "rigid_object": [ + _mesh_object( + "chip_bag_1", + "mesh_assets/chip_bag/chip_bag_1.glb", + [0.18, 0.22, 0.76], + [0.0, 0.0, 25.0], + ) + ], + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def fake_call_relative_task_llm(**kwargs): + scene_roles = { + item["source_uid"]: item["role"] for item in kwargs["scene_summary"] + } + assert scene_roles["chip_bag_1"] == "rigid_object" + assert scene_roles["pad_1"] == "background" + return { + "moved_object": "chip_bag_1", + "reference_object": "pad_1", + "goal_relation": "on", + "arm": "left", + "task_prompt_summary": "Place the chip bag on the pad.", + } + + monkeypatch.setattr( + ur5_basket_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_ur5_basket_config_from_project( + project_dir, + tmp_path / "generated_single_rigid_agent", + task_description="用左臂抓薯片袋子放到垫子上", + target_body_scale=0.5, + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + background_objects = {obj["uid"]: obj for obj in gym_config["background"]} + assert set(rigid_objects) == {"chip_bag"} + assert rigid_objects["chip_bag"]["body_type"] == "dynamic" + assert rigid_objects["chip_bag"]["body_scale"] == [0.5, 0.5, 0.5] + assert background_objects["pad"]["body_type"] == "static" + + success = gym_config["env"]["extensions"]["agent_success"] + assert success["type"] == "object_on_object" + assert success["object"] == "chip_bag" + assert success["support"] == "pad" + + registry = gym_config["env"]["events"]["register_info_to_env"]["params"][ + "registry" + ] + registered_uids = {entry["entity_cfg"]["uid"] for entry in registry} + assert {"chip_bag", "pad"}.issubset(registered_uids) + + def test_task_description_generates_dual_arm_relative_config( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, From 7dc0d84d8c19e319eb23cc01cc02564c7f057de5 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Tue, 16 Jun 2026 15:55:29 +0800 Subject: [PATCH 11/33] fix affoardance --- .../runtime/atom_actions.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index 876f28df..6cea7566 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -771,6 +771,8 @@ def _build_object_semantics( if target_obj is None: raise ValueError(f"No rigid object found for {obj_name}.") + _stabilize_affordance_object(env, target_obj, runtime_kwargs) + mesh_vertices = target_obj.get_vertices(env_ids=[0], scale=True)[0] mesh_triangles = target_obj.get_triangles(env_ids=[0])[0] mesh_vertices = torch.as_tensor(mesh_vertices, dtype=torch.float32) @@ -856,6 +858,20 @@ def _build_object_semantics( ) +def _stabilize_affordance_object( + env, + target_obj, + runtime_kwargs: Mapping[str, Any], +) -> None: + if not bool(runtime_kwargs.get("stabilize_affordance_object", True)): + return + + update_steps = int(runtime_kwargs.get("affordance_stabilization_steps", 5)) + if update_steps > 0: + env.sim.update(step=update_steps) + target_obj.clear_dynamics() + + def _trajectory_to_agent_action(env, robot_name, trajectory, joint_ids): _, _, current_arm_qpos, _, current_gripper_state = get_arm_states(env, robot_name) _, _, _, arm_joints, eef_joints = _select_arm_parts(env, robot_name) From fb473a29815f879ed6fc50885ed75aaef5ba51e4 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Tue, 16 Jun 2026 18:29:47 +0800 Subject: [PATCH 12/33] fix: tighten action-agent atomic runtime schema --- .../action_agent_pipeline/cli/run_agent.py | 2 +- .../{atomic_actions.py => agent_env.py} | 0 .../env_adapters/tableware/base_agent_env.py | 1 - .../generation/prompt_builders.py | 2 - .../prompts/atom_actions.txt | 5 +- .../runtime/atom_action_utils.py | 68 --------- .../runtime/atom_actions.py | 130 ++++++++++++++---- .../runtime/graph_compiler.py | 7 + .../test_backend_atomic_runtime.py | 14 ++ .../test_graph_spec_backend_atomic.py | 12 ++ 10 files changed, 142 insertions(+), 99 deletions(-) rename embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/{atomic_actions.py => agent_env.py} (100%) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py index 10723c70..de459813 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py @@ -23,7 +23,7 @@ import torch import tqdm -from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.atomic_actions import ( # noqa: F401 +from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.agent_env import ( # noqa: F401 AtomicActionsAgentEnv, ) from embodichain.lab.gym.utils.gym_utils import ( diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/atomic_actions.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/agent_env.py similarity index 100% rename from embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/atomic_actions.py rename to embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/agent_env.py diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py index bbbe303e..28be3bff 100644 --- a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py @@ -333,7 +333,6 @@ def create_demo_action_list( regenerate=regenerate, recovery=recovery ) atomic_action_kwargs = { - "use_place_action": True, "allow_grasp_annotation": True, "force_grasp_reannotate": False, } diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 4d3bced7..91104691 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -878,7 +878,6 @@ def _format_pose_object_spec( "reference": "object", "obj_name": obj_name, "offset": [float(x), float(y), float(z)], - "orientation": "current", }, "cfg": {"sample_interval": sample_interval}, } @@ -931,7 +930,6 @@ def _format_pose_absolute_spec( "target_pose": { "reference": "absolute", "position": [float(value) for value in position], - "orientation": "current", }, "cfg": {"sample_interval": sample_interval}, } diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt index 541763c8..1d704c59 100644 --- a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt @@ -22,8 +22,8 @@ Use only these atomic action classes: - Use `control: "arm"` with target_pose or arm target_qpos. - Use `control: "hand"` with gripper target_qpos. - Supported target_pose objects: - {"reference": "object", "obj_name": "", "offset": [x, y, z], "orientation": "current"} - {"reference": "absolute", "position": [x, y, z], "orientation": "current"} + {"reference": "object", "obj_name": "", "offset": [x, y, z]} + {"reference": "absolute", "position": [x, y, z]} {"reference": "relative", "offset": [dx, dy, dz], "frame": "world|eef"} - Supported target_qpos objects: {"source": "initial"} @@ -48,7 +48,6 @@ Rules: - Keep all values JSON primitives. - Each non-null action must contain exactly one of `target_object`, `target_pose`, or `target_qpos`. -- Preserve current orientation by setting `"orientation": "current"` for pose targets. - To keep a holding arm closed while the other arm moves, use: { "atomic_action_class": "MoveAction", diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py index 6bfabdfe..806ad2f0 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py @@ -16,9 +16,6 @@ from __future__ import annotations -import ast -from typing import List - from embodichain.utils.logger import log_error @@ -103,68 +100,3 @@ def get_arm_states(env, robot_name): select_arm_current_pose, select_arm_current_gripper_state, ) - - -def extract_drive_calls(code_str: str) -> List[str]: - """Extract all drive() function calls from a code string. - - Args: - code_str: Python code string to parse. - - Returns: - List of code blocks containing drive() calls. - """ - tree = ast.parse(code_str) - lines = code_str.splitlines() - - drive_blocks = [] - - for node in tree.body: - # Match: drive(...) - if ( - isinstance(node, ast.Expr) - and isinstance(node.value, ast.Call) - and isinstance(node.value.func, ast.Name) - and node.value.func.id == "drive" - ): - # AST line numbers are 1-based - start = node.lineno - 1 - end = node.end_lineno - block = "\n".join(lines[start:end]) - drive_blocks.append(block) - - return drive_blocks - - -def apply_offset_to_pose(pose, offset: list): - pose[0, 3] += offset[0] - pose[1, 3] += offset[1] - pose[2, 3] += offset[2] - return pose - - -def resolve_action(action, env, kwargs): - if callable(action): - return action(env=env, **kwargs) - return action - - -def sync_agent_state_from_robot(env) -> None: - """Synchronize cached agent arm states from the physical robot state.""" - action = env.robot.get_qpos().squeeze(0) - for side in ("left", "right"): - is_left = side == "left" - arm_joints = getattr(env, f"{side}_arm_joints", []) - eef_joints = getattr(env, f"{side}_eef_joints", []) - if arm_joints: - arm_qpos = action[arm_joints] - env.set_current_qpos_agent(arm_qpos, is_left=is_left) - env.set_current_xpos_agent( - env.get_arm_fk(qpos=arm_qpos, is_left=is_left), - is_left=is_left, - ) - if eef_joints: - env.set_current_gripper_state_agent( - action[eef_joints][0].unsqueeze(0), - is_left=is_left, - ) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index 6cea7566..68e0f601 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -63,6 +63,13 @@ SUPPORTED_ATOMIC_ACTION_CLASSES = {"PickUpAction", "MoveAction", "PlaceAction"} SUPPORTED_CONTROLS = {"arm", "hand"} TARGET_SPEC_FIELDS = ("target_object", "target_pose", "target_qpos") +ACTION_SPEC_FIELDS = { + "atomic_action_class", + "robot_name", + "control", + "cfg", + *TARGET_SPEC_FIELDS, +} SUPPORTED_POSE_REFERENCES = {"object", "absolute", "relative"} SUPPORTED_QPOS_SOURCES = {"initial", "gripper_state", "joint_delta"} SUPPORTED_CFG_KEYS = { @@ -143,6 +150,12 @@ def normalize_atomic_action_spec(spec: Mapping[str, Any]) -> dict[str, Any]: "Legacy target.kind schema is not supported. Use exactly one of " "target_object, target_pose, or target_qpos." ) + unknown_fields = set(spec) - ACTION_SPEC_FIELDS + if unknown_fields: + raise ValueError( + f"Unsupported atomic action spec fields: " + f"{', '.join(sorted(unknown_fields))}." + ) atomic_action_class = spec.get("atomic_action_class") if atomic_action_class not in SUPPORTED_ATOMIC_ACTION_CLASSES: @@ -230,6 +243,11 @@ def _normalize_action_target( def _validate_target_object(target_object: Mapping[str, Any]) -> None: + unknown_fields = set(target_object) - {"obj_name", "affordance"} + if unknown_fields: + raise ValueError( + f"Unsupported target_object fields: {', '.join(sorted(unknown_fields))}." + ) obj_name = target_object.get("obj_name") if not isinstance(obj_name, str) or not obj_name: raise ValueError("target_object requires non-empty obj_name.") @@ -246,6 +264,12 @@ def _validate_target_pose(target_pose: Mapping[str, Any]) -> None: ) if reference == "object": + _validate_target_fields( + target_pose, + {"reference", "obj_name", "offset", "orientation"}, + "target_pose", + ) + _validate_current_orientation(target_pose) obj_name = target_pose.get("obj_name") if not isinstance(obj_name, str) or not obj_name: raise ValueError("object target_pose requires non-empty obj_name.") @@ -253,6 +277,12 @@ def _validate_target_pose(target_pose: Mapping[str, Any]) -> None: return if reference == "absolute": + _validate_target_fields( + target_pose, + {"reference", "position", "orientation"}, + "target_pose", + ) + _validate_current_orientation(target_pose) position = target_pose.get("position") if not isinstance(position, list) or len(position) != 3: raise ValueError( @@ -260,6 +290,11 @@ def _validate_target_pose(target_pose: Mapping[str, Any]) -> None: ) return + _validate_target_fields( + target_pose, + {"reference", "offset", "frame"}, + "target_pose", + ) _xyz(target_pose.get("offset", [0.0, 0.0, 0.0]), "offset") frame = target_pose.get("frame", "world") if frame not in {"world", "eef"}: @@ -278,11 +313,13 @@ def _validate_target_qpos( ) if source == "initial": + _validate_target_fields(target_qpos, {"source"}, "target_qpos") if control != "arm": raise ValueError("initial target_qpos requires control='arm'.") return if source == "gripper_state": + _validate_target_fields(target_qpos, {"source", "state"}, "target_qpos") if control != "hand": raise ValueError("gripper_state target_qpos requires control='hand'.") state = target_qpos.get("state") @@ -292,6 +329,11 @@ def _validate_target_qpos( ) return + _validate_target_fields( + target_qpos, + {"source", "joint_index", "delta_degrees"}, + "target_qpos", + ) if control != "arm": raise ValueError("joint_delta target_qpos requires control='arm'.") if "joint_index" not in target_qpos: @@ -300,6 +342,24 @@ def _validate_target_qpos( float(target_qpos.get("delta_degrees", 0.0)) +def _validate_target_fields( + target_spec: Mapping[str, Any], + allowed_fields: set[str], + target_name: str, +) -> None: + unknown_fields = set(target_spec) - allowed_fields + if unknown_fields: + raise ValueError( + f"Unsupported {target_name} fields: {', '.join(sorted(unknown_fields))}." + ) + + +def _validate_current_orientation(target_pose: Mapping[str, Any]) -> None: + orientation = target_pose.get("orientation") + if orientation is not None and orientation != "current": + raise ValueError("target_pose orientation only supports 'current'.") + + def execute_atomic_action( action_spec: Mapping[str, Any] | AtomicActionSpec, *, @@ -530,9 +590,10 @@ def _interpolate_qpos_trajectory( dtype=start_qpos.dtype, device=start_qpos.device, ).reshape(1, sample_interval, 1) - return start_qpos.unsqueeze(1) + ( - target_qpos.unsqueeze(1) - start_qpos.unsqueeze(1) - ) * weights + return ( + start_qpos.unsqueeze(1) + + (target_qpos.unsqueeze(1) - start_qpos.unsqueeze(1)) * weights + ) def _select_arm_parts(env, robot_name: str): @@ -563,6 +624,24 @@ def _build_action_cfg_and_start(env, spec: AtomicActionSpec): is_left, arm_part, hand_part, arm_joints, eef_joints = _select_arm_parts( env, spec.robot_name ) + cfg = _build_action_cfg(env, spec, arm_part, hand_part, len(eef_joints)) + start_qpos = _resolve_action_start_qpos( + env, + spec, + is_left=is_left, + arm_joints=arm_joints, + eef_joints=eef_joints, + ) + return cfg, start_qpos + + +def _build_action_cfg( + env, + spec: AtomicActionSpec, + arm_part: str, + hand_part: str, + hand_dof: int, +): cfg_values = dict(spec.cfg) cfg_values.pop("post_hold_steps", None) device = env.robot.device @@ -570,46 +649,48 @@ def _build_action_cfg_and_start(env, spec: AtomicActionSpec): if spec.atomic_action_class == "PickUpAction": if spec.control != "arm": raise ValueError("PickUpAction atomic action requires control='arm'.") - hand_dof = len(eef_joints) - cfg = PickUpActionCfg( + return PickUpActionCfg( control_part=arm_part, hand_control_part=hand_part, hand_open_qpos=_state_to_hand_qpos(env.open_state, hand_dof, device), hand_close_qpos=_state_to_hand_qpos(env.close_state, hand_dof, device), **_cfg_supported_kwargs(PickUpActionCfg, cfg_values), ) - return cfg, _current_arm_qpos(env, is_left, arm_joints) if spec.atomic_action_class == "PlaceAction": if spec.control != "arm": raise ValueError("PlaceAction atomic action requires control='arm'.") - cfg = PlaceActionCfg( + return PlaceActionCfg( control_part=arm_part, hand_control_part=hand_part, - hand_open_qpos=_state_to_hand_qpos(env.open_state, len(eef_joints), device), - hand_close_qpos=_state_to_hand_qpos( - env.close_state, len(eef_joints), device - ), + hand_open_qpos=_state_to_hand_qpos(env.open_state, hand_dof, device), + hand_close_qpos=_state_to_hand_qpos(env.close_state, hand_dof, device), **_cfg_supported_kwargs(PlaceActionCfg, cfg_values), ) - return cfg, _current_arm_qpos(env, is_left, arm_joints) control_part = arm_part if spec.control == "arm" else hand_part - cfg = MoveActionCfg( + return MoveActionCfg( control_part=control_part, **_cfg_supported_kwargs(MoveActionCfg, cfg_values), ) + + +def _resolve_action_start_qpos( + env, + spec: AtomicActionSpec, + *, + is_left: bool, + arm_joints: list[int], + eef_joints: list[int], +): if spec.control == "hand": _, _, _, _, current_gripper_state = get_arm_states(env, spec.robot_name) - return ( - cfg, - _state_to_hand_qpos( - current_gripper_state, - len(eef_joints), - device, - ).reshape(1, len(eef_joints)), - ) - return cfg, _current_arm_qpos(env, is_left, arm_joints) + return _state_to_hand_qpos( + current_gripper_state, + len(eef_joints), + env.robot.device, + ).reshape(1, len(eef_joints)) + return _current_arm_qpos(env, is_left, arm_joints) def _resolve_target(env, spec: AtomicActionSpec, runtime_kwargs: dict[str, Any]): @@ -867,9 +948,10 @@ def _stabilize_affordance_object( return update_steps = int(runtime_kwargs.get("affordance_stabilization_steps", 5)) - if update_steps > 0: + if update_steps > 0 and hasattr(env.sim, "update"): env.sim.update(step=update_steps) - target_obj.clear_dynamics() + if hasattr(target_obj, "clear_dynamics"): + target_obj.clear_dynamics() def _trajectory_to_agent_action(env, robot_name, trajectory, joint_ids): diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py index f522c5ab..47c27e83 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py @@ -38,6 +38,7 @@ "recovery_branches", "recoveries", } +_EDGE_KEYS = {"id", "source", "target", "left_arm_action", "right_arm_action"} def load_agent_graph_bundle(path: str | Path) -> dict[str, Any]: @@ -149,6 +150,12 @@ def _validate_task_spec(task_spec: Mapping[str, Any]) -> None: edge_specs = list(task_spec.get("edges", [])) edge_ids = set() for edge in edge_specs: + unknown_edge_keys = set(edge) - _EDGE_KEYS + if unknown_edge_keys: + raise ValueError( + f"Nominal edge '{edge.get('id', '')}' contains unsupported " + f"fields: {', '.join(sorted(unknown_edge_keys))}." + ) edge_id = edge["id"] if edge_id in edge_ids: raise ValueError(f"Duplicate graph edge id '{edge_id}'.") diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py index 5ae0f5b3..9f1661a2 100644 --- a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -178,6 +178,20 @@ def test_normalize_atomic_action_spec_rejects_legacy_target_kind_schema() -> Non ) +def test_normalize_atomic_action_spec_rejects_unknown_fields() -> None: + with pytest.raises(ValueError, match="Unsupported atomic action spec fields"): + normalize_atomic_action_spec( + { + "atomic_action_class": "MoveAction", + "robot_name": "left_arm", + "control": "arm", + "target_qpos": {"source": "initial"}, + "cfg": {}, + "description": "return home", + } + ) + + def test_normalize_atomic_action_spec_rejects_multiple_target_fields() -> None: with pytest.raises(ValueError, match="exactly one of target_object"): normalize_atomic_action_spec( diff --git a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py index 5ae7d1e9..2dcc10d1 100644 --- a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py +++ b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py @@ -110,3 +110,15 @@ def test_compile_agent_graph_rejects_legacy_action_schema() -> None: graph_cls=_FakeGraph, monitor_module={}, ) + + +def test_compile_agent_graph_rejects_extra_edge_fields() -> None: + task_graph = _task_graph(_pick_up_spec("left_arm", "apple")) + task_graph["edges"][0]["monitor"] = {"condition": "object visible"} + + with pytest.raises(ValueError, match="unsupported fields: monitor"): + compile_agent_graph_spec( + task_graph, + graph_cls=_FakeGraph, + monitor_module={}, + ) From 93ea3eab71b32d151c91baa85f7d6ef53bab7557 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Tue, 16 Jun 2026 19:41:38 +0800 Subject: [PATCH 13/33] fix front and back --- .../generation/prompt_builders.py | 12 ++--- .../generation/ur5_basket_config.py | 6 +-- .../prompts/basic_background.txt | 10 ++-- .../test_ur5_basket_config_generation.py | 49 +++++++++++++++---- 4 files changed, 53 insertions(+), 24 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 91104691..5aa519a3 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -37,12 +37,12 @@ _RELATIVE_COORDINATE_CONVENTION = """Coordinate convention for relative placement: - `left_of` means negative world y relative to the reference object. - `right_of` means positive world y relative to the reference object. -- `front_of` means positive world x relative to the reference object. -- `behind` means negative world x relative to the reference object. -- `front_left_of` combines positive world x and negative world y. -- `back_left_of` combines negative world x and negative world y. -- `front_right_of` combines positive world x and positive world y. -- `back_right_of` combines negative world x and positive world y. +- `front_of` means negative world x relative to the reference object. +- `behind` means positive world x relative to the reference object. +- `front_left_of` combines negative world x and negative world y. +- `back_left_of` combines positive world x and negative world y. +- `front_right_of` combines negative world x and positive world y. +- `back_right_of` combines positive world x and positive world y. - `inside` and `on` use the reference object's xy center.""" diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index 2d93c0f8..bc7b3395 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -180,7 +180,7 @@ _DUAL_UR5_ROTATED_INIT_X = 2.0 _DUAL_UR5_ROTATED_INIT_YAW_DEGREES = -90.0 _ROBOT_VIEW_LEFT_WORLD_Y_SIGN = -1.0 -_ROBOT_VIEW_FRONT_WORLD_X_SIGN = 1.0 +_ROBOT_VIEW_FRONT_WORLD_X_SIGN = -1.0 _BACKGROUND_MAX_CONVEX_HULL_NUM = 1 _TARGET_MAX_CONVEX_HULL_NUM = 16 _CONTAINER_MAX_CONVEX_HULL_NUM = 8 @@ -1258,8 +1258,8 @@ def _call_relative_task_llm( "instructions such as 右臂, 右机械臂, right arm, or right UR5; use " "arm='auto' when the task does not specify an arm.\n" "- For Chinese/English left/right/front/back, use the relation enums " - "from the rotated robot-view perspective. front_of means positive " - "world-x; behind means negative world-x; left_of means negative " + "from the rotated robot-view perspective. front_of means negative " + "world-x; behind means positive world-x; left_of means negative " "world-y; right_of means positive world-y. Diagonal relations combine " "both axes: front_left_of, back_left_of, front_right_of, back_right_of.\n" "- If the task says to release an object above a basket/container so it " diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/basic_background.txt b/embodichain/gen_sim/action_agent_pipeline/prompts/basic_background.txt index 3a84455e..65088dd0 100644 --- a/embodichain/gen_sim/action_agent_pipeline/prompts/basic_background.txt +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/basic_background.txt @@ -7,14 +7,14 @@ ROBOT BASE COORDINATE DEFINITIONS All directions below are defined strictly in the robot base frame: -* Moving forward increases x -* Moving backward decreases x -* Moving left increases y -* Moving right decreases y +* Moving forward decreases x +* Moving backward increases x +* Moving left decreases y +* Moving right increases y * Moving up increases z * Moving down decreases z ROBOT INITIALIZATION AND TERMINATION Both robot arms start in predefined initial configurations with their end-effectors open. -At task completion, both arms must be returned to their initial poses. \ No newline at end of file +At task completion, both arms must be returned to their initial poses. diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index fc5651fa..0ac82cf0 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -72,7 +72,17 @@ def test_ur5_basket_generator_uses_parallel_handoff( _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) _assert_normalized_obj_path(background_objects["table"]["shape"]["fpath"]) _assert_normalized_obj_path(background_objects["wicker_basket"]["shape"]["fpath"]) - assert gym_config["robot"]["init_pos"] == [2.0, 0.0, 0.5] + table_top_z = ur5_basket_config_generation._mesh_config_world_zmax( + background_objects["table"] + ) + expected_robot_init_z = ( + table_top_z + + ur5_basket_config_generation._DUAL_UR5_TABLETOP_CLEARANCE + - ur5_basket_config_generation._DUAL_UR5_ARM_COMPONENT_Z + ) + assert gym_config["robot"]["init_pos"] == pytest.approx( + [2.0, 0.0, expected_robot_init_z] + ) assert gym_config["robot"]["init_rot"] == [0.0, 0.0, -90.0] extensions = gym_config["env"]["extensions"] assert extensions["agent_arm_slots"]["left"] == { @@ -587,13 +597,13 @@ def fake_call_relative_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("x", 0.16) in axis_terms + assert ("x", -0.16) in axis_terms assert ("y", 0.0) in axis_terms task_prompt = paths.task_prompt.read_text(encoding="utf-8") atom_actions = paths.atom_actions.read_text(encoding="utf-8") - assert '"offset":[0.16,0.0,0.22]' in task_prompt - assert '"offset":[0.16,0.0,0.22]' in atom_actions + assert '"offset":[-0.16,0.0,0.22]' in task_prompt + assert '"offset":[-0.16,0.0,0.22]' in atom_actions assert _stable_summary(paths.summary) == { "mode": "relative_placement", @@ -601,7 +611,7 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "apple_2", "relation": "front_of", "active_arm": "right_arm", - "release_offset": [0.16, 0.0, 0.12], + "release_offset": [-0.16, 0.0, 0.12], } @@ -667,7 +677,7 @@ def fake_call_relative_task_llm(**kwargs): rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} assert set(rigid_objects) == {"chip_bag"} initial_position = rigid_objects["chip_bag"]["init_pos"] - expected_x = round(initial_position[0] + 0.16, 6) + expected_x = round(initial_position[0] - 0.16, 6) expected_y = round(initial_position[1] - 0.16, 6) success = gym_config["env"]["extensions"]["agent_success"] @@ -692,7 +702,7 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "chip_bag", "relation": "front_left_of", "active_arm": "left_arm", - "release_offset": [0.16, -0.16, 0.12], + "release_offset": [-0.16, -0.16, 0.12], } @@ -737,12 +747,31 @@ def fake_call_relative_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("x", 0.16) in axis_terms + assert ("x", -0.16) in axis_terms assert ("y", 0.16) in axis_terms task_prompt = paths.task_prompt.read_text(encoding="utf-8") - assert '"offset":[0.16,0.16,0.12]' in task_prompt - assert _stable_summary(paths.summary)["release_offset"] == [0.16, 0.16, 0.12] + assert '"offset":[-0.16,0.16,0.12]' in task_prompt + assert _stable_summary(paths.summary)["release_offset"] == [-0.16, 0.16, 0.12] + + +def test_side_relation_offsets_use_robot_view_front_back_convention() -> None: + assert ur5_basket_config_generation._side_relation_xy_offsets("front_of") == ( + -0.16, + 0.0, + ) + assert ur5_basket_config_generation._side_relation_xy_offsets("behind") == ( + 0.16, + 0.0, + ) + assert ur5_basket_config_generation._side_relation_xy_offsets("front_left_of") == ( + -0.16, + -0.16, + ) + assert ur5_basket_config_generation._side_relation_xy_offsets("back_right_of") == ( + 0.16, + 0.16, + ) @pytest.mark.parametrize( From 7807d6705ef01b72d40e6d97c1de625e675dc6c5 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:19:10 +0800 Subject: [PATCH 14/33] fix lower -> open hand -> retreat --- .../generation/prompt_builders.py | 409 +++++++++--------- .../generation/ur5_basket_config.py | 14 +- .../prompts/atom_actions.txt | 7 +- .../test_ur5_basket_config_generation.py | 33 +- 4 files changed, 226 insertions(+), 237 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 5aa519a3..2050b9a9 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -34,6 +34,7 @@ _BASKET_LEFT_RELEASE_OFFSET_Y = -0.04 _BASKET_RIGHT_RELEASE_OFFSET_Y = 0.04 +_PLACE_LIFT_HEIGHT = 0.10 _RELATIVE_COORDINATE_CONVENTION = """Coordinate convention for relative placement: - `left_of` means negative world y relative to the reference object. - `right_of` means positive world y relative to the reference object. @@ -127,22 +128,11 @@ def make_relative_task_prompt( pose_kind="high", sample_interval=45, ) - release_spec = _format_relative_pose_spec( + place_spec = _format_relative_place_spec( active_arm, spec, - pose_kind="release", - sample_interval=30, - ) - open_spec = _format_gripper_spec( - active_arm, - "open", - sample_interval=15, - post_hold_steps=25, - ) - retreat_spec = _format_pose_offset_spec( - active_arm, - (0.0, 0.0, 0.14), - sample_interval=20, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, ) initial_spec = _format_initial_qpos_spec(active_arm, sample_interval=30) reference_line = _relative_reference_line(spec) @@ -170,9 +160,11 @@ def make_relative_task_prompt( {_RELATIVE_COORDINATE_CONVENTION} -Generate one deterministic nominal graph with exactly 6 nominal edges. Use only +Generate one deterministic nominal graph with exactly 4 nominal edges. Use only the atomic action class JSON specs shown below. Do not add recovery, monitor, search, -alignment, or extra lift edges. The inactive arm must remain null in every edge. +alignment, or extra lift edges. Use `PlaceAction` for the release-place step so +lowering, gripper opening, and upward retreat remain one atomic action. The +inactive arm must remain null in every edge. 1. Pick up the moved object: - {active_slot}: {pick_spec} @@ -182,19 +174,11 @@ def make_relative_task_prompt( - {active_slot}: {high_spec} - {inactive_slot}: null -3. Lower the held object to the {release_step_label} pose: - - {active_slot}: {release_spec} - - {inactive_slot}: null - -4. Release the moved object: - - {active_slot}: {open_spec} +3. Place the held object at the {release_step_label} pose: + - {active_slot}: {place_spec} - {inactive_slot}: null -5. Move the empty gripper upward to clear the object: - - {active_slot}: {retreat_spec} - - {inactive_slot}: null - -6. Return the active arm to its initial pose: +4. Return the active arm to its initial pose: - {active_slot}: {initial_spec} - {inactive_slot}: null @@ -223,35 +207,23 @@ def _make_dual_relative_task_prompt( pose_kind="high", sample_interval=45, ) - first_release_spec = _format_relative_pose_spec( - first_arm, - first, - pose_kind="release", - sample_interval=30, - ) second_high_spec = _format_relative_pose_spec( second_arm, second, pose_kind="high", sample_interval=45, ) - second_release_spec = _format_relative_pose_spec( - second_arm, - second, - pose_kind="release", - sample_interval=30, - ) - first_open_spec = _format_gripper_spec( + first_place_spec = _format_relative_place_spec( first_arm, - "open", - sample_interval=15, - post_hold_steps=25, + first, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, ) - second_open_spec = _format_gripper_spec( + second_place_spec = _format_relative_place_spec( second_arm, - "open", - sample_interval=15, - post_hold_steps=25, + second, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, ) first_close_spec = _format_gripper_spec( first_arm, @@ -263,16 +235,6 @@ def _make_dual_relative_task_prompt( "close", sample_interval=10, ) - first_retreat_spec = _format_pose_offset_spec( - first_arm, - (0.0, 0.0, 0.14), - sample_interval=20, - ) - second_retreat_spec = _format_pose_offset_spec( - second_arm, - (0.0, 0.0, 0.14), - sample_interval=20, - ) first_initial_spec = _format_initial_qpos_spec( first_arm, sample_interval=30, @@ -308,9 +270,10 @@ def _make_dual_relative_task_prompt( {_RELATIVE_COORDINATE_CONVENTION} -Generate one deterministic nominal graph with exactly 10 nominal edges. Use only +Generate one deterministic nominal graph with exactly 6 nominal edges. Use only the atomic action class JSON specs shown below. Do not add recovery, monitor, search, -alignment, or extra lift edges. +alignment, or extra lift edges. Use `PlaceAction` for each release-place step so +lowering, gripper opening, and upward retreat remain one atomic action. 1. Pick up both moved objects simultaneously: - {first_slot}: {first_pick_spec} @@ -321,36 +284,20 @@ def _make_dual_relative_task_prompt( - {first_slot}: {first_high_spec} - {second_slot}: {second_close_spec} -3. Lower `{first.moved_runtime_uid}` to the release pose: - - {first_slot}: {first_release_spec} +3. Place `{first.moved_runtime_uid}` at the release pose: + - {first_slot}: {first_place_spec} - {second_slot}: {second_close_spec} -4. Release `{first.moved_runtime_uid}`: - - {first_slot}: {first_open_spec} - - {second_slot}: {second_close_spec} - -5. Move the empty `{first_arm}` gripper upward to clear the workspace: - - {first_slot}: {first_retreat_spec} - - {second_slot}: {second_close_spec} - -6. Return `{first_arm}` to its initial pose while moving `{second.moved_runtime_uid}` +4. Return `{first_arm}` to its initial pose while moving `{second.moved_runtime_uid}` to the high staging pose: - {first_slot}: {first_initial_spec} - {second_slot}: {second_high_spec} -7. Lower `{second.moved_runtime_uid}` to the release pose: - - {first_slot}: null - - {second_slot}: {second_release_spec} - -8. Release `{second.moved_runtime_uid}`: - - {first_slot}: null - - {second_slot}: {second_open_spec} - -9. Move the empty `{second_arm}` gripper upward to clear the workspace: +5. Place `{second.moved_runtime_uid}` at the release pose: - {first_slot}: null - - {second_slot}: {second_retreat_spec} + - {second_slot}: {second_place_spec} -10. Return `{second_arm}` to its initial pose: +6. Return `{second_arm}` to its initial pose: - {first_slot}: null - {second_slot}: {second_initial_spec} @@ -396,8 +343,8 @@ def make_relative_basic_background( {notes} The execution-stage LLM should generate graph JSON that grasps the moved object, -moves it to the configured high staging pose, lowers to the release pose, opens -the gripper, retreats upward, and returns the active arm to its initial pose. +moves it to the configured high staging pose, places it at the release pose with +one `PlaceAction`, and returns the active arm to its initial pose. """ @@ -432,9 +379,10 @@ def _make_dual_relative_basic_background( {notes} The execution-stage LLM should generate graph JSON that grasps both moved -objects, places the first moved object, retreats the first arm, then places the -second moved object while the first arm returns to its initial pose. Each arm -must release its moved object before returning to its initial pose. +objects, stages and places the first moved object with one `PlaceAction`, then +stages and places the second moved object while the first arm returns to its +initial pose. Each arm must release its moved object before returning to its +initial pose. """ @@ -450,11 +398,11 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: pose_kind="high", sample_interval=45, ) - release_spec = _format_relative_pose_spec( + place_spec = _format_relative_place_spec( active_arm, spec, - pose_kind="release", - sample_interval=30, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, ) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Relative Placement @@ -467,12 +415,8 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: {_format_pick_up_spec(active_arm, spec.moved_runtime_uid)} - {_relative_pose_step_label(spec, "High staging")}: {high_spec} -- {_relative_pose_step_label(spec, "Release pose")}: - {release_spec} -- Release the held object: - {_format_gripper_spec(active_arm, "open", sample_interval=15, post_hold_steps=25)} -- Retreat upward: - {_format_pose_offset_spec(active_arm, (0.0, 0.0, 0.14), sample_interval=20)} +- Place at the release pose: + {place_spec} - Return to initial qpos: {_format_initial_qpos_spec(active_arm, sample_interval=30)} """ @@ -488,23 +432,23 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: pose_kind="high", sample_interval=45, ) - first_release_spec = _format_relative_pose_spec( - first_arm, - first, - pose_kind="release", - sample_interval=30, - ) second_high_spec = _format_relative_pose_spec( second_arm, second, pose_kind="high", sample_interval=45, ) - second_release_spec = _format_relative_pose_spec( + first_place_spec = _format_relative_place_spec( + first_arm, + first, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, + ) + second_place_spec = _format_relative_place_spec( second_arm, second, - pose_kind="release", - sample_interval=30, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, ) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Dual-Arm Relative Placement @@ -520,18 +464,14 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: {_format_pick_up_spec(second_arm, second.moved_runtime_uid)} - First high staging: {first_high_spec} -- First release pose: - {first_release_spec} +- First place action: + {first_place_spec} - Second high staging: {second_high_spec} -- Second release pose: - {second_release_spec} -- Release an object: - {_format_gripper_spec("", "open", sample_interval=15, post_hold_steps=25)} +- Second place action: + {second_place_spec} - Keep a holding arm closed: {_format_gripper_spec("", "close", sample_interval=10)} -- Retreat upward: - {_format_pose_offset_spec("", (0.0, 0.0, 0.14), sample_interval=20)} - Return to initial qpos: {_format_initial_qpos_spec("", sample_interval=30)} """ @@ -560,51 +500,31 @@ def make_basket_task_prompt( (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.22), sample_interval=45, ) - left_release_spec = _format_pose_object_spec( - "left_arm", - roles.container_runtime_uid, - (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.12), - sample_interval=30, - ) right_high_spec = _format_pose_object_spec( "right_arm", roles.container_runtime_uid, (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.22), sample_interval=45, ) - right_release_spec = _format_pose_object_spec( - "right_arm", - roles.container_runtime_uid, - (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.12), - sample_interval=30, - ) - left_open_spec = _format_gripper_spec( + left_place_spec = _format_place_object_spec( "left_arm", - "open", - sample_interval=15, - post_hold_steps=25, + roles.container_runtime_uid, + (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.12), + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, ) - right_open_spec = _format_gripper_spec( + right_place_spec = _format_place_object_spec( "right_arm", - "open", - sample_interval=15, - post_hold_steps=25, + roles.container_runtime_uid, + (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.12), + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, ) right_close_spec = _format_gripper_spec( "right_arm", "close", sample_interval=10, ) - left_retreat_spec = _format_pose_offset_spec( - "left_arm", - (0.0, 0.0, 0.14), - sample_interval=20, - ) - right_retreat_spec = _format_pose_offset_spec( - "right_arm", - (0.0, 0.0, 0.14), - sample_interval=20, - ) left_initial_spec = _format_initial_qpos_spec( "left_arm", sample_interval=30, @@ -633,23 +553,23 @@ def make_basket_task_prompt( - Both target objects must be released into `{roles.container_runtime_uid}`. Generate one deterministic nominal graph with the following semantic sequence. -Do not add extra alignment, search, recovery, or monitor steps. Do include the -specified post-release retreat and return-to-initial steps. The left arm must -finish its upward retreat before the right arm enters the shared container -workspace, but the left return-to-initial action and the right high-staging -action must execute simultaneously in one graph edge. Generate exactly 10 +Do not add extra alignment, search, recovery, or monitor steps. Use `PlaceAction` +for each release-place step so lowering, gripper opening, and upward retreat +remain one atomic action. The left arm must finish its `PlaceAction` retreat +before the right arm enters the shared container workspace, but the left +return-to-initial action and the right high-staging action must execute +simultaneously in one graph edge. Generate exactly 6 nominal edges, one edge for each numbered step below. Do not split the simultaneous grasp or the simultaneous left-return/right-staging action into -separate edges. Do not merge, reorder, or omit the lower-to-release, -open-gripper, upward-retreat, or final right return-to-initial edges. +separate edges. Do not split a `PlaceAction` into separate lower-to-release, +open-gripper, or upward-retreat edges. A target object is not considered placed when it is only above the {roles.container_runtime_uid}. For each arm, the placement order must be: move -to a high staging pose above the container, lower to the release pose inside the -container, use `target_qpos` with source `gripper_state` and state `open`, -move the empty gripper upward, then return the arm to its initial pose. Never -use `target_qpos` source `initial` for an arm that has not already released its -held target object. +to a high staging pose above the container, then execute one `PlaceAction` at +the release pose inside the container, then return the arm to its initial pose. +Never use `target_qpos` source `initial` for an arm that has not already +released its held target object. 1. Pick up both target objects simultaneously: - left_arm_action: {left_pick_spec} @@ -660,40 +580,24 @@ def make_basket_task_prompt( - left_arm_action: {left_high_spec} - right_arm_action: {right_close_spec} -3. Lower the held left target object to the left release pose inside the +3. Place the held left target object at the left release pose inside the {roles.container_runtime_uid}: - - left_arm_action: {left_release_spec} + - left_arm_action: {left_place_spec} - right_arm_action: {right_close_spec} -4. Release the left target object into the {roles.container_runtime_uid}: - - left_arm_action: {left_open_spec} - - right_arm_action: {right_close_spec} - -5. Move the empty left gripper upward to clear the container: - - left_arm_action: {left_retreat_spec} - - right_arm_action: {right_close_spec} - -6. After the left gripper has retreated upward, return the left UR5 to its +4. After the left gripper has retreated upward, return the left UR5 to its initial pose while simultaneously moving the held right target object directly above the right half of the {roles.container_runtime_uid}. This parallel handoff must remain one graph edge: - left_arm_action: {left_initial_spec} - right_arm_action: {right_high_spec} -7. Lower the held right target object to the right release pose inside the +5. Place the held right target object at the right release pose inside the {roles.container_runtime_uid}: - left_arm_action: null - - right_arm_action: {right_release_spec} - -8. Release the right target object into the {roles.container_runtime_uid}: - - left_arm_action: null - - right_arm_action: {right_open_spec} - -9. Move the empty right gripper upward to clear the container: - - left_arm_action: null - - right_arm_action: {right_retreat_spec} + - right_arm_action: {right_place_spec} -10. Return the right UR5 to its initial pose after releasing the target object: +6. Return the right UR5 to its initial pose after releasing the target object: - left_arm_action: null - right_arm_action: {right_initial_spec} @@ -740,27 +644,26 @@ def make_basket_basic_background( grasp {roles.left_target_runtime_uid} while the right UR5 grasps {roles.right_target_runtime_uid} in the same graph edge. After both {target_plural} are held, the left UR5 places -{roles.left_target_runtime_uid} into {roles.container_runtime_uid}, releases -it, and retreats upward. The next graph edge is a parallel handoff: the left -UR5 returns to its initial pose while the right UR5 simultaneously moves its +{roles.left_target_runtime_uid} into {roles.container_runtime_uid} with one +`PlaceAction`. The next graph edge is a parallel handoff: the left UR5 returns +to its initial pose while the right UR5 simultaneously moves its already-grasped {roles.right_target_runtime_uid} to the high staging pose above -{roles.container_runtime_uid}. The right UR5 then lowers and releases -{roles.right_target_runtime_uid}, retreats upward, and returns to its initial -pose. To change the insertion order later, edit the task prompt sequence and -keep the same atomic action API. - -The {roles.container_runtime_uid} area is a shared workspace. After a UR5 -releases a target object, it should retreat upward before the other UR5 moves -to the container, otherwise the two arms may collide near the container. The -right UR5 should keep holding {roles.right_target_runtime_uid} while the left -UR5 performs its placement and upward retreat. Once that retreat is complete, -the right UR5 may move toward the container while the left UR5 simultaneously -returns to its initial pose; it must not wait for the left return-to-initial -motion to finish. +{roles.container_runtime_uid}. The right UR5 then places +{roles.right_target_runtime_uid} with one `PlaceAction` and returns to its +initial pose. To change the insertion order later, edit the task prompt sequence +and keep the same atomic action API. + +The {roles.container_runtime_uid} area is a shared workspace. A UR5 should +complete its `PlaceAction` retreat before the other UR5 moves to the container, +otherwise the two arms may collide near the container. The right UR5 should keep +holding {roles.right_target_runtime_uid} while the left UR5 performs its +placement. Once that `PlaceAction` is complete, the right UR5 may move toward +the container while the left UR5 simultaneously returns to its initial pose; it +must not wait for the left return-to-initial motion to finish. A target object at a high pose above `{roles.container_runtime_uid}` is only -staged, not placed. Each arm must lower the held object into the container -release pose and open the gripper before any return-to-initial motion. +staged, not placed. Each arm must execute a `PlaceAction` at the container +release pose before any return-to-initial motion. Always plan to the current `{roles.container_runtime_uid}` object pose from the environment config. Do not hard-code container coordinates in generated graph @@ -775,23 +678,25 @@ def make_basket_atom_actions_prompt(roles: _BasketRolesLike) -> str: (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.22), sample_interval=45, ) - left_release_spec = _format_pose_object_spec( - "left_arm", - roles.container_runtime_uid, - (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.12), - sample_interval=30, - ) right_high_spec = _format_pose_object_spec( "right_arm", roles.container_runtime_uid, (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.22), sample_interval=45, ) - right_release_spec = _format_pose_object_spec( + left_place_spec = _format_place_object_spec( + "left_arm", + roles.container_runtime_uid, + (0.0, _BASKET_LEFT_RELEASE_OFFSET_Y, 0.12), + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, + ) + right_place_spec = _format_place_object_spec( "right_arm", roles.container_runtime_uid, (0.0, _BASKET_RIGHT_RELEASE_OFFSET_Y, 0.12), - sample_interval=30, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, ) return f"""### Atomic Action Class JSON Specs for UR5BreadBasket Dual-UR5 Placement @@ -806,7 +711,7 @@ def make_basket_atom_actions_prompt(roles: _BasketRolesLike) -> str: - While the left arm places its target, keep the right hand closed with a `target_qpos` whose source is `gripper_state` and state is `close`. - After the left arm releases `{roles.left_target_runtime_uid}`, first move it - upward to clear the container. + upward to clear the container as part of the same `PlaceAction`. - The next nominal edge must pair the left arm's initial `target_qpos` move with the right arm's object-referenced `target_pose` high-staging move. Do not split this parallel handoff into separate edges. @@ -821,18 +726,14 @@ def make_basket_atom_actions_prompt(roles: _BasketRolesLike) -> str: {_format_pick_up_spec("right_arm", roles.right_target_runtime_uid)} - Left high staging: {left_high_spec} -- Left release pose: - {left_release_spec} +- Left place action: + {left_place_spec} - Right high staging: {right_high_spec} -- Right release pose: - {right_release_spec} -- Release an object: - {_format_gripper_spec("", "open", sample_interval=15, post_hold_steps=25)} +- Right place action: + {right_place_spec} - Keep a holding arm closed: {_format_gripper_spec("", "close", sample_interval=10)} -- Retreat upward: - {_format_pose_offset_spec("", (0.0, 0.0, 0.14), sample_interval=20)} - Return to initial qpos: {_format_initial_qpos_spec("", sample_interval=30)} """ @@ -884,6 +785,27 @@ def _format_pose_object_spec( ) +def _format_place_object_spec( + robot_name: str, + obj_name: str, + offset: tuple[float, float, float] | list[float], + *, + sample_interval: int, + lift_height: float, +) -> str: + x, y, z = offset + return _format_place_spec( + robot_name, + { + "reference": "object", + "obj_name": obj_name, + "offset": [float(x), float(y), float(z)], + }, + sample_interval=sample_interval, + lift_height=lift_height, + ) + + def _format_relative_pose_spec( robot_name: str, placement: _RelativePlacementLike, @@ -916,6 +838,32 @@ def _format_relative_pose_spec( ) +def _format_relative_place_spec( + robot_name: str, + placement: _RelativePlacementLike, + *, + sample_interval: int, + lift_height: float, +) -> str: + if getattr(placement, "reference_is_initial_pose", False): + if placement.release_position is None: + raise ValueError("Self-relative placement requires release position.") + return _format_place_absolute_spec( + robot_name, + placement.release_position, + sample_interval=sample_interval, + lift_height=lift_height, + ) + + return _format_place_object_spec( + robot_name, + placement.reference_runtime_uid, + placement.release_offset, + sample_interval=sample_interval, + lift_height=lift_height, + ) + + def _format_pose_absolute_spec( robot_name: str, position: Sequence[float], @@ -936,6 +884,45 @@ def _format_pose_absolute_spec( ) +def _format_place_absolute_spec( + robot_name: str, + position: Sequence[float], + *, + sample_interval: int, + lift_height: float, +) -> str: + return _format_place_spec( + robot_name, + { + "reference": "absolute", + "position": [float(value) for value in position], + }, + sample_interval=sample_interval, + lift_height=lift_height, + ) + + +def _format_place_spec( + robot_name: str, + target_pose: Mapping[str, Any], + *, + sample_interval: int, + lift_height: float, +) -> str: + return _compact_json( + { + "atomic_action_class": "PlaceAction", + "robot_name": robot_name, + "control": "arm", + "target_pose": dict(target_pose), + "cfg": { + "sample_interval": sample_interval, + "lift_height": float(lift_height), + }, + } + ) + + def _format_pose_offset_spec( robot_name: str, offset: tuple[float, float, float], diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py index bc7b3395..e0290c34 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py @@ -1224,9 +1224,7 @@ def _call_relative_task_llm( ' "action_sketch": [\n' ' "grasp moved_object",\n' ' "move above the relation target pose",\n' - ' "lower to the release pose",\n' - ' "open gripper",\n' - ' "retreat upward"\n' + ' "place at the release pose with PlaceAction"\n' " ]\n" "}\n\n" "Rules:\n" @@ -1775,9 +1773,7 @@ def _default_relative_action_sketch( f"move above the {placement.relation} release pose relative to " f"{placement.reference_runtime_uid}" ), - "lower to the release pose", - "open the gripper", - "retreat upward", + "place at the release pose with PlaceAction", ] sketch = ["grasp both moved objects with their assigned arms"] for placement in placements: @@ -1788,8 +1784,7 @@ def _default_relative_action_sketch( f"{placement.moved_runtime_uid} above the release pose relative " f"to {placement.reference_runtime_uid}" ), - f"lower and release {placement.moved_runtime_uid}", - f"retreat {placement.active_side}_arm upward", + f"place {placement.moved_runtime_uid} with PlaceAction", ] ) return sketch @@ -2147,8 +2142,7 @@ def _offset_position( offset: Sequence[float], ) -> list[float]: return [ - round(float(position[index]) + float(offset[index]), 6) - for index in range(3) + round(float(position[index]) + float(offset[index]), 6) for index in range(3) ] diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt index 1d704c59..596fc455 100644 --- a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt @@ -31,15 +31,14 @@ Use only these atomic action classes: {"source": "joint_delta", "joint_index": 5, "delta_degrees": -90} - Typical cfg: {"sample_interval": 30} - - For release settling after an open gripper target, use: - {"sample_interval": 15, "post_hold_steps": 25} 3. `PlaceAction` - - Use this only when a single place action should lower, open, and retreat. + - Prefer this for placement because one action lowers, opens the gripper, + and retreats upward. - Required target_pose. Supported pose targets are the same target_pose objects accepted by `MoveAction`. - Typical cfg: - {"sample_interval": 80, "lift_height": 0.08} + {"sample_interval": 80, "lift_height": 0.1} Rules: - Do not output Python code, function calls, or `fn`/`kwargs` action objects. diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 0ac82cf0..fc475160 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -93,9 +93,7 @@ def test_ur5_basket_generator_uses_parallel_handoff( "arm": "left_arm", "eef": "left_eef", } - assert extensions["arm_aim_yaw_offset"]["left"] == pytest.approx( - 3.141592653589793 - ) + assert extensions["arm_aim_yaw_offset"]["left"] == pytest.approx(3.141592653589793) assert extensions["arm_aim_yaw_offset"]["right"] == pytest.approx(0.0) success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] @@ -111,8 +109,8 @@ def test_ur5_basket_generator_uses_parallel_handoff( atom_actions = paths.atom_actions.read_text(encoding="utf-8") normalized_task_prompt = " ".join(task_prompt.split()) - assert "Generate exactly 10 nominal edges" in normalized_task_prompt - assert "Generate exactly 11 nominal edges" not in normalized_task_prompt + assert "Generate exactly 6 nominal edges" in normalized_task_prompt + assert "Generate exactly 10 nominal edges" not in normalized_task_prompt assert "positive-y side" in basic_background assert "negative-y side" in basic_background assert "negative-x side" not in basic_background @@ -127,6 +125,16 @@ def test_ur5_basket_generator_uses_parallel_handoff( ) assert left_high_offset_spec in task_prompt assert right_high_offset_spec in task_prompt + assert ( + '"atomic_action_class":"PlaceAction","robot_name":"left_arm","control":"arm",' + '"target_pose":{"reference":"object","obj_name":"wicker_basket",' + '"offset":[0.0,-0.04,0.12]}' in task_prompt + ) + assert ( + '"atomic_action_class":"PlaceAction","robot_name":"right_arm","control":"arm",' + '"target_pose":{"reference":"object","obj_name":"wicker_basket",' + '"offset":[0.0,0.04,0.12]}' in task_prompt + ) assert '"offset":[-0.04,0.0,0.22]' not in task_prompt assert '"offset":[0.04,0.0,0.22]' not in task_prompt assert left_high_offset_spec in atom_actions @@ -136,8 +144,8 @@ def test_ur5_basket_generator_uses_parallel_handoff( assert "parallel handoff" in atom_actions assert len(paths.summary["normalized_meshes"]) == 4 - handoff_edge = task_prompt.split("6. After the left gripper", maxsplit=1)[1].split( - "\n7. Lower the held right target object", + handoff_edge = task_prompt.split("4. After the left gripper", maxsplit=1)[1].split( + "\n5. Place the held right target object", maxsplit=1, )[0] assert ( @@ -527,10 +535,11 @@ def fake_call_relative_task_llm(**kwargs): task_prompt = paths.task_prompt.read_text(encoding="utf-8") assert "Move apple_2 to the left of basket_3." in task_prompt assert ( - "Generate one deterministic nominal graph with exactly 6 nominal edges" + "Generate one deterministic nominal graph with exactly 4 nominal edges" in task_prompt ) assert '"atomic_action_class":"PickUpAction","robot_name":"left_arm"' in task_prompt + assert '"atomic_action_class":"PlaceAction","robot_name":"left_arm"' in task_prompt assert '"obj_name":"apple_2"' in task_prompt assert "right_arm_action: null" in task_prompt assert "Generate exactly 10 nominal edges" not in task_prompt @@ -1070,9 +1079,7 @@ def fake_call_relative_task_llm(**kwargs): assert success["object"] == "chip_bag" assert success["support"] == "pad" - registry = gym_config["env"]["events"]["register_info_to_env"]["params"][ - "registry" - ] + registry = gym_config["env"]["events"]["register_info_to_env"]["params"]["registry"] registered_uids = {entry["entity_cfg"]["uid"] for entry in registry} assert {"chip_bag", "pad"}.issubset(registered_uids) @@ -1178,7 +1185,7 @@ def fake_call_relative_task_llm(**kwargs): task_prompt = paths.task_prompt.read_text(encoding="utf-8") basic_background = paths.basic_background.read_text(encoding="utf-8") atom_actions = paths.atom_actions.read_text(encoding="utf-8") - assert "Generate one deterministic nominal graph with exactly 10 nominal edges" in ( + assert "Generate one deterministic nominal graph with exactly 6 nominal edges" in ( task_prompt ) assert ( @@ -1193,6 +1200,8 @@ def fake_call_relative_task_llm(**kwargs): '"robot_name":"right_arm","control":"hand","target_qpos":{"source":"gripper_state","state":"close"}' in task_prompt ) + assert '"atomic_action_class":"PlaceAction","robot_name":"left_arm"' in task_prompt + assert '"atomic_action_class":"PlaceAction","robot_name":"right_arm"' in task_prompt assert "The inactive arm must remain null" not in task_prompt assert "Both arms participate" in basic_background assert "left_arm moves `apple_2`" in basic_background From c607e305fe00e41e7d7feb69cf30af1a3fa4d886 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Wed, 17 Jun 2026 09:54:56 +0800 Subject: [PATCH 15/33] fix: address action-agent runtime review cleanup --- .../agents/compile_agent.py | 9 +--- .../action_agent_pipeline/agents/llm.py | 6 +-- .../action_agent_pipeline/cli/run_agent.py | 1 - .../env_adapters/tableware/base_agent_env.py | 15 ++---- .../generation/prompt_builders.py | 26 +---------- .../runtime/atom_actions.py | 46 ++++++------------- .../runtime/graph_compiler.py | 37 +++++---------- .../test_backend_atomic_runtime.py | 19 +++++++- .../test_demo3_semantic_grasp_integration.py | 2 +- .../test_graph_spec_backend_atomic.py | 3 -- 10 files changed, 49 insertions(+), 115 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/compile_agent.py b/embodichain/gen_sim/action_agent_pipeline/agents/compile_agent.py index f1ac95a3..bd16c6e6 100644 --- a/embodichain/gen_sim/action_agent_pipeline/agents/compile_agent.py +++ b/embodichain/gen_sim/action_agent_pipeline/agents/compile_agent.py @@ -40,16 +40,12 @@ class CompileAgent(AgentBase): query_suffix = "." prompt_kwargs: dict[str, dict[str, Any]] - def __init__(self, llm, **kwargs) -> None: + def __init__(self, **kwargs) -> None: for key, value in kwargs.items(): setattr(self, key, value) self.prompt_kwargs = kwargs.get("prompt_kwargs", {}) - self.llm = llm def generate(self, **kwargs): - if kwargs.get("recovery_enabled") or kwargs.get("recovery_spec"): - raise NotImplementedError("Recovery graph generation has been removed.") - log_dir = kwargs.get( "log_dir", Path(database_agent_prompt_dir) / self.task_name ) @@ -116,9 +112,6 @@ def _runtime_kwargs( prompt_only_keys.update( { "task_graph", - "recovery_spec", - "recovery_graph", - "recovery_enabled", "observations", "regenerate", } diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/llm.py b/embodichain/gen_sim/action_agent_pipeline/agents/llm.py index 5ae52b88..b6c62485 100644 --- a/embodichain/gen_sim/action_agent_pipeline/agents/llm.py +++ b/embodichain/gen_sim/action_agent_pipeline/agents/llm.py @@ -18,7 +18,7 @@ from embodichain.gen_sim.action_agent_pipeline.utils.mllm import create_chat_openai -__all__ = ["create_llm", "task_llm", "compile_llm"] +__all__ = ["create_llm", "task_llm"] # ------------------------------------------------------------------------------ @@ -55,10 +55,6 @@ def _create_llm_safe(*, temperature=0.0, model=None, usage_stage=None): temperature=0.0, usage_stage="action_agent.task_graph", ) -compile_llm = _create_llm_safe( - temperature=0.0, - usage_stage="action_agent.compile_canonicalize", -) if __name__ == "__main__": diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py index de459813..999996e4 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py @@ -111,7 +111,6 @@ def _generate_action_agent_trajectory( save_video=getattr(args, "save_video", False), debug_mode=getattr(args, "debug_mode", False), regenerate=getattr(args, "regenerate", False), - recovery=getattr(args, "recovery", False), ) if action_list is None or len(action_list) == 0: log_warning("Action is invalid. Skip to next generation.") diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py index 28be3bff..b239834d 100644 --- a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py @@ -35,7 +35,6 @@ def _init_agents(self, agent_config, task_name, agent_config_path=None): ) from embodichain.gen_sim.action_agent_pipeline.agents.llm import ( task_llm, - compile_llm, ) task_agent_config = self._agent_config_with_prompt_keys( @@ -54,7 +53,6 @@ def _init_agents(self, agent_config, task_name, agent_config_path=None): config_dir=agent_config_path, ) self.compile_agent = CompileAgent( - compile_llm, **compile_agent_config, **agent_config["CompileAgent"], task_name=task_name, @@ -291,12 +289,7 @@ def _get_agent_control_part( return control_part # -------------------- get compiled graph for action list -------------------- - def generate_graph_for_actions(self, regenerate=False, recovery=False, **kwargs): - if recovery: - raise NotImplementedError( - "RecoveryAgent has been removed from this pipeline." - ) - + def generate_graph_for_actions(self, regenerate=False, **kwargs): logger.log_info( "Generate graph for creating action list for " f"{self.compile_agent.task_name}.", @@ -326,11 +319,9 @@ def generate_graph_for_actions(self, regenerate=False, recovery=False, **kwargs) return graph_file_path, kwargs, graph_content # -------------------- get action list -------------------- - def create_demo_action_list( - self, regenerate=False, recovery=False, *args, **kwargs - ): + def create_demo_action_list(self, regenerate=False, *args, **kwargs): graph_file_path, compile_kwargs, _ = self.generate_graph_for_actions( - regenerate=regenerate, recovery=recovery + regenerate=regenerate ) atomic_action_kwargs = { "allow_grasp_annotation": True, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 2050b9a9..fa45fd03 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -85,9 +85,7 @@ def make_agent_config() -> dict[str, Any]: "TaskAgent": { "prompt_name": "generate_task_graph", }, - "CompileAgent": { - "prompt_name": "compile_agent_graph", - }, + "CompileAgent": {}, "Agent": { "prompt_kwargs": { "task_prompt": { @@ -923,28 +921,6 @@ def _format_place_spec( ) -def _format_pose_offset_spec( - robot_name: str, - offset: tuple[float, float, float], - *, - sample_interval: int = 20, -) -> str: - dx, dy, dz = offset - return _compact_json( - { - "atomic_action_class": "MoveAction", - "robot_name": robot_name, - "control": "arm", - "target_pose": { - "reference": "relative", - "offset": [float(dx), float(dy), float(dz)], - "frame": "world", - }, - "cfg": {"sample_interval": sample_interval}, - } - ) - - def _format_gripper_spec( robot_name: str, state: str, diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index 68e0f601..0bad84d7 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -266,10 +266,9 @@ def _validate_target_pose(target_pose: Mapping[str, Any]) -> None: if reference == "object": _validate_target_fields( target_pose, - {"reference", "obj_name", "offset", "orientation"}, + {"reference", "obj_name", "offset"}, "target_pose", ) - _validate_current_orientation(target_pose) obj_name = target_pose.get("obj_name") if not isinstance(obj_name, str) or not obj_name: raise ValueError("object target_pose requires non-empty obj_name.") @@ -279,10 +278,9 @@ def _validate_target_pose(target_pose: Mapping[str, Any]) -> None: if reference == "absolute": _validate_target_fields( target_pose, - {"reference", "position", "orientation"}, + {"reference", "position"}, "target_pose", ) - _validate_current_orientation(target_pose) position = target_pose.get("position") if not isinstance(position, list) or len(position) != 3: raise ValueError( @@ -354,12 +352,6 @@ def _validate_target_fields( ) -def _validate_current_orientation(target_pose: Mapping[str, Any]) -> None: - orientation = target_pose.get("orientation") - if orientation is not None and orientation != "current": - raise ValueError("target_pose orientation only supports 'current'.") - - def execute_atomic_action( action_spec: Mapping[str, Any] | AtomicActionSpec, *, @@ -394,7 +386,17 @@ def execute_atomic_action( return action_np target = _resolve_target(env, spec, runtime_kwargs) - cfg, start_qpos = _build_action_cfg_and_start(env, spec) + is_left, arm_part, hand_part, arm_joints, eef_joints = _select_arm_parts( + env, spec.robot_name + ) + cfg = _build_action_cfg(env, spec, arm_part, hand_part, len(eef_joints)) + start_qpos = _resolve_action_start_qpos( + env, + spec, + is_left=is_left, + arm_joints=arm_joints, + eef_joints=eef_joints, + ) action_cls = _get_atomic_action_class(spec.atomic_action_class) action = action_cls(motion_generator=_make_motion_generator(env), cfg=cfg) is_success, trajectory, joint_ids = action.execute( @@ -434,13 +436,9 @@ def execute_parallel_atomic_actions( right_arm_action=None, env=None, return_result: bool = False, - monitor_sequences=None, **runtime_kwargs, ): """Execute left/right atomic action specs as one synchronized stream.""" - if monitor_sequences is not None: - raise NotImplementedError("Monitor sequences have been removed.") - left_arm_action = _resolve_action_spec(left_arm_action, env, runtime_kwargs) right_arm_action = _resolve_action_spec(right_arm_action, env, runtime_kwargs) @@ -494,9 +492,6 @@ def execute_parallel_atomic_actions( if return_result: return { "actions": actions, - "monitor_index": None, - "monitor_name": None, - "step_index": None, } return actions @@ -620,21 +615,6 @@ def _get_atomic_action_class(atomic_action_class: str): return action_class -def _build_action_cfg_and_start(env, spec: AtomicActionSpec): - is_left, arm_part, hand_part, arm_joints, eef_joints = _select_arm_parts( - env, spec.robot_name - ) - cfg = _build_action_cfg(env, spec, arm_part, hand_part, len(eef_joints)) - start_qpos = _resolve_action_start_qpos( - env, - spec, - is_left=is_left, - arm_joints=arm_joints, - eef_joints=eef_joints, - ) - return cfg, start_qpos - - def _build_action_cfg( env, spec: AtomicActionSpec, diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py index 47c27e83..4393c37c 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py @@ -38,6 +38,7 @@ "recovery_branches", "recoveries", } +_COMPILED_BUNDLE_KEYS = {"task_graph", "metadata"} _EDGE_KEYS = {"id", "source", "target", "left_arm_action", "right_arm_action"} @@ -49,20 +50,21 @@ def load_agent_graph_bundle(path: str | Path) -> dict[str, Any]: def compile_agent_graph_from_file( path: str | Path, *, - env: Any = None, graph_cls: type | None = None, action_module: Any = None, - monitor_module: Any = None, ) -> Any: """Compile a graph JSON bundle from disk into an executable graph.""" - del env, monitor_module - bundle = load_agent_graph_bundle(path) - recovery_graph = bundle.get("recovery_graph") - if _has_recovery_content(recovery_graph): - raise ValueError("Recovery graph artifacts are no longer supported.") - - task_graph = bundle.get("task_graph", bundle) + if "task_graph" in bundle: + unknown_bundle_keys = set(bundle) - _COMPILED_BUNDLE_KEYS + if unknown_bundle_keys: + raise ValueError( + "Compiled graph artifact contains unsupported top-level fields: " + f"{', '.join(sorted(unknown_bundle_keys))}." + ) + task_graph = bundle["task_graph"] + else: + task_graph = bundle return compile_agent_graph_spec( task_graph, graph_cls=graph_cls, @@ -72,19 +74,11 @@ def compile_agent_graph_from_file( def compile_agent_graph_spec( task_graph: str | Mapping[str, Any], - recovery_graph: str | Mapping[str, Any] | None = None, *, - env: Any = None, graph_cls: type | None = None, action_module: Any = None, - monitor_module: Any = None, ) -> Any: """Compile a nominal JSON graph into ``AgentTaskGraph``.""" - del env, monitor_module - - if _has_recovery_content(recovery_graph): - raise ValueError("Recovery graph compilation has been removed.") - task_spec = extract_json_object(task_graph) _reject_recovery_keys(task_spec) _validate_task_spec(task_spec) @@ -245,15 +239,6 @@ def _compile_action(spec: Any, action_module: Any) -> Any: return action_module.normalize_atomic_action_spec(spec) -def _has_recovery_content(value: Any) -> bool: - if value is None: - return False - recovery_spec = extract_json_object(value) - if not isinstance(recovery_spec, Mapping): - return bool(recovery_spec) - return any(bool(recovery_spec.get(key)) for key in _RECOVERY_KEYS) - - def _reject_recovery_keys(task_spec: Mapping[str, Any]) -> None: present = _RECOVERY_KEYS & set(task_spec) if present: diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py index 9f1661a2..2cdbe9aa 100644 --- a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -209,6 +209,24 @@ def test_normalize_atomic_action_spec_rejects_multiple_target_fields() -> None: ) +def test_normalize_atomic_action_spec_rejects_orientation_field() -> None: + with pytest.raises(ValueError, match="Unsupported target_pose fields"): + normalize_atomic_action_spec( + { + "atomic_action_class": "MoveAction", + "robot_name": "left_arm", + "control": "arm", + "target_pose": { + "reference": "object", + "obj_name": "apple", + "offset": [0.0, 0.0, 0.1], + "orientation": "current", + }, + "cfg": {}, + } + ) + + def test_normalize_atomic_action_spec_rejects_pickup_pose_target() -> None: with pytest.raises(ValueError, match="PickUpAction requires control='arm'"): normalize_atomic_action_spec( @@ -256,7 +274,6 @@ def test_object_referenced_pose_builds_move_cfg_and_pose_target(monkeypatch) -> "reference": "object", "obj_name": "apple", "offset": [0.1, 0.2, 0.3], - "orientation": "current", }, "cfg": {"sample_interval": 12}, }, diff --git a/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py b/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py index d14308d6..1c55ef74 100644 --- a/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py +++ b/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py @@ -135,7 +135,7 @@ def _make_env(tmp_path: Path): # Import registers AtomicActionsAgent-v3. from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware import ( # noqa: F401 - atomic_actions, + agent_env, ) args = argparse.Namespace( diff --git a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py index 2dcc10d1..122b188b 100644 --- a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py +++ b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py @@ -95,7 +95,6 @@ def test_compile_agent_graph_accepts_atomic_action_class_spec() -> None: graph = compile_agent_graph_spec( _task_graph(action), graph_cls=_FakeGraph, - monitor_module={}, ) assert graph.edges["e01"]["left_arm_action"] == action @@ -108,7 +107,6 @@ def test_compile_agent_graph_rejects_legacy_action_schema() -> None: compile_agent_graph_spec( task_graph, graph_cls=_FakeGraph, - monitor_module={}, ) @@ -120,5 +118,4 @@ def test_compile_agent_graph_rejects_extra_edge_fields() -> None: compile_agent_graph_spec( task_graph, graph_cls=_FakeGraph, - monitor_module={}, ) From 7b15271a14cf3f5be75a8e0a13f99296dfb3bbc1 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Wed, 17 Jun 2026 12:37:59 +0800 Subject: [PATCH 16/33] Fix action agent CoACD cache reuse --- .../generation/coacd_cache.py | 19 +- .../runtime/atom_actions.py | 58 ++++- .../runtime/coacd_cache_bridge.py | 211 ++++++++++++++++++ .../action_agent_pipeline/test_coacd_cache.py | 130 +++++++++++ 4 files changed, 410 insertions(+), 8 deletions(-) create mode 100644 embodichain/gen_sim/action_agent_pipeline/runtime/coacd_cache_bridge.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_coacd_cache.py diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py b/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py index f2f4fee0..0787ca72 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py @@ -25,6 +25,7 @@ __all__ = [ "coacd_cache_path_for_mesh", + "dexsim_coacd_cache_key_for_mesh", "prewarm_coacd_cache_for_gym_config", ] @@ -37,19 +38,32 @@ def coacd_cache_path_for_mesh( mesh_path: str | Path, max_convex_hull_num: int, cache_dir: str | Path | None = None, + *, + mesh_count: int = 1, ) -> Path: """Return the DexSim environment-side CoACD cache path for a mesh.""" if cache_dir is None: cache_dir = _DEFAULT_CONVEX_DECOMP_DIR - mesh_path = Path(mesh_path).expanduser().resolve() - mesh_md5_key = hashlib.md5(mesh_path.read_bytes()).hexdigest() + mesh_md5_key = dexsim_coacd_cache_key_for_mesh(mesh_path, mesh_count=mesh_count) return Path(cache_dir).expanduser().resolve() / ( f"{mesh_md5_key}_{int(max_convex_hull_num)}.obj" ) +def dexsim_coacd_cache_key_for_mesh( + mesh_path: str | Path, + *, + mesh_count: int = 1, +) -> str: + """Return the cache key used by DexSim ``load_actor_with_coacd``.""" + + resolved_mesh_path = Path(mesh_path).expanduser().resolve(strict=False) + mesh_key_data = f"{resolved_mesh_path}|mesh_count={int(mesh_count)}" + return hashlib.md5(mesh_key_data.encode("utf-8")).hexdigest() + + def prewarm_coacd_cache_for_gym_config( gym_config: Mapping[str, Any], *, @@ -88,6 +102,7 @@ def prewarm_coacd_cache_for_gym_config( report = { "uid": uid, "mesh_path": mesh_path.as_posix(), + "mesh_count": 1, "max_convex_hull_num": max_convex_hull_num, "cache_path": cache_path.as_posix(), } diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index 0bad84d7..ec6733a5 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -881,6 +881,19 @@ def _build_object_semantics( }, ) ) + max_decomposition_hulls = _max_decomposition_hulls(target_obj, runtime_kwargs) + source_mesh_path = _rigid_object_mesh_path(target_obj) + body_scale = _rigid_object_body_scale(target_obj) + _prepare_grasp_collision_cache_from_env_coacd( + obj_name=obj_name, + mesh_vertices=mesh_vertices, + mesh_triangles=mesh_triangles, + source_mesh_path=source_mesh_path, + max_decomposition_hulls=max_decomposition_hulls, + body_scale=body_scale, + runtime_kwargs=runtime_kwargs, + ) + gripper_collision_cfg = GripperCollisionCfg( **_cfg_supported_kwargs( GripperCollisionCfg, @@ -891,12 +904,9 @@ def _build_object_semantics( "grasp_point_sample_dense", 0.012, ), - "max_decomposition_hulls": _max_decomposition_hulls( - target_obj, - runtime_kwargs, - ), - "env_coacd_source_mesh_path": _rigid_object_mesh_path(target_obj), - "env_coacd_body_scale": _rigid_object_body_scale(target_obj), + "max_decomposition_hulls": max_decomposition_hulls, + "env_coacd_source_mesh_path": source_mesh_path, + "env_coacd_body_scale": body_scale, }, ) ) @@ -919,6 +929,42 @@ def _build_object_semantics( ) +def _prepare_grasp_collision_cache_from_env_coacd( + *, + obj_name: str, + mesh_vertices: torch.Tensor, + mesh_triangles: torch.Tensor, + source_mesh_path: str | None, + max_decomposition_hulls: int, + body_scale: list[float] | None, + runtime_kwargs: Mapping[str, Any], +) -> None: + if not bool(runtime_kwargs.get("reuse_env_coacd_for_grasp", True)): + return + + try: + from embodichain.gen_sim.action_agent_pipeline.runtime.coacd_cache_bridge import ( + ensure_grasp_collision_cache_from_env_coacd, + ) + + result = ensure_grasp_collision_cache_from_env_coacd( + mesh_vertices=mesh_vertices, + mesh_triangles=mesh_triangles, + source_mesh_path=source_mesh_path, + max_decomposition_hulls=max_decomposition_hulls, + body_scale=body_scale, + ) + except Exception: + return + + if result.get("status") == "generated": + log_info( + "Prepared grasp collision cache from environment CoACD cache: " + f"target={obj_name}, cache={result.get('grasp_cache_path')}.", + color="green", + ) + + def _stabilize_affordance_object( env, target_obj, diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/coacd_cache_bridge.py b/embodichain/gen_sim/action_agent_pipeline/runtime/coacd_cache_bridge.py new file mode 100644 index 00000000..b0212fec --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/coacd_cache_bridge.py @@ -0,0 +1,211 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import hashlib +import pickle +from pathlib import Path +from typing import Any + +import numpy as np +import torch + +from embodichain.gen_sim.action_agent_pipeline.generation.coacd_cache import ( + coacd_cache_path_for_mesh, +) + +__all__ = [ + "ensure_grasp_collision_cache_from_env_coacd", + "grasp_collision_cache_path", +] + + +_DEFAULT_CONVEX_DECOMP_DIR = ( + Path.home() / ".cache" / "embodichain_cache" / "convex_decomposition" +) + + +def grasp_collision_cache_path( + mesh_vertices: torch.Tensor | np.ndarray, + mesh_triangles: torch.Tensor | np.ndarray, + max_decomposition_hulls: int, + *, + cache_dir: str | Path | None = None, +) -> Path: + """Return the grasp collision checker cache path for a scaled mesh.""" + + vertices = _as_numpy(mesh_vertices) + triangles = _as_numpy(mesh_triangles) + mesh_hash = hashlib.md5(vertices.tobytes() + triangles.tobytes()).hexdigest() + return _resolve_cache_dir(cache_dir) / ( + f"{mesh_hash}_{int(max_decomposition_hulls)}.pkl" + ) + + +def ensure_grasp_collision_cache_from_env_coacd( + *, + mesh_vertices: torch.Tensor | np.ndarray, + mesh_triangles: torch.Tensor | np.ndarray, + source_mesh_path: str | Path | None, + max_decomposition_hulls: int, + body_scale: Any = None, + cache_dir: str | Path | None = None, +) -> dict[str, Any]: + """Prepare grasp collision cache from the environment CoACD OBJ cache. + + The environment and grasp collision paths use different cache formats. This + bridge avoids running CoACD again during grasp annotation when the + environment-side convex OBJ cache is already available. + """ + + grasp_cache_path = grasp_collision_cache_path( + mesh_vertices, + mesh_triangles, + max_decomposition_hulls, + cache_dir=cache_dir, + ) + if grasp_cache_path.is_file(): + return { + "status": "hit", + "grasp_cache_path": grasp_cache_path.as_posix(), + } + + if source_mesh_path is None: + return { + "status": "missing_source_mesh", + "grasp_cache_path": grasp_cache_path.as_posix(), + } + + env_cache_path = coacd_cache_path_for_mesh( + source_mesh_path, + max_decomposition_hulls, + _resolve_cache_dir(cache_dir), + ) + if not env_cache_path.is_file(): + return { + "status": "missing_env_cache", + "env_cache_path": env_cache_path.as_posix(), + "grasp_cache_path": grasp_cache_path.as_posix(), + } + + try: + plane_equations = _plane_equations_from_env_cache(env_cache_path, body_scale) + _write_grasp_collision_cache(grasp_cache_path, plane_equations) + except Exception as exc: + return { + "status": "skipped", + "reason": str(exc), + "env_cache_path": env_cache_path.as_posix(), + "grasp_cache_path": grasp_cache_path.as_posix(), + } + + return { + "status": "generated", + "env_cache_path": env_cache_path.as_posix(), + "grasp_cache_path": grasp_cache_path.as_posix(), + } + + +def _plane_equations_from_env_cache( + env_cache_path: Path, + body_scale: Any, +) -> list[tuple[np.ndarray, np.ndarray]]: + from dexsim.kit.meshproc.convex_cache import load_obj_as_convex_parts + + from embodichain.toolkits.graspkit.pg_grasp.collision_checker import ( + extract_plane_equations, + ) + + convex_parts = load_obj_as_convex_parts(env_cache_path.as_posix()) + if not convex_parts: + raise ValueError(f"No convex parts found in {env_cache_path}.") + + scale = _body_scale(body_scale) + if not np.allclose(scale, np.ones(3, dtype=np.float32)): + convex_parts = [ + (vertices.astype(np.float32, copy=False) * scale, faces) + for vertices, faces in convex_parts + ] + + plane_equations = extract_plane_equations(convex_parts) + if not plane_equations: + raise ValueError(f"No plane equations extracted from {env_cache_path}.") + return plane_equations + + +def _write_grasp_collision_cache( + cache_path: Path, + plane_equations_np: list[tuple[np.ndarray, np.ndarray]], +) -> None: + cache_path.parent.mkdir(parents=True, exist_ok=True) + n_convex = len(plane_equations_np) + n_max_equation = max(normals.shape[0] for normals, _ in plane_equations_np) + plane_equations = torch.zeros( + size=(n_convex, n_max_equation, 4), + dtype=torch.float32, + device="cpu", + ) + plane_equation_counts = torch.zeros(n_convex, dtype=torch.int32, device="cpu") + for index, (normals, offsets) in enumerate(plane_equations_np): + n_equation = normals.shape[0] + plane_equations[index, :n_equation, :3] = torch.as_tensor( + normals, + dtype=torch.float32, + ) + plane_equations[index, :n_equation, 3] = torch.as_tensor( + offsets, + dtype=torch.float32, + ) + plane_equation_counts[index] = n_equation + + with cache_path.open("wb") as cache_file: + pickle.dump( + { + "plane_equations": plane_equations, + "plane_equation_counts": plane_equation_counts, + }, + cache_file, + ) + + +def _resolve_cache_dir(cache_dir: str | Path | None) -> Path: + if cache_dir is not None: + return Path(cache_dir).expanduser().resolve() + try: + from embodichain.lab.sim import CONVEX_DECOMP_DIR + except Exception: + return _DEFAULT_CONVEX_DECOMP_DIR + return Path(CONVEX_DECOMP_DIR).expanduser().resolve() + + +def _as_numpy(value: torch.Tensor | np.ndarray) -> np.ndarray: + if isinstance(value, torch.Tensor): + value = value.detach().cpu().numpy() + return np.ascontiguousarray(value) + + +def _body_scale(body_scale: Any) -> np.ndarray: + if body_scale is None: + return np.ones(3, dtype=np.float32) + if isinstance(body_scale, torch.Tensor): + body_scale = body_scale.detach().cpu().numpy() + scale = np.asarray(body_scale, dtype=np.float32).reshape(-1) + if scale.size == 1: + scale = np.repeat(scale, 3) + if scale.size != 3 or not np.all(np.isfinite(scale)): + raise ValueError(f"Invalid body scale: {body_scale!r}.") + return scale.reshape(1, 3) diff --git a/tests/gen_sim/action_agent_pipeline/test_coacd_cache.py b/tests/gen_sim/action_agent_pipeline/test_coacd_cache.py new file mode 100644 index 00000000..8a974071 --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_coacd_cache.py @@ -0,0 +1,130 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import hashlib +import pickle + +import pytest +import torch + +from embodichain.gen_sim.action_agent_pipeline.generation.coacd_cache import ( + coacd_cache_path_for_mesh, + dexsim_coacd_cache_key_for_mesh, +) +from embodichain.gen_sim.action_agent_pipeline.runtime.coacd_cache_bridge import ( + ensure_grasp_collision_cache_from_env_coacd, +) + + +def test_coacd_cache_path_matches_dexsim_load_actor_key(tmp_path) -> None: + mesh_path = tmp_path / "object.obj" + mesh_path.write_text("# placeholder mesh\n", encoding="utf-8") + cache_dir = tmp_path / "cache" + + cache_path = coacd_cache_path_for_mesh( + mesh_path, + 16, + cache_dir, + ) + + expected_key = hashlib.md5( + f"{mesh_path.resolve()}|mesh_count=1".encode("utf-8") + ).hexdigest() + assert dexsim_coacd_cache_key_for_mesh(mesh_path) == expected_key + assert cache_path == cache_dir.resolve() / f"{expected_key}_16.obj" + + +def test_grasp_cache_bridge_uses_existing_env_coacd_obj(tmp_path) -> None: + pytest.importorskip("dexsim.kit.meshproc.convex_cache") + source_mesh_path = tmp_path / "source.obj" + _write_tetra_obj(source_mesh_path) + + cache_dir = tmp_path / "cache" + env_cache_path = coacd_cache_path_for_mesh( + source_mesh_path, + 4, + cache_dir, + ) + env_cache_path.parent.mkdir(parents=True, exist_ok=True) + _write_tetra_obj(env_cache_path) + + mesh_vertices = torch.tensor( + [ + [0.0, 0.0, 0.0], + [2.0, 0.0, 0.0], + [0.0, 2.0, 0.0], + [0.0, 0.0, 2.0], + ], + dtype=torch.float32, + ) + mesh_triangles = torch.tensor( + [ + [0, 2, 1], + [0, 1, 3], + [1, 2, 3], + [2, 0, 3], + ], + dtype=torch.int64, + ) + + result = ensure_grasp_collision_cache_from_env_coacd( + mesh_vertices=mesh_vertices, + mesh_triangles=mesh_triangles, + source_mesh_path=source_mesh_path, + max_decomposition_hulls=4, + body_scale=[2.0, 2.0, 2.0], + cache_dir=cache_dir, + ) + + assert result["status"] == "generated" + assert result["env_cache_path"] == env_cache_path.as_posix() + with open(result["grasp_cache_path"], "rb") as cache_file: + cache = pickle.load(cache_file) + assert set(cache) == {"plane_equations", "plane_equation_counts"} + assert cache["plane_equations"].shape[-1] == 4 + assert cache["plane_equation_counts"].numel() == 1 + + second_result = ensure_grasp_collision_cache_from_env_coacd( + mesh_vertices=mesh_vertices, + mesh_triangles=mesh_triangles, + source_mesh_path=source_mesh_path, + max_decomposition_hulls=4, + body_scale=[2.0, 2.0, 2.0], + cache_dir=cache_dir, + ) + assert second_result["status"] == "hit" + + +def _write_tetra_obj(path) -> None: + path.write_text( + "\n".join( + [ + "o convex_0", + "v 0.0 0.0 0.0", + "v 1.0 0.0 0.0", + "v 0.0 1.0 0.0", + "v 0.0 0.0 1.0", + "f 1 3 2", + "f 1 2 4", + "f 2 3 4", + "f 3 1 4", + "", + ] + ), + encoding="utf-8", + ) From 063254c85e2a991d5a7fac2f01f4de9f18052f22 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Wed, 24 Jun 2026 10:41:26 +0800 Subject: [PATCH 17/33] fix: address action-agent review cleanup --- MANIFEST.in | 1 + docs/source/features/generative_sim/agents.md | 197 +- docs/source/features/generative_sim/index.rst | 1 + .../agents/agent_base.py | 2 + .../action_agent_pipeline/agents/llm.py | 4 +- .../agents/task_agent.py | 79 +- .../cli/agent_run_stage.py | 55 + ...fig.py => generate_action_agent_config.py} | 9 +- .../cli/image2scene_stage.py | 258 ++ .../cli/pipeline_args.py | 381 ++ .../cli/pipeline_defaults.py | 74 + .../cli/pipeline_records.py | 5 +- .../cli/pipeline_runner.py | 107 + .../cli/pipeline_usage.py | 89 + .../cli/project_resolution.py | 230 ++ .../action_agent_pipeline/cli/run_agent.py | 2 +- .../cli/run_agent_pipeline.py | 1302 +----- .../cli/target_replacements.py | 391 ++ .../env_adapters/tableware/agent_env.py | 365 +- .../env_adapters/tableware/base_agent_env.py | 335 -- .../env_adapters/tableware/success.py | 5 +- .../generation/action_agent_config.py | 549 +++ .../generation/action_agent_templates.py | 55 + .../generation/coacd_cache.py | 10 +- .../generation/config_blocks.py | 598 +++ .../generation/config_io.py | 95 + .../generation/config_types.py | 121 + .../generation/glb_io.py | 60 + .../generation/mesh_bounds.py | 558 +++ .../generation/mesh_frame_normalization.py | 37 +- .../generation/naming.py | 168 + .../generation/prompt_builders.py | 4 +- .../generation/relative_geometry.py | 396 ++ .../generation/relative_spec.py | 793 ++++ .../generation/replacement_generation.py | 326 ++ .../generation/role_refinement.py | 141 + .../generation/scene_objects.py | 262 ++ .../generation/success_specs.py | 338 ++ .../generation/templates/default_lights.json | 12 + .../generation/templates/default_sensors.json | 49 + .../generation/templates/dual_ur5_robot.json | 106 + .../generation/ur5_basket_config.py | 3665 ----------------- .../gym_project_api/image2tabletop_client.py | 59 +- .../prompt2geometry/.gitignore | 1 - .../prompt2geometry/config.json | 9 +- .../gym_project_api/prompt2geometry/config.py | 13 +- .../prompt2geometry/dimensions.py | 3 +- .../prompt2geometry/pipeline.py | 54 +- .../prompt2geometry/sam3_client.py | 16 +- .../prompt2geometry/sam3d_client.py | 12 +- .../prompt2geometry/zimage_client.py | 6 +- .../runtime/atom_action_utils.py | 18 +- .../runtime/atom_actions.py | 84 +- .../runtime/coacd_cache_bridge.py | 39 +- .../runtime/graph_compiler.py | 20 +- .../runtime/task_graph.py | 4 +- .../action_agent_pipeline/utils/__init__.py | 1 + .../action_agent_pipeline/utils/llm_config.py | 9 +- .../action_agent_pipeline/utils/mllm.py | 35 +- .../test_action_agent_cli_and_clients.py | 314 ++ .../test_backend_atomic_runtime.py | 155 + .../test_base_agent_env_config.py | 44 + .../action_agent_pipeline/test_coacd_cache.py | 1 + .../test_graph_spec_backend_atomic.py | 19 +- .../test_tableware_success.py | 47 + .../test_task_agent_cache.py | 90 + .../test_ur5_basket_config_generation.py | 257 +- 67 files changed, 7867 insertions(+), 5678 deletions(-) create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/agent_run_stage.py rename embodichain/gen_sim/action_agent_pipeline/cli/{generate_ur5_basket_config.py => generate_action_agent_config.py} (96%) create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/image2scene_stage.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/pipeline_args.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/pipeline_defaults.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/pipeline_runner.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/pipeline_usage.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/project_resolution.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/cli/target_replacements.py delete mode 100644 embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/action_agent_templates.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/config_io.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/config_types.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/glb_io.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/mesh_bounds.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/naming.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/replacement_generation.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/role_refinement.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/scene_objects.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/templates/default_lights.json create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/templates/default_sensors.json create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json delete mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_action_agent_cli_and_clients.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_base_agent_env_config.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_tableware_success.py create mode 100644 tests/gen_sim/action_agent_pipeline/test_task_agent_cache.py diff --git a/MANIFEST.in b/MANIFEST.in index 21352c3e..16bdd505 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include VERSION recursive-include configs/ * +recursive-include embodichain/gen_sim/action_agent_pipeline/generation/templates *.json diff --git a/docs/source/features/generative_sim/agents.md b/docs/source/features/generative_sim/agents.md index 213050a7..f556b2be 100644 --- a/docs/source/features/generative_sim/agents.md +++ b/docs/source/features/generative_sim/agents.md @@ -1,175 +1,68 @@ -# EmbodiAgent(aborted) +# Action Agent Pipeline -EmbodiAgent is a hierarchical multi-agent system that enables robots to perform complex manipulation tasks through closed-loop planning, code generation, and validation. The system combines vision-language models (VLMs) and large language models (LLMs) to translate high-level goals into executable robot actions. +The action-agent pipeline is the supported agent workflow for generated tabletop +manipulation tasks. It converts an image or an existing generated gym project +into a task-specific simulation config, asks the task model for a JSON task +graph, compiles that graph into atomic-action specs, and executes it through the +`AtomicActionsAgent-v3` environment. -## Quick Start +The legacy Python-code generation agent stack has been removed. New demos and +task generation should use the modules under +`embodichain.gen_sim.action_agent_pipeline`. -### Prerequisites -Ensure you have access to Azure OpenAI or a compatible LLM endpoint. +## End-to-end Pipeline -```bash -# Set environment variables -export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/" -export AZURE_OPENAI_API_KEY="your-api-key" -``` - -### Using Different LLM/VLM APIs +Run image-to-scene, config generation, and agent execution in one command: -The system uses LangChain's `AzureChatOpenAI` by default. To use different LLM/VLM providers, you can modify the `create_llm` function in `embodichain/agents/hierarchy/llm.py`. - -#### Azure OpenAI ```bash -export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/" -export AZURE_OPENAI_API_KEY="your-api-key" -export OPENAI_API_VERSION="2024-10-21" # Optional, defaults to "2024-10-21" +python -m embodichain.gen_sim.action_agent_pipeline.cli.run_agent_pipeline \ + --use-image2scene \ + --server "http://127.0.0.1:4523" \ + --image-name "demo1" \ + --task_description "Pick up the target object and place it in the basket." \ + --config-output-dir "gym_project/action_agent_pipeline/configs/demo1_text" \ + --task_name "Demo1_Text" \ + --target_body_scale 0.8 \ + --regenerate ``` -#### OpenAI -To use OpenAI directly instead of Azure, modify `llm.py`: -```python -from langchain_openai import ChatOpenAI +## Generate Config Only -def create_llm(*, temperature=0.0, model="gpt-4o"): - return ChatOpenAI( - temperature=temperature, - model=model, - api_key=os.getenv("OPENAI_API_KEY"), - ) -``` +Use an existing gym project to generate the task config and agent config: -Then set: ```bash -export OPENAI_API_KEY="your-api-key" -``` - -#### Other Providers -You can use other LangChain-compatible providers by modifying the `create_llm` function, for example: - -**Anthropic Claude:** -```python -from langchain_anthropic import ChatAnthropic - -def create_llm(*, temperature=0.0, model="claude-3-opus-20240229"): - return ChatAnthropic( - temperature=temperature, - model=model, - anthropic_api_key=os.getenv("ANTHROPIC_API_KEY"), - ) +python -m embodichain.gen_sim.action_agent_pipeline.cli.generate_action_agent_config \ + --gym_project "gym_project/environment/image2tabletop/downloads/example_gym_project" \ + --output_dir "gym_project/action_agent_pipeline/configs/demo_text" \ + --task_name "Demo_Text" \ + --task_description "Pick up the target object and place it in the basket." \ + --target_body_scale 0.8 \ + --overwrite ``` -**Google Gemini:** -```python -from langchain_google_genai import ChatGoogleGenerativeAI +## Run Generated Config -def create_llm(*, temperature=0.0, model="gemini-pro"): - return ChatGoogleGenerativeAI( - temperature=temperature, - model=model, - google_api_key=os.getenv("GOOGLE_API_KEY"), - ) -``` - -### Run the System - -Run the agent system with the following command: +Run a previously generated config with the action-agent environment: ```bash -python embodichain/lab/scripts/run_agent.py \ - --task_name YourTask \ - --gym_config configs/gym/your_task/gym_config.yaml \ - --agent_config configs/gym/agent/your_agent/agent_config.json \ - --regenerate False +python -m embodichain.gen_sim.action_agent_pipeline.cli.run_agent \ + --task_name "Demo_Text" \ + --gym_config "gym_project/action_agent_pipeline/configs/demo_text/fast_gym_config.json" \ + --agent_config "gym_project/action_agent_pipeline/configs/demo_text/agent_config.json" \ + --regenerate ``` -**Parameters:** -- `--task_name`: Name identifier for the task -- `--gym_config`: Path to the gym environment configuration file (``.json``, ``.yaml``, or ``.yml``) -- `--agent_config`: Path to the agent configuration file (defines prompts and agent behavior) -- `--regenerate`: If `True`, forces regeneration of plans/code even if cached - -## System Architecture - -The system operates on a closed-loop control cycle: - -- **Observe**: The `TaskAgent` perceives the environment via multi-view camera inputs. -- **Plan**: It decomposes the goal into natural language steps. -- **Code**: The `CodeAgent` translates steps into executable Python code using atomic actions. -- **Execute**: The code runs in the environment; runtime errors are caught immediately. -- **Validate**: The `ValidationAgent` analyzes the result images, selects the best camera angle, and judges success. -- **Refine**: If validation fails, feedback is sent back to the agents to regenerate the plan or code. - ---- - -## Core Components - -### TaskAgent -*Located in:* `embodichain/agents/hierarchy/task_agent.py` - -Responsible for high-level reasoning. It parses visual observations and outputs a structured plan. - -* For every step, it generates a specific condition (e.g., "The cup must be held by the gripper") which is used later by the ValidationAgent. -* Prompt Strategies: - * `one_stage_prompt`: Direct VLM-to-Plan generation. - * `two_stage_prompt`: Separates visual analysis from planning logic. - -### CodeAgent -*Located in:* `embodichain/agents/hierarchy/code_agent.py` - -Translates natural language plans into executable Python code using atomic actions from the action bank. - -* Generates Python code that follows strict coding guidelines (no loops, only provided APIs) -* Executes code in a sandboxed environment with immediate error detection -* Uses Abstract Syntax Tree (AST) parsing to ensure code safety and correctness -* Supports few-shot learning through code examples in the configuration - - -### ValidationAgent -*Located in:* `embodichain/agents/hierarchy/validation_agent.py` - -Closes the loop by verifying if the robot actually achieved what it planned. - -* Uses a specialized LLM call (`select_best_view_dir`) to analyze images from all cameras and pick the single best angle that proves the action's outcome, ignoring irrelevant views. -* If an error occurs (runtime or logic), it generates a detailed explanation which is fed back to the `TaskAgent` or `CodeAgent` for the next attempt. - ---- - -## Configuration Guide - -The `Agent` configuration block controls the context provided to the LLMs. Prompt files are resolved in the following order: - -1. **Config directory**: Task-specific prompt files in the same directory as the agent configuration file (e.g., `configs/gym/agent/pour_water_agent/`) -2. **Default prompts directory**: Reusable prompt templates in `embodichain/agents/prompts/` - -| Parameter | Description | Typical Use | -| :--- | :--- | :--- | -| `task_prompt` | Task-specific goal description | "Pour water from the red cup to the blue cup." | -| `basic_background` | Physical rules & constraints | World coordinate system definitions, safety rules. | -| `atom_actions` | API Documentation | List of available functions (e.g., `drive(action='pick', ...)`). | -| `code_prompt` | Coding guidelines | "Use provided APIs only. Do not use loops." | -| `code_example` | Few-shot examples | Previous successful code snippets to guide style. | - ---- - -## File Structure - -```text -embodichain/agents/ -├── hierarchy/ -│ ├── agent_base.py # Abstract base handling prompts & images -│ ├── task_agent.py # Plan generation logic -│ ├── code_agent.py # Code generation & AST execution engine -│ ├── validation_agent.py # Visual analysis & view selection -│ └── llm.py # LLM configuration and instances -├── mllm/ -│ └── prompt/ # Prompt templates (LangChain) -└── prompts/ # Agent prompt templates -``` +## Runtime Shape ---- +- `TaskAgent` produces a deterministic JSON graph. +- `CompileAgent` caches and validates the graph artifact. +- `AgenticGenSimEnv` registers `AtomicActionsAgent-v3` and exposes + `create_demo_action_list()`. +- Runtime graph execution calls atomic actions from + `embodichain.gen_sim.action_agent_pipeline.runtime`. ## See Also -- [Online Data Streaming](../online_data.md) — Streaming live simulation data for training -- [RL Architecture](../../overview/rl/index.rst) — RL training pipeline and algorithms -- [Atomic Actions Tutorial](../../tutorial/atomic_actions.rst) — Action primitives used by the CodeAgent +- [SimReady Asset Pipeline](simready_pipeline.md) — Generating simulation-ready assets +- [Atomic Actions Tutorial](../../tutorial/atomic_actions.rst) — Atomic action primitives - [Supported Tasks](../../resources/task/index.rst) — Available task environments diff --git a/docs/source/features/generative_sim/index.rst b/docs/source/features/generative_sim/index.rst index 1f7c759f..1409341c 100644 --- a/docs/source/features/generative_sim/index.rst +++ b/docs/source/features/generative_sim/index.rst @@ -6,4 +6,5 @@ Generative Simulation collects EmbodiChain features for generating simulation-re .. toctree:: :maxdepth: 2 + Action Agent Pipeline SimReady Asset Pipeline diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/agent_base.py b/embodichain/gen_sim/action_agent_pipeline/agents/agent_base.py index fc967f65..b46226d4 100644 --- a/embodichain/gen_sim/action_agent_pipeline/agents/agent_base.py +++ b/embodichain/gen_sim/action_agent_pipeline/agents/agent_base.py @@ -21,6 +21,8 @@ from embodichain.utils.utility import load_txt +__all__ = ["AgentBase"] + def _resolve_prompt_path(file_name: str, config_dir: str | None = None) -> str: # If absolute path, use directly diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/llm.py b/embodichain/gen_sim/action_agent_pipeline/agents/llm.py index b6c62485..384e4661 100644 --- a/embodichain/gen_sim/action_agent_pipeline/agents/llm.py +++ b/embodichain/gen_sim/action_agent_pipeline/agents/llm.py @@ -17,6 +17,7 @@ from __future__ import annotations from embodichain.gen_sim.action_agent_pipeline.utils.mllm import create_chat_openai +from embodichain.utils.logger import log_warning __all__ = ["create_llm", "task_llm"] @@ -47,7 +48,8 @@ def _create_llm_safe(*, temperature=0.0, model=None, usage_stage=None): model=model, usage_stage=usage_stage, ) - except Exception: + except Exception as exc: + log_warning(f"Failed to initialize action-agent LLM: {exc}") return None diff --git a/embodichain/gen_sim/action_agent_pipeline/agents/task_agent.py b/embodichain/gen_sim/action_agent_pipeline/agents/task_agent.py index 6efbdc32..a4fefac1 100644 --- a/embodichain/gen_sim/action_agent_pipeline/agents/task_agent.py +++ b/embodichain/gen_sim/action_agent_pipeline/agents/task_agent.py @@ -16,6 +16,8 @@ from __future__ import annotations +import hashlib +import json from pathlib import Path from typing import Any @@ -29,6 +31,8 @@ __all__ = ["TaskAgent"] +TASK_GRAPH_CACHE_SCHEMA_VERSION = "task_graph_prompt_v1" + class TaskAgent(AgentBase): """Generate the nominal atomic-action task graph.""" @@ -52,21 +56,92 @@ def generate(self, **kwargs) -> str: "log_dir", Path(database_agent_prompt_dir) / self.task_name ) file_path = Path(log_dir) / "agent_task_graph.json" + metadata_path = file_path.with_suffix(".metadata.json") + prompt = getattr(TaskPrompt, self.prompt_name)(**kwargs) + prompt_hash = _stable_text_hash(prompt) - if not kwargs.get("regenerate", False) and file_path.exists(): + if ( + not kwargs.get("regenerate", False) + and file_path.exists() + and _metadata_matches( + metadata_path, + prompt_hash=prompt_hash, + prompt_name=self.prompt_name, + task_name=self.task_name, + ) + ): print(f"Task graph already exists at {file_path}.") return load_txt(file_path) - prompt = getattr(TaskPrompt, self.prompt_name)(**kwargs) response = self.llm.invoke(prompt) print(f"\033[92m\nTask agent output:\n{response.content}\n\033[0m") content = normalize_json_content(response.content) file_path.parent.mkdir(parents=True, exist_ok=True) file_path.write_text(content, encoding="utf-8") + _write_metadata( + metadata_path, + prompt_hash=prompt_hash, + prompt_name=self.prompt_name, + task_name=self.task_name, + ) print(f"Generated task graph saved to {file_path}") return content def act(self, *args, **kwargs): return super().act(*args, **kwargs) + + +def _stable_text_hash(content: Any) -> str: + text = _prompt_to_hash_text(content) + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def _prompt_to_hash_text(prompt: Any) -> str: + to_string = getattr(prompt, "to_string", None) + if callable(to_string): + return str(to_string()) + return str(prompt) + + +def _metadata_matches( + metadata_path: Path, + *, + prompt_hash: str, + prompt_name: str, + task_name: str, +) -> bool: + if not metadata_path.is_file(): + return False + try: + metadata = json.loads(metadata_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return False + if not isinstance(metadata, dict): + return False + return ( + metadata.get("schema_version") == TASK_GRAPH_CACHE_SCHEMA_VERSION + and metadata.get("prompt_hash") == prompt_hash + and metadata.get("prompt_name") == prompt_name + and metadata.get("task_name") == task_name + ) + + +def _write_metadata( + metadata_path: Path, + *, + prompt_hash: str, + prompt_name: str, + task_name: str, +) -> None: + metadata = { + "schema_version": TASK_GRAPH_CACHE_SCHEMA_VERSION, + "prompt_hash": prompt_hash, + "prompt_name": prompt_name, + "task_name": task_name, + } + metadata_path.write_text( + json.dumps(metadata, ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/agent_run_stage.py b/embodichain/gen_sim/action_agent_pipeline/cli/agent_run_stage.py new file mode 100644 index 00000000..d258c64d --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/agent_run_stage.py @@ -0,0 +1,55 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import os +from pathlib import Path +import shlex +import subprocess +import sys + +__all__ = ["run_agent_command"] + + +def run_agent_command( + *, + task_name: str, + gym_config: Path, + agent_config: Path, + regenerate: bool, +) -> int: + command = [ + sys.executable, + "-m", + "embodichain.gen_sim.action_agent_pipeline.cli.run_agent", + "--task_name", + task_name, + "--gym_config", + str(gym_config), + "--agent_config", + str(agent_config), + ] + if regenerate: + command.append("--regenerate") + + env = os.environ.copy() + if env.get("EMBODICHAIN_LLM_USAGE_PATH"): + env["EMBODICHAIN_LLM_USAGE_PROCESS"] = "run_agent" + + print("Running task:") + print(shlex.join(command), flush=True) + return subprocess.run(command, check=False, env=env).returncode diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/generate_ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/cli/generate_action_agent_config.py similarity index 96% rename from embodichain/gen_sim/action_agent_pipeline/cli/generate_ur5_basket_config.py rename to embodichain/gen_sim/action_agent_pipeline/cli/generate_action_agent_config.py index 3a754e11..16b6a865 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/generate_ur5_basket_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/generate_action_agent_config.py @@ -19,9 +19,9 @@ import argparse from pathlib import Path -from embodichain.gen_sim.action_agent_pipeline.generation.ur5_basket_config import ( +from embodichain.gen_sim.action_agent_pipeline.generation.action_agent_config import ( TargetReplacementSpec, - generate_ur5_basket_config_from_project, + generate_action_agent_config_from_project, ) __all__ = ["cli"] @@ -30,8 +30,7 @@ def cli() -> None: parser = argparse.ArgumentParser( description=( - "Generate a Dual-UR5 basket-placement action-agent config from an " - "exported tabletop gym project." + "Generate an action-agent config from an " "exported tabletop gym project." ) ) parser.add_argument( @@ -171,7 +170,7 @@ def cli() -> None: task_description = _resolve_task_description(args) target_replacements = _resolve_target_replacements(args) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( gym_project=args.gym_project, output_dir=args.output_dir, task_name=args.task_name, diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/image2scene_stage.py b/embodichain/gen_sim/action_agent_pipeline/cli/image2scene_stage.py new file mode 100644 index 00000000..fbf03df8 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/image2scene_stage.py @@ -0,0 +1,258 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path +import shlex +import subprocess +import sys + +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_defaults import ( + DEFAULT_IMAGE2SCENE_IMAGE, + DEFAULT_IMAGE_DIR, + IMAGE_SUFFIXES, +) +from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + scrub_usage_tracking_env, +) + +__all__ = [ + "collect_merged_gym_configs", + "latest_path", + "resolve_image2tabletop_server", + "resolve_under_root", + "run_image2scene_pipeline", +] + + +def resolve_under_root(root: Path, path_input: str | None) -> Path | None: + if path_input is None: + return None + path = Path(path_input).expanduser() + if path.is_absolute(): + return path.resolve() + return (root / path).resolve() + + +def collect_merged_gym_configs(download_dir: Path) -> list[Path]: + if not download_dir.exists(): + return [] + return sorted(download_dir.rglob("gym_config_merged.json")) + + +def latest_path(paths: list[Path]) -> Path: + return max(paths, key=lambda path: path.stat().st_mtime) + + +def run_image2scene_pipeline(args: argparse.Namespace) -> Path: + if not args.background: + raise ValueError("--background is required with --use-image2scene.") + + image2scene_root = Path(args.image2scene_root).expanduser().resolve() + if not image2scene_root.is_dir(): + raise FileNotFoundError(f"image2scene root not found: {image2scene_root}") + + script_path = image2scene_root / "demo_api/client/image2scene_pipeline.py" + if not script_path.is_file(): + raise FileNotFoundError(f"image2scene pipeline not found: {script_path}") + + image_path = _resolve_image2scene_image(args, image2scene_root) + download_dir = resolve_under_root(image2scene_root, args.image2scene_download_dir) + output_root = resolve_under_root(image2scene_root, args.image2scene_output_root) + gen_config = resolve_under_root(image2scene_root, args.image2scene_gen_config) + llm_config = resolve_under_root(image2scene_root, args.image2scene_llm_config) + extract_dir = resolve_under_root(image2scene_root, args.image2scene_extract_dir) + merged_output = resolve_under_root(image2scene_root, args.image2scene_merged_output) + + if ( + download_dir is None + or output_root is None + or gen_config is None + or llm_config is None + ): + raise ValueError("image2scene paths must not be empty.") + + before_configs = set(collect_merged_gym_configs(download_dir)) + server = resolve_image2tabletop_server(args) + client_url = resolve_image2scene_client_url(args, server) + runtime_gen_config = _stage_b_gen_config_with_client_url( + gen_config, + client_url, + output_root, + ) + command = [ + sys.executable, + str(script_path), + "--server", + server, + "--image", + str(image_path), + "--download-dir", + str(download_dir), + "--background", + args.background, + "--output-root", + str(output_root), + "--gen-config", + str(runtime_gen_config), + "--llm-config", + str(llm_config), + "--poll-interval", + str(args.poll_interval), + ] + if extract_dir is not None: + command.extend(["--extract-dir", str(extract_dir)]) + if merged_output is not None: + command.extend(["--merged-output", str(merged_output)]) + + print("Running image2scene pipeline:") + print(shlex.join(command), flush=True) + completed = subprocess.run( + command, + cwd=image2scene_root, + check=False, + env=_image2scene_subprocess_env(), + ) + if completed.returncode != 0: + raise RuntimeError( + f"image2scene pipeline failed with exit code {completed.returncode}" + ) + + if merged_output is not None: + if not merged_output.is_file(): + raise FileNotFoundError( + f"image2scene merged output not found: {merged_output}" + ) + print(f"Using image2scene merged gym config: {merged_output}", flush=True) + return merged_output + + after_configs = collect_merged_gym_configs(download_dir) + new_configs = [path for path in after_configs if path not in before_configs] + if new_configs: + merged_config = latest_path(new_configs) + elif after_configs: + merged_config = latest_path(after_configs) + else: + raise FileNotFoundError( + f"gym_config_merged.json not found under: {download_dir}" + ) + + print(f"Using image2scene merged gym config: {merged_config}", flush=True) + return merged_config + + +def resolve_image2tabletop_server(args: argparse.Namespace) -> str: + server = str(args.server or os.getenv("IMAGE2TABLETOP_SERVER") or "").strip() + if not server: + raise ValueError( + "Image2Tabletop API server is required for this mode. Pass --server " + "or set IMAGE2TABLETOP_SERVER." + ) + return server.rstrip("/") + + +def resolve_image2scene_client_url(args: argparse.Namespace, server: str) -> str: + client_url = str(getattr(args, "image2scene_client_url", "") or "").strip() + if client_url: + return client_url.rstrip("/") + return server.rstrip("/") + + +def _stage_b_gen_config_with_client_url( + gen_config: Path, + client_url: str, + output_root: Path, +) -> Path: + normalized_client_url = str(client_url or "").strip().rstrip("/") + if not normalized_client_url: + return gen_config + + config = json.loads(gen_config.read_text(encoding="utf-8")) + if config.get("DEFAULT_CLIENT_URL") == normalized_client_url: + return gen_config + + runtime_dir = output_root / ".image2scene_runtime" + runtime_dir.mkdir(parents=True, exist_ok=True) + runtime_config = runtime_dir / "gen_config.json" + config["DEFAULT_CLIENT_URL"] = normalized_client_url + runtime_config.write_text( + json.dumps(config, ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + return runtime_config + + +def _image2scene_subprocess_env() -> dict[str, str]: + env = scrub_usage_tracking_env(os.environ) + try: + from embodichain.gen_sim.action_agent_pipeline.utils.llm_config import ( + get_openai_compatible_llm_config, + ) + except Exception: + return env + + cfg = get_openai_compatible_llm_config( + required=False, + require_base_url=False, + ) + env_updates = { + "IMAGE2TABLETOP_LLM_API_KEY": cfg.get("api_key"), + "IMAGE2TABLETOP_LLM_MODEL": cfg.get("model"), + "IMAGE2TABLETOP_LLM_BASE_URL": cfg.get("base_url"), + } + for key, value in env_updates.items(): + if value and key not in env: + env[key] = str(value) + if cfg.get("api_key"): + print( + "Using shared LLM config for image2scene subprocess: " + f"model={cfg.get('model') or ''}", + flush=True, + ) + return env + + +def _resolve_image2scene_image( + args: argparse.Namespace, + image2scene_root: Path, +) -> Path: + if args.image_name: + image_path = Path(args.image_name).expanduser() + if image_path.suffix: + if not image_path.is_absolute() and image_path.parent == Path("."): + return (DEFAULT_IMAGE_DIR / image_path).resolve() + return image_path.resolve() + candidates = [ + DEFAULT_IMAGE_DIR / f"{args.image_name}{suffix}" + for suffix in IMAGE_SUFFIXES + ] + existing = [path for path in candidates if path.is_file()] + if existing: + return existing[0].resolve() + searched = ", ".join(path.as_posix() for path in candidates) + raise FileNotFoundError( + f"Image name '{args.image_name}' was not found. Tried: {searched}" + ) + + image_input = args.image or DEFAULT_IMAGE2SCENE_IMAGE + image_path = Path(image_input).expanduser() + if image_path.is_absolute(): + return image_path.resolve() + return (image2scene_root / image_path).resolve() diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_args.py b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_args.py new file mode 100644 index 00000000..1495ac5d --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_args.py @@ -0,0 +1,381 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse + +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_defaults import ( + DEFAULT_CONFIG_OUTPUT_DIR, + DEFAULT_EXISTING_GYM_PROJECT, + DEFAULT_GYM_PROJECT_ROOT, + DEFAULT_IMAGE, + DEFAULT_IMAGE2SCENE_CONFIG, + DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR, + DEFAULT_IMAGE2SCENE_IMAGE, + DEFAULT_IMAGE2SCENE_OUTPUT_ROOT, + DEFAULT_IMAGE2SCENE_ROOT, + DEFAULT_JOB_TIMEOUT_S, + DEFAULT_PIPELINE_HISTORY, + DEFAULT_TASK_NAME, +) + +__all__ = ["build_parser"] + + +def build_parser() -> argparse.ArgumentParser: + """Build the one-shot action-agent pipeline argument parser.""" + parser = argparse.ArgumentParser( + description=( + "Generate a tabletop gym project from one image, generate action-agent " + "configs from that project, then run the generated task." + ) + ) + image_group = parser.add_mutually_exclusive_group() + image_group.add_argument( + "--image", + default=None, + help=( + f"Input image path. If omitted, defaults to {DEFAULT_IMAGE.as_posix()} " + f"or {DEFAULT_IMAGE2SCENE_IMAGE} with --use-image2scene." + ), + ) + image_group.add_argument( + "--image-name", + "--image_name", + dest="image_name", + default=None, + help=( + "Image file name under the default image directory. The suffix is " + 'optional, e.g. "demo6" resolves to demo6.jpg.' + ), + ) + parser.add_argument( + "--server", + default=None, + help="Image2Tabletop API server. Defaults to IMAGE2TABLETOP_SERVER.", + ) + parser.add_argument( + "--use-image2scene", + action="store_true", + default=False, + help=( + "Use gym_project/environment/image2tabletop/demo_api/client/" + "image2scene_pipeline.py as the first stage and continue from its " + "gym_config_merged.json output." + ), + ) + parser.add_argument( + "--background", + default=None, + help=( + "Background description passed to image2scene_pipeline.py. Required " + "with --use-image2scene." + ), + ) + parser.add_argument( + "--image2scene-root", + default=str(DEFAULT_IMAGE2SCENE_ROOT), + help=( + "Working directory for image2scene_pipeline.py. Defaults to " + f"{DEFAULT_IMAGE2SCENE_ROOT.as_posix()}" + ), + ) + parser.add_argument( + "--image2scene-download-dir", + default=DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR, + help=( + "Download directory passed to image2scene_pipeline.py. Relative " + "paths are interpreted under --image2scene-root. Defaults to " + f"{DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR}." + ), + ) + parser.add_argument( + "--image2scene-output-root", + default=DEFAULT_IMAGE2SCENE_OUTPUT_ROOT, + help=( + "Generated EC project directory passed to image2scene_pipeline.py. " + "Relative paths are interpreted under --image2scene-root. Defaults " + f"to {DEFAULT_IMAGE2SCENE_OUTPUT_ROOT}." + ), + ) + parser.add_argument( + "--image2scene-gen-config", + default=DEFAULT_IMAGE2SCENE_CONFIG, + help=( + "Generation config passed to image2scene_pipeline.py. Relative " + "paths are interpreted under --image2scene-root. Defaults to " + f"{DEFAULT_IMAGE2SCENE_CONFIG}." + ), + ) + parser.add_argument( + "--image2scene-client-url", + default=None, + help=( + "MesaTask/TextToScene service URL passed to image2scene Stage B. " + "If omitted, defaults to --server." + ), + ) + parser.add_argument( + "--image2scene-llm-config", + default=DEFAULT_IMAGE2SCENE_CONFIG, + help=( + "LLM config passed to image2scene_pipeline.py. Relative paths are " + "interpreted under --image2scene-root. Defaults to " + f"{DEFAULT_IMAGE2SCENE_CONFIG}." + ), + ) + parser.add_argument( + "--image2scene-extract-dir", + default=None, + help=( + "Optional extract directory passed to image2scene_pipeline.py. " + "Relative paths are interpreted under --image2scene-root." + ), + ) + parser.add_argument( + "--image2scene-merged-output", + default=None, + help=( + "Optional merged output path passed to image2scene_pipeline.py. " + "Relative paths are interpreted under --image2scene-root." + ), + ) + parser.add_argument( + "--gym-project-root", + default=str(DEFAULT_GYM_PROJECT_ROOT), + help=( + "Directory where Image2Tabletop generated gym projects are written. " + f"Defaults to {DEFAULT_GYM_PROJECT_ROOT.as_posix()}" + ), + ) + parser.add_argument( + "--use-existing-gym-project", + action="store_true", + default=False, + help=( + "Skip Image2Tabletop API and start from --gym-project. Defaults to " + "false." + ), + ) + parser.add_argument( + "--base-task-name", + "--base_task_name", + dest="base_task_name", + default=None, + help=( + "Start from the latest pipeline history entry with this task name. " + "Use this to chain demos, e.g. demo2 based on Demo1_Text." + ), + ) + parser.add_argument( + "--base-history-index", + "--base_history_index", + dest="base_history_index", + type=int, + default=None, + help=( + "Start from a specific pipeline history index. When used with " + "--base-task-name, the history entry must match that task name." + ), + ) + parser.add_argument( + "--gym-project", + "--gym_project", + dest="gym_project", + default=str(DEFAULT_EXISTING_GYM_PROJECT), + help=( + "Existing gym project used with --use-existing-gym-project. " + f"Defaults to {DEFAULT_EXISTING_GYM_PROJECT.as_posix()}" + ), + ) + parser.add_argument( + "--config-output-dir", + "--output_dir", + dest="config_output_dir", + default=str(DEFAULT_CONFIG_OUTPUT_DIR), + help=( + "Destination directory for generated config files. Defaults to " + f"{DEFAULT_CONFIG_OUTPUT_DIR.as_posix()}" + ), + ) + parser.add_argument( + "--pipeline-history-path", + "--pipeline_history_path", + dest="pipeline_history_path", + default=str(DEFAULT_PIPELINE_HISTORY), + help=( + "Global pipeline history JSON path. Defaults to " + f"{DEFAULT_PIPELINE_HISTORY.as_posix()}" + ), + ) + parser.add_argument( + "--task_name", + "--task-name", + dest="task_name", + default=DEFAULT_TASK_NAME, + help=f"Task name passed to run_agent. Defaults to {DEFAULT_TASK_NAME}", + ) + parser.add_argument( + "--task_description", + "--task-description", + dest="task_description", + default="", + help=( + 'Task description passed to config generation. Defaults to "". ' + "Ignored for default-template tasks such as Demo1_Text." + ), + ) + parser.add_argument( + "--target_body_scale", + "--target-body-scale", + dest="target_body_scale", + type=float, + default=0.8, + help=( + "Uniform body_scale for generated target objects. Basket-like " + "containers keep their source body_scale. Defaults to 0.8." + ), + ) + parser.add_argument( + "--target_replacement1", + "--target-replacement1", + nargs="+", + metavar="SOURCE_OR_PROMPT", + default=None, + help=( + "Generate /mesh_assets/new1 from PROMPT. Accepts either " + "PROMPT, which auto-selects the lower-y duplicated rigid " + "object, or SOURCE_UID PROMPT for explicit selection." + ), + ) + parser.add_argument( + "--target_replacement2", + "--target-replacement2", + nargs="+", + metavar="SOURCE_OR_PROMPT", + default=None, + help=( + "Generate /mesh_assets/new2 from PROMPT. Accepts either " + "PROMPT, which auto-selects the higher-y duplicated rigid " + "object, or SOURCE_UID PROMPT for explicit selection." + ), + ) + parser.add_argument( + "--sync_replacement_names", + "--sync-replacement-names", + action="store_true", + default=False, + help=( + "Also update replacement target runtime UIDs and generated prompts " + "from the replacement prompts." + ), + ) + parser.add_argument( + "--reuse-target-replacements", + "--reuse_target_replacements", + dest="reuse_target_replacements", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "Reuse existing prompt-generated replacement GLBs when the prompt " + "and expected output name match. Defaults to true." + ), + ) + parser.add_argument( + "--prewarm-coacd-cache", + "--prewarm_coacd_cache", + dest="prewarm_coacd_cache", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "Precompute environment CoACD cache files during config generation. " + "Defaults to true." + ), + ) + parser.add_argument( + "--poll-interval", + type=float, + default=10.0, + help="Image2Tabletop job polling interval in seconds. Defaults to 10.0.", + ) + parser.add_argument( + "--job-timeout-s", + "--job_timeout_s", + dest="job_timeout_s", + type=float, + default=DEFAULT_JOB_TIMEOUT_S, + help="Maximum seconds to wait for Image2Tabletop API jobs.", + ) + parser.add_argument( + "--skip-health-check", + action="store_true", + default=False, + help="Skip GET /health before submitting the image.", + ) + parser.add_argument( + "--overwrite-gym-project", + action="store_true", + default=False, + help="Replace an existing generated gym project with the same name.", + ) + parser.add_argument( + "--overwrite-config", + action=argparse.BooleanOptionalAction, + default=True, + help="Overwrite generated config files. Defaults to true.", + ) + parser.add_argument( + "--regenerate", + action=argparse.BooleanOptionalAction, + default=True, + help="Pass --regenerate to run_agent. Defaults to true.", + ) + parser.add_argument( + "--skip-run-agent", + action="store_true", + default=False, + help="Stop after generating config files instead of launching run_agent.", + ) + parser.add_argument( + "--llm-usage-output", + default=None, + help=( + "JSONL path for local LLM token usage records. Defaults to " + "/llm_usage.jsonl." + ), + ) + parser.add_argument( + "--llm-usage-summary-output", + default=None, + help=( + "JSON path for the aggregated local LLM token usage summary. " + "Defaults to /llm_usage_summary.json." + ), + ) + parser.add_argument( + "--llm-usage-run-id", + default=None, + help="Optional run id written into local LLM token usage records.", + ) + parser.add_argument( + "--no-llm-usage", + dest="llm_usage", + action="store_false", + default=True, + help="Disable local LLM token usage recording for this pipeline run.", + ) + return parser diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_defaults.py b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_defaults.py new file mode 100644 index 00000000..333135ff --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_defaults.py @@ -0,0 +1,74 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from pathlib import Path + +__all__ = [ + "DEFAULT_ACTION_AGENT_WORKSPACE", + "DEFAULT_CONFIG_OUTPUT_DIR", + "DEFAULT_EXISTING_GYM_PROJECT", + "DEFAULT_GYM_PROJECT_ROOT", + "DEFAULT_IMAGE", + "DEFAULT_IMAGE_DIR", + "DEFAULT_IMAGE2SCENE_CONFIG", + "DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR", + "DEFAULT_IMAGE2SCENE_IMAGE", + "DEFAULT_IMAGE2SCENE_OUTPUT_ROOT", + "DEFAULT_IMAGE2SCENE_ROOT", + "DEFAULT_JOB_TIMEOUT_S", + "DEFAULT_PIPELINE_HISTORY", + "DEFAULT_TASK_NAME", + "DEFAULT_TASK_TEMPLATE_NAMES", + "GYM_CONFIG_PREFERENCE", + "IMAGE_SUFFIXES", + "PIPELINE_HISTORY_SCHEMA_VERSION", + "PIPELINE_MANIFEST_FILENAME", + "REPO_ROOT", +] + + +def _repo_root() -> Path: + current = Path(__file__).resolve() + for parent in current.parents: + if (parent / "setup.py").is_file() and (parent / "embodichain").is_dir(): + return parent + return Path.cwd().resolve() + + +REPO_ROOT = _repo_root() +DEFAULT_JOB_TIMEOUT_S = 1800.0 +DEFAULT_GYM_PROJECT_ROOT = REPO_ROOT / "gym_project" +DEFAULT_ACTION_AGENT_WORKSPACE = DEFAULT_GYM_PROJECT_ROOT / "action_agent_pipeline" +DEFAULT_IMAGE = DEFAULT_ACTION_AGENT_WORKSPACE / "images/demo1.jpg" +DEFAULT_IMAGE_DIR = DEFAULT_IMAGE.parent +DEFAULT_EXISTING_GYM_PROJECT = DEFAULT_GYM_PROJECT_ROOT / "1780562837_gym_project" +DEFAULT_IMAGE2SCENE_ROOT = REPO_ROOT / "gym_project/environment/image2tabletop" +DEFAULT_IMAGE2SCENE_IMAGE = "scene_image/robotwin_example.png" +DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR = "./downloads" +DEFAULT_IMAGE2SCENE_OUTPUT_ROOT = "./generated" +DEFAULT_IMAGE2SCENE_CONFIG = "./gen_config.json" +DEFAULT_CONFIG_OUTPUT_DIR = DEFAULT_ACTION_AGENT_WORKSPACE / "configs/demo3_text" +DEFAULT_PIPELINE_HISTORY = ( + DEFAULT_ACTION_AGENT_WORKSPACE / "configs/pipeline_history.json" +) +DEFAULT_TASK_NAME = "Demo3_Text" +DEFAULT_TASK_TEMPLATE_NAMES = frozenset({"Demo1_Text"}) +IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".webp", ".bmp") +GYM_CONFIG_PREFERENCE = ("gym_config_merged.json", "gym_config.json") +PIPELINE_HISTORY_SCHEMA_VERSION = 1 +PIPELINE_MANIFEST_FILENAME = "pipeline_manifest.json" diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_records.py b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_records.py index 79d5a189..0c80d1e6 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_records.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_records.py @@ -20,7 +20,7 @@ import argparse from collections.abc import Sequence -from datetime import datetime +from datetime import datetime, timezone import hashlib import json from pathlib import Path @@ -158,7 +158,7 @@ def build_pipeline_record( source_sha256 = _file_sha256(source_gym_config) record: dict[str, Any] = { "schema_version": schema_version, - "created_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "created_at": datetime.now(timezone.utc).isoformat(timespec="milliseconds"), "task_name": args.task_name, "source_mode": resolution.mode, "source_id": f"gym_config_sha256:{source_sha256}", @@ -291,6 +291,7 @@ def _source_request_record( "image2scene_download_dir": str(args.image2scene_download_dir), "image2scene_output_root": str(args.image2scene_output_root), "image2scene_gen_config": str(args.image2scene_gen_config), + "image2scene_client_url": args.image2scene_client_url or args.server, "image2scene_llm_config": str(args.image2scene_llm_config), } ) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_runner.py b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_runner.py new file mode 100644 index 00000000..117ec2b7 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_runner.py @@ -0,0 +1,107 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse +import sys + +from embodichain.gen_sim.action_agent_pipeline.cli.agent_run_stage import ( + run_agent_command, +) +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_defaults import ( + PIPELINE_HISTORY_SCHEMA_VERSION, + PIPELINE_MANIFEST_FILENAME, + REPO_ROOT, +) +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_records import ( + write_pipeline_manifests, +) +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_usage import ( + configure_llm_usage_tracking, + write_llm_usage_summary, +) +from embodichain.gen_sim.action_agent_pipeline.cli.project_resolution import ( + resolve_gym_project, + resolve_task_description_for_generation, +) +from embodichain.gen_sim.action_agent_pipeline.cli.target_replacements import ( + resolve_target_replacements, +) + +__all__ = ["run_pipeline"] + + +def run_pipeline(args: argparse.Namespace) -> int: + """Run image/project resolution, config generation, and optional task execution.""" + _ensure_repo_on_pythonpath() + from embodichain.gen_sim.action_agent_pipeline.generation.action_agent_config import ( + TargetReplacementSpec, + generate_action_agent_config_from_project, + ) + + resolution = resolve_gym_project(args) + usage_paths = configure_llm_usage_tracking(args) + target_replacements = resolve_target_replacements( + args, + TargetReplacementSpec, + resolution.path, + ) + task_description = resolve_task_description_for_generation(args) + args.task_description = task_description or "" + + paths = generate_action_agent_config_from_project( + gym_project=resolution.path, + output_dir=args.config_output_dir, + task_name=args.task_name, + task_description=task_description, + target_body_scale=args.target_body_scale, + target_replacements=target_replacements, + sync_replacement_names=args.sync_replacement_names, + reuse_target_replacements=args.reuse_target_replacements, + prewarm_coacd_cache=args.prewarm_coacd_cache, + overwrite=args.overwrite_config, + ) + write_pipeline_manifests( + args=args, + resolution=resolution, + generated_paths=paths, + target_replacements=target_replacements, + repo_root=REPO_ROOT, + schema_version=PIPELINE_HISTORY_SCHEMA_VERSION, + manifest_filename=PIPELINE_MANIFEST_FILENAME, + ) + + print(f"Using gym project/config: {resolution.path}", flush=True) + print(f"Generated gym config: {paths.gym_config}", flush=True) + print(f"Generated agent config: {paths.agent_config}", flush=True) + if args.skip_run_agent: + write_llm_usage_summary(usage_paths) + return 0 + + return_code = run_agent_command( + task_name=args.task_name, + gym_config=paths.gym_config, + agent_config=paths.agent_config, + regenerate=args.regenerate, + ) + write_llm_usage_summary(usage_paths) + return return_code + + +def _ensure_repo_on_pythonpath() -> None: + if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_usage.py b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_usage.py new file mode 100644 index 00000000..7831e2db --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/pipeline_usage.py @@ -0,0 +1,89 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse +from datetime import datetime, timezone +from pathlib import Path + +__all__ = ["configure_llm_usage_tracking", "write_llm_usage_summary"] + + +def configure_llm_usage_tracking( + args: argparse.Namespace, +) -> tuple[Path, Path] | None: + if not args.llm_usage: + from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + disable_usage_tracking, + ) + + disable_usage_tracking() + return None + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + configure_usage_tracking, + ) + + output_dir = Path(args.config_output_dir).expanduser().resolve() + usage_path = ( + Path(args.llm_usage_output).expanduser().resolve() + if args.llm_usage_output + else output_dir / "llm_usage.jsonl" + ) + summary_path = ( + Path(args.llm_usage_summary_output).expanduser().resolve() + if args.llm_usage_summary_output + else output_dir / "llm_usage_summary.json" + ) + run_id = args.llm_usage_run_id or (f"{args.task_name}_{_utc_run_timestamp()}") + configure_usage_tracking( + usage_path=usage_path, + run_id=run_id, + process_name="run_agent_pipeline", + reset=True, + ) + print(f"Recording local LLM token usage: {usage_path}", flush=True) + print(f"Local LLM token usage summary: {summary_path}", flush=True) + return usage_path, summary_path + + +def write_llm_usage_summary(usage_paths: tuple[Path, Path] | None) -> None: + if usage_paths is None: + return + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( + write_usage_summary, + ) + + usage_path, summary_path = usage_paths + summary = write_usage_summary( + usage_path=usage_path, + summary_path=summary_path, + ) + total = summary["total"] + print( + "Local LLM token usage total: " + f"calls={total['calls']}, " + f"input={total['input_tokens']}, " + f"output={total['output_tokens']}, " + f"total={total['total_tokens']}", + flush=True, + ) + + +def _utc_run_timestamp() -> str: + return datetime.now(timezone.utc).isoformat(timespec="milliseconds") diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/project_resolution.py b/embodichain/gen_sim/action_agent_pipeline/cli/project_resolution.py new file mode 100644 index 00000000..458da39d --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/project_resolution.py @@ -0,0 +1,230 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse +from collections.abc import Callable +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.cli.image2scene_stage import ( + resolve_image2tabletop_server, + run_image2scene_pipeline, +) +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_defaults import ( + DEFAULT_IMAGE, + DEFAULT_TASK_TEMPLATE_NAMES, + IMAGE_SUFFIXES, + PIPELINE_HISTORY_SCHEMA_VERSION, + REPO_ROOT, +) +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_records import ( + find_history_entry_by_index, + history_entry_has_source, + history_entry_index, + path_from_history_entry, + pipeline_history_path, + read_pipeline_history, +) + +__all__ = [ + "ProjectResolution", + "resolve_gym_project", + "resolve_task_description_for_generation", +] + +_DEFAULT_IMAGE_DIR = DEFAULT_IMAGE.parent + + +@dataclass(frozen=True) +class ProjectResolution: + path: Path + mode: str + base_history: dict[str, Any] | None = None + + +def resolve_task_description_for_generation(args: argparse.Namespace) -> str | None: + task_description = str(args.task_description or "").strip() + if args.task_name in DEFAULT_TASK_TEMPLATE_NAMES: + if task_description: + print( + f"Ignoring --task_description for {args.task_name}; " + "using the default basket task template.", + flush=True, + ) + return None + return task_description or None + + +def resolve_gym_project(args: argparse.Namespace) -> ProjectResolution: + use_history = args.base_task_name is not None or args.base_history_index is not None + selected_modes = [ + args.use_image2scene, + args.use_existing_gym_project, + use_history, + ] + if sum(bool(mode) for mode in selected_modes) > 1: + raise ValueError( + "Use only one of --use-image2scene, --use-existing-gym-project, " + "or --base-task-name/--base-history-index." + ) + + if args.use_existing_gym_project: + project_path = Path(args.gym_project).expanduser().resolve() + if not project_path.exists(): + raise FileNotFoundError(f"gym project not found: {project_path}") + print(f"Using existing gym project: {project_path}", flush=True) + return ProjectResolution(path=project_path, mode="existing_gym_project") + + if args.use_image2scene: + return ProjectResolution( + path=run_image2scene_pipeline(args), + mode="image2scene", + ) + + if use_history: + history_entry = _resolve_base_history_entry(args) + project_path = path_from_history_entry(history_entry, repo_root=REPO_ROOT) + print( + "Using base history " + f"#{history_entry.get('index')} ({history_entry.get('task_name')}): " + f"{project_path}", + flush=True, + ) + return ProjectResolution( + path=project_path, + mode="history", + base_history=history_entry, + ) + + from embodichain.gen_sim.action_agent_pipeline.gym_project_api.image2tabletop_client import ( + check_health, + collect_image_paths, + process_image, + ) + + image_input = _resolve_image_input(args) + image_path = _resolve_single_image(str(image_input), collect_image_paths) + server = resolve_image2tabletop_server(args) + if not args.skip_health_check: + check_health(server) + + return ProjectResolution( + path=process_image( + server=server, + image_path=image_path, + output_root=Path(args.gym_project_root), + poll_interval=args.poll_interval, + overwrite=args.overwrite_gym_project, + job_timeout_s=args.job_timeout_s, + ), + mode="image2tabletop", + ) + + +def _resolve_base_history_entry(args: argparse.Namespace) -> dict[str, Any]: + if args.base_history_index is not None and args.base_history_index <= 0: + raise ValueError("--base-history-index must be a positive integer.") + + history_path = pipeline_history_path(args) + history = read_pipeline_history( + history_path, + schema_version=PIPELINE_HISTORY_SCHEMA_VERSION, + ) + runs = history["runs"] + + if args.base_history_index is not None: + entry = find_history_entry_by_index(runs, args.base_history_index) + if entry is None: + raise ValueError( + f"Pipeline history index not found: {args.base_history_index}" + ) + if args.base_task_name and entry.get("task_name") != args.base_task_name: + raise ValueError( + "Pipeline history entry " + f"#{args.base_history_index} has task_name={entry.get('task_name')!r}, " + f"expected {args.base_task_name!r}." + ) + return dict(entry) + + if not args.base_task_name: + raise ValueError("--base-task-name is required without --base-history-index.") + + candidates = [ + entry + for entry in runs + if entry.get("task_name") == args.base_task_name + and history_entry_has_source(entry) + ] + if not candidates: + raise ValueError( + "No pipeline history entry found for task_name=" + f"{args.base_task_name!r} in {history_path}" + ) + return dict(max(candidates, key=history_entry_index)) + + +def _resolve_single_image( + image_input: str, + collect_image_paths: Callable[[Path], list[Path]], +) -> Path: + image_paths = collect_image_paths(Path(image_input)) + if len(image_paths) != 1: + paths = ", ".join(path.as_posix() for path in image_paths) + raise ValueError( + "This pipeline expects exactly one image, but got " + f"{len(image_paths)}: {paths}" + ) + return image_paths[0] + + +def _resolve_image_input(args: argparse.Namespace) -> Path: + if args.image_name: + return _resolve_image_name(args.image_name) + if args.image: + return Path(args.image) + return DEFAULT_IMAGE + + +def _resolve_image_name(image_name: str) -> Path: + image_path = Path(image_name) + if image_path.parent != Path("."): + raise ValueError( + "--image-name only accepts a file name under " + f"{_DEFAULT_IMAGE_DIR.as_posix()}. Use --image for a full path." + ) + if image_path.suffix: + return _DEFAULT_IMAGE_DIR / image_path + + matches = [ + _DEFAULT_IMAGE_DIR / f"{image_name}{suffix}" for suffix in IMAGE_SUFFIXES + ] + existing = [path for path in matches if path.exists()] + if len(existing) == 1: + return existing[0] + if not existing: + candidates = ", ".join(path.name for path in matches) + raise FileNotFoundError( + f"Image name {image_name!r} was not found. Tried: {candidates}" + ) + + matched = ", ".join(path.name for path in existing) + raise ValueError( + f"Image name {image_name!r} is ambiguous. Use --image-name with a suffix: " + f"{matched}" + ) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py index 999996e4..363bfe88 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent.py @@ -24,7 +24,7 @@ import tqdm from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.agent_env import ( # noqa: F401 - AtomicActionsAgentEnv, + AgenticGenSimEnv, ) from embodichain.lab.gym.utils.gym_utils import ( add_env_launcher_args_to_parser, diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py index ac08b311..1db79064 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/run_agent_pipeline.py @@ -19,1310 +19,14 @@ from __future__ import annotations -import argparse -from collections.abc import Callable -from dataclasses import dataclass -from datetime import datetime -import json -import os -from pathlib import Path -import re -import shlex -import subprocess -import sys -from typing import Any - - -def _repo_root() -> Path: - current = Path(__file__).resolve() - for parent in current.parents: - if (parent / "setup.py").is_file() and (parent / "embodichain").is_dir(): - return parent - return Path.cwd().resolve() - +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_args import build_parser +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_runner import run_pipeline __all__ = ["main"] -_REPO_ROOT = _repo_root() -if str(_REPO_ROOT) not in sys.path: - sys.path.insert(0, str(_REPO_ROOT)) - -from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_records import ( - find_history_entry_by_index as _records_find_history_entry_by_index, - history_entry_has_source as _records_history_entry_has_source, - history_entry_index as _records_history_entry_index, - path_from_history_entry as _records_path_from_history_entry, - pipeline_history_path as _records_pipeline_history_path, - read_pipeline_history as _records_read_pipeline_history, - resolve_source_gym_config as _records_resolve_source_gym_config, - write_pipeline_manifests as _records_write_pipeline_manifests, -) - -_DEFAULT_SERVER = "http://192.168.3.23:4523" -_DEFAULT_GYM_PROJECT_ROOT = _REPO_ROOT / "gym_project" -_DEFAULT_ACTION_AGENT_WORKSPACE = _DEFAULT_GYM_PROJECT_ROOT / "action_agent_pipeline" -_DEFAULT_IMAGE = _DEFAULT_ACTION_AGENT_WORKSPACE / "images/demo1.jpg" -_DEFAULT_IMAGE_DIR = _DEFAULT_IMAGE.parent -_DEFAULT_EXISTING_GYM_PROJECT = _DEFAULT_GYM_PROJECT_ROOT / "1780562837_gym_project" -_DEFAULT_IMAGE2SCENE_ROOT = _REPO_ROOT / "gym_project/environment/image2tabletop" -_DEFAULT_IMAGE2SCENE_IMAGE = "scene_image/robotwin_example.png" -_DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR = "./downloads" -_DEFAULT_IMAGE2SCENE_OUTPUT_ROOT = "./generated" -_DEFAULT_IMAGE2SCENE_CONFIG = "./gen_config.json" -_DEFAULT_CONFIG_OUTPUT_DIR = _DEFAULT_ACTION_AGENT_WORKSPACE / "configs/demo3_text" -_DEFAULT_PIPELINE_HISTORY = ( - _DEFAULT_ACTION_AGENT_WORKSPACE / "configs/pipeline_history.json" -) -_DEFAULT_TASK_NAME = "Demo3_Text" -_DEFAULT_TASK_TEMPLATE_NAMES = frozenset({"Demo1_Text"}) -_IMAGE_SUFFIXES = (".jpg", ".jpeg", ".png", ".webp", ".bmp") -_GYM_CONFIG_PREFERENCE = ("gym_config_merged.json", "gym_config.json") -_PIPELINE_HISTORY_SCHEMA_VERSION = 1 -_PIPELINE_MANIFEST_FILENAME = "pipeline_manifest.json" -_INDEXED_REPLACEMENT_ALIAS_RE = re.compile( - r"^(?P[A-Za-z][A-Za-z0-9 _-]*?)[ _-]?(?P[0-9]+)$" -) - - -@dataclass(frozen=True) -class ProjectResolution: - path: Path - mode: str - base_history: dict[str, Any] | None = None - - -def _build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - description=( - "Generate a tabletop gym project from one image, generate action-agent " - "configs from that project, then run the generated task." - ) - ) - image_group = parser.add_mutually_exclusive_group() - image_group.add_argument( - "--image", - default=None, - help=( - f"Input image path. If omitted, defaults to {_DEFAULT_IMAGE.as_posix()} " - f"or {_DEFAULT_IMAGE2SCENE_IMAGE} with --use-image2scene." - ), - ) - image_group.add_argument( - "--image-name", - "--image_name", - dest="image_name", - default=None, - help=( - "Image file name under the default image directory. The suffix is " - 'optional, e.g. "demo6" resolves to demo6.jpg.' - ), - ) - parser.add_argument( - "--server", - default=_DEFAULT_SERVER, - help=f"Image2Tabletop API server. Defaults to {_DEFAULT_SERVER}", - ) - parser.add_argument( - "--use-image2scene", - action="store_true", - default=False, - help=( - "Use gym_project/environment/image2tabletop/demo_api/client/" - "image2scene_pipeline.py as the first stage and continue from its " - "gym_config_merged.json output." - ), - ) - parser.add_argument( - "--background", - default=None, - help=( - "Background description passed to image2scene_pipeline.py. Required " - "with --use-image2scene." - ), - ) - parser.add_argument( - "--image2scene-root", - default=str(_DEFAULT_IMAGE2SCENE_ROOT), - help=( - "Working directory for image2scene_pipeline.py. Defaults to " - f"{_DEFAULT_IMAGE2SCENE_ROOT.as_posix()}" - ), - ) - parser.add_argument( - "--image2scene-download-dir", - default=_DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR, - help=( - "Download directory passed to image2scene_pipeline.py. Relative " - "paths are interpreted under --image2scene-root. Defaults to " - f"{_DEFAULT_IMAGE2SCENE_DOWNLOAD_DIR}." - ), - ) - parser.add_argument( - "--image2scene-output-root", - default=_DEFAULT_IMAGE2SCENE_OUTPUT_ROOT, - help=( - "Generated EC project directory passed to image2scene_pipeline.py. " - "Relative paths are interpreted under --image2scene-root. Defaults " - f"to {_DEFAULT_IMAGE2SCENE_OUTPUT_ROOT}." - ), - ) - parser.add_argument( - "--image2scene-gen-config", - default=_DEFAULT_IMAGE2SCENE_CONFIG, - help=( - "Generation config passed to image2scene_pipeline.py. Relative " - "paths are interpreted under --image2scene-root. Defaults to " - f"{_DEFAULT_IMAGE2SCENE_CONFIG}." - ), - ) - parser.add_argument( - "--image2scene-llm-config", - default=_DEFAULT_IMAGE2SCENE_CONFIG, - help=( - "LLM config passed to image2scene_pipeline.py. Relative paths are " - "interpreted under --image2scene-root. Defaults to " - f"{_DEFAULT_IMAGE2SCENE_CONFIG}." - ), - ) - parser.add_argument( - "--image2scene-extract-dir", - default=None, - help=( - "Optional extract directory passed to image2scene_pipeline.py. " - "Relative paths are interpreted under --image2scene-root." - ), - ) - parser.add_argument( - "--image2scene-merged-output", - default=None, - help=( - "Optional merged output path passed to image2scene_pipeline.py. " - "Relative paths are interpreted under --image2scene-root." - ), - ) - parser.add_argument( - "--gym-project-root", - default=str(_DEFAULT_GYM_PROJECT_ROOT), - help=( - "Directory where Image2Tabletop generated gym projects are written. " - f"Defaults to {_DEFAULT_GYM_PROJECT_ROOT.as_posix()}" - ), - ) - parser.add_argument( - "--use-existing-gym-project", - action="store_true", - default=False, - help=( - "Skip Image2Tabletop API and start from --gym-project. Defaults to " - "false." - ), - ) - parser.add_argument( - "--base-task-name", - "--base_task_name", - dest="base_task_name", - default=None, - help=( - "Start from the latest pipeline history entry with this task name. " - "Use this to chain demos, e.g. demo2 based on Demo1_Text." - ), - ) - parser.add_argument( - "--base-history-index", - "--base_history_index", - dest="base_history_index", - type=int, - default=None, - help=( - "Start from a specific pipeline history index. When used with " - "--base-task-name, the history entry must match that task name." - ), - ) - parser.add_argument( - "--gym-project", - "--gym_project", - dest="gym_project", - default=str(_DEFAULT_EXISTING_GYM_PROJECT), - help=( - "Existing gym project used with --use-existing-gym-project. " - f"Defaults to {_DEFAULT_EXISTING_GYM_PROJECT.as_posix()}" - ), - ) - parser.add_argument( - "--config-output-dir", - "--output_dir", - dest="config_output_dir", - default=str(_DEFAULT_CONFIG_OUTPUT_DIR), - help=( - "Destination directory for generated config files. Defaults to " - f"{_DEFAULT_CONFIG_OUTPUT_DIR.as_posix()}" - ), - ) - parser.add_argument( - "--pipeline-history-path", - "--pipeline_history_path", - dest="pipeline_history_path", - default=str(_DEFAULT_PIPELINE_HISTORY), - help=( - "Global pipeline history JSON path. Defaults to " - f"{_DEFAULT_PIPELINE_HISTORY.as_posix()}" - ), - ) - parser.add_argument( - "--task_name", - "--task-name", - dest="task_name", - default=_DEFAULT_TASK_NAME, - help=f"Task name passed to run_agent. Defaults to {_DEFAULT_TASK_NAME}", - ) - parser.add_argument( - "--task_description", - "--task-description", - dest="task_description", - default="", - help=( - 'Task description passed to config generation. Defaults to "". ' - "Ignored for default-template tasks such as Demo1_Text." - ), - ) - parser.add_argument( - "--target_body_scale", - "--target-body-scale", - dest="target_body_scale", - type=float, - default=0.8, - help=( - "Uniform body_scale for generated target objects. Basket-like " - "containers keep their source body_scale. Defaults to 0.8." - ), - ) - parser.add_argument( - "--target_replacement1", - "--target-replacement1", - nargs="+", - metavar="SOURCE_OR_PROMPT", - default=None, - help=( - "Generate /mesh_assets/new1 from PROMPT. Accepts either " - "PROMPT, which auto-selects the lower-y duplicated rigid " - "object, or SOURCE_UID PROMPT for explicit selection." - ), - ) - parser.add_argument( - "--target_replacement2", - "--target-replacement2", - nargs="+", - metavar="SOURCE_OR_PROMPT", - default=None, - help=( - "Generate /mesh_assets/new2 from PROMPT. Accepts either " - "PROMPT, which auto-selects the higher-y duplicated rigid " - "object, or SOURCE_UID PROMPT for explicit selection." - ), - ) - parser.add_argument( - "--sync_replacement_names", - "--sync-replacement-names", - action="store_true", - default=False, - help=( - "Also update replacement target runtime UIDs and generated prompts " - "from the replacement prompts." - ), - ) - parser.add_argument( - "--reuse-target-replacements", - "--reuse_target_replacements", - dest="reuse_target_replacements", - action=argparse.BooleanOptionalAction, - default=True, - help=( - "Reuse existing prompt-generated replacement GLBs when the prompt " - "and expected output name match. Defaults to true." - ), - ) - parser.add_argument( - "--prewarm-coacd-cache", - "--prewarm_coacd_cache", - dest="prewarm_coacd_cache", - action=argparse.BooleanOptionalAction, - default=True, - help=( - "Precompute environment CoACD cache files during config generation. " - "Defaults to true." - ), - ) - parser.add_argument( - "--poll-interval", - type=float, - default=10.0, - help="Image2Tabletop job polling interval in seconds. Defaults to 10.0.", - ) - parser.add_argument( - "--skip-health-check", - action="store_true", - default=False, - help="Skip GET /health before submitting the image.", - ) - parser.add_argument( - "--overwrite-gym-project", - action="store_true", - default=False, - help="Replace an existing generated gym project with the same name.", - ) - parser.add_argument( - "--overwrite-config", - action=argparse.BooleanOptionalAction, - default=True, - help="Overwrite generated config files. Defaults to true.", - ) - parser.add_argument( - "--regenerate", - action=argparse.BooleanOptionalAction, - default=True, - help="Pass --regenerate to run_agent. Defaults to true.", - ) - parser.add_argument( - "--skip-run-agent", - action="store_true", - default=False, - help="Stop after generating config files instead of launching run_agent.", - ) - parser.add_argument( - "--llm-usage-output", - default=None, - help=( - "JSONL path for local LLM token usage records. Defaults to " - "/llm_usage.jsonl." - ), - ) - parser.add_argument( - "--llm-usage-summary-output", - default=None, - help=( - "JSON path for the aggregated local LLM token usage summary. " - "Defaults to /llm_usage_summary.json." - ), - ) - parser.add_argument( - "--llm-usage-run-id", - default=None, - help="Optional run id written into local LLM token usage records.", - ) - parser.add_argument( - "--no-llm-usage", - dest="llm_usage", - action="store_false", - default=True, - help="Disable local LLM token usage recording for this pipeline run.", - ) - return parser - - -def _ensure_repo_on_pythonpath() -> None: - if str(_REPO_ROOT) not in sys.path: - sys.path.insert(0, str(_REPO_ROOT)) - - -def _resolve_single_image( - image_input: str, - collect_image_paths: Callable[[Path], list[Path]], -) -> Path: - image_paths = collect_image_paths(Path(image_input)) - if len(image_paths) != 1: - paths = ", ".join(path.as_posix() for path in image_paths) - raise ValueError( - "This pipeline expects exactly one image, but got " - f"{len(image_paths)}: {paths}" - ) - return image_paths[0] - - -def _resolve_image_input(args: argparse.Namespace) -> Path: - if args.image_name: - return _resolve_image_name(args.image_name) - if args.image: - return Path(args.image) - return _DEFAULT_IMAGE - - -def _resolve_image_name(image_name: str) -> Path: - image_path = Path(image_name) - if image_path.parent != Path("."): - raise ValueError( - "--image-name only accepts a file name under " - f"{_DEFAULT_IMAGE_DIR.as_posix()}. Use --image for a full path." - ) - if image_path.suffix: - return _DEFAULT_IMAGE_DIR / image_path - - matches = [ - _DEFAULT_IMAGE_DIR / f"{image_name}{suffix}" for suffix in _IMAGE_SUFFIXES - ] - existing = [path for path in matches if path.exists()] - if len(existing) == 1: - return existing[0] - if not existing: - candidates = ", ".join(path.name for path in matches) - raise FileNotFoundError( - f"Image name {image_name!r} was not found. Tried: {candidates}" - ) - - matched = ", ".join(path.name for path in existing) - raise ValueError( - f"Image name {image_name!r} is ambiguous. Use --image-name with a suffix: " - f"{matched}" - ) - - -def _resolve_under_root(root: Path, path_input: str | None) -> Path | None: - if path_input is None: - return None - path = Path(path_input).expanduser() - if path.is_absolute(): - return path.resolve() - return (root / path).resolve() - - -def _image2scene_subprocess_env() -> dict[str, str]: - from embodichain.gen_sim.action_agent_pipeline.utils.llm_config import ( - get_openai_compatible_llm_config, - ) - from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( - scrub_usage_tracking_env, - ) - - env = scrub_usage_tracking_env() - cfg = get_openai_compatible_llm_config( - required=False, - require_base_url=False, - ) - env_overrides = { - "OPENAI_API_KEY": cfg.get("api_key"), - "OPENAI_MODEL": cfg.get("model"), - "OPENAI_BASE_URL": cfg.get("base_url"), - "EMBODICHAIN_LLM_PROXY": cfg.get("proxy_url"), - } - for name, value in env_overrides.items(): - if value: - env[name] = str(value) - - if cfg.get("model") or cfg.get("base_url"): - print( - "Using shared LLM config for image2scene subprocess: " - f"model={cfg.get('model')!r}, base_url={cfg.get('base_url')!r}", - flush=True, - ) - return env - - -def _resolve_task_description_for_generation(args: argparse.Namespace) -> str | None: - task_description = str(args.task_description or "").strip() - if args.task_name in _DEFAULT_TASK_TEMPLATE_NAMES: - if task_description: - print( - f"Ignoring --task_description for {args.task_name}; " - "using the default basket task template.", - flush=True, - ) - return None - return task_description or None - - -def _collect_merged_gym_configs(download_dir: Path) -> list[Path]: - if not download_dir.exists(): - return [] - return sorted( - path.resolve() for path in download_dir.rglob("gym_config_merged.json") - ) - - -def _latest_path(paths: list[Path]) -> Path: - return max(paths, key=lambda path: path.stat().st_mtime) - - -def _resolve_image2scene_image( - args: argparse.Namespace, image2scene_root: Path -) -> Path: - if args.image_name: - image_name = Path(args.image_name) - if image_name.parent != Path("."): - raise ValueError( - "--image-name only accepts a file name under " - f"{_DEFAULT_IMAGE_DIR.as_posix()} with " - "--use-image2scene. Use --image for a full path." - ) - if image_name.suffix: - return (_DEFAULT_IMAGE_DIR / image_name).resolve() - - matches = [ - _DEFAULT_IMAGE_DIR / f"{args.image_name}{suffix}" - for suffix in _IMAGE_SUFFIXES - ] - existing = [path.resolve() for path in matches if path.exists()] - if len(existing) == 1: - return existing[0] - if not existing: - candidates = ", ".join(path.name for path in matches) - raise FileNotFoundError( - f"Image name {args.image_name!r} was not found. Tried: {candidates}" - ) - - matched = ", ".join(path.name for path in existing) - raise ValueError( - f"Image name {args.image_name!r} is ambiguous. Use --image-name " - f"with a suffix: {matched}" - ) - - image_input = args.image or _DEFAULT_IMAGE2SCENE_IMAGE - image_path = Path(image_input).expanduser() - if image_path.is_absolute(): - return image_path.resolve() - return (image2scene_root / image_path).resolve() - - -def _run_image2scene_pipeline(args: argparse.Namespace) -> Path: - if not args.background: - raise ValueError("--background is required with --use-image2scene.") - - image2scene_root = Path(args.image2scene_root).expanduser().resolve() - if not image2scene_root.is_dir(): - raise FileNotFoundError(f"image2scene root not found: {image2scene_root}") - - script_path = image2scene_root / "demo_api/client/image2scene_pipeline.py" - if not script_path.is_file(): - raise FileNotFoundError(f"image2scene pipeline not found: {script_path}") - - image_path = _resolve_image2scene_image(args, image2scene_root) - download_dir = _resolve_under_root(image2scene_root, args.image2scene_download_dir) - output_root = _resolve_under_root(image2scene_root, args.image2scene_output_root) - gen_config = _resolve_under_root(image2scene_root, args.image2scene_gen_config) - llm_config = _resolve_under_root(image2scene_root, args.image2scene_llm_config) - extract_dir = _resolve_under_root(image2scene_root, args.image2scene_extract_dir) - merged_output = _resolve_under_root( - image2scene_root, args.image2scene_merged_output - ) - - if ( - download_dir is None - or output_root is None - or gen_config is None - or llm_config is None - ): - raise ValueError("image2scene paths must not be empty.") - - before_configs = set(_collect_merged_gym_configs(download_dir)) - command = [ - sys.executable, - str(script_path), - "--server", - args.server, - "--image", - str(image_path), - "--download-dir", - str(download_dir), - "--background", - args.background, - "--output-root", - str(output_root), - "--gen-config", - str(gen_config), - "--llm-config", - str(llm_config), - "--poll-interval", - str(args.poll_interval), - ] - if extract_dir is not None: - command.extend(["--extract-dir", str(extract_dir)]) - if merged_output is not None: - command.extend(["--merged-output", str(merged_output)]) - - print("Running image2scene pipeline:") - print(shlex.join(command), flush=True) - completed = subprocess.run( - command, - cwd=image2scene_root, - check=False, - env=_image2scene_subprocess_env(), - ) - if completed.returncode != 0: - raise RuntimeError( - f"image2scene pipeline failed with exit code {completed.returncode}" - ) - - if merged_output is not None: - if not merged_output.is_file(): - raise FileNotFoundError( - f"image2scene merged output not found: {merged_output}" - ) - print(f"Using image2scene merged gym config: {merged_output}", flush=True) - return merged_output - - after_configs = _collect_merged_gym_configs(download_dir) - new_configs = [path for path in after_configs if path not in before_configs] - if new_configs: - merged_config = _latest_path(new_configs) - elif after_configs: - merged_config = _latest_path(after_configs) - else: - raise FileNotFoundError( - f"gym_config_merged.json not found under: {download_dir}" - ) - - print(f"Using image2scene merged gym config: {merged_config}", flush=True) - return merged_config - - -def _resolve_gym_project(args: argparse.Namespace) -> ProjectResolution: - use_history = args.base_task_name is not None or args.base_history_index is not None - selected_modes = [ - args.use_image2scene, - args.use_existing_gym_project, - use_history, - ] - if sum(bool(mode) for mode in selected_modes) > 1: - raise ValueError( - "Use only one of --use-image2scene, --use-existing-gym-project, " - "or --base-task-name/--base-history-index." - ) - - if args.use_existing_gym_project: - project_path = Path(args.gym_project).expanduser().resolve() - if not project_path.exists(): - raise FileNotFoundError(f"gym project not found: {project_path}") - print(f"Using existing gym project: {project_path}", flush=True) - return ProjectResolution(path=project_path, mode="existing_gym_project") - - if args.use_image2scene: - return ProjectResolution( - path=_run_image2scene_pipeline(args), mode="image2scene" - ) - - if use_history: - history_entry = _resolve_base_history_entry(args) - project_path = _path_from_history_entry(history_entry) - print( - "Using base history " - f"#{history_entry.get('index')} ({history_entry.get('task_name')}): " - f"{project_path}", - flush=True, - ) - return ProjectResolution( - path=project_path, - mode="history", - base_history=history_entry, - ) - - from embodichain.gen_sim.action_agent_pipeline.gym_project_api.image2tabletop_client import ( - check_health, - collect_image_paths, - process_image, - ) - - image_input = _resolve_image_input(args) - image_path = _resolve_single_image(str(image_input), collect_image_paths) - if not args.skip_health_check: - check_health(args.server) - - return ProjectResolution( - path=process_image( - server=args.server, - image_path=image_path, - output_root=Path(args.gym_project_root), - poll_interval=args.poll_interval, - overwrite=args.overwrite_gym_project, - ), - mode="image2tabletop", - ) - - -def _resolve_base_history_entry(args: argparse.Namespace) -> dict[str, Any]: - if args.base_history_index is not None and args.base_history_index <= 0: - raise ValueError("--base-history-index must be a positive integer.") - - history_path = _pipeline_history_path(args) - history = _read_pipeline_history(history_path) - runs = history["runs"] - - if args.base_history_index is not None: - entry = _find_history_entry_by_index(runs, args.base_history_index) - if entry is None: - raise ValueError( - f"Pipeline history index not found: {args.base_history_index}" - ) - if args.base_task_name and entry.get("task_name") != args.base_task_name: - raise ValueError( - "Pipeline history entry " - f"#{args.base_history_index} has task_name={entry.get('task_name')!r}, " - f"expected {args.base_task_name!r}." - ) - return dict(entry) - - if not args.base_task_name: - raise ValueError("--base-task-name is required without --base-history-index.") - - candidates = [ - entry - for entry in runs - if entry.get("task_name") == args.base_task_name - and _history_entry_has_source(entry) - ] - if not candidates: - raise ValueError( - "No pipeline history entry found for task_name=" - f"{args.base_task_name!r} in {history_path}" - ) - return dict(max(candidates, key=_history_entry_index)) - - -def _pipeline_history_path(args: argparse.Namespace) -> Path: - return _records_pipeline_history_path(args) - - -def _read_pipeline_history(history_path: Path) -> dict[str, Any]: - return _records_read_pipeline_history( - history_path, - schema_version=_PIPELINE_HISTORY_SCHEMA_VERSION, - ) - - -def _find_history_entry_by_index( - runs: list[Any], history_index: int -) -> dict[str, Any] | None: - return _records_find_history_entry_by_index(runs, history_index) - - -def _history_entry_index(entry: dict[str, Any]) -> int: - return _records_history_entry_index(entry) - - -def _history_entry_has_source(entry: dict[str, Any]) -> bool: - return _records_history_entry_has_source(entry) - - -def _path_from_history_entry(entry: dict[str, Any]) -> Path: - return _records_path_from_history_entry(entry, repo_root=_REPO_ROOT) - - -def _resolve_target_replacements( - args: argparse.Namespace, - target_replacement_spec_cls: Callable[..., object], - gym_project: Path, -) -> list[object]: - replacements = [] - alias_config = None - if args.target_replacement1: - alias_config = alias_config or _load_replacement_alias_config(gym_project) - source_uid, prompt = _resolve_target_replacement_arg( - args.target_replacement1, - alias_config, - option_name="--target_replacement1", - replacement_number=1, - ) - replacements.append( - target_replacement_spec_cls( - source_uid=source_uid, - prompt=prompt, - output_dir_name="new1", - ) - ) - if args.target_replacement2: - alias_config = alias_config or _load_replacement_alias_config(gym_project) - source_uid, prompt = _resolve_target_replacement_arg( - args.target_replacement2, - alias_config, - option_name="--target_replacement2", - replacement_number=2, - ) - replacements.append( - target_replacement_spec_cls( - source_uid=source_uid, - prompt=prompt, - output_dir_name="new2", - ) - ) - return replacements - - -def _resolve_target_replacement_arg( - values: list[str], - gym_config: dict[str, Any], - *, - option_name: str, - replacement_number: int, -) -> tuple[str, str]: - if len(values) == 1: - prompt = str(values[0]).strip() - if not prompt: - raise ValueError(f"{option_name} prompt must be non-empty.") - source_uid = _auto_replacement_source_uid( - gym_config, - replacement_number=replacement_number, - option_name=option_name, - ) - return source_uid, prompt - - if len(values) == 2: - source_uid, prompt = values - prompt = str(prompt).strip() - if not prompt: - raise ValueError(f"{option_name} prompt must be non-empty.") - source_uid = _resolve_replacement_source_uid( - source_uid, - gym_config, - option_name=option_name, - ) - return source_uid, prompt - - raise ValueError( - f"{option_name} expects either PROMPT or SOURCE_UID PROMPT, got " - f"{len(values)} values: {values!r}. Quote multi-word prompts." - ) - - -def _load_replacement_alias_config(gym_project: Path) -> dict[str, Any]: - config_path = _resolve_replacement_alias_gym_config(gym_project) - data = json.loads(config_path.read_text(encoding="utf-8")) - if not isinstance(data, dict): - raise ValueError(f"Gym config must be a JSON object: {config_path}") - return data - - -def _resolve_replacement_alias_gym_config(input_path: Path) -> Path: - input_path = input_path.expanduser().resolve() - if input_path.is_file(): - sibling_gym_config = input_path.parent / "gym_config.json" - if sibling_gym_config.is_file(): - return sibling_gym_config.resolve() - return _resolve_source_gym_config(input_path) - - direct_gym_config = input_path / "gym_config.json" - if direct_gym_config.is_file(): - return direct_gym_config.resolve() - - source_config = _resolve_source_gym_config(input_path) - sibling_gym_config = source_config.parent / "gym_config.json" - if sibling_gym_config.is_file(): - return sibling_gym_config.resolve() - return source_config - - -def _auto_replacement_source_uid( - gym_config: dict[str, Any], - *, - replacement_number: int, - option_name: str, -) -> str: - if replacement_number not in {1, 2}: - raise ValueError(f"Unsupported replacement number: {replacement_number}") - - duplicate_groups = _duplicated_numbered_rigid_object_groups(gym_config) - if len(duplicate_groups) != 1: - candidates = _format_duplicate_group_candidates(duplicate_groups) - raise ValueError( - f"{option_name} was given without an explicit source uid, so the " - "pipeline expected exactly one duplicated numbered rigid_object " - f"group in gym_config.json. Found {len(duplicate_groups)} group(s): " - f"{candidates}. Use SOURCE_UID PROMPT to disambiguate." - ) - - base_name, positioned_objects = duplicate_groups[0] - if len(positioned_objects) != 2: - candidates = _format_duplicate_group_candidates(duplicate_groups) - raise ValueError( - f"{option_name} auto-selection requires exactly two objects in the " - f"duplicated group {base_name!r}, found {len(positioned_objects)}: " - f"{candidates}. Use SOURCE_UID PROMPT to disambiguate." - ) - - if ( - abs(float(positioned_objects[0]["y"]) - float(positioned_objects[1]["y"])) - < 1e-9 - ): - candidates = _format_duplicate_group_candidates(duplicate_groups) - raise ValueError( - f"{option_name} auto-selection requires distinct y coordinates in " - f"duplicated group {base_name!r}: {candidates}. Use SOURCE_UID PROMPT " - "to disambiguate." - ) - - selected = positioned_objects[replacement_number - 1] - source_uid = selected["object"]["uid"] - print( - f"Resolved {option_name} auto source -> {source_uid!r} " - f"from duplicated rigid_object group {base_name!r} by y={selected['y']}", - flush=True, - ) - return source_uid - - -def _duplicated_numbered_rigid_object_groups( - gym_config: dict[str, Any], -) -> list[tuple[str, list[dict[str, Any]]]]: - grouped: dict[str, list[dict[str, Any]]] = {} - for obj in _rigid_objects(gym_config): - parsed = _parse_numbered_rigid_object_uid(obj["uid"]) - if parsed is None: - continue - base_name, number = parsed - grouped.setdefault(base_name, []).append( - { - "number": number, - "y": _rigid_object_y_coordinate(obj), - "object": obj, - } - ) - - duplicate_groups = [] - for base_name, entries in grouped.items(): - if len(entries) < 2: - continue - duplicate_groups.append( - ( - base_name, - sorted( - entries, - key=lambda entry: ( - float(entry["y"]), - str(entry["object"]["uid"]), - ), - ), - ) - ) - return sorted(duplicate_groups, key=lambda item: item[0]) - - -def _parse_numbered_rigid_object_uid(uid: str) -> tuple[str, int] | None: - match = re.match(r"^(?P.+?)[_-]?(?P[0-9]+)$", uid) - if match is None: - return None - base_name = match.group("base").strip("_-") - if not base_name: - return None - return base_name, int(match.group("number")) - - -def _rigid_object_y_coordinate(obj: dict[str, Any]) -> float: - init_pos = obj.get("init_pos") - if not isinstance(init_pos, (list, tuple)) or len(init_pos) < 2: - raise ValueError( - "Auto replacement source selection requires each duplicated " - f"rigid_object to define init_pos with a y value, got {obj.get('uid')!r}." - ) - try: - return float(init_pos[1]) - except (TypeError, ValueError) as exc: - raise ValueError( - "Auto replacement source selection requires numeric init_pos[1], " - f"got {obj.get('uid')!r}: {init_pos[1]!r}" - ) from exc - - -def _format_duplicate_group_candidates( - groups: list[tuple[str, list[dict[str, Any]]]], -) -> str: - if not groups: - return "" - parts = [] - for base_name, entries in groups: - values = ", ".join( - f"{entry['object']['uid']}#number={entry['number']},y={entry['y']}" - for entry in entries - ) - parts.append(f"{base_name}: {values}") - return "; ".join(parts) - - -def _resolve_replacement_source_uid( - source_input: str, - gym_config: dict[str, Any], - *, - option_name: str, -) -> str: - source_input = str(source_input).strip() - rigid_objects = _rigid_objects(gym_config) - by_uid = {obj["uid"]: obj for obj in rigid_objects} - if source_input in by_uid: - return source_input - - alias = _parse_indexed_replacement_alias(source_input) - if alias is None: - candidates = _format_rigid_object_candidates(rigid_objects) - raise ValueError( - f"{option_name} source {source_input!r} is neither a rigid object uid " - f"nor an indexed alias such as bread1. Rigid object candidates: " - f"{candidates}" - ) - - keyword, alias_index = alias - matches = [ - obj for obj in rigid_objects if _rigid_object_matches_keyword(obj, keyword) - ] - if alias_index > len(matches): - candidates = _format_rigid_object_candidates(matches or rigid_objects) - raise ValueError( - f"{option_name} alias {source_input!r} requested match #{alias_index} " - f"for keyword {keyword!r}, but only found {len(matches)} match(es). " - f"Candidates: {candidates}" - ) - - resolved_uid = matches[alias_index - 1]["uid"] - print( - f"Resolved {option_name} source alias {source_input!r} -> {resolved_uid!r}", - flush=True, - ) - return resolved_uid - - -def _rigid_objects(gym_config: dict[str, Any]) -> list[dict[str, Any]]: - value = gym_config.get("rigid_object", []) - if isinstance(value, dict): - value = [value] - if not isinstance(value, list): - raise ValueError("gym config rigid_object must be a list or object.") - - rigid_objects = [] - for obj in value: - if not isinstance(obj, dict): - continue - uid = str(obj.get("uid", "")).strip() - if not uid: - continue - copied = dict(obj) - copied["uid"] = uid - rigid_objects.append(copied) - if not rigid_objects: - raise ValueError("No rigid_object entries found in gym config.") - return rigid_objects - - -def _parse_indexed_replacement_alias(alias: str) -> tuple[str, int] | None: - match = _INDEXED_REPLACEMENT_ALIAS_RE.match(alias.strip()) - if match is None: - return None - keyword = match.group("keyword").strip(" _-") - index = int(match.group("index")) - if not keyword or index < 1: - return None - return keyword, index - - -def _rigid_object_matches_keyword(obj: dict[str, Any], keyword: str) -> bool: - keyword_tokens = _search_tokens(keyword) - if not keyword_tokens: - return False - object_tokens = set(_search_tokens(_rigid_object_search_text(obj))) - return all(token in object_tokens for token in keyword_tokens) - - -def _rigid_object_search_text(obj: dict[str, Any]) -> str: - values = [ - obj.get("uid", ""), - obj.get("source_uid", ""), - obj.get("category", ""), - obj.get("semantic_label", ""), - obj.get("name", ""), - obj.get("description", ""), - ] - shape = obj.get("shape", {}) - if isinstance(shape, dict): - values.extend( - [ - shape.get("fpath", ""), - shape.get("file_path", ""), - shape.get("category", ""), - ] - ) - return " ".join(str(value) for value in values if value) - - -def _search_tokens(value: str) -> list[str]: - return re.findall(r"[a-z0-9]+", str(value).lower()) - - -def _format_rigid_object_candidates(rigid_objects: list[dict[str, Any]]) -> str: - if not rigid_objects: - return "" - parts = [] - for obj in rigid_objects: - shape = obj.get("shape", {}) - fpath = shape.get("fpath", "") if isinstance(shape, dict) else "" - parts.append(f"{obj.get('uid')} ({fpath})") - return ", ".join(parts) - - -def _write_pipeline_manifests( - *, - args: argparse.Namespace, - resolution: ProjectResolution, - generated_paths: object, - target_replacements: list[object], -) -> dict[str, Any]: - return _records_write_pipeline_manifests( - args=args, - resolution=resolution, - generated_paths=generated_paths, - target_replacements=target_replacements, - repo_root=_REPO_ROOT, - schema_version=_PIPELINE_HISTORY_SCHEMA_VERSION, - manifest_filename=_PIPELINE_MANIFEST_FILENAME, - ) - - -def _resolve_source_gym_config(input_path: Path) -> Path: - return _records_resolve_source_gym_config( - input_path, - gym_config_preference=_GYM_CONFIG_PREFERENCE, - ) - - -def _configure_llm_usage_tracking( - args: argparse.Namespace, -) -> tuple[Path, Path] | None: - if not args.llm_usage: - from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( - disable_usage_tracking, - ) - - disable_usage_tracking() - return None - - from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( - configure_usage_tracking, - ) - - output_dir = Path(args.config_output_dir).expanduser().resolve() - usage_path = ( - Path(args.llm_usage_output).expanduser().resolve() - if args.llm_usage_output - else output_dir / "llm_usage.jsonl" - ) - summary_path = ( - Path(args.llm_usage_summary_output).expanduser().resolve() - if args.llm_usage_summary_output - else output_dir / "llm_usage_summary.json" - ) - run_id = args.llm_usage_run_id or ( - f"{args.task_name}_{datetime.now().astimezone().strftime('%Y%m%d_%H%M%S')}" - ) - configure_usage_tracking( - usage_path=usage_path, - run_id=run_id, - process_name="run_agent_pipeline", - reset=True, - ) - print(f"Recording local LLM token usage: {usage_path}", flush=True) - print(f"Local LLM token usage summary: {summary_path}", flush=True) - return usage_path, summary_path - - -def _write_llm_usage_summary(usage_paths: tuple[Path, Path] | None) -> None: - if usage_paths is None: - return - - from embodichain.gen_sim.action_agent_pipeline.utils.llm_usage import ( - write_usage_summary, - ) - - usage_path, summary_path = usage_paths - summary = write_usage_summary( - usage_path=usage_path, - summary_path=summary_path, - ) - total = summary["total"] - print( - "Local LLM token usage total: " - f"calls={total['calls']}, " - f"input={total['input_tokens']}, " - f"output={total['output_tokens']}, " - f"total={total['total_tokens']}", - flush=True, - ) - - -def _run_agent_command( - *, - task_name: str, - gym_config: Path, - agent_config: Path, - regenerate: bool, -) -> int: - command = [ - sys.executable, - "-m", - "embodichain.gen_sim.action_agent_pipeline.cli.run_agent", - "--task_name", - task_name, - "--gym_config", - str(gym_config), - "--agent_config", - str(agent_config), - ] - if regenerate: - command.append("--regenerate") - - env = os.environ.copy() - if env.get("EMBODICHAIN_LLM_USAGE_PATH"): - env["EMBODICHAIN_LLM_USAGE_PROCESS"] = "run_agent" - - print("Running task:") - print(shlex.join(command), flush=True) - return subprocess.run(command, check=False, env=env).returncode - def main() -> int: - args = _build_parser().parse_args() - - _ensure_repo_on_pythonpath() - from embodichain.gen_sim.action_agent_pipeline.generation.ur5_basket_config import ( - TargetReplacementSpec, - generate_ur5_basket_config_from_project, - ) - - resolution = _resolve_gym_project(args) - usage_paths = _configure_llm_usage_tracking(args) - target_replacements = _resolve_target_replacements( - args, - TargetReplacementSpec, - resolution.path, - ) - task_description = _resolve_task_description_for_generation(args) - args.task_description = task_description or "" - - paths = generate_ur5_basket_config_from_project( - gym_project=resolution.path, - output_dir=args.config_output_dir, - task_name=args.task_name, - task_description=task_description, - target_body_scale=args.target_body_scale, - target_replacements=target_replacements, - sync_replacement_names=args.sync_replacement_names, - reuse_target_replacements=args.reuse_target_replacements, - prewarm_coacd_cache=args.prewarm_coacd_cache, - overwrite=args.overwrite_config, - ) - _write_pipeline_manifests( - args=args, - resolution=resolution, - generated_paths=paths, - target_replacements=target_replacements, - ) - - print(f"Using gym project/config: {resolution.path}", flush=True) - print(f"Generated gym config: {paths.gym_config}", flush=True) - print(f"Generated agent config: {paths.agent_config}", flush=True) - if args.skip_run_agent: - _write_llm_usage_summary(usage_paths) - return 0 - - return_code = _run_agent_command( - task_name=args.task_name, - gym_config=paths.gym_config, - agent_config=paths.agent_config, - regenerate=args.regenerate, - ) - _write_llm_usage_summary(usage_paths) - return return_code + return run_pipeline(build_parser().parse_args()) if __name__ == "__main__": diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/target_replacements.py b/embodichain/gen_sim/action_agent_pipeline/cli/target_replacements.py new file mode 100644 index 00000000..c12ca8ec --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/cli/target_replacements.py @@ -0,0 +1,391 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import argparse +from collections.abc import Callable +import json +from pathlib import Path +import re +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_defaults import ( + GYM_CONFIG_PREFERENCE, +) +from embodichain.gen_sim.action_agent_pipeline.cli.pipeline_records import ( + resolve_source_gym_config, +) + +__all__ = ["resolve_target_replacements"] + +_INDEXED_REPLACEMENT_ALIAS_RE = re.compile( + r"^(?P[A-Za-z][A-Za-z0-9 _-]*?)[ _-]?(?P[0-9]+)$" +) + + +def resolve_target_replacements( + args: argparse.Namespace, + target_replacement_spec_cls: Callable[..., object], + gym_project: Path, +) -> list[object]: + replacements = [] + alias_config = None + if args.target_replacement1: + alias_config = alias_config or _load_replacement_alias_config(gym_project) + source_uid, prompt = _resolve_target_replacement_arg( + args.target_replacement1, + alias_config, + option_name="--target_replacement1", + replacement_number=1, + ) + replacements.append( + target_replacement_spec_cls( + source_uid=source_uid, + prompt=prompt, + output_dir_name="new1", + ) + ) + if args.target_replacement2: + alias_config = alias_config or _load_replacement_alias_config(gym_project) + source_uid, prompt = _resolve_target_replacement_arg( + args.target_replacement2, + alias_config, + option_name="--target_replacement2", + replacement_number=2, + ) + replacements.append( + target_replacement_spec_cls( + source_uid=source_uid, + prompt=prompt, + output_dir_name="new2", + ) + ) + return replacements + + +def _resolve_target_replacement_arg( + values: list[str], + gym_config: dict[str, Any], + *, + option_name: str, + replacement_number: int, +) -> tuple[str, str]: + if len(values) == 1: + prompt = str(values[0]).strip() + if not prompt: + raise ValueError(f"{option_name} prompt must be non-empty.") + source_uid = _auto_replacement_source_uid( + gym_config, + replacement_number=replacement_number, + option_name=option_name, + ) + return source_uid, prompt + + if len(values) == 2: + source_uid, prompt = values + prompt = str(prompt).strip() + if not prompt: + raise ValueError(f"{option_name} prompt must be non-empty.") + source_uid = _resolve_replacement_source_uid( + source_uid, + gym_config, + option_name=option_name, + ) + return source_uid, prompt + + raise ValueError( + f"{option_name} expects either PROMPT or SOURCE_UID PROMPT, got " + f"{len(values)} values: {values!r}. Quote multi-word prompts." + ) + + +def _load_replacement_alias_config(gym_project: Path) -> dict[str, Any]: + config_path = _resolve_replacement_alias_gym_config(gym_project) + data = json.loads(config_path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError(f"Gym config must be a JSON object: {config_path}") + return data + + +def _resolve_replacement_alias_gym_config(input_path: Path) -> Path: + input_path = input_path.expanduser().resolve() + if input_path.is_file(): + sibling_gym_config = input_path.parent / "gym_config.json" + if sibling_gym_config.is_file(): + return sibling_gym_config.resolve() + return _resolve_source_gym_config(input_path) + + direct_gym_config = input_path / "gym_config.json" + if direct_gym_config.is_file(): + return direct_gym_config.resolve() + + source_config = _resolve_source_gym_config(input_path) + sibling_gym_config = source_config.parent / "gym_config.json" + if sibling_gym_config.is_file(): + return sibling_gym_config.resolve() + return source_config + + +def _auto_replacement_source_uid( + gym_config: dict[str, Any], + *, + replacement_number: int, + option_name: str, +) -> str: + if replacement_number not in {1, 2}: + raise ValueError(f"Unsupported replacement number: {replacement_number}") + + duplicate_groups = _duplicated_numbered_rigid_object_groups(gym_config) + if len(duplicate_groups) != 1: + candidates = _format_duplicate_group_candidates(duplicate_groups) + raise ValueError( + f"{option_name} was given without an explicit source uid, so the " + "pipeline expected exactly one duplicated numbered rigid_object " + f"group in gym_config.json. Found {len(duplicate_groups)} group(s): " + f"{candidates}. Use SOURCE_UID PROMPT to disambiguate." + ) + + base_name, positioned_objects = duplicate_groups[0] + if len(positioned_objects) != 2: + candidates = _format_duplicate_group_candidates(duplicate_groups) + raise ValueError( + f"{option_name} auto-selection requires exactly two objects in the " + f"duplicated group {base_name!r}, found {len(positioned_objects)}: " + f"{candidates}. Use SOURCE_UID PROMPT to disambiguate." + ) + + if ( + abs(float(positioned_objects[0]["y"]) - float(positioned_objects[1]["y"])) + < 1e-9 + ): + candidates = _format_duplicate_group_candidates(duplicate_groups) + raise ValueError( + f"{option_name} auto-selection requires distinct y coordinates in " + f"duplicated group {base_name!r}: {candidates}. Use SOURCE_UID PROMPT " + "to disambiguate." + ) + + selected = positioned_objects[replacement_number - 1] + source_uid = selected["object"]["uid"] + print( + f"Resolved {option_name} auto source -> {source_uid!r} " + f"from duplicated rigid_object group {base_name!r} by y={selected['y']}", + flush=True, + ) + return source_uid + + +def _duplicated_numbered_rigid_object_groups( + gym_config: dict[str, Any], +) -> list[tuple[str, list[dict[str, Any]]]]: + grouped: dict[str, list[dict[str, Any]]] = {} + for obj in _rigid_objects(gym_config): + parsed = _parse_numbered_rigid_object_uid(obj["uid"]) + if parsed is None: + continue + base_name, number = parsed + grouped.setdefault(base_name, []).append( + { + "number": number, + "y": _rigid_object_y_coordinate(obj), + "object": obj, + } + ) + + duplicate_groups = [] + for base_name, entries in grouped.items(): + if len(entries) < 2: + continue + duplicate_groups.append( + ( + base_name, + sorted( + entries, + key=lambda entry: ( + float(entry["y"]), + str(entry["object"]["uid"]), + ), + ), + ) + ) + return sorted(duplicate_groups, key=lambda item: item[0]) + + +def _parse_numbered_rigid_object_uid(uid: str) -> tuple[str, int] | None: + match = re.match(r"^(?P.+?)[_-]?(?P[0-9]+)$", uid) + if match is None: + return None + base_name = match.group("base").strip("_-") + if not base_name: + return None + return base_name, int(match.group("number")) + + +def _rigid_object_y_coordinate(obj: dict[str, Any]) -> float: + init_pos = obj.get("init_pos") + if not isinstance(init_pos, (list, tuple)) or len(init_pos) < 2: + raise ValueError( + "Auto replacement source selection requires each duplicated " + f"rigid_object to define init_pos with a y value, got {obj.get('uid')!r}." + ) + try: + return float(init_pos[1]) + except (TypeError, ValueError) as exc: + raise ValueError( + "Auto replacement source selection requires numeric init_pos[1], " + f"got {obj.get('uid')!r}: {init_pos[1]!r}" + ) from exc + + +def _format_duplicate_group_candidates( + groups: list[tuple[str, list[dict[str, Any]]]], +) -> str: + if not groups: + return "" + parts = [] + for base_name, entries in groups: + values = ", ".join( + f"{entry['object']['uid']}#number={entry['number']},y={entry['y']}" + for entry in entries + ) + parts.append(f"{base_name}: {values}") + return "; ".join(parts) + + +def _resolve_replacement_source_uid( + source_input: str, + gym_config: dict[str, Any], + *, + option_name: str, +) -> str: + source_input = str(source_input).strip() + rigid_objects = _rigid_objects(gym_config) + by_uid = {obj["uid"]: obj for obj in rigid_objects} + if source_input in by_uid: + return source_input + + alias = _parse_indexed_replacement_alias(source_input) + if alias is None: + candidates = _format_rigid_object_candidates(rigid_objects) + raise ValueError( + f"{option_name} source {source_input!r} is neither a rigid object uid " + f"nor an indexed alias such as bread1. Rigid object candidates: " + f"{candidates}" + ) + + keyword, alias_index = alias + matches = [ + obj for obj in rigid_objects if _rigid_object_matches_keyword(obj, keyword) + ] + if alias_index > len(matches): + candidates = _format_rigid_object_candidates(matches or rigid_objects) + raise ValueError( + f"{option_name} alias {source_input!r} requested match #{alias_index} " + f"for keyword {keyword!r}, but only found {len(matches)} match(es). " + f"Candidates: {candidates}" + ) + + resolved_uid = matches[alias_index - 1]["uid"] + print( + f"Resolved {option_name} source alias {source_input!r} -> {resolved_uid!r}", + flush=True, + ) + return resolved_uid + + +def _rigid_objects(gym_config: dict[str, Any]) -> list[dict[str, Any]]: + value = gym_config.get("rigid_object", []) + if isinstance(value, dict): + value = [value] + if not isinstance(value, list): + raise ValueError("gym config rigid_object must be a list or object.") + + rigid_objects = [] + for obj in value: + if not isinstance(obj, dict): + continue + uid = str(obj.get("uid", "")).strip() + if not uid: + continue + copied = dict(obj) + copied["uid"] = uid + rigid_objects.append(copied) + if not rigid_objects: + raise ValueError("No rigid_object entries found in gym config.") + return rigid_objects + + +def _parse_indexed_replacement_alias(alias: str) -> tuple[str, int] | None: + match = _INDEXED_REPLACEMENT_ALIAS_RE.match(alias.strip()) + if match is None: + return None + keyword = match.group("keyword").strip(" _-") + index = int(match.group("index")) + if not keyword or index < 1: + return None + return keyword, index + + +def _rigid_object_matches_keyword(obj: dict[str, Any], keyword: str) -> bool: + keyword_tokens = _search_tokens(keyword) + if not keyword_tokens: + return False + object_tokens = set(_search_tokens(_rigid_object_search_text(obj))) + return all(token in object_tokens for token in keyword_tokens) + + +def _rigid_object_search_text(obj: dict[str, Any]) -> str: + values = [ + obj.get("uid", ""), + obj.get("source_uid", ""), + obj.get("category", ""), + obj.get("semantic_label", ""), + obj.get("name", ""), + obj.get("description", ""), + ] + shape = obj.get("shape", {}) + if isinstance(shape, dict): + values.extend( + [ + shape.get("fpath", ""), + shape.get("file_path", ""), + shape.get("category", ""), + ] + ) + return " ".join(str(value) for value in values if value) + + +def _search_tokens(value: str) -> list[str]: + return re.findall(r"[a-z0-9]+", str(value).lower()) + + +def _format_rigid_object_candidates(rigid_objects: list[dict[str, Any]]) -> str: + if not rigid_objects: + return "" + parts = [] + for obj in rigid_objects: + shape = obj.get("shape", {}) + fpath = shape.get("fpath", "") if isinstance(shape, dict) else "" + parts.append(f"{obj.get('uid')} ({fpath})") + return ", ".join(parts) + + +def _resolve_source_gym_config(input_path: Path) -> Path: + return resolve_source_gym_config( + input_path, + gym_config_preference=GYM_CONFIG_PREFERENCE, + ) diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/agent_env.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/agent_env.py index 907af3c6..7af7a1a9 100644 --- a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/agent_env.py +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/agent_env.py @@ -16,33 +16,40 @@ from __future__ import annotations +from copy import deepcopy + import torch -from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg -from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.base_agent_env import ( - BaseAgentEnv, -) from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.success import ( evaluate_configured_success, ) +from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg from embodichain.lab.gym.utils.registration import register_env +from embodichain.utils import logger -__all__ = ["AtomicActionsAgentEnv"] +__all__ = ["AgenticGenSimEnv", "AtomicActionsAgentEnv"] + +_TASK_PROMPT_KEYS = frozenset({"task_prompt", "basic_background", "atom_actions"}) +_AGENT_RESERVED_KEYS = frozenset({"task_name", "config_dir"}) +_REQUIRED_AGENT_KWARGS = frozenset({"agent_config", "task_name"}) +_OPTIONAL_AGENT_KWARGS = frozenset({"agent_config_path"}) +_AGENT_KWARGS = _REQUIRED_AGENT_KWARGS | _OPTIONAL_AGENT_KWARGS @register_env("AtomicActionsAgent-v3", max_episode_steps=600) -class AtomicActionsAgentEnv(BaseAgentEnv, EmbodiedEnv): +class AgenticGenSimEnv(EmbodiedEnv): """Config-driven agent environment for atomic-action tasks.""" def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): - super().__init__(cfg, **kwargs) + env_kwargs, agent_kwargs = _split_env_and_agent_kwargs(kwargs) + super().__init__(cfg, **env_kwargs) if bool(getattr(self, "ignore_terminations_during_agent", False)): self.cfg.ignore_terminations = True - super()._init_agents(**kwargs) + self._init_agents(**agent_kwargs) def reset(self, seed: int | None = None, options: dict | None = None): obs, info = super().reset(seed=seed, options=options) - super().get_states() + self.get_states() return obs, info def is_task_success(self, **kwargs) -> torch.Tensor: @@ -52,3 +59,343 @@ def compute_task_state(self, **kwargs) -> tuple[torch.Tensor, torch.Tensor, dict success = self.is_task_success() fail = torch.zeros_like(success) return success, fail, {} + + def _init_agents(self, agent_config, task_name, agent_config_path=None): + self._validate_agent_config_keys("Agent", agent_config["Agent"]) + self._validate_agent_config_keys("TaskAgent", agent_config["TaskAgent"]) + self._validate_agent_config_keys("CompileAgent", agent_config["CompileAgent"]) + + from embodichain.gen_sim.action_agent_pipeline.agents.compile_agent import ( + CompileAgent, + ) + from embodichain.gen_sim.action_agent_pipeline.agents.llm import ( + task_llm, + ) + from embodichain.gen_sim.action_agent_pipeline.agents.task_agent import ( + TaskAgent, + ) + + task_agent_config = self._agent_config_with_prompt_keys( + agent_config["Agent"], + _TASK_PROMPT_KEYS, + ) + compile_agent_config = self._agent_config_with_prompt_keys( + agent_config["Agent"], + frozenset(), + ) + self.task_agent = TaskAgent( + task_llm, + **task_agent_config, + **agent_config["TaskAgent"], + task_name=task_name, + config_dir=agent_config_path, + ) + self.compile_agent = CompileAgent( + **compile_agent_config, + **agent_config["CompileAgent"], + task_name=task_name, + config_dir=agent_config_path, + ) + + def _validate_agent_config_keys(self, section_name, section_config): + reserved_keys = _AGENT_RESERVED_KEYS & set(section_config) + if reserved_keys: + raise ValueError( + f"{section_name} config contains reserved keys: " + f"{', '.join(sorted(reserved_keys))}." + ) + + def _agent_config_with_prompt_keys(self, agent_config, allowed_keys): + filtered = deepcopy(agent_config) + prompt_kwargs = filtered.get("prompt_kwargs", {}) or {} + filtered["prompt_kwargs"] = { + key: value for key, value in prompt_kwargs.items() if key in allowed_keys + } + return filtered + + def get_states(self): + # TODO: only support num_env = 1 for now + # store robot states in each env.reset + self.init_qpos = self.robot.get_qpos().squeeze(0) + + self._agent_arm_slots = self._resolve_agent_arm_slots() + for side in ("left", "right"): + self._initialize_agent_arm_slot(side, self._agent_arm_slots.get(side)) + + self.open_state = torch.as_tensor( + getattr( + self, + "agent_open_state", + getattr(self, "gripper_open_state", [0.05]), + ), + dtype=self.init_qpos.dtype, + device=self.init_qpos.device, + ).flatten() + self.close_state = torch.as_tensor( + getattr( + self, + "agent_close_state", + getattr(self, "gripper_close_state", [0.0]), + ), + dtype=self.init_qpos.dtype, + device=self.init_qpos.device, + ).flatten() + self.left_arm_current_gripper_state = self._initial_gripper_state("left") + self.right_arm_current_gripper_state = self._initial_gripper_state("right") + + self.update_obj_info() + + def _resolve_agent_arm_slots(self) -> dict[str, dict[str, str | None] | None]: + configured_slots = getattr(self, "agent_arm_slots", None) + if configured_slots is not None: + return self._normalize_agent_arm_slots(configured_slots) + + if hasattr(self, "single_arm_name") or hasattr(self, "single_eef_name"): + slot = getattr(self, "agent_single_arm_slot", "right") + return self._normalize_agent_arm_slots( + { + slot: { + "arm": getattr(self, "single_arm_name", "right_arm"), + "eef": getattr(self, "single_eef_name", "right_eef"), + } + } + ) + + control_parts = getattr(self.robot, "control_parts", {}) or {} + if "arm" in control_parts and "hand" in control_parts: + slot = getattr(self, "agent_single_arm_slot", "left") + return self._normalize_agent_arm_slots( + {slot: {"arm": "arm", "eef": "hand"}} + ) + + return self._normalize_agent_arm_slots( + { + "left": {"arm": "left_arm", "eef": "left_eef"}, + "right": {"arm": "right_arm", "eef": "right_eef"}, + } + ) + + def _normalize_agent_arm_slots( + self, slots + ) -> dict[str, dict[str, str | None] | None]: + normalized = {"left": None, "right": None} + for side in normalized: + slot_cfg = slots.get(side) if isinstance(slots, dict) else None + if slot_cfg is None: + continue + if isinstance(slot_cfg, str): + normalized[side] = {"arm": slot_cfg, "eef": None} + continue + normalized[side] = { + "arm": slot_cfg.get("arm", slot_cfg.get("arm_control_part")), + "eef": slot_cfg.get( + "eef", + slot_cfg.get("hand", slot_cfg.get("eef_control_part")), + ), + } + return normalized + + def _initialize_agent_arm_slot( + self, side: str, slot_cfg: dict[str, str | None] | None + ) -> None: + arm_name = slot_cfg.get("arm") if slot_cfg else None + eef_name = slot_cfg.get("eef") if slot_cfg else None + arm_joints = self._get_control_part_joint_ids(arm_name) + eef_joints = self._get_control_part_joint_ids(eef_name) + + setattr(self, f"{side}_arm_joints", arm_joints) + setattr(self, f"{side}_eef_joints", eef_joints) + + if arm_name is None or not arm_joints: + setattr(self, f"{side}_arm_init_qpos", self.init_qpos.new_empty(0)) + setattr(self, f"{side}_arm_init_xpos", None) + setattr(self, f"{side}_arm_base_pose", None) + setattr(self, f"{side}_arm_current_qpos", self.init_qpos.new_empty(0)) + setattr(self, f"{side}_arm_current_xpos", None) + return + + init_qpos = self.init_qpos[arm_joints] + init_xpos = self.robot.compute_fk( + init_qpos, name=arm_name, to_matrix=True + ).squeeze(0) + base_pose = self.robot.get_control_part_base_pose( + arm_name, to_matrix=True + ).squeeze(0) + + setattr(self, f"{side}_arm_init_qpos", init_qpos) + setattr(self, f"{side}_arm_init_xpos", init_xpos) + setattr(self, f"{side}_arm_base_pose", base_pose) + setattr(self, f"{side}_arm_current_qpos", init_qpos) + setattr(self, f"{side}_arm_current_xpos", init_xpos) + + def _get_control_part_joint_ids(self, control_part: str | None) -> list[int]: + if control_part is None: + return [] + if control_part not in (getattr(self.robot, "control_parts", {}) or {}): + return [] + return list(self.robot.get_joint_ids(name=control_part)) + + def _initial_gripper_state(self, side: str) -> torch.Tensor: + if len(getattr(self, f"{side}_eef_joints", []) or []) == 0: + return self.open_state.new_empty(0) + return self.open_state + + def update_obj_info(self): + # store some useful obj information + obj_info = getattr(self, "obj_info", {}) + obj_uids = self.sim.get_rigid_object_uid_list() + for obj_name in obj_uids: + obj = self.sim.get_rigid_object(obj_name) + obj_pose = obj.get_local_pose(to_matrix=True).squeeze(0) + + if obj_name not in obj_info: + obj_height = obj_pose[2, 3] # Extract the height (z-coordinate) + obj_info[obj_name] = { + "pose": obj_pose, # Store the full pose (4x4 matrix) + "height": obj_height, # Store the initial height (z-coordinate) + } + else: + obj_info[obj_name]["pose"] = obj_pose + + self.obj_info = obj_info + + # -------------------- Common getters / setters -------------------- + + def get_obs_for_agent(self): + obs = self.get_obs() + rgb = obs["sensor"]["cam_high"]["color"].squeeze(0) + + # Get validation camera data + camera_data = self.event_manager.get_functor("validation_cameras")(self, None) + result = {"rgb": rgb} + result.update({k: v.squeeze(0) for k, v in camera_data.items()}) + return result + + def get_current_qpos_agent(self): + return self.left_arm_current_qpos, self.right_arm_current_qpos + + def set_current_qpos_agent(self, arm_qpos, is_left): + if is_left: + self.left_arm_current_qpos = arm_qpos + else: + self.right_arm_current_qpos = arm_qpos + + def get_current_xpos_agent(self): + return self.left_arm_current_xpos, self.right_arm_current_xpos + + def set_current_xpos_agent(self, arm_xpos, is_left): + if is_left: + self.left_arm_current_xpos = arm_xpos + else: + self.right_arm_current_xpos = arm_xpos + + def get_current_gripper_state_agent(self): + return self.left_arm_current_gripper_state, self.right_arm_current_gripper_state + + def set_current_gripper_state_agent(self, arm_gripper_state, is_left): + if is_left: + self.left_arm_current_gripper_state = arm_gripper_state + else: + self.right_arm_current_gripper_state = arm_gripper_state + + # -------------------- IK / FK -------------------- + def get_arm_ik(self, target_xpos, is_left, qpos_seed=None): + control_part = self.get_agent_arm_control_part(is_left) + ret, qpos = self.robot.compute_ik( + name=control_part, pose=target_xpos, joint_seed=qpos_seed + ) + return ret.all().item(), qpos.squeeze(0) + + def get_arm_fk(self, qpos, is_left): + control_part = self.get_agent_arm_control_part(is_left) + xpos = self.robot.compute_fk( + name=control_part, qpos=torch.as_tensor(qpos), to_matrix=True + ) + return xpos.squeeze(0) + + def get_agent_arm_control_part(self, is_left: bool) -> str: + return self._get_agent_control_part(is_left=is_left, key="arm") + + def get_agent_eef_control_part(self, is_left: bool) -> str | None: + return self._get_agent_control_part(is_left=is_left, key="eef", required=False) + + def _get_agent_control_part( + self, is_left: bool, key: str, required: bool = True + ) -> str | None: + if not hasattr(self, "_agent_arm_slots"): + self._agent_arm_slots = self._resolve_agent_arm_slots() + side = "left" if is_left else "right" + slot_cfg = getattr(self, "_agent_arm_slots", {}).get(side) + control_part = slot_cfg.get(key) if slot_cfg else None + if control_part is None and required: + logger.log_error( + f"{side}_{key} is not configured for agent control.", + error_type=ValueError, + ) + return control_part + + # -------------------- get compiled graph for action list -------------------- + def generate_graph_for_actions(self, regenerate=False, **kwargs): + logger.log_info( + "Generate graph for creating action list for " + f"{self.compile_agent.task_name}.", + color="green", + ) + + print(f"\033[92m\nStart task graph generation.\n\033[0m") + task_agent_input = self.task_agent.get_composed_observations( + env=self, + regenerate=regenerate, + observations=self.get_obs_for_agent(), + **kwargs, + ) + task_graph = self.task_agent.generate(**task_agent_input) + + print(f"\033[94m\nStart graph compilation.\n\033[0m") + compile_agent_input = self.compile_agent.get_composed_observations( + env=self, + regenerate=regenerate, + task_graph=task_graph, + **kwargs, + ) + graph_file_path, kwargs, graph_content = self.compile_agent.generate( + **compile_agent_input + ) + + return graph_file_path, kwargs, graph_content + + # -------------------- get action list -------------------- + def create_demo_action_list(self, regenerate=False, *args, **kwargs): + graph_file_path, compile_kwargs, _ = self.generate_graph_for_actions( + regenerate=regenerate + ) + atomic_action_kwargs = { + "allow_grasp_annotation": True, + "force_grasp_reannotate": False, + } + for key in atomic_action_kwargs: + if key in kwargs: + atomic_action_kwargs[key] = kwargs[key] + compile_kwargs.update(atomic_action_kwargs) + action_list = self.compile_agent.act(graph_file_path, **compile_kwargs) + return action_list + + +def _split_env_and_agent_kwargs( + kwargs: dict, +) -> tuple[dict, dict]: + missing = _REQUIRED_AGENT_KWARGS - set(kwargs) + if missing: + raise ValueError( + "AgenticGenSimEnv requires agent kwargs: " f"{', '.join(sorted(missing))}." + ) + + env_kwargs = { + key: value for key, value in kwargs.items() if key not in _AGENT_KWARGS + } + agent_kwargs = {key: kwargs[key] for key in _REQUIRED_AGENT_KWARGS} + agent_kwargs["agent_config_path"] = kwargs.get("agent_config_path") + return env_kwargs, agent_kwargs + + +AtomicActionsAgentEnv = AgenticGenSimEnv diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py deleted file mode 100644 index b239834d..00000000 --- a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/base_agent_env.py +++ /dev/null @@ -1,335 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -from __future__ import annotations - -from copy import deepcopy - -import torch -from embodichain.utils import logger - -_TASK_PROMPT_KEYS = frozenset({"task_prompt", "basic_background", "atom_actions"}) - - -class BaseAgentEnv: - - def _init_agents(self, agent_config, task_name, agent_config_path=None): - from embodichain.gen_sim.action_agent_pipeline.agents.task_agent import ( - TaskAgent, - ) - from embodichain.gen_sim.action_agent_pipeline.agents.compile_agent import ( - CompileAgent, - ) - from embodichain.gen_sim.action_agent_pipeline.agents.llm import ( - task_llm, - ) - - task_agent_config = self._agent_config_with_prompt_keys( - agent_config["Agent"], - _TASK_PROMPT_KEYS, - ) - compile_agent_config = self._agent_config_with_prompt_keys( - agent_config["Agent"], - frozenset(), - ) - self.task_agent = TaskAgent( - task_llm, - **task_agent_config, - **agent_config["TaskAgent"], - task_name=task_name, - config_dir=agent_config_path, - ) - self.compile_agent = CompileAgent( - **compile_agent_config, - **agent_config["CompileAgent"], - task_name=task_name, - config_dir=agent_config_path, - ) - - def _agent_config_with_prompt_keys(self, agent_config, allowed_keys): - filtered = deepcopy(agent_config) - prompt_kwargs = filtered.get("prompt_kwargs", {}) or {} - filtered["prompt_kwargs"] = { - key: value for key, value in prompt_kwargs.items() if key in allowed_keys - } - return filtered - - def get_states(self): - # TODO: only support num_env = 1 for now - # store robot states in each env.reset - self.init_qpos = self.robot.get_qpos().squeeze(0) - - self._agent_arm_slots = self._resolve_agent_arm_slots() - for side in ("left", "right"): - self._initialize_agent_arm_slot(side, self._agent_arm_slots.get(side)) - - self.open_state = torch.as_tensor( - getattr( - self, - "agent_open_state", - getattr(self, "gripper_open_state", [0.05]), - ), - dtype=self.init_qpos.dtype, - device=self.init_qpos.device, - ).flatten() - self.close_state = torch.as_tensor( - getattr( - self, - "agent_close_state", - getattr(self, "gripper_close_state", [0.0]), - ), - dtype=self.init_qpos.dtype, - device=self.init_qpos.device, - ).flatten() - self.left_arm_current_gripper_state = self._initial_gripper_state("left") - self.right_arm_current_gripper_state = self._initial_gripper_state("right") - - self.update_obj_info() - - def _resolve_agent_arm_slots(self) -> dict[str, dict[str, str | None] | None]: - configured_slots = getattr(self, "agent_arm_slots", None) - if configured_slots is not None: - return self._normalize_agent_arm_slots(configured_slots) - - if hasattr(self, "single_arm_name") or hasattr(self, "single_eef_name"): - slot = getattr(self, "agent_single_arm_slot", "right") - return self._normalize_agent_arm_slots( - { - slot: { - "arm": getattr(self, "single_arm_name", "right_arm"), - "eef": getattr(self, "single_eef_name", "right_eef"), - } - } - ) - - control_parts = getattr(self.robot, "control_parts", {}) or {} - if "arm" in control_parts and "hand" in control_parts: - slot = getattr(self, "agent_single_arm_slot", "left") - return self._normalize_agent_arm_slots( - {slot: {"arm": "arm", "eef": "hand"}} - ) - - return self._normalize_agent_arm_slots( - { - "left": {"arm": "left_arm", "eef": "left_eef"}, - "right": {"arm": "right_arm", "eef": "right_eef"}, - } - ) - - def _normalize_agent_arm_slots( - self, slots - ) -> dict[str, dict[str, str | None] | None]: - normalized = {"left": None, "right": None} - for side in normalized: - slot_cfg = slots.get(side) if isinstance(slots, dict) else None - if slot_cfg is None: - continue - if isinstance(slot_cfg, str): - normalized[side] = {"arm": slot_cfg, "eef": None} - continue - normalized[side] = { - "arm": slot_cfg.get("arm", slot_cfg.get("arm_control_part")), - "eef": slot_cfg.get( - "eef", - slot_cfg.get("hand", slot_cfg.get("eef_control_part")), - ), - } - return normalized - - def _initialize_agent_arm_slot( - self, side: str, slot_cfg: dict[str, str | None] | None - ) -> None: - arm_name = slot_cfg.get("arm") if slot_cfg else None - eef_name = slot_cfg.get("eef") if slot_cfg else None - arm_joints = self._get_control_part_joint_ids(arm_name) - eef_joints = self._get_control_part_joint_ids(eef_name) - - setattr(self, f"{side}_arm_joints", arm_joints) - setattr(self, f"{side}_eef_joints", eef_joints) - - if arm_name is None or not arm_joints: - setattr(self, f"{side}_arm_init_qpos", self.init_qpos.new_empty(0)) - setattr(self, f"{side}_arm_init_xpos", None) - setattr(self, f"{side}_arm_base_pose", None) - setattr(self, f"{side}_arm_current_qpos", self.init_qpos.new_empty(0)) - setattr(self, f"{side}_arm_current_xpos", None) - return - - init_qpos = self.init_qpos[arm_joints] - init_xpos = self.robot.compute_fk( - init_qpos, name=arm_name, to_matrix=True - ).squeeze(0) - base_pose = self.robot.get_control_part_base_pose( - arm_name, to_matrix=True - ).squeeze(0) - - setattr(self, f"{side}_arm_init_qpos", init_qpos) - setattr(self, f"{side}_arm_init_xpos", init_xpos) - setattr(self, f"{side}_arm_base_pose", base_pose) - setattr(self, f"{side}_arm_current_qpos", init_qpos) - setattr(self, f"{side}_arm_current_xpos", init_xpos) - - def _get_control_part_joint_ids(self, control_part: str | None) -> list[int]: - if control_part is None: - return [] - if control_part not in (getattr(self.robot, "control_parts", {}) or {}): - return [] - return list(self.robot.get_joint_ids(name=control_part)) - - def _initial_gripper_state(self, side: str) -> torch.Tensor: - if len(getattr(self, f"{side}_eef_joints", []) or []) == 0: - return self.open_state.new_empty(0) - return self.open_state - - def update_obj_info(self): - # store some useful obj information - obj_info = getattr(self, "obj_info", {}) - obj_uids = self.sim.get_rigid_object_uid_list() - for obj_name in obj_uids: - obj = self.sim.get_rigid_object(obj_name) - obj_pose = obj.get_local_pose(to_matrix=True).squeeze(0) - - if obj_name not in obj_info: - obj_height = obj_pose[2, 3] # Extract the height (z-coordinate) - obj_info[obj_name] = { - "pose": obj_pose, # Store the full pose (4x4 matrix) - "height": obj_height, # Store the initial height (z-coordinate) - } - else: - obj_info[obj_name]["pose"] = obj_pose - - self.obj_info = obj_info - - # -------------------- Common getters / setters -------------------- - - def get_obs_for_agent(self): - obs = self.get_obs() - rgb = obs["sensor"]["cam_high"]["color"].squeeze(0) - - # Get validation camera data - camera_data = self.event_manager.get_functor("validation_cameras")(self, None) - result = {"rgb": rgb} - result.update({k: v.squeeze(0) for k, v in camera_data.items()}) - return result - - def get_current_qpos_agent(self): - return self.left_arm_current_qpos, self.right_arm_current_qpos - - def set_current_qpos_agent(self, arm_qpos, is_left): - if is_left: - self.left_arm_current_qpos = arm_qpos - else: - self.right_arm_current_qpos = arm_qpos - - def get_current_xpos_agent(self): - return self.left_arm_current_xpos, self.right_arm_current_xpos - - def set_current_xpos_agent(self, arm_xpos, is_left): - if is_left: - self.left_arm_current_xpos = arm_xpos - else: - self.right_arm_current_xpos = arm_xpos - - def get_current_gripper_state_agent(self): - return self.left_arm_current_gripper_state, self.right_arm_current_gripper_state - - def set_current_gripper_state_agent(self, arm_gripper_state, is_left): - if is_left: - self.left_arm_current_gripper_state = arm_gripper_state - else: - self.right_arm_current_gripper_state = arm_gripper_state - - # -------------------- IK / FK -------------------- - def get_arm_ik(self, target_xpos, is_left, qpos_seed=None): - control_part = self.get_agent_arm_control_part(is_left) - ret, qpos = self.robot.compute_ik( - name=control_part, pose=target_xpos, joint_seed=qpos_seed - ) - return ret.all().item(), qpos.squeeze(0) - - def get_arm_fk(self, qpos, is_left): - control_part = self.get_agent_arm_control_part(is_left) - xpos = self.robot.compute_fk( - name=control_part, qpos=torch.as_tensor(qpos), to_matrix=True - ) - return xpos.squeeze(0) - - def get_agent_arm_control_part(self, is_left: bool) -> str: - return self._get_agent_control_part(is_left=is_left, key="arm") - - def get_agent_eef_control_part(self, is_left: bool) -> str | None: - return self._get_agent_control_part(is_left=is_left, key="eef", required=False) - - def _get_agent_control_part( - self, is_left: bool, key: str, required: bool = True - ) -> str | None: - if not hasattr(self, "_agent_arm_slots"): - self._agent_arm_slots = self._resolve_agent_arm_slots() - side = "left" if is_left else "right" - slot_cfg = getattr(self, "_agent_arm_slots", {}).get(side) - control_part = slot_cfg.get(key) if slot_cfg else None - if control_part is None and required: - logger.log_error( - f"{side}_{key} is not configured for agent control.", - error_type=ValueError, - ) - return control_part - - # -------------------- get compiled graph for action list -------------------- - def generate_graph_for_actions(self, regenerate=False, **kwargs): - logger.log_info( - "Generate graph for creating action list for " - f"{self.compile_agent.task_name}.", - color="green", - ) - - print(f"\033[92m\nStart task graph generation.\n\033[0m") - task_agent_input = self.task_agent.get_composed_observations( - env=self, - regenerate=regenerate, - observations=self.get_obs_for_agent(), - **kwargs, - ) - task_graph = self.task_agent.generate(**task_agent_input) - - print(f"\033[94m\nStart graph compilation.\n\033[0m") - compile_agent_input = self.compile_agent.get_composed_observations( - env=self, - regenerate=regenerate, - task_graph=task_graph, - **kwargs, - ) - graph_file_path, kwargs, graph_content = self.compile_agent.generate( - **compile_agent_input - ) - - return graph_file_path, kwargs, graph_content - - # -------------------- get action list -------------------- - def create_demo_action_list(self, regenerate=False, *args, **kwargs): - graph_file_path, compile_kwargs, _ = self.generate_graph_for_actions( - regenerate=regenerate - ) - atomic_action_kwargs = { - "allow_grasp_annotation": True, - "force_grasp_reannotate": False, - } - for key in atomic_action_kwargs: - if key in kwargs: - atomic_action_kwargs[key] = kwargs[key] - compile_kwargs.update(atomic_action_kwargs) - action_list = self.compile_agent.act(graph_file_path, **compile_kwargs) - return action_list diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py index 23de84b9..2826ebc4 100644 --- a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py @@ -99,7 +99,10 @@ def _constant(env, value: bool) -> torch.Tensor: def _pose(env, uid: str) -> torch.Tensor: - return env.sim.get_rigid_object(uid).get_local_pose(to_matrix=True) + obj = env.sim.get_rigid_object(uid) + if obj is None: + raise ValueError(f"Unknown rigid object uid: {uid!r}.") + return obj.get_local_pose(to_matrix=True) def _position(env, uid: str) -> torch.Tensor: diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py new file mode 100644 index 00000000..3966814f --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py @@ -0,0 +1,549 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from pathlib import Path +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.generation.config_io import ( + read_json as _read_json, + raise_if_generated_files_exist as _raise_if_generated_files_exist, + write_config_bundle as _write_config_bundle, +) +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + GeneratedActionAgentConfigPaths, + TargetReplacementSpec, + _BasketTaskRoles, + _RelativePlacementSpec, + _ResolvedTargetReplacement, + _SceneObject, +) +from embodichain.gen_sim.action_agent_pipeline.generation.scene_objects import ( + _collect_scene_objects, + _infer_basket_task_roles, + _infer_project_name, + _resolve_gym_config_path, +) +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_frame_normalization import ( + MeshFrameNormalizer, +) +from embodichain.gen_sim.action_agent_pipeline.generation.glb_io import read_glb +from embodichain.gen_sim.action_agent_pipeline.generation.prompt_builders import ( + make_agent_config, + make_basket_atom_actions_prompt, + make_basket_basic_background, + make_basket_task_prompt, + make_relative_atom_actions_prompt, + make_relative_basic_background, + make_relative_task_prompt, +) +from embodichain.gen_sim.action_agent_pipeline.generation.action_agent_templates import ( + make_dual_ur5_robot_config as _make_dual_ur5_robot_config, + make_light_config as _make_light_config, + make_sensor_config as _make_sensor_config, +) +from embodichain.gen_sim.action_agent_pipeline.generation.config_blocks import ( + _make_background_config, + _make_container_background_config, + _make_dataset_config, + _make_events_config, + _make_extra_background_config, + _make_extra_rigid_object_config, + _make_observations_config, + _make_relative_background_object_config, + _make_relative_dataset_config, + _make_relative_events_config, + _make_relative_rigid_object_config, + _make_target_object_config, + _relative_rigid_object_max_convex_hull_num, + _relative_static_background_max_convex_hull_num, + _source_body_scale, + _target_body_scale_vector, +) +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( + _DUAL_UR5_ARM_COMPONENT_Z, + _DUAL_UR5_TABLETOP_CLEARANCE, + _TABLETOP_OBJECT_CLEARANCE, + _apply_tabletop_z_placement, + _dual_ur5_init_z_from_table_top, + _mesh_config_world_z_bounds, + _mesh_config_world_zmax, + _resolve_table_mesh_world_zmax, +) +from embodichain.gen_sim.action_agent_pipeline.generation.relative_geometry import ( + _STAGING_Z_DELTA, + _inside_container_axis_offsets, + _inside_container_slot_axis_and_distance, + _make_relative_summary, + _offset_position, + _relative_release_offset, + _side_relation_xy_offsets, + _with_inside_container_slot_offsets, + _with_self_relative_absolute_targets, +) +from embodichain.gen_sim.action_agent_pipeline.generation.relative_spec import ( + _build_relative_placement_spec_with_llm, + _call_relative_task_llm, + _normalize_relative_relation, + _relative_relation_phrase, + _relative_scene_runtime_uid_mapping, +) +from embodichain.gen_sim.action_agent_pipeline.generation.replacement_generation import ( + _apply_replacement_names, + _normalize_target_replacements, + _run_prompt2geometry_replacement, + _run_target_replacements, + _validate_target_replacement_sources, +) +from embodichain.gen_sim.action_agent_pipeline.generation.role_refinement import ( + _refine_roles_with_llm, +) +from embodichain.gen_sim.action_agent_pipeline.generation.success_specs import ( + _make_extensions_config, + _make_relative_extensions_config, + _object_in_container_success, + _validate_bundle, + _validate_relative_bundle, + _validate_success_uids, +) + +__all__ = [ + "GeneratedActionAgentConfigPaths", + "TargetReplacementSpec", + "generate_action_agent_config_from_project", +] + + +def generate_action_agent_config_from_project( + gym_project: str | Path, + output_dir: str | Path, + *, + task_name: str = "UR5BreadBasket", + task_description: str | None = None, + use_llm_roles: bool = False, + llm_model: str | None = None, + target_body_scale: float | list[float] | tuple[float, float, float] = 0.7, + target_replacements: Sequence[TargetReplacementSpec] | None = None, + sync_replacement_names: bool = False, + reuse_target_replacements: bool = True, + prewarm_coacd_cache: bool = True, + overwrite: bool = False, + max_episodes: int = 1, + max_episode_steps: int = 1000, +) -> GeneratedActionAgentConfigPaths: + """Generate action-agent configs from an exported gym project. + + This first-stage generator intentionally keeps the UR5BreadBasket task + structure fixed: the left arm grasps the left target object, the right arm + grasps the right target object, and both objects are placed into one + basket-like container. + + Args: + gym_project: Project root, formatted scene folder, ``gym_config.json``, + or ``gym_config_merged.json``. + output_dir: Destination config directory. + task_name: Name passed to ``run_agent``. + task_description: Optional natural-language relative-placement task. + When provided, the generator asks the shared LLM for a constrained + config-level task spec and generates prompts from that spec. + use_llm_roles: If true, use an LLM only to refine object role mapping. + llm_model: Optional model override for role refinement. + target_body_scale: Uniform or xyz scale applied to generated target + objects. Basket-like containers keep their source ``body_scale``. + target_replacements: Optional prompt-generated GLB replacements for + selected default basket target objects. Each replacement writes to + ``/mesh_assets/`` and only affects the + generated config, not the original source mesh file. + sync_replacement_names: If true, update runtime target UIDs and prompts + from the replacement prompts. If false, only mesh paths are replaced. + reuse_target_replacements: If true, reuse an existing replacement GLB + at the expected output path when it matches the requested prompt. + prewarm_coacd_cache: If true, precompute environment-side CoACD cache + files referenced by the generated gym config before writing it. + overwrite: If false, fail when generated files already exist. + max_episodes: Value written to ``fast_gym_config.json``. + max_episode_steps: Value written to ``fast_gym_config.json``. + + Returns: + Paths of generated config files. + """ + + output_dir_path = Path(output_dir).expanduser().resolve() + _raise_if_generated_files_exist(output_dir_path, overwrite) + + input_path = Path(gym_project).expanduser().resolve() + gym_config_path = _resolve_gym_config_path(input_path) + scene_dir = gym_config_path.parent + source_config = _read_json(gym_config_path) + project_name = _infer_project_name(input_path, scene_dir) + replacement_specs = _normalize_target_replacements(target_replacements) + mesh_normalizer = MeshFrameNormalizer( + output_dir=output_dir_path / "mesh_assets" / "normalized" + ) + + scene_objects = _collect_scene_objects(source_config) + if task_description: + if replacement_specs: + raise ValueError( + "target_replacements are only supported by the default basket " + "template. Do not combine them with task_description." + ) + spec = _build_relative_placement_spec_with_llm( + scene_objects=scene_objects, + project_name=project_name, + task_description=task_description, + model=llm_model, + release_offset_fn=_relative_release_offset, + staging_z_delta=_STAGING_Z_DELTA, + task_llm_caller=_call_relative_task_llm, + ) + bundle = _build_relative_placement_bundle( + scene_dir=scene_dir, + source_config=source_config, + spec=spec, + project_name=project_name, + task_name=task_name, + target_body_scale=target_body_scale, + max_episodes=max_episodes, + max_episode_steps=max_episode_steps, + mesh_normalizer=mesh_normalizer, + ) + _validate_relative_bundle(bundle, spec) + _attach_mesh_normalization_summary(bundle, mesh_normalizer) + if prewarm_coacd_cache: + _attach_coacd_cache_summary(bundle) + return _write_config_bundle( + output_dir=output_dir_path, + bundle=bundle, + overwrite=overwrite, + ) + + roles = _infer_basket_task_roles(scene_objects) + if use_llm_roles: + roles = _refine_roles_with_llm( + roles=roles, + scene_objects=scene_objects, + project_name=project_name, + model=llm_model, + ) + + _validate_target_replacement_sources(roles, replacement_specs) + resolved_replacements = _run_target_replacements( + scene_dir=scene_dir, + replacement_specs=replacement_specs, + reuse_target_replacements=reuse_target_replacements, + prompt2geometry_runner=_run_prompt2geometry_replacement, + ) + if sync_replacement_names: + roles = _apply_replacement_names( + roles, + resolved_replacements, + ) + + bundle = _build_ur5_basket_bundle( + scene_dir=scene_dir, + source_config=source_config, + roles=roles, + project_name=project_name, + task_name=task_name, + target_body_scale=target_body_scale, + target_replacements=resolved_replacements, + max_episodes=max_episodes, + max_episode_steps=max_episode_steps, + mesh_normalizer=mesh_normalizer, + ) + _validate_bundle(bundle, roles) + _attach_mesh_normalization_summary(bundle, mesh_normalizer) + if prewarm_coacd_cache: + _attach_coacd_cache_summary(bundle) + return _write_config_bundle( + output_dir=output_dir_path, + bundle=bundle, + overwrite=overwrite, + ) + + +def _build_ur5_basket_bundle( + *, + scene_dir: Path, + source_config: Mapping[str, Any], + roles: _BasketTaskRoles, + project_name: str, + task_name: str, + target_body_scale: float | list[float] | tuple[float, float, float], + target_replacements: Sequence[_ResolvedTargetReplacement], + max_episodes: int, + max_episode_steps: int, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + scene_objects = _collect_scene_objects(source_config) + by_uid = {obj.source_uid: obj for obj in scene_objects} + replacement_by_source_uid = { + replacement.source_uid: replacement for replacement in target_replacements + } + object_scale = _target_body_scale_vector(target_body_scale) + container_scale = _source_body_scale(by_uid[roles.container_source_uid]) + task_source_uids = { + roles.container_source_uid, + roles.left_target_source_uid, + roles.right_target_source_uid, + } + extra_rigid_objects = [ + obj + for obj in scene_objects + if obj.source_role == "rigid_object" and obj.source_uid not in task_source_uids + ] + extra_background_objects = [ + obj + for obj in scene_objects + if obj.source_role == "background" and obj.source_uid != roles.table_source_uid + ] + table_config = _make_background_config( + scene_dir, + by_uid[roles.table_source_uid], + mesh_normalizer, + ) + table_top_z = _mesh_config_world_zmax(table_config) + robot_init_z = _dual_ur5_init_z_from_table_top(table_top_z) + + gym_config = { + "id": "AtomicActionsAgent-v3", + "max_episodes": int(max_episodes), + "max_episode_steps": int(max_episode_steps), + "env": { + "extensions": _make_extensions_config(roles), + "events": _make_events_config( + roles, + sensor_config_factory=_make_sensor_config, + ), + "observations": _make_observations_config(), + "dataset": _make_dataset_config(project_name, roles), + }, + "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), + "sensor": _make_sensor_config(), + "light": _make_light_config(), + "background": [ + table_config, + _make_container_background_config( + scene_dir, + by_uid[roles.container_source_uid], + roles.container_runtime_uid, + container_scale, + mesh_normalizer, + ), + *[ + _make_extra_background_config(scene_dir, obj, mesh_normalizer) + for obj in extra_background_objects + ], + ], + "rigid_object": [ + _make_target_object_config( + scene_dir, + by_uid[roles.right_target_source_uid], + roles.right_target_runtime_uid, + object_scale, + mesh_normalizer, + replacement_by_source_uid.get(roles.right_target_source_uid), + ), + _make_target_object_config( + scene_dir, + by_uid[roles.left_target_source_uid], + roles.left_target_runtime_uid, + object_scale, + mesh_normalizer, + replacement_by_source_uid.get(roles.left_target_source_uid), + ), + *[ + _make_extra_rigid_object_config( + scene_dir, + obj, + _source_body_scale(obj), + mesh_normalizer, + ) + for obj in extra_rigid_objects + ], + ], + } + _apply_tabletop_z_placement(gym_config, table_top_z) + return { + "gym_config": gym_config, + "agent_config": make_agent_config(), + "task_prompt": make_basket_task_prompt(task_name, project_name, roles), + "basic_background": make_basket_basic_background(project_name, roles), + "atom_actions": make_basket_atom_actions_prompt(roles), + "summary": { + "mode": "basket_template", + "left_target": roles.left_target_runtime_uid, + "right_target": roles.right_target_runtime_uid, + "container": roles.container_runtime_uid, + "target_replacements": [ + { + "source_uid": replacement.source_uid, + "prompt": replacement.prompt, + "output_dir_name": replacement.output_dir_name, + "mesh_path": replacement.mesh_path.as_posix(), + "runtime_noun": replacement.runtime_noun, + "reused": replacement.reused, + } + for replacement in target_replacements + ], + }, + } + + +def _attach_coacd_cache_summary(bundle: dict[str, Any]) -> None: + from embodichain.gen_sim.action_agent_pipeline.generation.coacd_cache import ( + prewarm_coacd_cache_for_gym_config, + ) + + bundle.setdefault("summary", {})["coacd_cache"] = ( + prewarm_coacd_cache_for_gym_config(bundle["gym_config"]) + ) + + +def _attach_mesh_normalization_summary( + bundle: dict[str, Any], + mesh_normalizer: MeshFrameNormalizer, +) -> None: + reports = mesh_normalizer.reports + if reports: + bundle.setdefault("summary", {})["normalized_meshes"] = reports + + +def _build_relative_placement_bundle( + *, + scene_dir: Path, + source_config: Mapping[str, Any], + spec: _RelativePlacementSpec, + project_name: str, + task_name: str, + target_body_scale: float | list[float] | tuple[float, float, float], + max_episodes: int, + max_episode_steps: int, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + scene_objects = _collect_scene_objects(source_config) + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + by_uid = {obj.source_uid: obj for obj in scene_objects} + runtime_uids = _relative_scene_runtime_uid_mapping( + scene_objects, + table_source_uid=spec.table_source_uid, + ) + moved_source_uids = {placement.moved_source_uid for placement in spec.placements} + reference_runtime_uids = { + placement.reference_runtime_uid for placement in spec.placements + } + registered_runtime_uids = sorted( + {runtime_uids[obj.source_uid] for obj in rigid_objects} | reference_runtime_uids + ) + dynamic_rigid_objects = [ + obj for obj in rigid_objects if obj.source_uid in moved_source_uids + ] + static_scene_objects = [ + obj for obj in rigid_objects if obj.source_uid not in moved_source_uids + ] + object_scale = _target_body_scale_vector(target_body_scale) + table_config = _make_background_config( + scene_dir, + by_uid[spec.table_source_uid], + mesh_normalizer, + ) + table_top_z = _mesh_config_world_zmax(table_config) + robot_init_z = _dual_ur5_init_z_from_table_top(table_top_z) + + gym_config = { + "id": "AtomicActionsAgent-v3", + "max_episodes": int(max_episodes), + "max_episode_steps": int(max_episode_steps), + "env": { + "extensions": {}, + "events": _make_relative_events_config( + spec, + registered_runtime_uids, + sensor_config_factory=_make_sensor_config, + ), + "observations": _make_observations_config(), + "dataset": {}, + }, + "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), + "sensor": _make_sensor_config(), + "light": _make_light_config(), + "background": [ + table_config, + *[ + _make_relative_background_object_config( + scene_dir, + obj, + runtime_uids[obj.source_uid], + max_convex_hull_num=_relative_static_background_max_convex_hull_num( + runtime_uids[obj.source_uid], + spec, + ), + mesh_normalizer=mesh_normalizer, + ) + for obj in static_scene_objects + ], + *[ + _make_extra_background_config( + scene_dir, + obj, + mesh_normalizer, + runtime_uid=runtime_uids[obj.source_uid], + ) + for obj in background_objects + if obj.source_uid != spec.table_source_uid + ], + ], + "rigid_object": [ + _make_relative_rigid_object_config( + scene_dir=scene_dir, + obj=obj, + runtime_uid=runtime_uids[obj.source_uid], + body_scale=object_scale, + max_convex_hull_num=_relative_rigid_object_max_convex_hull_num( + runtime_uids[obj.source_uid], + spec, + ), + mesh_normalizer=mesh_normalizer, + ) + for obj in dynamic_rigid_objects + ], + } + _apply_tabletop_z_placement(gym_config, table_top_z) + spec = _with_self_relative_absolute_targets(spec, gym_config) + spec = _with_inside_container_slot_offsets(spec, gym_config) + gym_config["env"]["extensions"] = _make_relative_extensions_config( + spec, + side_relation_xy_offsets=_side_relation_xy_offsets, + ) + gym_config["env"]["dataset"] = _make_relative_dataset_config( + project_name, + spec, + relation_phrase=_relative_relation_phrase, + ) + return { + "gym_config": gym_config, + "agent_config": make_agent_config(), + "task_prompt": make_relative_task_prompt(task_name, project_name, spec), + "basic_background": make_relative_basic_background(project_name, spec), + "atom_actions": make_relative_atom_actions_prompt(spec), + "summary": _make_relative_summary(spec), + } diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_templates.py b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_templates.py new file mode 100644 index 00000000..119bf18b --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_templates.py @@ -0,0 +1,55 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from copy import deepcopy +from functools import lru_cache +import json +from pathlib import Path +from typing import Any + +__all__ = [ + "make_dual_ur5_robot_config", + "make_light_config", + "make_sensor_config", +] + +_TEMPLATE_DIR = Path(__file__).resolve().parent / "templates" + + +def make_dual_ur5_robot_config(*, robot_init_z: float) -> dict[str, Any]: + config = _load_template("dual_ur5_robot.json") + config["init_pos"][2] = float(robot_init_z) + return config + + +def make_sensor_config() -> list[dict[str, Any]]: + return _load_template("default_sensors.json") + + +def make_light_config() -> dict[str, Any]: + return _load_template("default_lights.json") + + +def _load_template(name: str) -> Any: + return deepcopy(_read_template(name)) + + +@lru_cache(maxsize=None) +def _read_template(name: str) -> Any: + path = _TEMPLATE_DIR / name + return json.loads(path.read_text(encoding="utf-8")) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py b/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py index 0787ca72..ae35e2fb 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/coacd_cache.py @@ -20,6 +20,7 @@ from pathlib import Path from typing import Any import hashlib +import os from embodichain.utils.logger import log_info @@ -175,7 +176,14 @@ def _generate_coacd_cache( in_mesh, max_convex_hull_num=int(max_convex_hull_num), ) - mesh_list_to_file(cache_path.as_posix(), out_mesh_list) + cache_path.parent.mkdir(parents=True, exist_ok=True) + temp_path = cache_path.with_name(f"{cache_path.name}.tmp.{os.getpid()}") + try: + mesh_list_to_file(temp_path.as_posix(), out_mesh_list) + os.replace(temp_path, cache_path) + finally: + if temp_path.exists(): + temp_path.unlink() def _repo_root() -> Path: diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py new file mode 100644 index 00000000..605d1045 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py @@ -0,0 +1,598 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Callable, Mapping +from pathlib import Path +from typing import Any +import copy + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _BasketTaskRoles, + _RelativePlacementSpec, + _ResolvedTargetReplacement, + _SceneObject, +) +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( + _clean_vector3, +) +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_frame_normalization import ( + MeshFrameNormalizer, +) +from embodichain.gen_sim.action_agent_pipeline.generation.naming import ( + _left_target_text, + _normalize_runtime_uid, + _right_target_text, + _target_task_description_text, +) + +__all__ = [ + "_make_background_config", + "_make_dataset_config", + "_make_events_config", + "_make_extra_background_config", + "_make_extra_rigid_object_config", + "_make_observations_config", + "_make_container_background_config", + "_make_relative_background_object_config", + "_make_relative_dataset_config", + "_make_relative_events_config", + "_make_relative_rigid_object_config", + "_make_target_object_config", + "_relative_rigid_object_max_convex_hull_num", + "_relative_static_background_max_convex_hull_num", + "_source_body_scale", + "_target_body_scale_vector", +] + +_BACKGROUND_MAX_CONVEX_HULL_NUM = 1 +_TARGET_MAX_CONVEX_HULL_NUM = 16 +_CONTAINER_MAX_CONVEX_HULL_NUM = 8 +_EXTRA_RIGID_MAX_CONVEX_HULL_NUM = 1 + +_BACKGROUND_ATTRS = { + "mass": 10.0, + "static_friction": 0.95, + "dynamic_friction": 0.9, + "restitution": 0.01, +} + +_RIGID_OBJECT_ATTRS = { + "mass": 0.01, + "contact_offset": 0.003, + "rest_offset": 0.001, + "restitution": 0.01, + "max_depenetration_velocity": 10.0, + "min_position_iters": 32, + "min_velocity_iters": 8, +} + + +def _target_body_scale_vector( + target_body_scale: float | list[float] | tuple[float, float, float], +) -> list[float]: + if isinstance(target_body_scale, (int, float)): + value = float(target_body_scale) + return [value, value, value] + return _clean_vector3(target_body_scale) + + +def _source_body_scale(obj: _SceneObject) -> list[float]: + return _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])) + + +def _make_relative_events_config( + spec: _RelativePlacementSpec, + registered_runtime_uids: list[str], + *, + sensor_config_factory: Callable[[], list[dict[str, Any]]], +) -> dict[str, Any]: + return { + "record_camera": _record_camera_event_config(sensor_config_factory), + "validation_cameras": _validation_cameras_event_config(), + "prepare_extra_attr": { + "func": "prepare_extra_attr", + "mode": "reset", + "params": { + "attrs": [ + { + "name": "object_lengths", + "mode": "callable", + "entity_uids": "all_objects", + "func_name": "compute_object_length", + "func_kwargs": { + "is_svd_frame": True, + "sample_points": 5000, + }, + }, + ] + }, + }, + "register_info_to_env": { + "func": "register_info_to_env", + "mode": "reset", + "params": { + "registry": [ + _object_registry_entry(uid) + for uid in sorted(registered_runtime_uids) + ], + "registration": "affordance_datas", + "sim_update": True, + }, + }, + } + + +def _make_events_config( + roles: _BasketTaskRoles, + *, + sensor_config_factory: Callable[[], list[dict[str, Any]]], +) -> dict[str, Any]: + return { + "record_camera": _record_camera_event_config(sensor_config_factory), + "validation_cameras": _validation_cameras_event_config(), + "prepare_extra_attr": { + "func": "prepare_extra_attr", + "mode": "reset", + "params": { + "attrs": [ + { + "name": "object_lengths", + "mode": "callable", + "entity_uids": "all_objects", + "func_name": "compute_object_length", + "func_kwargs": { + "is_svd_frame": True, + "sample_points": 5000, + }, + }, + ] + }, + }, + "register_info_to_env": { + "func": "register_info_to_env", + "mode": "reset", + "params": { + "registry": [ + _object_registry_entry(roles.left_target_runtime_uid), + _object_registry_entry(roles.right_target_runtime_uid), + _object_registry_entry(roles.container_runtime_uid), + ], + "registration": "affordance_datas", + "sim_update": True, + }, + }, + } + + +def _record_camera_event_config( + sensor_config_factory: Callable[[], list[dict[str, Any]]], +) -> dict[str, Any]: + camera = sensor_config_factory()[0] + extrinsics = camera["extrinsics"] + return { + "func": "record_camera_data", + "mode": "interval", + "interval_step": 1, + "params": { + "name": "record_cam_high", + "resolution": [camera["width"], camera["height"]], + "intrinsics": camera["intrinsics"], + "eye": extrinsics["eye"], + "target": extrinsics["target"], + "up": extrinsics["up"], + }, + } + + +def _validation_cameras_event_config() -> dict[str, Any]: + return { + "func": "validation_cameras", + "mode": "trigger", + "params": {}, + } + + +def _object_registry_entry(uid: str) -> dict[str, Any]: + return { + "entity_cfg": { + "uid": uid, + }, + "pose_register_params": { + "compute_relative": False, + "compute_pose_object_to_arena": True, + "to_matrix": True, + }, + } + + +def _make_observations_config() -> dict[str, Any]: + return { + "norm_robot_eef_joint": { + "func": "normalize_robot_joint_data", + "mode": "modify", + "name": "robot/qpos", + "params": { + "joint_ids": [12, 13, 14, 15], + }, + } + } + + +def _make_dataset_config( + project_name: str, + roles: _BasketTaskRoles, +) -> dict[str, Any]: + left_target_text = _left_target_text(roles) + right_target_text = _right_target_text(roles) + target_description = _target_task_description_text(roles) + return { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "DualUR5", + "control_freq": 25, + }, + "instruction": { + "lang": ( + f"Use the left UR5 to place the left {left_target_text} into " + f"the {roles.container_runtime_uid}, then use the right " + f"UR5 to place the right {right_target_text} into the " + f"{roles.container_runtime_uid}." + ), + }, + "extra": { + "scene_type": project_name, + "task_description": ( + f"Dual UR5 {target_description}-to-container placement" + ), + "data_type": "sim", + }, + "use_videos": True, + }, + } + } + + +def _make_relative_dataset_config( + project_name: str, + spec: _RelativePlacementSpec, + *, + relation_phrase: Callable[[str], str], +) -> dict[str, Any]: + return { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "DualUR5", + "control_freq": 25, + }, + "instruction": { + "lang": _relative_dataset_instruction( + spec, + relation_phrase=relation_phrase, + ), + }, + "extra": { + "scene_type": project_name, + "task_description": spec.task_description, + "data_type": "sim", + }, + "use_videos": True, + }, + } + } + + +def _relative_dataset_instruction( + spec: _RelativePlacementSpec, + *, + relation_phrase: Callable[[str], str], +) -> str: + if len(spec.placements) == 1: + placement = spec.placements[0] + return ( + f"Use the {placement.active_side} UR5 to move " + f"{placement.moved_runtime_uid} " + f"{relation_phrase(placement.relation)} " + f"{placement.reference_runtime_uid}." + ) + return " ".join( + f"Use the {placement.active_side} UR5 to move " + f"{placement.moved_runtime_uid} " + f"{relation_phrase(placement.relation)} " + f"{placement.reference_runtime_uid}." + for placement in spec.placements + ) + + +def _make_background_config( + scene_dir: Path, + obj: _SceneObject, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + shape = _make_shape_config(scene_dir, obj.config, mesh_normalizer=mesh_normalizer) + return { + "uid": "table", + "shape": shape, + "attrs": dict(_BACKGROUND_ATTRS), + "body_scale": _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])), + "body_type": "kinematic", + "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "max_convex_hull_num": _role_limited_max_convex_hull_num( + obj, + _BACKGROUND_MAX_CONVEX_HULL_NUM, + ), + } + + +def _make_extra_background_config( + scene_dir: Path, + obj: _SceneObject, + mesh_normalizer: MeshFrameNormalizer, + body_scale: Any | None = None, + runtime_uid: str | None = None, +) -> dict[str, Any]: + shape = _make_shape_config(scene_dir, obj.config, mesh_normalizer=mesh_normalizer) + config = { + "uid": runtime_uid or _normalize_runtime_uid(obj.source_uid), + "shape": shape, + "attrs": copy.deepcopy(dict(obj.config.get("attrs", _BACKGROUND_ATTRS))), + "body_scale": _clean_vector3( + obj.config.get("body_scale", [1.0, 1.0, 1.0]) + if body_scale is None + else body_scale + ), + "body_type": str(obj.config.get("body_type", "static")), + "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "max_convex_hull_num": _role_limited_max_convex_hull_num( + obj, + _BACKGROUND_MAX_CONVEX_HULL_NUM, + ), + } + return config + + +def _make_target_object_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + target_scale: list[float], + mesh_normalizer: MeshFrameNormalizer, + replacement: _ResolvedTargetReplacement | None = None, +) -> dict[str, Any]: + config = _make_rigid_object_config( + scene_dir, + obj, + runtime_uid, + target_scale, + max_convex_hull_num=_TARGET_MAX_CONVEX_HULL_NUM, + mesh_fpath=replacement.mesh_path if replacement else None, + mesh_normalizer=mesh_normalizer, + ) + config["body_type"] = "dynamic" + return config + + +def _make_container_object_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + body_scale: Any, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + return _make_rigid_object_config( + scene_dir, + obj, + runtime_uid, + body_scale, + max_convex_hull_num=_role_limited_max_convex_hull_num( + obj, + _CONTAINER_MAX_CONVEX_HULL_NUM, + ), + mesh_normalizer=mesh_normalizer, + ) + + +def _make_container_background_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + body_scale: Any, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + config = _make_container_object_config( + scene_dir, + obj, + runtime_uid, + body_scale, + mesh_normalizer, + ) + config["body_type"] = "kinematic" + return config + + +def _make_relative_background_object_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + *, + max_convex_hull_num: int, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + config = _make_rigid_object_config( + scene_dir, + obj, + runtime_uid, + _source_body_scale(obj), + max_convex_hull_num=max_convex_hull_num, + mesh_normalizer=mesh_normalizer, + ) + config["body_type"] = "kinematic" + return config + + +def _make_extra_rigid_object_config( + scene_dir: Path, + obj: _SceneObject, + body_scale: Any, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + return _make_rigid_object_config( + scene_dir, + obj, + _normalize_runtime_uid(obj.source_uid), + body_scale, + max_convex_hull_num=_role_limited_max_convex_hull_num( + obj, + _EXTRA_RIGID_MAX_CONVEX_HULL_NUM, + ), + mesh_normalizer=mesh_normalizer, + ) + + +def _make_relative_rigid_object_config( + *, + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + body_scale: Any, + max_convex_hull_num: int, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + if max_convex_hull_num == _TARGET_MAX_CONVEX_HULL_NUM: + resolved_max_convex_hull_num = max_convex_hull_num + else: + resolved_max_convex_hull_num = _role_limited_max_convex_hull_num( + obj, + max_convex_hull_num, + ) + config = _make_rigid_object_config( + scene_dir, + obj, + runtime_uid, + body_scale, + max_convex_hull_num=resolved_max_convex_hull_num, + mesh_normalizer=mesh_normalizer, + ) + config["body_type"] = "dynamic" + return config + + +def _make_rigid_object_config( + scene_dir: Path, + obj: _SceneObject, + runtime_uid: str, + body_scale: Any, + max_convex_hull_num: int, + mesh_fpath: str | Path | None = None, + mesh_normalizer: MeshFrameNormalizer | None = None, +) -> dict[str, Any]: + shape = _make_shape_config( + scene_dir, + obj.config, + mesh_fpath=mesh_fpath, + mesh_normalizer=mesh_normalizer, + ) + config = { + "uid": runtime_uid, + "shape": shape, + "attrs": dict(_RIGID_OBJECT_ATTRS), + "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), + "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), + "body_scale": _clean_vector3(body_scale), + "max_convex_hull_num": int(max_convex_hull_num), + } + if "body_type" in obj.config: + config["body_type"] = str(obj.config["body_type"]) + return config + + +def _role_limited_max_convex_hull_num( + obj: _SceneObject, + role_max_convex_hull_num: int, +) -> int: + source_max_convex_hull_num = obj.config.get("max_convex_hull_num") + if source_max_convex_hull_num is None: + return role_max_convex_hull_num + return max(1, min(int(source_max_convex_hull_num), role_max_convex_hull_num)) + + +def _relative_rigid_object_max_convex_hull_num( + runtime_uid: str, + spec: _RelativePlacementSpec, +) -> int: + for placement in spec.placements: + if ( + placement.relation == "inside" + and runtime_uid == placement.reference_runtime_uid + ): + return _CONTAINER_MAX_CONVEX_HULL_NUM + task_uids = { + uid + for placement in spec.placements + for uid in (placement.moved_runtime_uid, placement.reference_runtime_uid) + } + if runtime_uid in task_uids: + return _TARGET_MAX_CONVEX_HULL_NUM + return _EXTRA_RIGID_MAX_CONVEX_HULL_NUM + + +def _relative_static_background_max_convex_hull_num( + runtime_uid: str, + spec: _RelativePlacementSpec, +) -> int: + for placement in spec.placements: + if ( + placement.relation == "inside" + and runtime_uid == placement.reference_runtime_uid + ): + return _CONTAINER_MAX_CONVEX_HULL_NUM + return _BACKGROUND_MAX_CONVEX_HULL_NUM + + +def _make_shape_config( + scene_dir: Path, + source_config: Mapping[str, Any], + *, + mesh_fpath: str | Path | None = None, + mesh_normalizer: MeshFrameNormalizer | None = None, +) -> dict[str, Any]: + shape = copy.deepcopy(dict(source_config.get("shape", {}))) + if mesh_fpath is not None: + shape["shape_type"] = "Mesh" + shape["fpath"] = str(mesh_fpath) + if shape.get("shape_type") == "Mesh" and "fpath" in shape: + mesh_path = Path(_asset_path_for_config(scene_dir, str(shape["fpath"]))) + if mesh_normalizer is not None: + mesh_path = mesh_normalizer.normalize_path(mesh_path) + shape["fpath"] = mesh_path.as_posix() + shape.setdefault("compute_uv", False) + return shape + + +def _asset_path_for_config(scene_dir: Path, fpath: str) -> str: + raw_path = Path(fpath) + if raw_path.is_absolute(): + return raw_path.resolve().as_posix() + return (scene_dir / raw_path).resolve().as_posix() diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_io.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_io.py new file mode 100644 index 00000000..1119f001 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_io.py @@ -0,0 +1,95 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping +import json +from pathlib import Path +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + GeneratedActionAgentConfigPaths, +) + +__all__ = [ + "read_json", + "raise_if_generated_files_exist", + "write_config_bundle", + "write_json", + "write_text", +] + + +def write_config_bundle( + *, + output_dir: Path, + bundle: Mapping[str, Any], + overwrite: bool, +) -> GeneratedActionAgentConfigPaths: + paths = GeneratedActionAgentConfigPaths( + output_dir=output_dir, + gym_config=output_dir / "fast_gym_config.json", + agent_config=output_dir / "agent_config.json", + task_prompt=output_dir / "task_prompt.txt", + basic_background=output_dir / "basic_background.txt", + atom_actions=output_dir / "atom_actions.txt", + summary=dict(bundle.get("summary", {})), + ) + raise_if_generated_files_exist(output_dir, overwrite) + + output_dir.mkdir(parents=True, exist_ok=True) + write_json(paths.gym_config, bundle["gym_config"]) + write_json(paths.agent_config, bundle["agent_config"]) + write_text(paths.task_prompt, bundle["task_prompt"]) + write_text(paths.basic_background, bundle["basic_background"]) + write_text(paths.atom_actions, bundle["atom_actions"]) + return paths + + +def raise_if_generated_files_exist(output_dir: Path, overwrite: bool) -> None: + if overwrite: + return + output_files = [ + output_dir / "fast_gym_config.json", + output_dir / "agent_config.json", + output_dir / "task_prompt.txt", + output_dir / "basic_background.txt", + output_dir / "atom_actions.txt", + ] + existing = [path for path in output_files if path.exists()] + if existing: + existing_text = ", ".join(path.as_posix() for path in existing) + raise FileExistsError( + f"Generated file(s) already exist: {existing_text}. " + "Pass overwrite=True or --overwrite to replace them." + ) + + +def write_json(path: Path, data: Mapping[str, Any]) -> None: + path.write_text( + json.dumps(data, ensure_ascii=False, indent=4) + "\n", + encoding="utf-8", + ) + + +def write_text(path: Path, content: str) -> None: + path.write_text(content.rstrip() + "\n", encoding="utf-8") + + +def read_json(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as file: + return json.load(file) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py new file mode 100644 index 00000000..a0a6dcfd --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py @@ -0,0 +1,121 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +__all__ = [ + "GeneratedActionAgentConfigPaths", + "TargetReplacementSpec", + "_BasketTaskRoles", + "_RelativePlacementSpec", + "_RelativePlacementStepSpec", + "_ResolvedTargetReplacement", + "_SceneObject", +] + + +@dataclass(frozen=True) +class GeneratedActionAgentConfigPaths: + """Paths written by the action-agent config generator.""" + + output_dir: Path + gym_config: Path + agent_config: Path + task_prompt: Path + basic_background: Path + atom_actions: Path + summary: dict[str, Any] + + +@dataclass(frozen=True) +class TargetReplacementSpec: + """Prompt-to-geometry replacement for one source target object.""" + + source_uid: str + prompt: str + output_dir_name: str + + +@dataclass(frozen=True) +class _SceneObject: + source_uid: str + source_role: str + config: dict[str, Any] + + +@dataclass(frozen=True) +class _BasketTaskRoles: + table_source_uid: str + container_source_uid: str + left_target_source_uid: str + right_target_source_uid: str + container_runtime_uid: str + left_target_runtime_uid: str + right_target_runtime_uid: str + target_noun: str + left_target_noun: str + right_target_noun: str + container_noun: str + + +@dataclass(frozen=True) +class _ResolvedTargetReplacement: + source_uid: str + prompt: str + output_dir_name: str + mesh_path: Path + runtime_noun: str + reused: bool = False + + +@dataclass(frozen=True) +class _RelativePlacementStepSpec: + moved_source_uid: str + reference_source_uid: str + moved_runtime_uid: str + reference_runtime_uid: str + relation: str + active_side: str + release_offset: list[float] + high_offset: list[float] + reference_is_initial_pose: bool = False + release_position: list[float] | None = None + high_position: list[float] | None = None + + +@dataclass(frozen=True) +class _RelativePlacementSpec: + table_source_uid: str + moved_source_uid: str + reference_source_uid: str + moved_runtime_uid: str + reference_runtime_uid: str + relation: str + active_side: str + task_description: str + task_prompt_summary: str + basic_background_notes: str + action_sketch: list[str] + release_offset: list[float] + high_offset: list[float] + placements: tuple[_RelativePlacementStepSpec, ...] + reference_is_initial_pose: bool = False + release_position: list[float] | None = None + high_position: list[float] | None = None diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/glb_io.py b/embodichain/gen_sim/action_agent_pipeline/generation/glb_io.py new file mode 100644 index 00000000..432b6707 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/glb_io.py @@ -0,0 +1,60 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from pathlib import Path +from typing import Any +import json +import struct + +__all__ = ["read_glb"] + +_GLB_JSON_CHUNK_TYPE = 0x4E4F534A +_GLB_BINARY_CHUNK_TYPE = 0x004E4942 + + +def read_glb(path: Path) -> tuple[dict[str, Any], bytes]: + """Read a GLB v2 file and return its JSON document and binary chunk.""" + data = path.read_bytes() + if len(data) < 12: + raise ValueError(f"GLB file is too small: {path}") + + magic, version, declared_length = struct.unpack_from("<4sII", data, 0) + if magic != b"glTF" or version != 2: + raise ValueError(f"Only GLB version 2 files are supported: {path}") + if declared_length > len(data): + raise ValueError(f"GLB length header exceeds file size: {path}") + + offset = 12 + doc: dict[str, Any] | None = None + binary_chunk = b"" + while offset + 8 <= declared_length: + chunk_length, chunk_type = struct.unpack_from(" declared_length: + raise ValueError(f"GLB chunk exceeds file size: {path}") + chunk = data[offset:chunk_end] + offset = chunk_end + if chunk_type == _GLB_JSON_CHUNK_TYPE: + doc = json.loads(chunk.decode("utf-8").rstrip("\x00 ")) + elif chunk_type == _GLB_BINARY_CHUNK_TYPE: + binary_chunk = chunk + + if doc is None: + raise ValueError(f"GLB file does not contain a JSON chunk: {path}") + return doc, binary_chunk diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/mesh_bounds.py b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_bounds.py new file mode 100644 index 00000000..9eefb4d5 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_bounds.py @@ -0,0 +1,558 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from pathlib import Path +from typing import Any +import json +import math +import struct + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _SceneObject, +) +from embodichain.gen_sim.action_agent_pipeline.generation.glb_io import read_glb + +__all__ = [ + "_apply_tabletop_z_placement", + "_clean_vector3", + "_dual_ur5_init_z_from_table_top", + "_iter_generated_scene_object_configs", + "_mesh_config_world_xy_extents", + "_mesh_config_world_z_bounds", + "_mesh_config_world_zmax", + "_resolve_table_mesh_world_zmax", + "_vector3", +] + +_DUAL_UR5_LEGACY_INIT_Z = 0.5 +_DUAL_UR5_ARM_COMPONENT_Z = 0.4 +_DUAL_UR5_TABLETOP_CLEARANCE = 0.25 +_TABLETOP_OBJECT_CLEARANCE = 0.003 +_GLTF_COMPONENT_FORMATS = { + 5120: ("b", 1), + 5121: ("B", 1), + 5122: ("h", 2), + 5123: ("H", 2), + 5125: ("I", 4), + 5126: ("f", 4), +} +_GLTF_TYPE_COMPONENT_COUNTS = { + "SCALAR": 1, + "VEC2": 2, + "VEC3": 3, + "VEC4": 4, + "MAT4": 16, +} + + +def _dual_ur5_init_z_from_table_top(table_top_z: float | None) -> float: + if table_top_z is None: + return _DUAL_UR5_LEGACY_INIT_Z + + init_z = table_top_z + _DUAL_UR5_TABLETOP_CLEARANCE - _DUAL_UR5_ARM_COMPONENT_Z + return round(init_z, 6) + + +def _apply_tabletop_z_placement( + gym_config: dict[str, Any], + table_top_z: float | None, +) -> None: + if table_top_z is None: + return + target_bottom_z = float(table_top_z) + _TABLETOP_OBJECT_CLEARANCE + for obj in _iter_generated_scene_object_configs(gym_config): + if obj.get("uid") == "table": + continue + mesh_min_z = _mesh_config_local_zmin_after_rotation(obj) + if mesh_min_z is None: + continue + init_pos = _clean_vector3(obj.get("init_pos", [0.0, 0.0, 0.0])) + init_pos[2] = round(target_bottom_z - mesh_min_z, 6) + obj["init_pos"] = init_pos + + +def _iter_generated_scene_object_configs( + gym_config: Mapping[str, Any], +) -> list[dict[str, Any]]: + objects: list[dict[str, Any]] = [] + for section in ("background", "rigid_object"): + value = gym_config.get(section, []) + if isinstance(value, Mapping): + value = [value] + if not isinstance(value, list): + continue + objects.extend(obj for obj in value if isinstance(obj, dict)) + return objects + + +def _mesh_config_world_zmax(obj_config: Mapping[str, Any]) -> float | None: + bounds = _mesh_config_world_z_bounds(obj_config) + if bounds is None: + return None + return bounds[1] + + +def _mesh_config_world_xy_extents( + obj_config: Mapping[str, Any], +) -> tuple[float, float] | None: + shape = obj_config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + mesh_path = shape.get("fpath") + if not isinstance(mesh_path, str): + return None + vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) + if not vertices: + return None + + matrix = _mesh_config_transform_matrix(obj_config) + transformed_vertices = [_transform_point(matrix, vertex) for vertex in vertices] + x_values = [vertex[0] for vertex in transformed_vertices] + y_values = [vertex[1] for vertex in transformed_vertices] + return ( + max(x_values) - min(x_values), + max(y_values) - min(y_values), + ) + + +def _mesh_config_local_zmin_after_rotation( + obj_config: Mapping[str, Any], +) -> float | None: + shape = obj_config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + mesh_path = shape.get("fpath") + if not isinstance(mesh_path, str): + return None + vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) + if not vertices: + return None + + matrix = _mesh_config_transform_matrix( + obj_config, + translation=[0.0, 0.0, 0.0], + ) + return min(_transform_point(matrix, vertex)[2] for vertex in vertices) + + +def _mesh_config_world_z_bounds( + obj_config: Mapping[str, Any], +) -> tuple[float, float] | None: + shape = obj_config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + mesh_path = shape.get("fpath") + if not isinstance(mesh_path, str): + return None + vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) + if not vertices: + return None + + matrix = _mesh_config_transform_matrix(obj_config) + z_values = [_transform_point(matrix, vertex)[2] for vertex in vertices] + return (min(z_values), max(z_values)) + + +def _mesh_config_transform_matrix( + obj_config: Mapping[str, Any], + *, + translation: list[float] | None = None, +) -> list[list[float]]: + scale = _vector3(obj_config.get("body_scale", [1.0, 1.0, 1.0])) + init_local_pose = obj_config.get("init_local_pose") + if init_local_pose is not None and translation is None: + root_matrix = _matrix4(init_local_pose) + else: + root_matrix = _euler_xyz_degrees_matrix( + _vector3(obj_config.get("init_rot", [0.0, 0.0, 0.0])), + ( + _vector3(obj_config.get("init_pos", [0.0, 0.0, 0.0])) + if translation is None + else translation + ), + ) + return _matrix_multiply(root_matrix, _scale_matrix4(scale)) + + +def _resolve_table_mesh_world_zmax( + scene_dir: Path, + table_obj: _SceneObject, +) -> float | None: + shape = table_obj.config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + if shape.get("shape_type") != "Mesh" or not shape.get("fpath"): + return None + + mesh_path = _source_asset_path(scene_dir, str(shape["fpath"])) + try: + vertices = _load_mesh_vertices(mesh_path) + except ( + OSError, + ValueError, + json.JSONDecodeError, + UnicodeDecodeError, + struct.error, + ): + return None + if not vertices: + return None + + world_matrix = _table_mesh_world_matrix(table_obj.config) + return max(_transform_point(world_matrix, vertex)[2] for vertex in vertices) + + +def _source_asset_path(scene_dir: Path, fpath: str) -> Path: + raw_path = Path(fpath) + if raw_path.is_absolute(): + return raw_path.resolve() + + scene_candidate = (scene_dir / raw_path).resolve() + if scene_candidate.exists(): + return scene_candidate + + repo_candidate = (_repo_root() / raw_path).resolve() + if repo_candidate.exists(): + return repo_candidate + return scene_candidate + + +def _load_mesh_vertices(mesh_path: Path) -> list[tuple[float, float, float]] | None: + if mesh_path.suffix.lower() == ".glb": + try: + return list(_iter_glb_world_position_vertices(mesh_path)) + except ( + OSError, + ValueError, + json.JSONDecodeError, + UnicodeDecodeError, + struct.error, + ): + return _load_mesh_vertices_with_trimesh(mesh_path) + if mesh_path.suffix.lower() == ".obj": + vertices = _load_obj_position_vertices(mesh_path) + if vertices is not None: + return vertices + return _load_mesh_vertices_with_trimesh(mesh_path) + + +def _load_obj_position_vertices( + mesh_path: Path, +) -> list[tuple[float, float, float]] | None: + try: + vertices = [] + for line in mesh_path.read_text(encoding="utf-8").splitlines(): + if not line.startswith("v "): + continue + values = line.split() + if len(values) < 4: + continue + vertices.append((float(values[1]), float(values[2]), float(values[3]))) + except (OSError, UnicodeDecodeError, ValueError): + return None + return vertices or None + + +def _load_mesh_vertices_with_trimesh( + mesh_path: Path, +) -> list[tuple[float, float, float]] | None: + try: + import trimesh + except ImportError: + return None + + try: + scene_or_mesh = trimesh.load(str(mesh_path), force="scene") + if hasattr(scene_or_mesh, "to_geometry"): + mesh = scene_or_mesh.to_geometry() + elif hasattr(scene_or_mesh, "dump"): + mesh = scene_or_mesh.dump(concatenate=True) + else: + mesh = scene_or_mesh + except Exception: + return None + vertices = getattr(mesh, "vertices", None) + if vertices is None or len(vertices) == 0: + return None + return [ + (float(vertex[0]), float(vertex[1]), float(vertex[2])) for vertex in vertices + ] + + +def _iter_glb_world_position_vertices( + mesh_path: Path, +): + doc, binary_chunk = read_glb(mesh_path) + nodes = doc.get("nodes", []) + if not isinstance(nodes, list): + raise ValueError("GLB nodes must be a list.") + + scenes = doc.get("scenes", []) + if scenes: + scene_index = int(doc.get("scene", 0)) + root_node_ids = scenes[scene_index].get("nodes", []) + else: + root_node_ids = list(range(len(nodes))) + + stack = [(int(node_id), _identity_matrix4()) for node_id in root_node_ids] + while stack: + node_id, parent_matrix = stack.pop() + node = nodes[node_id] + node_matrix = _matrix_multiply(parent_matrix, _gltf_node_matrix(node)) + mesh_index = node.get("mesh") + if mesh_index is not None: + for vertex in _iter_gltf_mesh_position_vertices( + doc, + binary_chunk, + int(mesh_index), + ): + yield _transform_point(node_matrix, vertex) + for child_id in node.get("children", []) or []: + stack.append((int(child_id), node_matrix)) + + +def _iter_gltf_mesh_position_vertices( + doc: Mapping[str, Any], + binary_chunk: bytes, + mesh_index: int, +): + meshes = doc.get("meshes", []) + accessors = doc.get("accessors", []) + mesh = meshes[mesh_index] + for primitive in mesh.get("primitives", []) or []: + attributes = primitive.get("attributes", {}) + position_accessor = attributes.get("POSITION") + if position_accessor is None: + continue + if int(position_accessor) >= len(accessors): + raise ValueError("POSITION accessor index is out of range.") + yield from _iter_gltf_accessor_vec3(doc, binary_chunk, int(position_accessor)) + + +def _iter_gltf_accessor_vec3( + doc: Mapping[str, Any], + binary_chunk: bytes, + accessor_index: int, +): + accessor = doc["accessors"][accessor_index] + if accessor.get("sparse"): + raise ValueError("Sparse GLB accessors are not supported.") + if accessor.get("type") != "VEC3": + raise ValueError("POSITION accessor must be VEC3.") + if "bufferView" not in accessor: + raise ValueError("POSITION accessor must reference a bufferView.") + + component_type = int(accessor["componentType"]) + if component_type not in _GLTF_COMPONENT_FORMATS: + raise ValueError(f"Unsupported GLB component type: {component_type}.") + component_format, component_size = _GLTF_COMPONENT_FORMATS[component_type] + component_count = _GLTF_TYPE_COMPONENT_COUNTS[accessor["type"]] + buffer_view = doc["bufferViews"][int(accessor["bufferView"])] + if int(buffer_view.get("buffer", 0)) != 0: + raise ValueError("Only GLB embedded binary buffers are supported.") + + stride = int(buffer_view.get("byteStride", component_size * component_count)) + offset = int(buffer_view.get("byteOffset", 0)) + int(accessor.get("byteOffset", 0)) + element_format = "<" + component_format * component_count + for index in range(int(accessor["count"])): + values = struct.unpack_from( + element_format, + binary_chunk, + offset + index * stride, + ) + yield (float(values[0]), float(values[1]), float(values[2])) + + +def _table_mesh_world_matrix(table_config: Mapping[str, Any]) -> list[list[float]]: + scale = _vector3(table_config.get("body_scale", [1.0, 1.0, 1.0])) + init_local_pose = table_config.get("init_local_pose") + if init_local_pose is not None: + root_matrix = _matrix4(init_local_pose) + else: + root_matrix = _euler_xyz_degrees_matrix( + _vector3(table_config.get("init_rot", [0.0, 0.0, 0.0])), + _vector3(table_config.get("init_pos", [0.0, 0.0, 0.0])), + ) + return _matrix_multiply(root_matrix, _scale_matrix4(scale)) + + +def _gltf_node_matrix(node: Mapping[str, Any]) -> list[list[float]]: + if "matrix" in node: + values = [float(value) for value in node["matrix"]] + if len(values) != 16: + raise ValueError("GLB node matrix must contain 16 values.") + return [[values[column * 4 + row] for column in range(4)] for row in range(4)] + + translation = [float(value) for value in node.get("translation", [0.0, 0.0, 0.0])] + scale = [float(value) for value in node.get("scale", [1.0, 1.0, 1.0])] + rotation = [float(value) for value in node.get("rotation", [0.0, 0.0, 0.0, 1.0])] + if len(translation) != 3 or len(scale) != 3 or len(rotation) != 4: + raise ValueError("Invalid GLB node TRS transform.") + + x, y, z, w = rotation + xx, yy, zz = x * x, y * y, z * z + xy, xz, yz = x * y, x * z, y * z + wx, wy, wz = w * x, w * y, w * z + matrix = [ + [ + (1.0 - 2.0 * (yy + zz)) * scale[0], + (2.0 * (xy - wz)) * scale[1], + (2.0 * (xz + wy)) * scale[2], + translation[0], + ], + [ + (2.0 * (xy + wz)) * scale[0], + (1.0 - 2.0 * (xx + zz)) * scale[1], + (2.0 * (yz - wx)) * scale[2], + translation[1], + ], + [ + (2.0 * (xz - wy)) * scale[0], + (2.0 * (yz + wx)) * scale[1], + (1.0 - 2.0 * (xx + yy)) * scale[2], + translation[2], + ], + [0.0, 0.0, 0.0, 1.0], + ] + return matrix + + +def _euler_xyz_degrees_matrix( + rotation_deg: Sequence[float], + translation: Sequence[float], +) -> list[list[float]]: + rx, ry, rz = (math.radians(float(value)) for value in rotation_deg) + cx, sx = math.cos(rx), math.sin(rx) + cy, sy = math.cos(ry), math.sin(ry) + cz, sz = math.cos(rz), math.sin(rz) + rot_x = [ + [1.0, 0.0, 0.0, 0.0], + [0.0, cx, -sx, 0.0], + [0.0, sx, cx, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + rot_y = [ + [cy, 0.0, sy, 0.0], + [0.0, 1.0, 0.0, 0.0], + [-sy, 0.0, cy, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + rot_z = [ + [cz, -sz, 0.0, 0.0], + [sz, cz, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + matrix = _matrix_multiply(_matrix_multiply(rot_z, rot_y), rot_x) + matrix[0][3] = float(translation[0]) + matrix[1][3] = float(translation[1]) + matrix[2][3] = float(translation[2]) + return matrix + + +def _identity_matrix4() -> list[list[float]]: + return [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + + +def _scale_matrix4(scale: Sequence[float]) -> list[list[float]]: + return [ + [float(scale[0]), 0.0, 0.0, 0.0], + [0.0, float(scale[1]), 0.0, 0.0], + [0.0, 0.0, float(scale[2]), 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + + +def _matrix4(value: Any) -> list[list[float]]: + if not isinstance(value, (list, tuple)) or len(value) != 4: + raise ValueError(f"Expected a 4x4 matrix, got {value!r}.") + matrix = [] + for row in value: + if not isinstance(row, (list, tuple)) or len(row) != 4: + raise ValueError(f"Expected a 4x4 matrix, got {value!r}.") + matrix.append([float(item) for item in row]) + return matrix + + +def _matrix_multiply( + left: Sequence[Sequence[float]], + right: Sequence[Sequence[float]], +) -> list[list[float]]: + return [ + [ + sum( + float(left[row][inner]) * float(right[inner][column]) + for inner in range(4) + ) + for column in range(4) + ] + for row in range(4) + ] + + +def _transform_point( + matrix: Sequence[Sequence[float]], + point: Sequence[float], +) -> tuple[float, float, float]: + x, y, z = (float(point[0]), float(point[1]), float(point[2])) + return ( + float(matrix[0][0]) * x + + float(matrix[0][1]) * y + + float(matrix[0][2]) * z + + float(matrix[0][3]), + float(matrix[1][0]) * x + + float(matrix[1][1]) * y + + float(matrix[1][2]) * z + + float(matrix[1][3]), + float(matrix[2][0]) * x + + float(matrix[2][1]) * y + + float(matrix[2][2]) * z + + float(matrix[2][3]), + ) + + +def _repo_root() -> Path: + current = Path(__file__).resolve() + for parent in current.parents: + if (parent / "setup.py").exists() and (parent / "embodichain").exists(): + return parent + return Path.cwd().resolve() + + +def _vector3(value: Any) -> list[float]: + if not isinstance(value, (list, tuple)) or len(value) != 3: + raise ValueError(f"Expected a 3-vector, got {value!r}.") + return [float(item) for item in value] + + +def _clean_vector3(value: Any) -> list[float]: + cleaned = [] + for item in _vector3(value): + if abs(item - 1.0) < 1e-9: + cleaned.append(1.0) + elif abs(item) < 1e-12: + cleaned.append(0.0) + else: + cleaned.append(item) + return cleaned diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py index e9576cd0..c689a39c 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/mesh_frame_normalization.py @@ -24,7 +24,8 @@ import json import math import re -import struct + +from embodichain.gen_sim.action_agent_pipeline.generation.glb_io import read_glb __all__ = [ "GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES", @@ -40,8 +41,6 @@ GLB_LOCAL_X_CORRECTION_DEGREES = GLB_TO_OBJ_BAKED_X_ROTATION_DEGREES _SAFE_STEM_RE = re.compile(r"[^0-9a-zA-Z_.-]+") -_GLB_JSON_CHUNK_TYPE = 0x4E4F534A -_GLB_BINARY_CHUNK_TYPE = 0x004E4942 _TEXTURE_EXTENSION_BY_MIME_TYPE = { "image/jpeg": ".jpg", "image/png": ".png", @@ -378,7 +377,7 @@ def _extract_glb_base_color_texture(source_path: Path) -> _TextureAsset | None: if source_path.suffix.lower() != ".glb": return None - doc, binary_chunk = _read_glb(source_path) + doc, binary_chunk = read_glb(source_path) material = _first_textured_material(doc) if material is None: return None @@ -436,36 +435,6 @@ def _first_textured_material(doc: dict[str, Any]) -> dict[str, Any] | None: return None -def _read_glb(source_path: Path) -> tuple[dict[str, Any], bytes]: - data = source_path.read_bytes() - if len(data) < 12: - raise ValueError(f"GLB file is too small: {source_path}") - magic, version, declared_length = struct.unpack_from("<4sII", data, 0) - if magic != b"glTF" or version != 2: - raise ValueError(f"Only GLB version 2 files are supported: {source_path}") - if declared_length > len(data): - raise ValueError(f"GLB length header exceeds file size: {source_path}") - - offset = 12 - doc: dict[str, Any] | None = None - binary_chunk = b"" - while offset + 8 <= declared_length: - chunk_length, chunk_type = struct.unpack_from(" declared_length: - raise ValueError(f"GLB chunk exceeds file size: {source_path}") - chunk = data[offset:chunk_end] - offset = chunk_end - if chunk_type == _GLB_JSON_CHUNK_TYPE: - doc = json.loads(chunk.decode("utf-8")) - elif chunk_type == _GLB_BINARY_CHUNK_TYPE: - binary_chunk = chunk - if doc is None: - raise ValueError(f"GLB file does not contain a JSON chunk: {source_path}") - return doc, binary_chunk - - def _buffer_view_bytes( doc: dict[str, Any], binary_chunk: bytes, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/naming.py b/embodichain/gen_sim/action_agent_pipeline/generation/naming.py new file mode 100644 index 00000000..9d824d02 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/naming.py @@ -0,0 +1,168 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from pathlib import Path +from typing import Any +import re + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _BasketTaskRoles, + _SceneObject, +) + +__all__ = [ + "_base_name", + "_candidate_relative_runtime_uid", + "_container_runtime_uid", + "_display_noun", + "_generic_target_text", + "_is_container_like", + "_left_target_text", + "_normalize_runtime_uid", + "_object_text", + "_plural", + "_right_target_text", + "_string_list", + "_target_pair_text", + "_target_plural_text", + "_target_runtime_suffix", + "_target_task_description_text", +] + +_DIGIT_SUFFIX_RE = re.compile(r"_[0-9]+$") +_INVALID_UID_CHARS_RE = re.compile(r"[^0-9a-zA-Z_]+") +_CONTAINER_KEYWORDS = ( + "basket", + "container", + "bowl", + "box", + "bin", + "tray", + "crate", +) + + +def _target_noun(left_target: _SceneObject, right_target: _SceneObject) -> str: + left_base = _base_name(left_target) + right_base = _base_name(right_target) + if left_base == right_base: + return _target_runtime_suffix(left_base) + return "target_object" + + +def _object_text(obj: _SceneObject) -> str: + shape = obj.config.get("shape", {}) or {} + return f"{obj.source_uid} {shape.get('fpath', '')}".lower() + + +def _base_name(obj: _SceneObject) -> str: + base = _DIGIT_SUFFIX_RE.sub("", obj.source_uid) + if base == obj.source_uid: + fpath = str(obj.config.get("shape", {}).get("fpath", "")) + path = Path(fpath) + if len(path.parts) >= 2: + base = path.parts[-2] + return _normalize_runtime_uid(base) + + +def _target_runtime_suffix(base: str) -> str: + if base == "bread": + return "bread_roll" + return base + + +def _container_runtime_uid(container: _SceneObject) -> str: + base = _base_name(container) + if "basket" in base: + return "wicker_basket" + return f"target_{base}" + + +def _display_noun(uid: str) -> str: + return uid.replace("_", " ") + + +def _plural(noun: str) -> str: + if noun.endswith("s"): + return noun + if noun.endswith(("ch", "sh", "x")): + return f"{noun}es" + return f"{noun}s" + + +def _left_target_text(roles: _BasketTaskRoles) -> str: + return _display_noun(roles.left_target_noun) + + +def _right_target_text(roles: _BasketTaskRoles) -> str: + return _display_noun(roles.right_target_noun) + + +def _target_pair_text(roles: _BasketTaskRoles) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return f"two {left_text} objects" + return f"the left {left_text} and right {right_text}" + + +def _target_plural_text(roles: _BasketTaskRoles) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return _plural(left_text) + return "target objects" + + +def _generic_target_text(roles: _BasketTaskRoles) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return left_text + return "target object" + + +def _target_task_description_text(roles: _BasketTaskRoles) -> str: + left_text = _left_target_text(roles) + right_text = _right_target_text(roles) + if left_text == right_text: + return _plural(left_text) + return f"{left_text}-and-{right_text}" + + +def _normalize_runtime_uid(value: str) -> str: + uid = _INVALID_UID_CHARS_RE.sub("_", value.strip()).strip("_").lower() + if not uid: + raise ValueError(f"Invalid runtime uid: {value!r}") + return uid + + +def _candidate_relative_runtime_uid(obj: _SceneObject) -> str: + if _is_container_like(obj): + return _container_runtime_uid(obj) + return _target_runtime_suffix(_base_name(obj)) + + +def _is_container_like(obj: _SceneObject) -> bool: + return any(keyword in _object_text(obj) for keyword in _CONTAINER_KEYWORDS) + + +def _string_list(value: Any) -> list[str]: + if not isinstance(value, list): + return [] + return [str(item).strip() for item in value if str(item).strip()] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index fa45fd03..5cfa8322 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -44,7 +44,9 @@ - `back_left_of` combines positive world x and negative world y. - `front_right_of` combines negative world x and positive world y. - `back_right_of` combines positive world x and positive world y. -- `inside` and `on` use the reference object's xy center.""" +- `inside` uses generated container slot offsets; multiple objects sharing a + container are distributed along the container XY long axis. +- `on` uses the reference object's xy center.""" class _BasketRolesLike(Protocol): diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py new file mode 100644 index 00000000..720f189e --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py @@ -0,0 +1,396 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from dataclasses import replace +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _RelativePlacementSpec, + _RelativePlacementStepSpec, +) +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( + _clean_vector3, + _iter_generated_scene_object_configs, + _mesh_config_world_xy_extents, +) +from embodichain.gen_sim.action_agent_pipeline.generation.relative_spec import ( + _SIDE_RELATIONS, + _normalize_relative_relation, +) + +__all__ = [ + "_inside_container_axis_offsets", + "_inside_container_slot_axis_and_distance", + "_make_relative_summary", + "_offset_position", + "_relative_release_offset", + "_side_relation_xy_offsets", + "_with_inside_container_slot_offsets", + "_with_self_relative_absolute_targets", +] + +_SIDE_RELATION_DISTANCE = 0.16 +_SIDE_RELEASE_Z_OFFSET = 0.12 +_CONTAINER_SLOT_MIN_OFFSET = 0.04 +_CONTAINER_SLOT_MAX_OFFSET = 0.12 +_CONTAINER_SLOT_FRACTION = 0.25 +_CONTAINER_SLOT_MAX_FRACTION = 0.40 +_CONTAINER_SLOT_AXIS_TIE_RATIO = 0.10 +_STAGING_Z_DELTA = 0.10 +_ON_RELEASE_Z_OFFSET = 0.2 +_ROBOT_VIEW_LEFT_WORLD_Y_SIGN = -1.0 +_ROBOT_VIEW_FRONT_WORLD_X_SIGN = -1.0 + + +def _relative_release_offset(relation: str) -> list[float]: + relation = _normalize_relative_relation(relation) + if relation == "inside": + return [0.0, 0.0, _SIDE_RELEASE_Z_OFFSET] + if relation == "on": + return [0.0, 0.0, _ON_RELEASE_Z_OFFSET] + if relation in _SIDE_RELATIONS: + x_offset, y_offset = _side_relation_xy_offsets(relation) + return [x_offset, y_offset, _SIDE_RELEASE_Z_OFFSET] + raise ValueError(f"Unsupported relative placement relation: {relation!r}.") + + +def _side_relation_xy_offsets(relation: str) -> tuple[float, float]: + relation = _normalize_relative_relation(relation) + left_y = _ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE + right_y = -_ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE + front_x = _ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE + behind_x = -_ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE + if relation == "left_of": + return 0.0, left_y + if relation == "right_of": + return 0.0, right_y + if relation == "front_of": + return front_x, 0.0 + if relation == "behind": + return behind_x, 0.0 + if relation == "front_left_of": + return front_x, left_y + if relation == "back_left_of": + return behind_x, left_y + if relation == "front_right_of": + return front_x, right_y + if relation == "back_right_of": + return behind_x, right_y + raise ValueError(f"Unsupported side relation: {relation!r}.") + + +def _with_self_relative_absolute_targets( + spec: _RelativePlacementSpec, + gym_config: Mapping[str, Any], +) -> _RelativePlacementSpec: + if not any(placement.reference_is_initial_pose for placement in spec.placements): + return spec + + generated_positions = { + str(obj.get("uid")): _clean_vector3(obj.get("init_pos", [0.0, 0.0, 0.0])) + for obj in gym_config.get("rigid_object", []) + } + placements = tuple( + _with_self_relative_absolute_target(placement, generated_positions) + for placement in spec.placements + ) + primary = placements[0] + return _RelativePlacementSpec( + table_source_uid=spec.table_source_uid, + moved_source_uid=primary.moved_source_uid, + reference_source_uid=primary.reference_source_uid, + moved_runtime_uid=primary.moved_runtime_uid, + reference_runtime_uid=primary.reference_runtime_uid, + relation=primary.relation, + active_side=primary.active_side, + task_description=spec.task_description, + task_prompt_summary=spec.task_prompt_summary, + basic_background_notes=spec.basic_background_notes, + action_sketch=spec.action_sketch, + release_offset=primary.release_offset, + high_offset=primary.high_offset, + placements=placements, + reference_is_initial_pose=primary.reference_is_initial_pose, + release_position=primary.release_position, + high_position=primary.high_position, + ) + + +def _with_self_relative_absolute_target( + placement: _RelativePlacementStepSpec, + generated_positions: Mapping[str, list[float]], +) -> _RelativePlacementStepSpec: + if not placement.reference_is_initial_pose: + return placement + initial_position = generated_positions.get(placement.moved_runtime_uid) + if initial_position is None: + raise ValueError( + "Generated relative config missing self-relative moved object " + f"{placement.moved_runtime_uid!r}." + ) + release_position = _offset_position(initial_position, placement.release_offset) + high_position = _offset_position(initial_position, placement.high_offset) + return _RelativePlacementStepSpec( + moved_source_uid=placement.moved_source_uid, + reference_source_uid=placement.reference_source_uid, + moved_runtime_uid=placement.moved_runtime_uid, + reference_runtime_uid=placement.reference_runtime_uid, + relation=placement.relation, + active_side=placement.active_side, + release_offset=placement.release_offset, + high_offset=placement.high_offset, + reference_is_initial_pose=True, + release_position=release_position, + high_position=high_position, + ) + + +def _with_inside_container_slot_offsets( + spec: _RelativePlacementSpec, + gym_config: Mapping[str, Any], +) -> _RelativePlacementSpec: + inside_groups: dict[str, list[int]] = {} + for index, placement in enumerate(spec.placements): + if placement.relation != "inside" or placement.reference_is_initial_pose: + continue + inside_groups.setdefault(placement.reference_runtime_uid, []).append(index) + + inside_groups = { + reference_uid: indices + for reference_uid, indices in inside_groups.items() + if len(indices) > 1 + } + if not inside_groups: + return spec + + object_configs = { + str(obj.get("uid")): obj + for obj in _iter_generated_scene_object_configs(gym_config) + if obj.get("uid") is not None + } + slot_offsets_by_index: dict[int, list[float]] = {} + for reference_uid, indices in inside_groups.items(): + container_config = object_configs.get(reference_uid) + axis, slot_distance = _inside_container_slot_axis_and_distance(container_config) + ordered_indices = _order_inside_container_slot_indices( + indices, + placements=spec.placements, + axis=axis, + object_configs=object_configs, + container_config=container_config, + ) + for index, axis_offset in zip( + ordered_indices, + _inside_container_axis_offsets(len(ordered_indices), slot_distance), + ): + release_offset = [0.0, 0.0, _SIDE_RELEASE_Z_OFFSET] + release_offset[0 if axis == "x" else 1] = axis_offset + slot_offsets_by_index[index] = [ + round(float(value), 6) for value in release_offset + ] + + if not slot_offsets_by_index: + return spec + + placements = tuple( + ( + _with_relative_release_offset(placement, slot_offsets_by_index[index]) + if index in slot_offsets_by_index + else placement + ) + for index, placement in enumerate(spec.placements) + ) + return _replace_relative_spec_placements(spec, placements) + + +def _with_relative_release_offset( + placement: _RelativePlacementStepSpec, + release_offset: Sequence[float], +) -> _RelativePlacementStepSpec: + clean_release_offset = [round(float(value), 6) for value in release_offset] + high_offset = list(clean_release_offset) + high_offset[2] = round(high_offset[2] + _STAGING_Z_DELTA, 6) + return replace( + placement, + release_offset=clean_release_offset, + high_offset=high_offset, + ) + + +def _replace_relative_spec_placements( + spec: _RelativePlacementSpec, + placements: tuple[_RelativePlacementStepSpec, ...], +) -> _RelativePlacementSpec: + primary = placements[0] + return replace( + spec, + moved_source_uid=primary.moved_source_uid, + reference_source_uid=primary.reference_source_uid, + moved_runtime_uid=primary.moved_runtime_uid, + reference_runtime_uid=primary.reference_runtime_uid, + relation=primary.relation, + active_side=primary.active_side, + release_offset=primary.release_offset, + high_offset=primary.high_offset, + placements=placements, + reference_is_initial_pose=primary.reference_is_initial_pose, + release_position=primary.release_position, + high_position=primary.high_position, + ) + + +def _inside_container_slot_axis_and_distance( + container_config: Mapping[str, Any] | None, +) -> tuple[str, float]: + extents = ( + _mesh_config_world_xy_extents(container_config) + if container_config is not None + else None + ) + if extents is None: + return "y", _CONTAINER_SLOT_MIN_OFFSET + + x_extent, y_extent = extents + axis = _inside_container_slot_axis(x_extent, y_extent) + axis_extent = x_extent if axis == "x" else y_extent + if axis_extent <= 0.0: + return "y", _CONTAINER_SLOT_MIN_OFFSET + + slot_distance = min( + max(axis_extent * _CONTAINER_SLOT_FRACTION, _CONTAINER_SLOT_MIN_OFFSET), + axis_extent * _CONTAINER_SLOT_MAX_FRACTION, + _CONTAINER_SLOT_MAX_OFFSET, + ) + return axis, round(float(slot_distance), 6) + + +def _inside_container_slot_axis(x_extent: float, y_extent: float) -> str: + max_extent = max(float(x_extent), float(y_extent)) + if max_extent <= 0.0: + return "y" + if abs(float(x_extent) - float(y_extent)) <= ( + max_extent * _CONTAINER_SLOT_AXIS_TIE_RATIO + ): + return "y" + return "x" if float(x_extent) > float(y_extent) else "y" + + +def _order_inside_container_slot_indices( + indices: list[int], + *, + placements: Sequence[_RelativePlacementStepSpec], + axis: str, + object_configs: Mapping[str, Mapping[str, Any]], + container_config: Mapping[str, Any] | None, +) -> list[int]: + if axis == "y": + side_order = {"left": 0, "right": 1} + return sorted( + indices, + key=lambda index: ( + side_order.get(placements[index].active_side, 1), + _relative_initial_axis_value( + placements[index], + axis_index=1, + object_configs=object_configs, + container_config=container_config, + ), + index, + ), + ) + + return sorted( + indices, + key=lambda index: ( + _relative_initial_axis_value( + placements[index], + axis_index=0, + object_configs=object_configs, + container_config=container_config, + ), + index, + ), + ) + + +def _relative_initial_axis_value( + placement: _RelativePlacementStepSpec, + *, + axis_index: int, + object_configs: Mapping[str, Mapping[str, Any]], + container_config: Mapping[str, Any] | None, +) -> float: + moved_config = object_configs.get(placement.moved_runtime_uid) + moved_position = _scene_config_init_position(moved_config) + container_position = _scene_config_init_position(container_config) + return float(moved_position[axis_index] - container_position[axis_index]) + + +def _scene_config_init_position( + obj_config: Mapping[str, Any] | None, +) -> list[float]: + if obj_config is None: + return [0.0, 0.0, 0.0] + return _clean_vector3(obj_config.get("init_pos", [0.0, 0.0, 0.0])) + + +def _inside_container_axis_offsets(count: int, slot_distance: float) -> list[float]: + if count <= 1: + return [0.0] + if count == 2: + return [ + round(-float(slot_distance), 6), + round(float(slot_distance), 6), + ] + step = (2.0 * float(slot_distance)) / float(count - 1) + return [round(-float(slot_distance) + step * index, 6) for index in range(count)] + + +def _offset_position( + position: Sequence[float], + offset: Sequence[float], +) -> list[float]: + return [ + round(float(position[index]) + float(offset[index]), 6) for index in range(3) + ] + + +def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: + if len(spec.placements) == 1: + return { + "mode": "relative_placement", + "moved_object": spec.moved_runtime_uid, + "reference_object": spec.reference_runtime_uid, + "relation": spec.relation, + "active_arm": f"{spec.active_side}_arm", + "release_offset": spec.release_offset, + } + return { + "mode": "dual_arm_relative_placement", + "placements": [ + { + "moved_object": placement.moved_runtime_uid, + "reference_object": placement.reference_runtime_uid, + "relation": placement.relation, + "active_arm": f"{placement.active_side}_arm", + "release_offset": placement.release_offset, + } + for placement in spec.placements + ], + } diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py new file mode 100644 index 00000000..ad182c97 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py @@ -0,0 +1,793 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Callable, Mapping, Sequence +from typing import Any +import json + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _RelativePlacementSpec, + _RelativePlacementStepSpec, + _SceneObject, +) +from embodichain.gen_sim.action_agent_pipeline.generation.naming import ( + _base_name, + _candidate_relative_runtime_uid, + _container_runtime_uid, + _is_container_like, + _normalize_runtime_uid, + _string_list, + _target_runtime_suffix, +) +from embodichain.gen_sim.action_agent_pipeline.generation.scene_objects import ( + _arm_side_for_position, + _pick_table, + _position_side_axis_value, +) + +__all__ = [ + "_SIDE_RELATIONS", + "_build_relative_placement_spec_with_llm", + "_normalize_relative_relation", + "_relative_relation_phrase", + "_relative_scene_runtime_uid_mapping", +] + +_RELATIVE_RELATIONS = { + "inside", + "on", + "left_of", + "right_of", + "front_of", + "behind", + "front_left_of", + "back_left_of", + "front_right_of", + "back_right_of", +} + +_SIDE_RELATIONS = _RELATIVE_RELATIONS - {"inside", "on"} + +_SELF_REFERENCE_VALUES = { + "self", + "initial_self", + "initial_position", + "initial_pose", + "origin", + "itself", + "自身", + "自己", + "原位", + "初始位置", +} + +_RELATION_ALIASES = { + "in": "inside", + "into": "inside", + "inside": "inside", + "放入": "inside", + "放进": "inside", + "里面": "inside", + "on": "on", + "onto": "on", + "on_top": "on", + "on_top_of": "on", + "above": "on", + "top": "on", + "上": "on", + "上方": "on", + "上面": "on", + "叠放": "on", + "left": "left_of", + "left_of": "left_of", + "to_the_left_of": "left_of", + "左": "left_of", + "左边": "left_of", + "front_left": "front_left_of", + "front_left_of": "front_left_of", + "left_front": "front_left_of", + "left_front_of": "front_left_of", + "to_the_front_left_of": "front_left_of", + "左前": "front_left_of", + "左前方": "front_left_of", + "左前面": "front_left_of", + "back_left": "back_left_of", + "back_left_of": "back_left_of", + "behind_left": "back_left_of", + "left_back": "back_left_of", + "left_behind": "back_left_of", + "left_back_of": "back_left_of", + "to_the_back_left_of": "back_left_of", + "左后": "back_left_of", + "左后方": "back_left_of", + "左后面": "back_left_of", + "右": "right_of", + "右边": "right_of", + "right": "right_of", + "right_of": "right_of", + "to_the_right_of": "right_of", + "front_right": "front_right_of", + "front_right_of": "front_right_of", + "right_front": "front_right_of", + "right_front_of": "front_right_of", + "to_the_front_right_of": "front_right_of", + "右前": "front_right_of", + "右前方": "front_right_of", + "右前面": "front_right_of", + "back_right": "back_right_of", + "back_right_of": "back_right_of", + "behind_right": "back_right_of", + "right_back": "back_right_of", + "right_behind": "back_right_of", + "right_back_of": "back_right_of", + "to_the_back_right_of": "back_right_of", + "右后": "back_right_of", + "右后方": "back_right_of", + "右后面": "back_right_of", + "front": "front_of", + "front_of": "front_of", + "in_front_of": "front_of", + "前": "front_of", + "前方": "front_of", + "前面": "front_of", + "back": "behind", + "behind": "behind", + "back_of": "behind", + "后": "behind", + "后方": "behind", + "后面": "behind", +} + + +def _build_relative_placement_spec_with_llm( + *, + scene_objects: list[_SceneObject], + project_name: str, + task_description: str, + model: str | None, + release_offset_fn: Callable[[str], Sequence[float]], + staging_z_delta: float, + task_llm_caller: Callable[..., Mapping[str, Any]] | None = None, +) -> _RelativePlacementSpec: + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + if not background_objects: + raise ValueError("Relative placement generation requires a background table.") + if not rigid_objects: + raise ValueError( + "Relative placement generation requires a movable rigid object." + ) + + table = _pick_table(background_objects) + if task_llm_caller is None: + task_llm_caller = _call_relative_task_llm + response = task_llm_caller( + project_name=project_name, + task_description=task_description, + scene_summary=[ + { + "source_uid": obj.source_uid, + "role": obj.source_role, + "object_type": _base_name(obj), + "is_container_like": _is_container_like(obj), + "mesh": obj.config.get("shape", {}).get("fpath"), + "init_pos": obj.config.get("init_pos"), + } + for obj in scene_objects + ], + model=model, + ) + return _apply_relative_task_response( + response=response, + table_source_uid=table.source_uid, + scene_objects=scene_objects, + rigid_objects=rigid_objects, + task_description=task_description, + release_offset_fn=release_offset_fn, + staging_z_delta=staging_z_delta, + ) + + +def _call_relative_task_llm( + *, + project_name: str, + task_description: str, + scene_summary: list[dict[str, Any]], + model: str | None, +) -> dict[str, Any]: + from langchain_core.messages import HumanMessage, SystemMessage + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + extract_json_object, + ) + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( + create_chat_openai, + ) + + prompt = ( + "Parse a simple Dual-UR5 tabletop relative-placement task and produce " + "a constrained config-level JSON spec. This JSON is used to generate " + "task_prompt.txt, basic_background.txt, atom_actions.txt, and " + "agent_success; a second LLM will later read those prompts to generate " + "the executable graph JSON.\n\n" + "Return exactly one JSON object with this schema:\n" + "{\n" + ' "placements": [\n' + " {\n" + ' "moved_object": "",\n' + ' "reference_object": "",\n' + ' "goal_relation": ' + '"inside|on|left_of|right_of|front_of|behind|front_left_of|back_left_of|front_right_of|back_right_of",\n' + ' "arm": "left|right|auto"\n' + " }\n" + " ],\n" + ' "task_prompt_summary": "",\n' + ' "basic_background_notes": "",\n' + ' "action_sketch": [\n' + ' "grasp moved_object",\n' + ' "move above the relation target pose",\n' + ' "place at the release pose with PlaceAction"\n' + " ]\n" + "}\n\n" + "Rules:\n" + "- Use only source_uid values from the scene objects listed below.\n" + "- Return one placement for a single-arm task and exactly two placements " + "for a dual-arm task.\n" + "- Treat the task as dual-arm when it explicitly says 双臂, 两臂, both " + "arms, two arms, or when it describes separate work for the left arm and " + "the right arm even if it does not literally say 双臂.\n" + "- Do not invent a second placement when the task only moves one object.\n" + "- moved_object is the object to grasp and move.\n" + "- reference_object is the object used as the spatial reference, " + "container, or support.\n" + "- reference_object may be a rigid_object or a background object such as " + "a pad, tray, basket, or container.\n" + "- For single-object directional tasks such as moving the only object " + "forward, left, front-left, or back-right from its initial position, set " + "reference_object to the same source_uid as moved_object (or 'self'). " + "This means the generator will use the object's initial position as a " + "fixed anchor, not the object's moving runtime pose.\n" + "- Within each placement, moved_object and reference_object must be " + "different unless the task is an initial-position directional move.\n" + "- For dual-arm tasks, the placements must use two different moved_object " + "values and one left arm plus one right arm. Use arm='auto' only when " + "the user did not specify which arm handles that placement.\n" + "- arm selects the single UR5 arm that should manipulate moved_object. " + "Use arm='left' for explicit left-arm instructions such as 左臂, 左机械臂, " + "left arm, or left UR5; use arm='right' for explicit right-arm " + "instructions such as 右臂, 右机械臂, right arm, or right UR5; use " + "arm='auto' when the task does not specify an arm.\n" + "- For Chinese/English left/right/front/back, use the relation enums " + "from the rotated robot-view perspective. front_of means negative " + "world-x; behind means positive world-x; left_of means negative " + "world-y; right_of means positive world-y. Diagonal relations combine " + "both axes: front_left_of, back_left_of, front_right_of, back_right_of.\n" + "- If the task says to release an object above a basket/container so it " + "falls into it, use goal_relation='inside'.\n" + "- If the task says to stack/place one object on another non-container " + "support, use goal_relation='on'.\n" + "- Do not return numeric offsets, object poses, scales, success JSON, " + "robot config, or full prompt files. The generator computes those " + "deterministically.\n\n" + f"Project: {project_name}\n" + f"Task description:\n{task_description}\n" + f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}" + ) + llm = create_chat_openai( + temperature=0.0, + model=model, + usage_stage="config_generation.relative_task", + ) + response = llm.invoke( + [ + SystemMessage( + content=( + "You produce strict JSON specs for simulation config " + "generation. Do not include markdown." + ) + ), + HumanMessage(content=prompt), + ] + ) + content = getattr(response, "content", response) + return extract_json_object(content) + + +def _apply_relative_task_response( + *, + response: Mapping[str, Any], + table_source_uid: str, + scene_objects: list[_SceneObject], + rigid_objects: list[_SceneObject], + task_description: str, + release_offset_fn: Callable[[str], Sequence[float]], + staging_z_delta: float, +) -> _RelativePlacementSpec: + by_uid = {obj.source_uid: obj for obj in scene_objects} + runtime_uids = _relative_scene_runtime_uid_mapping( + scene_objects, + table_source_uid=table_source_uid, + ) + + placement_entries = _relative_placement_entries(response) + if len(placement_entries) > 2: + raise ValueError("Relative placement supports at most two arm placements.") + + forced_arm_sides = _relative_forced_arm_sides( + placement_entries, + by_uid=by_uid, + rigid_objects=rigid_objects, + ) + placements = tuple( + _build_relative_placement_step( + entry=entry, + by_uid=by_uid, + scene_objects=scene_objects, + rigid_objects=rigid_objects, + runtime_uids=runtime_uids, + forced_side=forced_side, + release_offset_fn=release_offset_fn, + staging_z_delta=staging_z_delta, + ) + for entry, forced_side in zip(placement_entries, forced_arm_sides) + ) + _validate_relative_placements(placements) + + summary = str(response.get("task_prompt_summary", "")).strip() + if not summary: + summary = _default_relative_plan_summary(placements) + background_notes = str(response.get("basic_background_notes", "")).strip() + action_sketch = _string_list(response.get("action_sketch")) + if not action_sketch: + action_sketch = _default_relative_action_sketch(placements) + + primary = placements[0] + + return _RelativePlacementSpec( + table_source_uid=table_source_uid, + moved_source_uid=primary.moved_source_uid, + reference_source_uid=primary.reference_source_uid, + moved_runtime_uid=primary.moved_runtime_uid, + reference_runtime_uid=primary.reference_runtime_uid, + relation=primary.relation, + active_side=primary.active_side, + task_description=task_description, + task_prompt_summary=summary, + basic_background_notes=background_notes, + action_sketch=action_sketch, + release_offset=primary.release_offset, + high_offset=primary.high_offset, + placements=placements, + reference_is_initial_pose=primary.reference_is_initial_pose, + release_position=primary.release_position, + high_position=primary.high_position, + ) + + +def _relative_placement_entries(response: Mapping[str, Any]) -> list[Mapping[str, Any]]: + placements = response.get("placements") + if placements is None: + return [response] + if not isinstance(placements, list) or not placements: + raise ValueError("LLM response placements must be a non-empty list.") + entries: list[Mapping[str, Any]] = [] + for index, placement in enumerate(placements): + if not isinstance(placement, Mapping): + raise ValueError(f"Placement {index} must be a JSON object.") + entries.append(placement) + return entries + + +def _relative_forced_arm_sides( + placement_entries: list[Mapping[str, Any]], + *, + by_uid: Mapping[str, _SceneObject], + rigid_objects: list[_SceneObject], +) -> list[str | None]: + if len(placement_entries) != 2: + return [None for _ in placement_entries] + + requested_sides = [ + _normalize_relative_arm(entry.get("arm")) for entry in placement_entries + ] + explicit_sides = [side for side in requested_sides if side != "auto"] + if len(explicit_sides) == 2: + return [None, None] + if len(explicit_sides) == 1: + complement = "right" if explicit_sides[0] == "left" else "left" + return [ + requested_side if requested_side != "auto" else complement + for requested_side in requested_sides + ] + + moved_source_uids = [ + _resolve_rigid_source_uid( + entry.get("moved_object"), + rigid_objects, + field_name="moved_object", + ) + for entry in placement_entries + ] + positions = [ + _vector3(by_uid[source_uid].config.get("init_pos", [0.0, 0.0, 0.0])) + for source_uid in moved_source_uids + ] + inferred_sides = [_arm_side_for_position(position) for position in positions] + if set(inferred_sides) == {"left", "right"}: + return inferred_sides + + side_values = [_position_side_axis_value(position) for position in positions] + if side_values[0] <= side_values[1]: + return ["left", "right"] + return ["right", "left"] + + +def _build_relative_placement_step( + *, + entry: Mapping[str, Any], + by_uid: Mapping[str, _SceneObject], + scene_objects: list[_SceneObject], + rigid_objects: list[_SceneObject], + runtime_uids: Mapping[str, str], + forced_side: str | None, + release_offset_fn: Callable[[str], Sequence[float]], + staging_z_delta: float, +) -> _RelativePlacementStepSpec: + moved_source_uid = _resolve_rigid_source_uid( + entry.get("moved_object"), + rigid_objects, + field_name="moved_object", + ) + relation = _normalize_relative_relation(entry.get("goal_relation")) + reference_source_uid = _resolve_relative_reference_source_uid( + entry.get("reference_object"), + moved_source_uid=moved_source_uid, + scene_objects=scene_objects, + ) + reference_is_initial_pose = moved_source_uid == reference_source_uid + if reference_is_initial_pose and relation not in _SIDE_RELATIONS: + raise ValueError( + "Initial-position self-relative placement only supports directional " + "relations, not inside/on." + ) + + reference_obj = by_uid[reference_source_uid] + if relation == "on" and _is_container_like(reference_obj): + relation = "inside" + + moved_runtime_uid = runtime_uids[moved_source_uid] + reference_runtime_uid = runtime_uids[reference_source_uid] + if moved_runtime_uid == reference_runtime_uid and not reference_is_initial_pose: + raise ValueError( + f"Relative placement produced duplicate runtime uid {moved_runtime_uid!r}." + ) + + release_offset = [float(value) for value in release_offset_fn(relation)] + high_offset = list(release_offset) + high_offset[2] += float(staging_z_delta) + moved_position = _vector3( + by_uid[moved_source_uid].config.get("init_pos", [0, 0, 0]) + ) + requested_side = _normalize_relative_arm(entry.get("arm")) + active_side = ( + forced_side + if forced_side is not None + else ( + _arm_side_for_position(moved_position) + if requested_side == "auto" + else requested_side + ) + ) + + return _RelativePlacementStepSpec( + moved_source_uid=moved_source_uid, + reference_source_uid=reference_source_uid, + moved_runtime_uid=moved_runtime_uid, + reference_runtime_uid=reference_runtime_uid, + relation=relation, + active_side=active_side, + release_offset=release_offset, + high_offset=high_offset, + reference_is_initial_pose=reference_is_initial_pose, + ) + + +def _validate_relative_placements( + placements: tuple[_RelativePlacementStepSpec, ...], +) -> None: + if not placements: + raise ValueError("Relative placement requires at least one placement.") + moved_source_uids = [placement.moved_source_uid for placement in placements] + if len(moved_source_uids) != len(set(moved_source_uids)): + raise ValueError("Relative placements must use distinct moved_object values.") + if len(placements) == 2: + active_sides = {placement.active_side for placement in placements} + if active_sides != {"left", "right"}: + raise ValueError( + "Dual-arm relative placement requires one left arm and one right arm." + ) + + +def _resolve_rigid_source_uid( + value: Any, + rigid_objects: list[_SceneObject], + *, + field_name: str, +) -> str: + return _resolve_scene_source_uid( + value, + rigid_objects, + field_name=field_name, + ) + + +def _resolve_relative_reference_source_uid( + value: Any, + *, + moved_source_uid: str, + scene_objects: list[_SceneObject], +) -> str: + if value is not None: + text = str(value).strip() + normalized = text.lower().replace("-", "_").replace(" ", "_") + if normalized in _SELF_REFERENCE_VALUES: + return moved_source_uid + return _resolve_scene_source_uid( + value, + scene_objects, + field_name="reference_object", + ) + + +def _resolve_scene_source_uid( + value: Any, + scene_objects: list[_SceneObject], + *, + field_name: str, +) -> str: + if value is None: + raise ValueError(f"LLM response missing required {field_name}.") + text = str(value).strip() + by_uid = {obj.source_uid: obj for obj in scene_objects} + if text in by_uid: + return text + + normalized = _normalize_runtime_uid(text) + matches = [ + obj.source_uid + for obj in scene_objects + if _normalize_runtime_uid(obj.source_uid) == normalized + or _base_name(obj) == normalized + or _candidate_relative_runtime_uid(obj) == normalized + ] + if len(matches) == 1: + return matches[0] + if not matches: + raise ValueError(f"LLM returned unknown {field_name}: {text!r}.") + raise ValueError( + f"LLM returned ambiguous {field_name}: {text!r}; candidates: {matches}." + ) + + +def _normalize_relative_relation(value: Any) -> str: + relation = str(value or "").strip().lower().replace("-", "_").replace(" ", "_") + relation = _RELATION_ALIASES.get(relation, relation) + if relation not in _RELATIVE_RELATIONS: + raise ValueError( + f"Unsupported relative placement relation {value!r}; expected one " + f"of {sorted(_RELATIVE_RELATIONS)}." + ) + return relation + + +def _normalize_relative_arm(value: Any) -> str: + if value is None: + return "auto" + text = str(value).strip().lower().replace("-", "_").replace(" ", "_") + if text in { + "", + "auto", + "automatic", + "unspecified", + "none", + "null", + "default", + "自动", + "默认", + "未指定", + "不指定", + }: + return "auto" + if text in { + "left", + "left_arm", + "left_ur5", + "左", + "左臂", + "左机械臂", + "左手", + "左手臂", + }: + return "left" + if text in { + "right", + "right_arm", + "right_ur5", + "右", + "右臂", + "右机械臂", + "右手", + "右手臂", + }: + return "right" + raise ValueError( + f"Unsupported relative placement arm {value!r}; expected 'left', " + "'right', or 'auto'." + ) + + +def _relative_runtime_uid_mapping( + rigid_objects: list[_SceneObject], +) -> dict[str, str]: + candidates: dict[str, str] = {} + for obj in rigid_objects: + if _is_container_like(obj): + candidates[obj.source_uid] = _container_runtime_uid(obj) + continue + + base = _target_runtime_suffix(_base_name(obj)) + base_count = sum( + 1 for other in rigid_objects if _base_name(other) == _base_name(obj) + ) + candidates[obj.source_uid] = ( + base if base_count == 1 else _normalize_runtime_uid(obj.source_uid) + ) + + counts: dict[str, int] = {} + for runtime_uid in candidates.values(): + counts[runtime_uid] = counts.get(runtime_uid, 0) + 1 + return { + source_uid: ( + runtime_uid + if counts[runtime_uid] == 1 + else _normalize_runtime_uid(source_uid) + ) + for source_uid, runtime_uid in candidates.items() + } + + +def _relative_scene_runtime_uid_mapping( + scene_objects: list[_SceneObject], + *, + table_source_uid: str, +) -> dict[str, str]: + candidates: dict[str, str] = {} + rigid_runtime_uids = _relative_runtime_uid_mapping( + [obj for obj in scene_objects if obj.source_role == "rigid_object"] + ) + for obj in scene_objects: + if obj.source_uid == table_source_uid: + candidates[obj.source_uid] = "table" + elif obj.source_role == "rigid_object": + candidates[obj.source_uid] = rigid_runtime_uids[obj.source_uid] + else: + candidates[obj.source_uid] = _candidate_relative_runtime_uid(obj) + + counts: dict[str, int] = {} + for runtime_uid in candidates.values(): + counts[runtime_uid] = counts.get(runtime_uid, 0) + 1 + return { + source_uid: ( + runtime_uid + if source_uid == table_source_uid or counts[runtime_uid] == 1 + else _normalize_runtime_uid(source_uid) + ) + for source_uid, runtime_uid in candidates.items() + } + + +def _default_relative_task_summary( + moved_uid: str, + reference_uid: str, + relation: str, +) -> str: + return ( + f"Move `{moved_uid}` so its final state is " + f"{_relative_relation_phrase(relation)} `{reference_uid}`." + ) + + +def _default_relative_plan_summary( + placements: Sequence[_RelativePlacementStepSpec], +) -> str: + if len(placements) == 1: + placement = placements[0] + return _default_relative_task_summary( + placement.moved_runtime_uid, + placement.reference_runtime_uid, + placement.relation, + ) + placement_text = "; ".join( + f"use the {placement.active_side} UR5 to move " + f"`{placement.moved_runtime_uid}` " + f"{_relative_relation_phrase(placement.relation)} " + f"`{placement.reference_runtime_uid}`" + for placement in placements + ) + return f"Use both UR5 arms for a dual-arm relative placement: {placement_text}." + + +def _default_relative_action_sketch( + placements: Sequence[_RelativePlacementStepSpec], +) -> list[str]: + if len(placements) == 1: + placement = placements[0] + return [ + f"grasp {placement.moved_runtime_uid}", + ( + f"move above the {placement.relation} release pose relative to " + f"{placement.reference_runtime_uid}" + ), + "place at the release pose with PlaceAction", + ] + sketch = ["grasp both moved objects with their assigned arms"] + for placement in placements: + sketch.extend( + [ + ( + f"use {placement.active_side}_arm to move " + f"{placement.moved_runtime_uid} above the release pose relative " + f"to {placement.reference_runtime_uid}" + ), + f"place {placement.moved_runtime_uid} with PlaceAction", + ] + ) + return sketch + + +def _relative_relation_phrase(relation: str) -> str: + relation = _normalize_relative_relation(relation) + if relation == "inside": + return "inside" + if relation == "on": + return "on top of" + if relation == "left_of": + return "to the left of" + if relation == "right_of": + return "to the right of" + if relation == "front_of": + return "in front of" + if relation == "behind": + return "behind" + if relation == "front_left_of": + return "to the front-left of" + if relation == "back_left_of": + return "to the back-left of" + if relation == "front_right_of": + return "to the front-right of" + if relation == "back_right_of": + return "to the back-right of" + raise ValueError(f"Unsupported relative placement relation: {relation!r}.") + + +def _vector3(value: Any) -> list[float]: + if not isinstance(value, (list, tuple)) or len(value) != 3: + raise ValueError(f"Expected a 3-vector, got {value!r}.") + return [float(item) for item in value] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/replacement_generation.py b/embodichain/gen_sim/action_agent_pipeline/generation/replacement_generation.py new file mode 100644 index 00000000..90cebe28 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/replacement_generation.py @@ -0,0 +1,326 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Callable, Mapping, Sequence +import json +from pathlib import Path +from typing import Any +import re + +from embodichain.gen_sim.action_agent_pipeline.generation.config_io import ( + read_json, + write_json, +) +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + TargetReplacementSpec, + _BasketTaskRoles, + _ResolvedTargetReplacement, +) +from embodichain.gen_sim.action_agent_pipeline.generation.naming import ( + _normalize_runtime_uid, +) + +__all__ = [ + "_apply_replacement_names", + "_normalize_target_replacements", + "_run_target_replacements", + "_validate_target_replacement_sources", +] + +_TARGET_REPLACEMENT_MANIFEST_FILENAME = ".embodichain_replacement_manifest.json" + + +def _normalize_target_replacements( + target_replacements: Sequence[TargetReplacementSpec] | None, +) -> tuple[TargetReplacementSpec, ...]: + if not target_replacements: + return () + + normalized = [] + seen_source_uids = set() + seen_output_dirs = set() + for replacement in target_replacements: + if not isinstance(replacement, TargetReplacementSpec): + raise TypeError( + "target_replacements must contain TargetReplacementSpec values." + ) + source_uid = str(replacement.source_uid).strip() + prompt = str(replacement.prompt).strip() + output_dir_name = str(replacement.output_dir_name).strip() + if not source_uid: + raise ValueError("target replacement source_uid must be non-empty.") + if not prompt: + raise ValueError("target replacement prompt must be non-empty.") + if not output_dir_name: + raise ValueError("target replacement output_dir_name must be non-empty.") + output_dir_path = Path(output_dir_name) + if ( + output_dir_path.is_absolute() + or len(output_dir_path.parts) != 1 + or output_dir_name in {".", ".."} + ): + raise ValueError( + "target replacement output_dir_name must be a single relative " + f"directory name, got: {output_dir_name!r}" + ) + if source_uid in seen_source_uids: + raise ValueError(f"Duplicate target replacement source uid: {source_uid}") + if output_dir_name in seen_output_dirs: + raise ValueError( + f"Duplicate target replacement output dir: {output_dir_name}" + ) + seen_source_uids.add(source_uid) + seen_output_dirs.add(output_dir_name) + normalized.append( + TargetReplacementSpec( + source_uid=source_uid, + prompt=prompt, + output_dir_name=output_dir_name, + ) + ) + return tuple(normalized) + + +def _validate_target_replacement_sources( + roles: _BasketTaskRoles, + replacement_specs: Sequence[TargetReplacementSpec], +) -> None: + if not replacement_specs: + return + + target_source_uids = { + roles.left_target_source_uid, + roles.right_target_source_uid, + } + unknown = [ + replacement.source_uid + for replacement in replacement_specs + if replacement.source_uid not in target_source_uids + ] + if unknown: + raise ValueError( + "target_replacements must reference the selected basket target " + f"source uid(s) {sorted(target_source_uids)}, got: {unknown}" + ) + + +def _run_target_replacements( + *, + scene_dir: Path, + replacement_specs: Sequence[TargetReplacementSpec], + reuse_target_replacements: bool, + prompt2geometry_runner: Callable[..., Mapping[str, Any]] | None = None, +) -> tuple[_ResolvedTargetReplacement, ...]: + if prompt2geometry_runner is None: + prompt2geometry_runner = _run_prompt2geometry_replacement + + resolved = [] + for replacement in replacement_specs: + runtime_noun = _replacement_runtime_noun(replacement.prompt) + output_root = scene_dir / "mesh_assets" / replacement.output_dir_name + output_name = f"{runtime_noun}.glb" + mesh_path = None + reused = False + if reuse_target_replacements: + mesh_path = _resolve_reusable_target_replacement_mesh_path( + output_root=output_root, + prompt=replacement.prompt, + output_name=output_name, + ) + reused = mesh_path is not None + if mesh_path is None: + result = prompt2geometry_runner( + prompt=replacement.prompt, + output_root=output_root, + output_name=output_name, + ) + mesh_path = _resolve_prompt2geometry_mesh_path(result, output_root) + _write_target_replacement_manifest( + output_root=output_root, + prompt=replacement.prompt, + output_name=output_name, + mesh_path=mesh_path, + ) + elif reused: + _write_target_replacement_manifest( + output_root=output_root, + prompt=replacement.prompt, + output_name=output_name, + mesh_path=mesh_path, + ) + resolved.append( + _ResolvedTargetReplacement( + source_uid=replacement.source_uid, + prompt=replacement.prompt, + output_dir_name=replacement.output_dir_name, + mesh_path=mesh_path, + runtime_noun=runtime_noun, + reused=reused, + ) + ) + return tuple(resolved) + + +def _resolve_reusable_target_replacement_mesh_path( + *, + output_root: Path, + prompt: str, + output_name: str, +) -> Path | None: + expected_mesh_path = (output_root / output_name).expanduser().resolve() + if not expected_mesh_path.is_file(): + return None + + manifest_path = _target_replacement_manifest_path(output_root) + if not manifest_path.is_file(): + return expected_mesh_path + + try: + manifest = read_json(manifest_path) + except (OSError, json.JSONDecodeError): + return None + + if manifest.get("prompt") != prompt or manifest.get("output_name") != output_name: + return None + + manifest_mesh_path = Path( + str(manifest.get("mesh_path", expected_mesh_path)) + ).expanduser() + if not manifest_mesh_path.is_absolute(): + manifest_mesh_path = (output_root / manifest_mesh_path).resolve() + else: + manifest_mesh_path = manifest_mesh_path.resolve() + if manifest_mesh_path.is_file(): + return manifest_mesh_path + return expected_mesh_path + + +def _write_target_replacement_manifest( + *, + output_root: Path, + prompt: str, + output_name: str, + mesh_path: Path, +) -> None: + write_json( + _target_replacement_manifest_path(output_root), + { + "prompt": prompt, + "output_name": output_name, + "mesh_path": mesh_path.expanduser().resolve().as_posix(), + }, + ) + + +def _target_replacement_manifest_path(output_root: Path) -> Path: + return output_root / _TARGET_REPLACEMENT_MANIFEST_FILENAME + + +def _run_prompt2geometry_replacement( + *, + prompt: str, + output_root: Path, + output_name: str, +) -> dict[str, Any]: + from embodichain.gen_sim.action_agent_pipeline.gym_project_api.prompt2geometry import ( + Prompt2GeometryRequest, + load_prompt2geometry_config, + run_prompt2geometry, + ) + + cfg = load_prompt2geometry_config() + return run_prompt2geometry( + Prompt2GeometryRequest( + prompt=prompt, + output_root=output_root, + output_name=output_name, + zimage_base_url=cfg.zimage_base_url, + sam3_base_url=cfg.sam3_base_url, + sam3d_base_url=cfg.sam3d_base_url, + llm_api_key=cfg.llm_api_key, + llm_model=cfg.llm_model, + llm_base_url=cfg.llm_base_url, + llm_timeout_s=cfg.llm_timeout_s, + ) + ) + + +def _resolve_prompt2geometry_mesh_path( + result: Mapping[str, Any], + output_root: Path, +) -> Path: + raw_path = result.get("scaled_mesh_path") or result.get("mesh_path") + if not raw_path: + raise ValueError("prompt2geometry result did not include a GLB mesh path.") + + mesh_path = Path(str(raw_path)).expanduser() + if not mesh_path.is_absolute(): + mesh_path = (output_root / mesh_path).resolve() + else: + mesh_path = mesh_path.resolve() + + if not mesh_path.is_file(): + raise FileNotFoundError(f"Generated replacement GLB not found: {mesh_path}") + return mesh_path + + +def _replacement_runtime_noun(prompt: str) -> str: + tokens = re.findall(r"[a-z0-9]+", prompt.lower()) + while tokens and tokens[0] in {"a", "an", "the"}: + tokens.pop(0) + stem = "_".join(tokens) + if not stem: + stem = "replacement_object" + return _normalize_runtime_uid(stem) + + +def _apply_replacement_names( + roles: _BasketTaskRoles, + resolved_replacements: Sequence[_ResolvedTargetReplacement], +) -> _BasketTaskRoles: + replacement_by_uid = { + replacement.source_uid: replacement for replacement in resolved_replacements + } + left_replacement = replacement_by_uid.get(roles.left_target_source_uid) + right_replacement = replacement_by_uid.get(roles.right_target_source_uid) + left_target_noun = ( + left_replacement.runtime_noun + if left_replacement is not None + else roles.left_target_noun + ) + right_target_noun = ( + right_replacement.runtime_noun + if right_replacement is not None + else roles.right_target_noun + ) + target_noun = ( + left_target_noun if left_target_noun == right_target_noun else "target_object" + ) + return _BasketTaskRoles( + table_source_uid=roles.table_source_uid, + container_source_uid=roles.container_source_uid, + left_target_source_uid=roles.left_target_source_uid, + right_target_source_uid=roles.right_target_source_uid, + container_runtime_uid=roles.container_runtime_uid, + left_target_runtime_uid=f"left_{left_target_noun}", + right_target_runtime_uid=f"right_{right_target_noun}", + target_noun=target_noun, + left_target_noun=left_target_noun, + right_target_noun=right_target_noun, + container_noun=roles.container_noun, + ) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/role_refinement.py b/embodichain/gen_sim/action_agent_pipeline/generation/role_refinement.py new file mode 100644 index 00000000..70d20cfa --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/role_refinement.py @@ -0,0 +1,141 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from typing import Any +import json + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _BasketTaskRoles, + _SceneObject, +) +from embodichain.gen_sim.action_agent_pipeline.generation.naming import ( + _display_noun, + _normalize_runtime_uid, +) + +__all__ = [ + "_call_role_llm", + "_refine_roles_with_llm", +] + + +def _refine_roles_with_llm( + *, + roles: _BasketTaskRoles, + scene_objects: list[_SceneObject], + project_name: str, + model: str | None, +) -> _BasketTaskRoles: + response = _call_role_llm( + project_name=project_name, + scene_summary=[ + { + "source_uid": obj.source_uid, + "role": obj.source_role, + "mesh": obj.config.get("shape", {}).get("fpath"), + "init_pos": obj.config.get("init_pos"), + } + for obj in scene_objects + ], + default_roles={ + "container_object": roles.container_source_uid, + "left_target_object": roles.left_target_source_uid, + "right_target_object": roles.right_target_source_uid, + "target_noun": roles.target_noun, + "container_runtime_uid": roles.container_runtime_uid, + }, + model=model, + ) + source_uids = {obj.source_uid for obj in scene_objects} + left_target = str(response.get("left_target_object", roles.left_target_source_uid)) + right_target = str( + response.get("right_target_object", roles.right_target_source_uid) + ) + container = str(response.get("container_object", roles.container_source_uid)) + for uid in (left_target, right_target, container): + if uid not in source_uids: + raise ValueError(f"LLM returned unknown source uid: {uid!r}") + if len({left_target, right_target, container}) != 3: + raise ValueError("LLM role mapping must use three distinct source objects.") + + target_noun = _normalize_runtime_uid( + str(response.get("target_noun", roles.target_noun)) + ) + container_runtime_uid = _normalize_runtime_uid( + str(response.get("container_runtime_uid", roles.container_runtime_uid)) + ) + return _BasketTaskRoles( + table_source_uid=roles.table_source_uid, + container_source_uid=container, + left_target_source_uid=left_target, + right_target_source_uid=right_target, + container_runtime_uid=container_runtime_uid, + left_target_runtime_uid=f"left_{target_noun}", + right_target_runtime_uid=f"right_{target_noun}", + target_noun=target_noun, + left_target_noun=target_noun, + right_target_noun=target_noun, + container_noun=_display_noun(container_runtime_uid), + ) + + +def _call_role_llm( + *, + project_name: str, + scene_summary: list[dict[str, Any]], + default_roles: dict[str, Any], + model: str | None, +) -> dict[str, Any]: + from langchain_core.messages import HumanMessage, SystemMessage + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + extract_json_object, + ) + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( + create_chat_openai, + ) + + prompt = ( + "Identify roles for a fixed Dual-UR5 basket-placement simulation task. " + "Return only one JSON object with keys: container_object, " + "left_target_object, right_target_object, target_noun, " + "container_runtime_uid. Use only source_uid values from the scene. The " + "rotated robot-view left target starts on the negative-y side, and the " + "rotated robot-view right target starts on the positive-y side.\n\n" + f"Project: {project_name}\n" + f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}\n" + f"Default roles:\n{json.dumps(default_roles, ensure_ascii=False, indent=2)}" + ) + llm = create_chat_openai( + temperature=0.0, + model=model, + usage_stage="config_generation.role_refinement", + ) + response = llm.invoke( + [ + SystemMessage( + content=( + "You produce strict JSON role mappings for simulation config " + "generation. Do not include markdown." + ) + ), + HumanMessage(content=prompt), + ] + ) + content = getattr(response, "content", response) + return extract_json_object(content) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/scene_objects.py b/embodichain/gen_sim/action_agent_pipeline/generation/scene_objects.py new file mode 100644 index 00000000..e188d89a --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/scene_objects.py @@ -0,0 +1,262 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Mapping +import copy +from pathlib import Path +from typing import Any +import re + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _BasketTaskRoles, + _SceneObject, +) +from embodichain.gen_sim.action_agent_pipeline.generation.naming import ( + _base_name, + _container_runtime_uid, + _display_noun, + _is_container_like, + _object_text, + _target_noun, +) + +__all__ = [ + "_arm_side_for_position", + "_collect_scene_objects", + "_infer_basket_task_roles", + "_infer_project_name", + "_pick_container", + "_pick_left_right_targets", + "_pick_table", + "_position_side_axis_value", + "_resolve_gym_config_path", + "_side_axis_value", +] + +_PROJECT_NAME_RE = re.compile(r"^[0-9]+_gym_project$") +_GYM_CONFIG_FILENAMES = frozenset({"gym_config.json", "gym_config_merged.json"}) +_GYM_CONFIG_PREFERENCE = ("gym_config_merged.json", "gym_config.json") +_DUAL_UR5_SIDE_AXIS_INDEX = 1 + + +def _resolve_gym_config_path(input_path: Path) -> Path: + if input_path.is_file(): + if input_path.name not in _GYM_CONFIG_FILENAMES: + expected = ", ".join(sorted(_GYM_CONFIG_FILENAMES)) + raise ValueError(f"Expected one of {expected}, got: {input_path}") + return input_path + + direct = _preferred_gym_config_in_dir(input_path) + if direct is not None: + return direct + + formatted_scene_dirs = sorted( + { + path.parent + for filename in _GYM_CONFIG_FILENAMES + for path in input_path.glob(f"formatted_tabletop_scene/*/{filename}") + } + ) + formatted_matches = [ + path + for scene_dir in formatted_scene_dirs + if (path := _preferred_gym_config_in_dir(scene_dir)) is not None + ] + if len(formatted_matches) == 1: + return formatted_matches[0] + if len(formatted_matches) > 1: + matches = ", ".join(path.as_posix() for path in formatted_matches) + raise ValueError(f"Multiple formatted gym config files found: {matches}") + + recursive_scene_dirs = sorted( + { + path.parent + for filename in _GYM_CONFIG_FILENAMES + for path in input_path.rglob(filename) + } + ) + recursive_matches = [ + path + for scene_dir in recursive_scene_dirs + if (path := _preferred_gym_config_in_dir(scene_dir)) is not None + ] + if len(recursive_matches) == 1: + return recursive_matches[0] + if not recursive_matches: + expected = " or ".join(_GYM_CONFIG_PREFERENCE) + raise FileNotFoundError(f"{expected} not found under: {input_path}") + matches = ", ".join(path.as_posix() for path in recursive_matches) + raise ValueError(f"Multiple gym config files found: {matches}") + + +def _preferred_gym_config_in_dir(scene_dir: Path) -> Path | None: + for filename in _GYM_CONFIG_PREFERENCE: + path = scene_dir / filename + if path.is_file(): + return path + return None + + +def _infer_project_name(input_path: Path, scene_dir: Path) -> str: + for part in input_path.parts: + if _PROJECT_NAME_RE.match(part): + return part + for part in scene_dir.parts: + if _PROJECT_NAME_RE.match(part): + return part + return scene_dir.name + + +def _collect_scene_objects(scene_config: Mapping[str, Any]) -> list[_SceneObject]: + scene_objects = [] + for source_role in ("background", "rigid_object"): + for obj_config in scene_config.get(source_role, []) or []: + source_uid = str(obj_config.get("uid", "")).strip() + if not source_uid: + raise ValueError(f"Scene object without uid in {source_role}.") + scene_objects.append( + _SceneObject( + source_uid=source_uid, + source_role=source_role, + config=copy.deepcopy(dict(obj_config)), + ) + ) + + if not scene_objects: + raise ValueError("No background or rigid_object entries found in gym config.") + return scene_objects + + +def _infer_basket_task_roles(scene_objects: list[_SceneObject]) -> _BasketTaskRoles: + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + if not background_objects: + raise ValueError("UR5 basket generation requires a table/background object.") + if len(rigid_objects) < 3: + raise ValueError( + "UR5 basket generation requires at least two target objects and one " + "basket-like container." + ) + + table = _pick_table(background_objects) + container = _pick_container(rigid_objects) + target_candidates = [ + obj for obj in rigid_objects if obj.source_uid != container.source_uid + ] + if len(target_candidates) < 2: + raise ValueError("Expected at least two non-container target objects.") + + left_target, right_target = _pick_left_right_targets(target_candidates) + target_noun = _target_noun(left_target, right_target) + container_noun = _display_noun(_base_name(container)) + return _BasketTaskRoles( + table_source_uid=table.source_uid, + container_source_uid=container.source_uid, + left_target_source_uid=left_target.source_uid, + right_target_source_uid=right_target.source_uid, + container_runtime_uid=_container_runtime_uid(container), + left_target_runtime_uid=f"left_{target_noun}", + right_target_runtime_uid=f"right_{target_noun}", + target_noun=target_noun, + left_target_noun=target_noun, + right_target_noun=target_noun, + container_noun=container_noun, + ) + + +def _pick_table(background_objects: list[_SceneObject]) -> _SceneObject: + for obj in background_objects: + text = _object_text(obj) + if "table" in text: + return obj + return background_objects[0] + + +def _pick_container(rigid_objects: list[_SceneObject]) -> _SceneObject: + candidates = [obj for obj in rigid_objects if _is_container_like(obj)] + if not candidates: + names = ", ".join(obj.source_uid for obj in rigid_objects) + raise ValueError(f"No basket-like container object found among: {names}") + + def score(obj: _SceneObject) -> tuple[int, float]: + text = _object_text(obj) + keyword_score = 0 if "basket" in text else 1 + pos = _vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) + center_distance = abs(pos[0]) + abs(pos[1]) + return keyword_score, center_distance + + return sorted(candidates, key=score)[0] + + +def _pick_left_right_targets( + target_candidates: list[_SceneObject], +) -> tuple[_SceneObject, _SceneObject]: + if len(target_candidates) == 2: + picked = target_candidates + else: + grouped: dict[str, list[_SceneObject]] = {} + for obj in target_candidates: + grouped.setdefault(_base_name(obj), []).append(obj) + repeated_groups = [group for group in grouped.values() if len(group) >= 2] + if repeated_groups: + picked = sorted( + repeated_groups, + key=_target_group_sort_key, + )[0] + if len(picked) > 2: + picked = sorted( + picked, + key=lambda obj: abs(_side_axis_value(obj)), + reverse=True, + )[:2] + else: + picked = sorted( + target_candidates, + key=lambda obj: abs(_side_axis_value(obj)), + reverse=True, + )[:2] + left, right = sorted(picked, key=_side_axis_value) + return left, right + + +def _target_group_sort_key(group: list[_SceneObject]) -> tuple[float, int]: + side_values = [_side_axis_value(obj) for obj in group] + side_spread = max(side_values) - min(side_values) + return -side_spread, -len(group) + + +def _side_axis_value(obj: _SceneObject) -> float: + return _position_side_axis_value( + _vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) + ) + + +def _position_side_axis_value(position: list[float]) -> float: + return float(position[_DUAL_UR5_SIDE_AXIS_INDEX]) + + +def _arm_side_for_position(position: list[float]) -> str: + return "left" if _position_side_axis_value(position) < 0.0 else "right" + + +def _vector3(value: Any) -> list[float]: + if not isinstance(value, (list, tuple)) or len(value) != 3: + raise ValueError(f"Expected a 3-vector, got {value!r}.") + return [float(item) for item in value] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py new file mode 100644 index 00000000..b430ae3d --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py @@ -0,0 +1,338 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Callable, Mapping, Sequence +from typing import Any + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _BasketTaskRoles, + _RelativePlacementSpec, + _RelativePlacementStepSpec, +) + +__all__ = [ + "_make_extensions_config", + "_make_relative_extensions_config", + "_object_in_container_success", + "_validate_bundle", + "_validate_relative_bundle", + "_validate_success_uids", +] + + +def _make_extensions_config(roles: _BasketTaskRoles) -> dict[str, Any]: + return { + "agent_arm_slots": { + "left": { + "arm": "right_arm", + "eef": "right_eef", + }, + "right": { + "arm": "left_arm", + "eef": "left_eef", + }, + }, + "arm_aim_yaw_offset": { + "left": 3.141592653589793, + "right": 0.0, + }, + "gripper_open_state": [0.0], + "gripper_close_state": [0.04], + "ignore_terminations_during_agent": True, + "viewer_camera_uid": "cam_high", + "agent_success": { + "op": "all", + "terms": [ + _object_in_container_success( + roles.left_target_runtime_uid, + roles.container_runtime_uid, + ), + _object_in_container_success( + roles.right_target_runtime_uid, + roles.container_runtime_uid, + ), + ], + }, + } + + +def _object_in_container_success(object_uid: str, container_uid: str) -> dict[str, Any]: + return { + "type": "object_in_container", + "object": object_uid, + "container": container_uid, + "radius": 0.2, + "min_z_offset": -0.05, + "max_z_offset": 0.35, + } + + +def _make_relative_extensions_config( + spec: _RelativePlacementSpec, + *, + side_relation_xy_offsets: Callable[[str], tuple[float, float]], +) -> dict[str, Any]: + return { + "agent_arm_slots": { + "left": { + "arm": "right_arm", + "eef": "right_eef", + }, + "right": { + "arm": "left_arm", + "eef": "left_eef", + }, + }, + "arm_aim_yaw_offset": { + "left": 3.141592653589793, + "right": 0.0, + }, + "gripper_open_state": [0.0], + "gripper_close_state": [0.04], + "ignore_terminations_during_agent": True, + "viewer_camera_uid": "cam_high", + "agent_success": _make_relative_success_spec( + spec, + side_relation_xy_offsets=side_relation_xy_offsets, + ), + } + + +def _make_relative_success_spec( + spec: _RelativePlacementSpec, + *, + side_relation_xy_offsets: Callable[[str], tuple[float, float]], +) -> dict[str, Any]: + if len(spec.placements) == 1: + return _make_relative_placement_success_spec( + spec.placements[0], + side_relation_xy_offsets=side_relation_xy_offsets, + ) + return { + "op": "all", + "terms": [ + _make_relative_placement_success_spec( + placement, + side_relation_xy_offsets=side_relation_xy_offsets, + ) + for placement in spec.placements + ], + } + + +def _make_relative_placement_success_spec( + placement: _RelativePlacementStepSpec, + *, + side_relation_xy_offsets: Callable[[str], tuple[float, float]], +) -> dict[str, Any]: + if placement.relation == "inside": + return _object_in_container_success( + placement.moved_runtime_uid, + placement.reference_runtime_uid, + ) + if placement.relation == "on": + return { + "type": "object_on_object", + "object": placement.moved_runtime_uid, + "support": placement.reference_runtime_uid, + "xy_radius": 0.08, + "min_z_offset": 0.02, + "max_z_offset": 0.35, + } + + if placement.reference_is_initial_pose: + if placement.release_position is None: + raise ValueError( + "Self-relative success requires an absolute release position." + ) + return { + "op": "all", + "terms": [ + *_absolute_xy_success_terms( + placement.moved_runtime_uid, + placement.release_position, + ), + { + "type": "object_not_fallen", + "object": placement.moved_runtime_uid, + "max_tilt": 0.9, + }, + ], + } + + return { + "op": "all", + "terms": [ + *_relative_xy_success_terms( + placement, + side_relation_xy_offsets=side_relation_xy_offsets, + ), + { + "type": "object_not_fallen", + "object": placement.moved_runtime_uid, + "max_tilt": 0.9, + }, + ], + } + + +def _absolute_xy_success_terms( + object_uid: str, + position: Sequence[float], +) -> list[dict[str, Any]]: + return [ + { + "type": "object_axis_near", + "object": object_uid, + "axis": axis, + "target": float(position[index]), + "tolerance": 0.05, + } + for index, axis in enumerate(("x", "y")) + ] + + +def _relative_xy_success_terms( + placement: _RelativePlacementStepSpec, + *, + side_relation_xy_offsets: Callable[[str], tuple[float, float]], +) -> list[dict[str, Any]]: + x_offset, y_offset = side_relation_xy_offsets(placement.relation) + return [ + { + "type": "object_axis_offset_near", + "object": placement.moved_runtime_uid, + "reference": placement.reference_runtime_uid, + "axis": axis, + "offset": offset, + "tolerance": 0.05 if offset else 0.06, + } + for axis, offset in (("x", x_offset), ("y", y_offset)) + ] + + +def _validate_bundle(bundle: Mapping[str, Any], roles: _BasketTaskRoles) -> None: + gym_config = bundle["gym_config"] + if gym_config.get("id") != "AtomicActionsAgent-v3": + raise ValueError("Generated gym config must use AtomicActionsAgent-v3.") + if gym_config.get("robot", {}).get("uid") != "DualUR5": + raise ValueError("Generated UR5 basket config must use DualUR5.") + + rigid_uids = {obj["uid"] for obj in gym_config.get("rigid_object", [])} + background_uids = {obj["uid"] for obj in gym_config.get("background", [])} + scene_uids = rigid_uids | background_uids + required_rigid = { + roles.left_target_runtime_uid, + roles.right_target_runtime_uid, + } + if not required_rigid.issubset(rigid_uids): + raise ValueError( + f"Generated rigid objects missing: {sorted(required_rigid - rigid_uids)}" + ) + if roles.container_runtime_uid not in scene_uids: + raise ValueError( + f"Generated scene objects missing container: {roles.container_runtime_uid}" + ) + + success = gym_config["env"]["extensions"]["agent_success"] + for term in success.get("terms", []): + if ( + term.get("object") not in rigid_uids + or term.get("container") not in scene_uids + ): + raise ValueError(f"Invalid success term uid reference: {term}") + + +def _validate_relative_bundle( + bundle: Mapping[str, Any], + spec: _RelativePlacementSpec, +) -> None: + gym_config = bundle["gym_config"] + if gym_config.get("id") != "AtomicActionsAgent-v3": + raise ValueError("Generated gym config must use AtomicActionsAgent-v3.") + if gym_config.get("robot", {}).get("uid") != "DualUR5": + raise ValueError("Generated relative placement config must use DualUR5.") + + rigid_uid_list = [obj["uid"] for obj in gym_config.get("rigid_object", [])] + if len(rigid_uid_list) != len(set(rigid_uid_list)): + raise ValueError(f"Duplicate rigid object runtime uid(s): {rigid_uid_list}") + rigid_uids = set(rigid_uid_list) + background_uids = {obj["uid"] for obj in gym_config.get("background", [])} + scene_uids = rigid_uids | background_uids + moved_required = {placement.moved_runtime_uid for placement in spec.placements} + missing_moved = moved_required - rigid_uids + if missing_moved: + raise ValueError( + f"Generated relative config missing moved rigid object(s): {missing_moved}" + ) + reference_required = { + placement.reference_runtime_uid for placement in spec.placements + } + missing_reference = reference_required - scene_uids + if missing_reference: + raise ValueError( + f"Generated relative config missing reference object(s): {missing_reference}" + ) + + _validate_success_uids( + gym_config["env"]["extensions"]["agent_success"], + rigid_uids=rigid_uids, + scene_uids=scene_uids, + ) + registry = gym_config["env"]["events"]["register_info_to_env"]["params"]["registry"] + registered = {entry["entity_cfg"]["uid"] for entry in registry} + required = moved_required | reference_required + if not required.issubset(registered): + raise ValueError( + f"Relative config registry missing: {sorted(required - registered)}" + ) + + +def _validate_success_uids( + success: Mapping[str, Any], + *, + rigid_uids: set[str], + scene_uids: set[str], +) -> None: + if success.get("op") in {"all", "and", "any", "or"}: + for term in success.get("terms", []): + _validate_success_uids(term, rigid_uids=rigid_uids, scene_uids=scene_uids) + return + + success_type = str(success.get("type", success.get("func", ""))).lower() + if success_type == "object_in_container": + required_keys = ("object", "container") + elif success_type in {"object_on_object", "object_on", "on_object"}: + required_keys = ("object", "support") + elif success_type in { + "object_axis_offset_near", + "object_relative_axis_near", + }: + required_keys = ("object", "reference") + elif success_type in {"object_axis_near", "object_coordinate_near"}: + required_keys = ("object",) + elif success_type in {"object_not_fallen", "not_fallen"}: + required_keys = ("object",) + else: + raise ValueError(f"Unsupported generated success term: {success_type!r}.") + + for key in required_keys: + uid = success.get(key) + valid_uids = rigid_uids if key == "object" else scene_uids + if uid not in valid_uids: + raise ValueError(f"Invalid success uid reference {key}={uid!r}.") diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_lights.json b/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_lights.json new file mode 100644 index 00000000..f234fafe --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_lights.json @@ -0,0 +1,12 @@ +{ + "direct": [ + { + "uid": "main_light", + "light_type": "point", + "color": [1.0, 1.0, 1.0], + "intensity": 40.0, + "init_pos": [0.0, -0.4, 2.2], + "radius": 10.0 + } + ] +} diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_sensors.json b/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_sensors.json new file mode 100644 index 00000000..3da01498 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_sensors.json @@ -0,0 +1,49 @@ +[ + { + "sensor_type": "Camera", + "uid": "cam_high", + "width": 960, + "height": 540, + "intrinsics": [420, 420, 480, 270], + "extrinsics": { + "pos": [0.4, 0.0, 2.2], + "eye": [0.6, 0.0, 3.3], + "target": [0.0, 0.0, 0.75], + "up": [1.0, 0.0, 0.0] + } + }, + { + "sensor_type": "Camera", + "uid": "cam_wrist_left", + "width": 640, + "height": 480, + "intrinsics": [600, 600, 320, 240], + "extrinsics": { + "parent": "left_ee_link", + "pos": [0.0, 0.12, 0.08], + "quat": [ + -0.0012598701, + -0.029051816664441618998, + 0.9094039177564813, + 0.41489627504330695 + ] + } + }, + { + "sensor_type": "Camera", + "uid": "cam_wrist_right", + "width": 640, + "height": 480, + "intrinsics": [600, 600, 320, 240], + "extrinsics": { + "parent": "right_ee_link", + "pos": [0.0, 0.12, 0.08], + "quat": [ + -0.0012598701, + -0.029051816664441618998, + 0.9094039177564813, + 0.41489627504330695 + ] + } + } +] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json b/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json new file mode 100644 index 00000000..b74bf850 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json @@ -0,0 +1,106 @@ +{ + "uid": "DualUR5", + "urdf_cfg": { + "fname": "dual_ur5_dh_pgi_basket", + "components": [ + { + "component_type": "left_arm", + "urdf_path": "UniversalRobots/UR5/UR5.urdf", + "transform": [ + [0.0, -1.0, 0.0, -0.3], + [1.0, 0.0, 0.0, -1.45], + [0.0, 0.0, 1.0, 0.4], + [0.0, 0.0, 0.0, 1.0] + ] + }, + { + "component_type": "left_hand", + "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf" + }, + { + "component_type": "right_arm", + "urdf_path": "UniversalRobots/UR5/UR5.urdf", + "transform": [ + [0.0, -1.0, 0.0, 0.3], + [1.0, 0.0, 0.0, -1.45], + [0.0, 0.0, 1.0, 0.4], + [0.0, 0.0, 0.0, 1.0] + ] + }, + { + "component_type": "right_hand", + "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf" + } + ] + }, + "init_pos": [2.0, 0.0, 0.0], + "init_rot": [0.0, 0.0, -90.0], + "init_qpos": [ + 0, + 0, + -1.57, + -1.57, + 1.57, + 1.57, + -1.57, + -1.57, + -1.57, + -1.57, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "drive_pros": { + "stiffness": { + "LEFT_JOINT[1-6]": 10000.0, + "RIGHT_JOINT[1-6]": 10000.0, + "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 100.0, + "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 100.0 + }, + "damping": { + "LEFT_JOINT[1-6]": 1000.0, + "RIGHT_JOINT[1-6]": 1000.0, + "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 10.0, + "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 10.0 + }, + "max_effort": { + "LEFT_JOINT[1-6]": 100000.0, + "RIGHT_JOINT[1-6]": 100000.0, + "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 1000.0, + "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 1000.0 + } + }, + "control_parts": { + "left_arm": ["LEFT_JOINT[1-6]"], + "left_eef": ["LEFT_GRIPPER_FINGER[1-2]_JOINT_1"], + "right_arm": ["RIGHT_JOINT[1-6]"], + "right_eef": ["RIGHT_GRIPPER_FINGER[1-2]_JOINT_1"] + }, + "solver_cfg": { + "left_arm": { + "class_type": "PytorchSolver", + "end_link_name": "left_ee_link", + "root_link_name": "left_base_link", + "tcp": [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.16], + [0.0, 0.0, 0.0, 1.0] + ] + }, + "right_arm": { + "class_type": "PytorchSolver", + "end_link_name": "right_ee_link", + "root_link_name": "right_base_link", + "tcp": [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.16], + [0.0, 0.0, 0.0, 1.0] + ] + } + } +} diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py deleted file mode 100644 index e0290c34..00000000 --- a/embodichain/gen_sim/action_agent_pipeline/generation/ur5_basket_config.py +++ /dev/null @@ -1,3665 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -from __future__ import annotations - -from collections.abc import Mapping, Sequence -from dataclasses import dataclass -from pathlib import Path -from typing import Any -import copy -import json -import math -import re -import struct - -from embodichain.gen_sim.action_agent_pipeline.generation.mesh_frame_normalization import ( - MeshFrameNormalizer, -) -from embodichain.gen_sim.action_agent_pipeline.generation.prompt_builders import ( - make_agent_config, - make_basket_atom_actions_prompt, - make_basket_basic_background, - make_basket_task_prompt, - make_relative_atom_actions_prompt, - make_relative_basic_background, - make_relative_task_prompt, -) - -__all__ = [ - "GeneratedUR5BasketConfigPaths", - "TargetReplacementSpec", - "generate_ur5_basket_config_from_project", -] - -_DIGIT_SUFFIX_RE = re.compile(r"_[0-9]+$") -_INVALID_UID_CHARS_RE = re.compile(r"[^0-9a-zA-Z_]+") -_PROJECT_NAME_RE = re.compile(r"^[0-9]+_gym_project$") -_GYM_CONFIG_FILENAMES = frozenset({"gym_config.json", "gym_config_merged.json"}) -_GYM_CONFIG_PREFERENCE = ("gym_config_merged.json", "gym_config.json") -_TARGET_REPLACEMENT_MANIFEST_FILENAME = ".embodichain_replacement_manifest.json" - -_CONTAINER_KEYWORDS = ( - "basket", - "container", - "bowl", - "box", - "bin", - "tray", - "crate", -) - -_RELATIVE_RELATIONS = { - "inside", - "on", - "left_of", - "right_of", - "front_of", - "behind", - "front_left_of", - "back_left_of", - "front_right_of", - "back_right_of", -} - -_SIDE_RELATIONS = _RELATIVE_RELATIONS - {"inside", "on"} - -_SELF_REFERENCE_VALUES = { - "self", - "initial_self", - "initial_position", - "initial_pose", - "origin", - "itself", - "自身", - "自己", - "原位", - "初始位置", -} - -_RELATION_ALIASES = { - "in": "inside", - "into": "inside", - "inside": "inside", - "放入": "inside", - "放进": "inside", - "里面": "inside", - "on": "on", - "onto": "on", - "on_top": "on", - "on_top_of": "on", - "above": "on", - "top": "on", - "上": "on", - "上方": "on", - "上面": "on", - "叠放": "on", - "left": "left_of", - "left_of": "left_of", - "to_the_left_of": "left_of", - "左": "left_of", - "左边": "left_of", - "front_left": "front_left_of", - "front_left_of": "front_left_of", - "left_front": "front_left_of", - "left_front_of": "front_left_of", - "to_the_front_left_of": "front_left_of", - "左前": "front_left_of", - "左前方": "front_left_of", - "左前面": "front_left_of", - "back_left": "back_left_of", - "back_left_of": "back_left_of", - "behind_left": "back_left_of", - "left_back": "back_left_of", - "left_behind": "back_left_of", - "left_back_of": "back_left_of", - "to_the_back_left_of": "back_left_of", - "左后": "back_left_of", - "左后方": "back_left_of", - "左后面": "back_left_of", - "右": "right_of", - "右边": "right_of", - "right": "right_of", - "right_of": "right_of", - "to_the_right_of": "right_of", - "front_right": "front_right_of", - "front_right_of": "front_right_of", - "right_front": "front_right_of", - "right_front_of": "front_right_of", - "to_the_front_right_of": "front_right_of", - "右前": "front_right_of", - "右前方": "front_right_of", - "右前面": "front_right_of", - "back_right": "back_right_of", - "back_right_of": "back_right_of", - "behind_right": "back_right_of", - "right_back": "back_right_of", - "right_behind": "back_right_of", - "right_back_of": "back_right_of", - "to_the_back_right_of": "back_right_of", - "右后": "back_right_of", - "右后方": "back_right_of", - "右后面": "back_right_of", - "front": "front_of", - "front_of": "front_of", - "in_front_of": "front_of", - "前": "front_of", - "前方": "front_of", - "前面": "front_of", - "back": "behind", - "behind": "behind", - "back_of": "behind", - "后": "behind", - "后方": "behind", - "后面": "behind", -} - -_SIDE_RELATION_DISTANCE = 0.16 -_SIDE_RELEASE_Z_OFFSET = 0.12 -_STAGING_Z_DELTA = 0.10 -_ON_RELEASE_Z_OFFSET = 0.2 -_DUAL_UR5_LEGACY_INIT_Z = 0.5 -_DUAL_UR5_HIGH_TABLETOP_THRESHOLD = 1.0 -_DUAL_UR5_HIGH_TABLETOP_INIT_Z = 0.8 -_DUAL_UR5_ARM_COMPONENT_Z = 0.4 -_DUAL_UR5_TABLETOP_CLEARANCE = 0.25 -_DUAL_UR5_SIDE_AXIS_INDEX = 1 -_DUAL_UR5_ROTATED_INIT_X = 2.0 -_DUAL_UR5_ROTATED_INIT_YAW_DEGREES = -90.0 -_ROBOT_VIEW_LEFT_WORLD_Y_SIGN = -1.0 -_ROBOT_VIEW_FRONT_WORLD_X_SIGN = -1.0 -_BACKGROUND_MAX_CONVEX_HULL_NUM = 1 -_TARGET_MAX_CONVEX_HULL_NUM = 16 -_CONTAINER_MAX_CONVEX_HULL_NUM = 8 -_EXTRA_RIGID_MAX_CONVEX_HULL_NUM = 1 -_TABLETOP_OBJECT_CLEARANCE = 0.003 -_GLB_JSON_CHUNK_TYPE = 0x4E4F534A -_GLB_BINARY_CHUNK_TYPE = 0x004E4942 -_GLTF_COMPONENT_FORMATS = { - 5120: ("b", 1), - 5121: ("B", 1), - 5122: ("h", 2), - 5123: ("H", 2), - 5125: ("I", 4), - 5126: ("f", 4), -} -_GLTF_TYPE_COMPONENT_COUNTS = { - "SCALAR": 1, - "VEC2": 2, - "VEC3": 3, - "VEC4": 4, - "MAT4": 16, -} - -_BACKGROUND_ATTRS = { - "mass": 10.0, - "static_friction": 0.95, - "dynamic_friction": 0.9, - "restitution": 0.01, -} - -_RIGID_OBJECT_ATTRS = { - "mass": 0.01, - "contact_offset": 0.003, - "rest_offset": 0.001, - "restitution": 0.01, - "max_depenetration_velocity": 10.0, - "min_position_iters": 32, - "min_velocity_iters": 8, -} - - -@dataclass(frozen=True) -class GeneratedUR5BasketConfigPaths: - """Paths written by the UR5 basket config generator.""" - - output_dir: Path - gym_config: Path - agent_config: Path - task_prompt: Path - basic_background: Path - atom_actions: Path - summary: dict[str, Any] - - -@dataclass(frozen=True) -class TargetReplacementSpec: - """Prompt-to-geometry replacement for one source target object.""" - - source_uid: str - prompt: str - output_dir_name: str - - -@dataclass(frozen=True) -class _SceneObject: - source_uid: str - source_role: str - config: dict[str, Any] - - -@dataclass(frozen=True) -class _BasketTaskRoles: - table_source_uid: str - container_source_uid: str - left_target_source_uid: str - right_target_source_uid: str - container_runtime_uid: str - left_target_runtime_uid: str - right_target_runtime_uid: str - target_noun: str - left_target_noun: str - right_target_noun: str - container_noun: str - - -@dataclass(frozen=True) -class _ResolvedTargetReplacement: - source_uid: str - prompt: str - output_dir_name: str - mesh_path: Path - runtime_noun: str - reused: bool = False - - -@dataclass(frozen=True) -class _RelativePlacementStepSpec: - moved_source_uid: str - reference_source_uid: str - moved_runtime_uid: str - reference_runtime_uid: str - relation: str - active_side: str - release_offset: list[float] - high_offset: list[float] - reference_is_initial_pose: bool = False - release_position: list[float] | None = None - high_position: list[float] | None = None - - -@dataclass(frozen=True) -class _RelativePlacementSpec: - table_source_uid: str - moved_source_uid: str - reference_source_uid: str - moved_runtime_uid: str - reference_runtime_uid: str - relation: str - active_side: str - task_description: str - task_prompt_summary: str - basic_background_notes: str - action_sketch: list[str] - release_offset: list[float] - high_offset: list[float] - placements: tuple[_RelativePlacementStepSpec, ...] - reference_is_initial_pose: bool = False - release_position: list[float] | None = None - high_position: list[float] | None = None - - -def generate_ur5_basket_config_from_project( - gym_project: str | Path, - output_dir: str | Path, - *, - task_name: str = "UR5BreadBasket", - task_description: str | None = None, - use_llm_roles: bool = False, - llm_model: str | None = None, - target_body_scale: float | list[float] | tuple[float, float, float] = 0.7, - target_replacements: Sequence[TargetReplacementSpec] | None = None, - sync_replacement_names: bool = False, - reuse_target_replacements: bool = True, - prewarm_coacd_cache: bool = True, - overwrite: bool = False, - max_episodes: int = 1, - max_episode_steps: int = 1000, -) -> GeneratedUR5BasketConfigPaths: - """Generate Dual-UR5 basket placement configs from an exported gym project. - - This first-stage generator intentionally keeps the UR5BreadBasket task - structure fixed: the left arm grasps the left target object, the right arm - grasps the right target object, and both objects are placed into one - basket-like container. - - Args: - gym_project: Project root, formatted scene folder, ``gym_config.json``, - or ``gym_config_merged.json``. - output_dir: Destination config directory. - task_name: Name passed to ``run_agent``. - task_description: Optional natural-language relative-placement task. - When provided, the generator asks the shared LLM for a constrained - config-level task spec and generates prompts from that spec. - use_llm_roles: If true, use an LLM only to refine object role mapping. - llm_model: Optional model override for role refinement. - target_body_scale: Uniform or xyz scale applied to generated target - objects. Basket-like containers keep their source ``body_scale``. - target_replacements: Optional prompt-generated GLB replacements for - selected default basket target objects. Each replacement writes to - ``/mesh_assets/`` and only affects the - generated config, not the original source mesh file. - sync_replacement_names: If true, update runtime target UIDs and prompts - from the replacement prompts. If false, only mesh paths are replaced. - reuse_target_replacements: If true, reuse an existing replacement GLB - at the expected output path when it matches the requested prompt. - prewarm_coacd_cache: If true, precompute environment-side CoACD cache - files referenced by the generated gym config before writing it. - overwrite: If false, fail when generated files already exist. - max_episodes: Value written to ``fast_gym_config.json``. - max_episode_steps: Value written to ``fast_gym_config.json``. - - Returns: - Paths of generated config files. - """ - - output_dir_path = Path(output_dir).expanduser().resolve() - _raise_if_generated_files_exist(output_dir_path, overwrite) - - input_path = Path(gym_project).expanduser().resolve() - gym_config_path = _resolve_gym_config_path(input_path) - scene_dir = gym_config_path.parent - source_config = _read_json(gym_config_path) - project_name = _infer_project_name(input_path, scene_dir) - replacement_specs = _normalize_target_replacements(target_replacements) - mesh_normalizer = MeshFrameNormalizer( - output_dir=output_dir_path / "mesh_assets" / "normalized" - ) - - scene_objects = _collect_scene_objects(source_config) - if task_description: - if replacement_specs: - raise ValueError( - "target_replacements are only supported by the default basket " - "template. Do not combine them with task_description." - ) - spec = _build_relative_placement_spec_with_llm( - scene_objects=scene_objects, - project_name=project_name, - task_description=task_description, - model=llm_model, - ) - bundle = _build_relative_placement_bundle( - scene_dir=scene_dir, - source_config=source_config, - spec=spec, - project_name=project_name, - task_name=task_name, - target_body_scale=target_body_scale, - max_episodes=max_episodes, - max_episode_steps=max_episode_steps, - mesh_normalizer=mesh_normalizer, - ) - _validate_relative_bundle(bundle, spec) - _attach_mesh_normalization_summary(bundle, mesh_normalizer) - if prewarm_coacd_cache: - _attach_coacd_cache_summary(bundle) - return _write_config_bundle( - output_dir=output_dir_path, - bundle=bundle, - overwrite=overwrite, - ) - - roles = _infer_basket_task_roles(scene_objects) - if use_llm_roles: - roles = _refine_roles_with_llm( - roles=roles, - scene_objects=scene_objects, - project_name=project_name, - model=llm_model, - ) - - _validate_target_replacement_sources(roles, replacement_specs) - resolved_replacements = _run_target_replacements( - scene_dir=scene_dir, - replacement_specs=replacement_specs, - reuse_target_replacements=reuse_target_replacements, - ) - if sync_replacement_names: - roles = _apply_replacement_names( - roles, - resolved_replacements, - ) - - bundle = _build_ur5_basket_bundle( - scene_dir=scene_dir, - source_config=source_config, - roles=roles, - project_name=project_name, - task_name=task_name, - target_body_scale=target_body_scale, - target_replacements=resolved_replacements, - max_episodes=max_episodes, - max_episode_steps=max_episode_steps, - mesh_normalizer=mesh_normalizer, - ) - _validate_bundle(bundle, roles) - _attach_mesh_normalization_summary(bundle, mesh_normalizer) - if prewarm_coacd_cache: - _attach_coacd_cache_summary(bundle) - return _write_config_bundle( - output_dir=output_dir_path, - bundle=bundle, - overwrite=overwrite, - ) - - -def _resolve_gym_config_path(input_path: Path) -> Path: - if input_path.is_file(): - if input_path.name not in _GYM_CONFIG_FILENAMES: - expected = ", ".join(sorted(_GYM_CONFIG_FILENAMES)) - raise ValueError(f"Expected one of {expected}, got: {input_path}") - return input_path - - direct = _preferred_gym_config_in_dir(input_path) - if direct is not None: - return direct - - formatted_scene_dirs = sorted( - { - path.parent - for filename in _GYM_CONFIG_FILENAMES - for path in input_path.glob(f"formatted_tabletop_scene/*/{filename}") - } - ) - formatted_matches = [ - path - for scene_dir in formatted_scene_dirs - if (path := _preferred_gym_config_in_dir(scene_dir)) is not None - ] - if len(formatted_matches) == 1: - return formatted_matches[0] - if len(formatted_matches) > 1: - matches = ", ".join(path.as_posix() for path in formatted_matches) - raise ValueError(f"Multiple formatted gym config files found: {matches}") - - recursive_scene_dirs = sorted( - { - path.parent - for filename in _GYM_CONFIG_FILENAMES - for path in input_path.rglob(filename) - } - ) - recursive_matches = [ - path - for scene_dir in recursive_scene_dirs - if (path := _preferred_gym_config_in_dir(scene_dir)) is not None - ] - if len(recursive_matches) == 1: - return recursive_matches[0] - if not recursive_matches: - expected = " or ".join(_GYM_CONFIG_PREFERENCE) - raise FileNotFoundError(f"{expected} not found under: {input_path}") - matches = ", ".join(path.as_posix() for path in recursive_matches) - raise ValueError(f"Multiple gym config files found: {matches}") - - -def _preferred_gym_config_in_dir(scene_dir: Path) -> Path | None: - for filename in _GYM_CONFIG_PREFERENCE: - path = scene_dir / filename - if path.is_file(): - return path - return None - - -def _infer_project_name(input_path: Path, scene_dir: Path) -> str: - for part in input_path.parts: - if _PROJECT_NAME_RE.match(part): - return part - for part in scene_dir.parts: - if _PROJECT_NAME_RE.match(part): - return part - return scene_dir.name - - -def _collect_scene_objects(scene_config: Mapping[str, Any]) -> list[_SceneObject]: - scene_objects = [] - for source_role in ("background", "rigid_object"): - for obj_config in scene_config.get(source_role, []) or []: - source_uid = str(obj_config.get("uid", "")).strip() - if not source_uid: - raise ValueError(f"Scene object without uid in {source_role}.") - scene_objects.append( - _SceneObject( - source_uid=source_uid, - source_role=source_role, - config=copy.deepcopy(dict(obj_config)), - ) - ) - - if not scene_objects: - raise ValueError("No background or rigid_object entries found in gym config.") - return scene_objects - - -def _infer_basket_task_roles(scene_objects: list[_SceneObject]) -> _BasketTaskRoles: - background_objects = [ - obj for obj in scene_objects if obj.source_role == "background" - ] - rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] - if not background_objects: - raise ValueError("UR5 basket generation requires a table/background object.") - if len(rigid_objects) < 3: - raise ValueError( - "UR5 basket generation requires at least two target objects and one " - "basket-like container." - ) - - table = _pick_table(background_objects) - container = _pick_container(rigid_objects) - target_candidates = [ - obj for obj in rigid_objects if obj.source_uid != container.source_uid - ] - if len(target_candidates) < 2: - raise ValueError("Expected at least two non-container target objects.") - - left_target, right_target = _pick_left_right_targets(target_candidates) - target_noun = _target_noun(left_target, right_target) - container_noun = _display_noun(_base_name(container)) - return _BasketTaskRoles( - table_source_uid=table.source_uid, - container_source_uid=container.source_uid, - left_target_source_uid=left_target.source_uid, - right_target_source_uid=right_target.source_uid, - container_runtime_uid=_container_runtime_uid(container), - left_target_runtime_uid=f"left_{target_noun}", - right_target_runtime_uid=f"right_{target_noun}", - target_noun=target_noun, - left_target_noun=target_noun, - right_target_noun=target_noun, - container_noun=container_noun, - ) - - -def _pick_table(background_objects: list[_SceneObject]) -> _SceneObject: - for obj in background_objects: - text = _object_text(obj) - if "table" in text: - return obj - return background_objects[0] - - -def _pick_container(rigid_objects: list[_SceneObject]) -> _SceneObject: - candidates = [ - obj - for obj in rigid_objects - if any(keyword in _object_text(obj) for keyword in _CONTAINER_KEYWORDS) - ] - if not candidates: - names = ", ".join(obj.source_uid for obj in rigid_objects) - raise ValueError(f"No basket-like container object found among: {names}") - - def score(obj: _SceneObject) -> tuple[int, float]: - text = _object_text(obj) - keyword_score = 0 if "basket" in text else 1 - pos = _vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) - center_distance = abs(pos[0]) + abs(pos[1]) - return keyword_score, center_distance - - return sorted(candidates, key=score)[0] - - -def _pick_left_right_targets( - target_candidates: list[_SceneObject], -) -> tuple[_SceneObject, _SceneObject]: - if len(target_candidates) == 2: - picked = target_candidates - else: - grouped: dict[str, list[_SceneObject]] = {} - for obj in target_candidates: - grouped.setdefault(_base_name(obj), []).append(obj) - repeated_groups = [group for group in grouped.values() if len(group) >= 2] - if repeated_groups: - picked = sorted( - repeated_groups, - key=_target_group_sort_key, - )[0] - if len(picked) > 2: - picked = sorted( - picked, - key=lambda obj: abs(_side_axis_value(obj)), - reverse=True, - )[:2] - else: - picked = sorted( - target_candidates, - key=lambda obj: abs(_side_axis_value(obj)), - reverse=True, - )[:2] - left, right = sorted(picked, key=_side_axis_value) - return left, right - - -def _target_group_sort_key(group: list[_SceneObject]) -> tuple[float, int]: - side_values = [_side_axis_value(obj) for obj in group] - side_spread = max(side_values) - min(side_values) - return -side_spread, -len(group) - - -def _side_axis_value(obj: _SceneObject) -> float: - return _position_side_axis_value( - _vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) - ) - - -def _position_side_axis_value(position: list[float]) -> float: - return float(position[_DUAL_UR5_SIDE_AXIS_INDEX]) - - -def _arm_side_for_position(position: list[float]) -> str: - return "left" if _position_side_axis_value(position) < 0.0 else "right" - - -def _target_noun(left_target: _SceneObject, right_target: _SceneObject) -> str: - left_base = _base_name(left_target) - right_base = _base_name(right_target) - if left_base == right_base: - return _target_runtime_suffix(left_base) - return "target_object" - - -def _object_text(obj: _SceneObject) -> str: - shape = obj.config.get("shape", {}) or {} - return f"{obj.source_uid} {shape.get('fpath', '')}".lower() - - -def _base_name(obj: _SceneObject) -> str: - base = _DIGIT_SUFFIX_RE.sub("", obj.source_uid) - if base == obj.source_uid: - fpath = str(obj.config.get("shape", {}).get("fpath", "")) - path = Path(fpath) - if len(path.parts) >= 2: - base = path.parts[-2] - return _normalize_runtime_uid(base) - - -def _target_runtime_suffix(base: str) -> str: - if base == "bread": - return "bread_roll" - return base - - -def _container_runtime_uid(container: _SceneObject) -> str: - base = _base_name(container) - if "basket" in base: - return "wicker_basket" - return f"target_{base}" - - -def _display_noun(uid: str) -> str: - return uid.replace("_", " ") - - -def _plural(noun: str) -> str: - if noun.endswith("s"): - return noun - if noun.endswith(("ch", "sh", "x")): - return f"{noun}es" - return f"{noun}s" - - -def _left_target_text(roles: _BasketTaskRoles) -> str: - return _display_noun(roles.left_target_noun) - - -def _right_target_text(roles: _BasketTaskRoles) -> str: - return _display_noun(roles.right_target_noun) - - -def _target_pair_text(roles: _BasketTaskRoles) -> str: - left_text = _left_target_text(roles) - right_text = _right_target_text(roles) - if left_text == right_text: - return f"two {left_text} objects" - return f"the left {left_text} and right {right_text}" - - -def _target_plural_text(roles: _BasketTaskRoles) -> str: - left_text = _left_target_text(roles) - right_text = _right_target_text(roles) - if left_text == right_text: - return _plural(left_text) - return "target objects" - - -def _generic_target_text(roles: _BasketTaskRoles) -> str: - left_text = _left_target_text(roles) - right_text = _right_target_text(roles) - if left_text == right_text: - return left_text - return "target object" - - -def _target_task_description_text(roles: _BasketTaskRoles) -> str: - left_text = _left_target_text(roles) - right_text = _right_target_text(roles) - if left_text == right_text: - return _plural(left_text) - return f"{left_text}-and-{right_text}" - - -def _normalize_runtime_uid(value: str) -> str: - uid = _INVALID_UID_CHARS_RE.sub("_", value.strip()).strip("_").lower() - if not uid: - raise ValueError(f"Invalid runtime uid: {value!r}") - return uid - - -def _normalize_target_replacements( - target_replacements: Sequence[TargetReplacementSpec] | None, -) -> tuple[TargetReplacementSpec, ...]: - if not target_replacements: - return () - - normalized = [] - seen_source_uids = set() - seen_output_dirs = set() - for replacement in target_replacements: - if not isinstance(replacement, TargetReplacementSpec): - raise TypeError( - "target_replacements must contain TargetReplacementSpec values." - ) - source_uid = str(replacement.source_uid).strip() - prompt = str(replacement.prompt).strip() - output_dir_name = str(replacement.output_dir_name).strip() - if not source_uid: - raise ValueError("target replacement source_uid must be non-empty.") - if not prompt: - raise ValueError("target replacement prompt must be non-empty.") - if not output_dir_name: - raise ValueError("target replacement output_dir_name must be non-empty.") - output_dir_path = Path(output_dir_name) - if ( - output_dir_path.is_absolute() - or len(output_dir_path.parts) != 1 - or output_dir_name in {".", ".."} - ): - raise ValueError( - "target replacement output_dir_name must be a single relative " - f"directory name, got: {output_dir_name!r}" - ) - if source_uid in seen_source_uids: - raise ValueError(f"Duplicate target replacement source uid: {source_uid}") - if output_dir_name in seen_output_dirs: - raise ValueError( - f"Duplicate target replacement output dir: {output_dir_name}" - ) - seen_source_uids.add(source_uid) - seen_output_dirs.add(output_dir_name) - normalized.append( - TargetReplacementSpec( - source_uid=source_uid, - prompt=prompt, - output_dir_name=output_dir_name, - ) - ) - return tuple(normalized) - - -def _validate_target_replacement_sources( - roles: _BasketTaskRoles, - replacement_specs: Sequence[TargetReplacementSpec], -) -> None: - if not replacement_specs: - return - - target_source_uids = { - roles.left_target_source_uid, - roles.right_target_source_uid, - } - unknown = [ - replacement.source_uid - for replacement in replacement_specs - if replacement.source_uid not in target_source_uids - ] - if unknown: - raise ValueError( - "target_replacements must reference the selected basket target " - f"source uid(s) {sorted(target_source_uids)}, got: {unknown}" - ) - - -def _run_target_replacements( - *, - scene_dir: Path, - replacement_specs: Sequence[TargetReplacementSpec], - reuse_target_replacements: bool, -) -> tuple[_ResolvedTargetReplacement, ...]: - resolved = [] - for replacement in replacement_specs: - runtime_noun = _replacement_runtime_noun(replacement.prompt) - output_root = scene_dir / "mesh_assets" / replacement.output_dir_name - output_name = f"{runtime_noun}.glb" - mesh_path = None - reused = False - if reuse_target_replacements: - mesh_path = _resolve_reusable_target_replacement_mesh_path( - output_root=output_root, - prompt=replacement.prompt, - output_name=output_name, - ) - reused = mesh_path is not None - if mesh_path is None: - result = _run_prompt2geometry_replacement( - prompt=replacement.prompt, - output_root=output_root, - output_name=output_name, - ) - mesh_path = _resolve_prompt2geometry_mesh_path(result, output_root) - _write_target_replacement_manifest( - output_root=output_root, - prompt=replacement.prompt, - output_name=output_name, - mesh_path=mesh_path, - ) - elif reused: - _write_target_replacement_manifest( - output_root=output_root, - prompt=replacement.prompt, - output_name=output_name, - mesh_path=mesh_path, - ) - resolved.append( - _ResolvedTargetReplacement( - source_uid=replacement.source_uid, - prompt=replacement.prompt, - output_dir_name=replacement.output_dir_name, - mesh_path=mesh_path, - runtime_noun=runtime_noun, - reused=reused, - ) - ) - return tuple(resolved) - - -def _resolve_reusable_target_replacement_mesh_path( - *, - output_root: Path, - prompt: str, - output_name: str, -) -> Path | None: - expected_mesh_path = (output_root / output_name).expanduser().resolve() - if not expected_mesh_path.is_file(): - return None - - manifest_path = _target_replacement_manifest_path(output_root) - if not manifest_path.is_file(): - return expected_mesh_path - - try: - manifest = _read_json(manifest_path) - except (OSError, json.JSONDecodeError): - return None - - if manifest.get("prompt") != prompt or manifest.get("output_name") != output_name: - return None - - manifest_mesh_path = Path( - str(manifest.get("mesh_path", expected_mesh_path)) - ).expanduser() - if not manifest_mesh_path.is_absolute(): - manifest_mesh_path = (output_root / manifest_mesh_path).resolve() - else: - manifest_mesh_path = manifest_mesh_path.resolve() - if manifest_mesh_path.is_file(): - return manifest_mesh_path - return expected_mesh_path - - -def _write_target_replacement_manifest( - *, - output_root: Path, - prompt: str, - output_name: str, - mesh_path: Path, -) -> None: - _write_json( - _target_replacement_manifest_path(output_root), - { - "prompt": prompt, - "output_name": output_name, - "mesh_path": mesh_path.expanduser().resolve().as_posix(), - }, - ) - - -def _target_replacement_manifest_path(output_root: Path) -> Path: - return output_root / _TARGET_REPLACEMENT_MANIFEST_FILENAME - - -def _run_prompt2geometry_replacement( - *, - prompt: str, - output_root: Path, - output_name: str, -) -> dict[str, Any]: - from embodichain.gen_sim.action_agent_pipeline.gym_project_api.prompt2geometry import ( - Prompt2GeometryRequest, - load_prompt2geometry_config, - run_prompt2geometry, - ) - - cfg = load_prompt2geometry_config() - return run_prompt2geometry( - Prompt2GeometryRequest( - prompt=prompt, - output_root=output_root, - output_name=output_name, - zimage_base_url=cfg.zimage_base_url, - sam3_base_url=cfg.sam3_base_url, - sam3d_base_url=cfg.sam3d_base_url, - llm_api_key=cfg.llm_api_key, - llm_model=cfg.llm_model, - llm_base_url=cfg.llm_base_url, - llm_timeout_s=cfg.llm_timeout_s, - ) - ) - - -def _resolve_prompt2geometry_mesh_path( - result: Mapping[str, Any], - output_root: Path, -) -> Path: - raw_path = result.get("scaled_mesh_path") or result.get("mesh_path") - if not raw_path: - raise ValueError("prompt2geometry result did not include a GLB mesh path.") - - mesh_path = Path(str(raw_path)).expanduser() - if not mesh_path.is_absolute(): - mesh_path = (output_root / mesh_path).resolve() - else: - mesh_path = mesh_path.resolve() - - if not mesh_path.is_file(): - raise FileNotFoundError(f"Generated replacement GLB not found: {mesh_path}") - return mesh_path - - -def _replacement_runtime_noun(prompt: str) -> str: - tokens = re.findall(r"[a-z0-9]+", prompt.lower()) - while tokens and tokens[0] in {"a", "an", "the"}: - tokens.pop(0) - stem = "_".join(tokens) - if not stem: - stem = "replacement_object" - return _normalize_runtime_uid(stem) - - -def _apply_replacement_names( - roles: _BasketTaskRoles, - resolved_replacements: Sequence[_ResolvedTargetReplacement], -) -> _BasketTaskRoles: - replacement_by_uid = { - replacement.source_uid: replacement for replacement in resolved_replacements - } - left_replacement = replacement_by_uid.get(roles.left_target_source_uid) - right_replacement = replacement_by_uid.get(roles.right_target_source_uid) - left_target_noun = ( - left_replacement.runtime_noun - if left_replacement is not None - else roles.left_target_noun - ) - right_target_noun = ( - right_replacement.runtime_noun - if right_replacement is not None - else roles.right_target_noun - ) - target_noun = ( - left_target_noun if left_target_noun == right_target_noun else "target_object" - ) - return _BasketTaskRoles( - table_source_uid=roles.table_source_uid, - container_source_uid=roles.container_source_uid, - left_target_source_uid=roles.left_target_source_uid, - right_target_source_uid=roles.right_target_source_uid, - container_runtime_uid=roles.container_runtime_uid, - left_target_runtime_uid=f"left_{left_target_noun}", - right_target_runtime_uid=f"right_{right_target_noun}", - target_noun=target_noun, - left_target_noun=left_target_noun, - right_target_noun=right_target_noun, - container_noun=roles.container_noun, - ) - - -def _refine_roles_with_llm( - *, - roles: _BasketTaskRoles, - scene_objects: list[_SceneObject], - project_name: str, - model: str | None, -) -> _BasketTaskRoles: - response = _call_role_llm( - project_name=project_name, - scene_summary=[ - { - "source_uid": obj.source_uid, - "role": obj.source_role, - "mesh": obj.config.get("shape", {}).get("fpath"), - "init_pos": obj.config.get("init_pos"), - } - for obj in scene_objects - ], - default_roles={ - "container_object": roles.container_source_uid, - "left_target_object": roles.left_target_source_uid, - "right_target_object": roles.right_target_source_uid, - "target_noun": roles.target_noun, - "container_runtime_uid": roles.container_runtime_uid, - }, - model=model, - ) - source_uids = {obj.source_uid for obj in scene_objects} - left_target = str(response.get("left_target_object", roles.left_target_source_uid)) - right_target = str( - response.get("right_target_object", roles.right_target_source_uid) - ) - container = str(response.get("container_object", roles.container_source_uid)) - for uid in (left_target, right_target, container): - if uid not in source_uids: - raise ValueError(f"LLM returned unknown source uid: {uid!r}") - if len({left_target, right_target, container}) != 3: - raise ValueError("LLM role mapping must use three distinct source objects.") - - target_noun = _normalize_runtime_uid( - str(response.get("target_noun", roles.target_noun)) - ) - container_runtime_uid = _normalize_runtime_uid( - str(response.get("container_runtime_uid", roles.container_runtime_uid)) - ) - return _BasketTaskRoles( - table_source_uid=roles.table_source_uid, - container_source_uid=container, - left_target_source_uid=left_target, - right_target_source_uid=right_target, - container_runtime_uid=container_runtime_uid, - left_target_runtime_uid=f"left_{target_noun}", - right_target_runtime_uid=f"right_{target_noun}", - target_noun=target_noun, - left_target_noun=target_noun, - right_target_noun=target_noun, - container_noun=_display_noun(container_runtime_uid), - ) - - -def _call_role_llm( - *, - project_name: str, - scene_summary: list[dict[str, Any]], - default_roles: dict[str, Any], - model: str | None, -) -> dict[str, Any]: - from langchain_core.messages import HumanMessage, SystemMessage - - from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( - extract_json_object, - ) - from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( - create_chat_openai, - ) - - prompt = ( - "Identify roles for a fixed Dual-UR5 basket-placement simulation task. " - "Return only one JSON object with keys: container_object, " - "left_target_object, right_target_object, target_noun, " - "container_runtime_uid. Use only source_uid values from the scene. The " - "rotated robot-view left target starts on the negative-y side, and the " - "rotated robot-view right target starts on the positive-y side.\n\n" - f"Project: {project_name}\n" - f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}\n" - f"Default roles:\n{json.dumps(default_roles, ensure_ascii=False, indent=2)}" - ) - llm = create_chat_openai( - temperature=0.0, - model=model, - usage_stage="config_generation.role_refinement", - ) - response = llm.invoke( - [ - SystemMessage( - content=( - "You produce strict JSON role mappings for simulation config " - "generation. Do not include markdown." - ) - ), - HumanMessage(content=prompt), - ] - ) - content = getattr(response, "content", response) - return extract_json_object(content) - - -def _build_relative_placement_spec_with_llm( - *, - scene_objects: list[_SceneObject], - project_name: str, - task_description: str, - model: str | None, -) -> _RelativePlacementSpec: - background_objects = [ - obj for obj in scene_objects if obj.source_role == "background" - ] - rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] - if not background_objects: - raise ValueError("Relative placement generation requires a background table.") - if not rigid_objects: - raise ValueError( - "Relative placement generation requires a movable rigid object." - ) - - table = _pick_table(background_objects) - response = _call_relative_task_llm( - project_name=project_name, - task_description=task_description, - scene_summary=[ - { - "source_uid": obj.source_uid, - "role": obj.source_role, - "object_type": _base_name(obj), - "is_container_like": _is_container_like(obj), - "mesh": obj.config.get("shape", {}).get("fpath"), - "init_pos": obj.config.get("init_pos"), - } - for obj in scene_objects - ], - model=model, - ) - return _apply_relative_task_response( - response=response, - table_source_uid=table.source_uid, - scene_objects=scene_objects, - rigid_objects=rigid_objects, - task_description=task_description, - ) - - -def _call_relative_task_llm( - *, - project_name: str, - task_description: str, - scene_summary: list[dict[str, Any]], - model: str | None, -) -> dict[str, Any]: - from langchain_core.messages import HumanMessage, SystemMessage - - from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( - extract_json_object, - ) - from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( - create_chat_openai, - ) - - prompt = ( - "Parse a simple Dual-UR5 tabletop relative-placement task and produce " - "a constrained config-level JSON spec. This JSON is used to generate " - "task_prompt.txt, basic_background.txt, atom_actions.txt, and " - "agent_success; a second LLM will later read those prompts to generate " - "the executable graph JSON.\n\n" - "Return exactly one JSON object with this schema:\n" - "{\n" - ' "placements": [\n' - " {\n" - ' "moved_object": "",\n' - ' "reference_object": "",\n' - ' "goal_relation": ' - '"inside|on|left_of|right_of|front_of|behind|front_left_of|back_left_of|front_right_of|back_right_of",\n' - ' "arm": "left|right|auto"\n' - " }\n" - " ],\n" - ' "task_prompt_summary": "",\n' - ' "basic_background_notes": "",\n' - ' "action_sketch": [\n' - ' "grasp moved_object",\n' - ' "move above the relation target pose",\n' - ' "place at the release pose with PlaceAction"\n' - " ]\n" - "}\n\n" - "Rules:\n" - "- Use only source_uid values from the scene objects listed below.\n" - "- Return one placement for a single-arm task and exactly two placements " - "for a dual-arm task.\n" - "- Treat the task as dual-arm when it explicitly says 双臂, 两臂, both " - "arms, two arms, or when it describes separate work for the left arm and " - "the right arm even if it does not literally say 双臂.\n" - "- Do not invent a second placement when the task only moves one object.\n" - "- moved_object is the object to grasp and move.\n" - "- reference_object is the object used as the spatial reference, " - "container, or support.\n" - "- reference_object may be a rigid_object or a background object such as " - "a pad, tray, basket, or container.\n" - "- For single-object directional tasks such as moving the only object " - "forward, left, front-left, or back-right from its initial position, set " - "reference_object to the same source_uid as moved_object (or 'self'). " - "This means the generator will use the object's initial position as a " - "fixed anchor, not the object's moving runtime pose.\n" - "- Within each placement, moved_object and reference_object must be " - "different unless the task is an initial-position directional move.\n" - "- For dual-arm tasks, the placements must use two different moved_object " - "values and one left arm plus one right arm. Use arm='auto' only when " - "the user did not specify which arm handles that placement.\n" - "- arm selects the single UR5 arm that should manipulate moved_object. " - "Use arm='left' for explicit left-arm instructions such as 左臂, 左机械臂, " - "left arm, or left UR5; use arm='right' for explicit right-arm " - "instructions such as 右臂, 右机械臂, right arm, or right UR5; use " - "arm='auto' when the task does not specify an arm.\n" - "- For Chinese/English left/right/front/back, use the relation enums " - "from the rotated robot-view perspective. front_of means negative " - "world-x; behind means positive world-x; left_of means negative " - "world-y; right_of means positive world-y. Diagonal relations combine " - "both axes: front_left_of, back_left_of, front_right_of, back_right_of.\n" - "- If the task says to release an object above a basket/container so it " - "falls into it, use goal_relation='inside'.\n" - "- If the task says to stack/place one object on another non-container " - "support, use goal_relation='on'.\n" - "- Do not return numeric offsets, object poses, scales, success JSON, " - "robot config, or full prompt files. The generator computes those " - "deterministically.\n\n" - f"Project: {project_name}\n" - f"Task description:\n{task_description}\n" - f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}" - ) - llm = create_chat_openai( - temperature=0.0, - model=model, - usage_stage="config_generation.relative_task", - ) - response = llm.invoke( - [ - SystemMessage( - content=( - "You produce strict JSON specs for simulation config " - "generation. Do not include markdown." - ) - ), - HumanMessage(content=prompt), - ] - ) - content = getattr(response, "content", response) - return extract_json_object(content) - - -def _apply_relative_task_response( - *, - response: Mapping[str, Any], - table_source_uid: str, - scene_objects: list[_SceneObject], - rigid_objects: list[_SceneObject], - task_description: str, -) -> _RelativePlacementSpec: - by_uid = {obj.source_uid: obj for obj in scene_objects} - runtime_uids = _relative_scene_runtime_uid_mapping( - scene_objects, - table_source_uid=table_source_uid, - ) - - placement_entries = _relative_placement_entries(response) - if len(placement_entries) > 2: - raise ValueError("Relative placement supports at most two arm placements.") - - forced_arm_sides = _relative_forced_arm_sides( - placement_entries, - by_uid=by_uid, - rigid_objects=rigid_objects, - ) - placements = tuple( - _build_relative_placement_step( - entry=entry, - by_uid=by_uid, - scene_objects=scene_objects, - rigid_objects=rigid_objects, - runtime_uids=runtime_uids, - forced_side=forced_side, - ) - for entry, forced_side in zip(placement_entries, forced_arm_sides) - ) - _validate_relative_placements(placements) - - summary = str(response.get("task_prompt_summary", "")).strip() - if not summary: - summary = _default_relative_plan_summary(placements) - background_notes = str(response.get("basic_background_notes", "")).strip() - action_sketch = _string_list(response.get("action_sketch")) - if not action_sketch: - action_sketch = _default_relative_action_sketch(placements) - - primary = placements[0] - - return _RelativePlacementSpec( - table_source_uid=table_source_uid, - moved_source_uid=primary.moved_source_uid, - reference_source_uid=primary.reference_source_uid, - moved_runtime_uid=primary.moved_runtime_uid, - reference_runtime_uid=primary.reference_runtime_uid, - relation=primary.relation, - active_side=primary.active_side, - task_description=task_description, - task_prompt_summary=summary, - basic_background_notes=background_notes, - action_sketch=action_sketch, - release_offset=primary.release_offset, - high_offset=primary.high_offset, - placements=placements, - reference_is_initial_pose=primary.reference_is_initial_pose, - release_position=primary.release_position, - high_position=primary.high_position, - ) - - -def _relative_placement_entries(response: Mapping[str, Any]) -> list[Mapping[str, Any]]: - placements = response.get("placements") - if placements is None: - return [response] - if not isinstance(placements, list) or not placements: - raise ValueError("LLM response placements must be a non-empty list.") - entries: list[Mapping[str, Any]] = [] - for index, placement in enumerate(placements): - if not isinstance(placement, Mapping): - raise ValueError(f"Placement {index} must be a JSON object.") - entries.append(placement) - return entries - - -def _relative_forced_arm_sides( - placement_entries: list[Mapping[str, Any]], - *, - by_uid: Mapping[str, _SceneObject], - rigid_objects: list[_SceneObject], -) -> list[str | None]: - if len(placement_entries) != 2: - return [None for _ in placement_entries] - - requested_sides = [ - _normalize_relative_arm(entry.get("arm")) for entry in placement_entries - ] - explicit_sides = [side for side in requested_sides if side != "auto"] - if len(explicit_sides) == 2: - return [None, None] - if len(explicit_sides) == 1: - complement = "right" if explicit_sides[0] == "left" else "left" - return [ - requested_side if requested_side != "auto" else complement - for requested_side in requested_sides - ] - - moved_source_uids = [ - _resolve_rigid_source_uid( - entry.get("moved_object"), - rigid_objects, - field_name="moved_object", - ) - for entry in placement_entries - ] - positions = [ - _vector3(by_uid[source_uid].config.get("init_pos", [0.0, 0.0, 0.0])) - for source_uid in moved_source_uids - ] - inferred_sides = [_arm_side_for_position(position) for position in positions] - if set(inferred_sides) == {"left", "right"}: - return inferred_sides - - side_values = [_position_side_axis_value(position) for position in positions] - if side_values[0] <= side_values[1]: - return ["left", "right"] - return ["right", "left"] - - -def _build_relative_placement_step( - *, - entry: Mapping[str, Any], - by_uid: Mapping[str, _SceneObject], - scene_objects: list[_SceneObject], - rigid_objects: list[_SceneObject], - runtime_uids: Mapping[str, str], - forced_side: str | None, -) -> _RelativePlacementStepSpec: - moved_source_uid = _resolve_rigid_source_uid( - entry.get("moved_object"), - rigid_objects, - field_name="moved_object", - ) - relation = _normalize_relative_relation(entry.get("goal_relation")) - reference_source_uid = _resolve_relative_reference_source_uid( - entry.get("reference_object"), - moved_source_uid=moved_source_uid, - scene_objects=scene_objects, - ) - reference_is_initial_pose = moved_source_uid == reference_source_uid - if reference_is_initial_pose and relation not in _SIDE_RELATIONS: - raise ValueError( - "Initial-position self-relative placement only supports directional " - "relations, not inside/on." - ) - - reference_obj = by_uid[reference_source_uid] - if relation == "on" and _is_container_like(reference_obj): - relation = "inside" - - moved_runtime_uid = runtime_uids[moved_source_uid] - reference_runtime_uid = runtime_uids[reference_source_uid] - if moved_runtime_uid == reference_runtime_uid and not reference_is_initial_pose: - raise ValueError( - f"Relative placement produced duplicate runtime uid {moved_runtime_uid!r}." - ) - - release_offset = _relative_release_offset(relation) - high_offset = list(release_offset) - high_offset[2] += _STAGING_Z_DELTA - moved_position = _vector3( - by_uid[moved_source_uid].config.get("init_pos", [0, 0, 0]) - ) - requested_side = _normalize_relative_arm(entry.get("arm")) - active_side = ( - forced_side - if forced_side is not None - else ( - _arm_side_for_position(moved_position) - if requested_side == "auto" - else requested_side - ) - ) - - return _RelativePlacementStepSpec( - moved_source_uid=moved_source_uid, - reference_source_uid=reference_source_uid, - moved_runtime_uid=moved_runtime_uid, - reference_runtime_uid=reference_runtime_uid, - relation=relation, - active_side=active_side, - release_offset=release_offset, - high_offset=high_offset, - reference_is_initial_pose=reference_is_initial_pose, - ) - - -def _validate_relative_placements( - placements: tuple[_RelativePlacementStepSpec, ...], -) -> None: - if not placements: - raise ValueError("Relative placement requires at least one placement.") - moved_source_uids = [placement.moved_source_uid for placement in placements] - if len(moved_source_uids) != len(set(moved_source_uids)): - raise ValueError("Relative placements must use distinct moved_object values.") - if len(placements) == 2: - active_sides = {placement.active_side for placement in placements} - if active_sides != {"left", "right"}: - raise ValueError( - "Dual-arm relative placement requires one left arm and one right arm." - ) - - -def _resolve_rigid_source_uid( - value: Any, - rigid_objects: list[_SceneObject], - *, - field_name: str, -) -> str: - return _resolve_scene_source_uid( - value, - rigid_objects, - field_name=field_name, - ) - - -def _resolve_relative_reference_source_uid( - value: Any, - *, - moved_source_uid: str, - scene_objects: list[_SceneObject], -) -> str: - if value is not None: - text = str(value).strip() - normalized = text.lower().replace("-", "_").replace(" ", "_") - if normalized in _SELF_REFERENCE_VALUES: - return moved_source_uid - return _resolve_scene_source_uid( - value, - scene_objects, - field_name="reference_object", - ) - - -def _resolve_scene_source_uid( - value: Any, - scene_objects: list[_SceneObject], - *, - field_name: str, -) -> str: - if value is None: - raise ValueError(f"LLM response missing required {field_name}.") - text = str(value).strip() - by_uid = {obj.source_uid: obj for obj in scene_objects} - if text in by_uid: - return text - - normalized = _normalize_runtime_uid(text) - matches = [ - obj.source_uid - for obj in scene_objects - if _normalize_runtime_uid(obj.source_uid) == normalized - or _base_name(obj) == normalized - or _candidate_relative_runtime_uid(obj) == normalized - ] - if len(matches) == 1: - return matches[0] - if not matches: - raise ValueError(f"LLM returned unknown {field_name}: {text!r}.") - raise ValueError( - f"LLM returned ambiguous {field_name}: {text!r}; candidates: {matches}." - ) - - -def _normalize_relative_relation(value: Any) -> str: - relation = str(value or "").strip().lower().replace("-", "_").replace(" ", "_") - relation = _RELATION_ALIASES.get(relation, relation) - if relation not in _RELATIVE_RELATIONS: - raise ValueError( - f"Unsupported relative placement relation {value!r}; expected one " - f"of {sorted(_RELATIVE_RELATIONS)}." - ) - return relation - - -def _normalize_relative_arm(value: Any) -> str: - if value is None: - return "auto" - text = str(value).strip().lower().replace("-", "_").replace(" ", "_") - if text in { - "", - "auto", - "automatic", - "unspecified", - "none", - "null", - "default", - "自动", - "默认", - "未指定", - "不指定", - }: - return "auto" - if text in { - "left", - "left_arm", - "left_ur5", - "左", - "左臂", - "左机械臂", - "左手", - "左手臂", - }: - return "left" - if text in { - "right", - "right_arm", - "right_ur5", - "右", - "右臂", - "右机械臂", - "右手", - "右手臂", - }: - return "right" - raise ValueError( - f"Unsupported relative placement arm {value!r}; expected 'left', " - "'right', or 'auto'." - ) - - -def _relative_release_offset(relation: str) -> list[float]: - relation = _normalize_relative_relation(relation) - if relation == "inside": - return [0.0, 0.0, _SIDE_RELEASE_Z_OFFSET] - if relation == "on": - return [0.0, 0.0, _ON_RELEASE_Z_OFFSET] - if relation in _SIDE_RELATIONS: - x_offset, y_offset = _side_relation_xy_offsets(relation) - return [x_offset, y_offset, _SIDE_RELEASE_Z_OFFSET] - raise ValueError(f"Unsupported relative placement relation: {relation!r}.") - - -def _side_relation_xy_offsets(relation: str) -> tuple[float, float]: - relation = _normalize_relative_relation(relation) - left_y = _ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE - right_y = -_ROBOT_VIEW_LEFT_WORLD_Y_SIGN * _SIDE_RELATION_DISTANCE - front_x = _ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE - behind_x = -_ROBOT_VIEW_FRONT_WORLD_X_SIGN * _SIDE_RELATION_DISTANCE - if relation == "left_of": - return 0.0, left_y - if relation == "right_of": - return 0.0, right_y - if relation == "front_of": - return front_x, 0.0 - if relation == "behind": - return behind_x, 0.0 - if relation == "front_left_of": - return front_x, left_y - if relation == "back_left_of": - return behind_x, left_y - if relation == "front_right_of": - return front_x, right_y - if relation == "back_right_of": - return behind_x, right_y - raise ValueError(f"Unsupported side relation: {relation!r}.") - - -def _relative_runtime_uid_mapping( - rigid_objects: list[_SceneObject], -) -> dict[str, str]: - candidates: dict[str, str] = {} - for obj in rigid_objects: - if _is_container_like(obj): - candidates[obj.source_uid] = _container_runtime_uid(obj) - continue - - base = _target_runtime_suffix(_base_name(obj)) - base_count = sum( - 1 for other in rigid_objects if _base_name(other) == _base_name(obj) - ) - candidates[obj.source_uid] = ( - base if base_count == 1 else _normalize_runtime_uid(obj.source_uid) - ) - - counts: dict[str, int] = {} - for runtime_uid in candidates.values(): - counts[runtime_uid] = counts.get(runtime_uid, 0) + 1 - return { - source_uid: ( - runtime_uid - if counts[runtime_uid] == 1 - else _normalize_runtime_uid(source_uid) - ) - for source_uid, runtime_uid in candidates.items() - } - - -def _relative_scene_runtime_uid_mapping( - scene_objects: list[_SceneObject], - *, - table_source_uid: str, -) -> dict[str, str]: - candidates: dict[str, str] = {} - rigid_runtime_uids = _relative_runtime_uid_mapping( - [obj for obj in scene_objects if obj.source_role == "rigid_object"] - ) - for obj in scene_objects: - if obj.source_uid == table_source_uid: - candidates[obj.source_uid] = "table" - elif obj.source_role == "rigid_object": - candidates[obj.source_uid] = rigid_runtime_uids[obj.source_uid] - else: - candidates[obj.source_uid] = _candidate_relative_runtime_uid(obj) - - counts: dict[str, int] = {} - for runtime_uid in candidates.values(): - counts[runtime_uid] = counts.get(runtime_uid, 0) + 1 - return { - source_uid: ( - runtime_uid - if source_uid == table_source_uid or counts[runtime_uid] == 1 - else _normalize_runtime_uid(source_uid) - ) - for source_uid, runtime_uid in candidates.items() - } - - -def _candidate_relative_runtime_uid(obj: _SceneObject) -> str: - if _is_container_like(obj): - return _container_runtime_uid(obj) - return _target_runtime_suffix(_base_name(obj)) - - -def _is_container_like(obj: _SceneObject) -> bool: - return any(keyword in _object_text(obj) for keyword in _CONTAINER_KEYWORDS) - - -def _string_list(value: Any) -> list[str]: - if not isinstance(value, list): - return [] - return [str(item).strip() for item in value if str(item).strip()] - - -def _default_relative_task_summary( - moved_uid: str, - reference_uid: str, - relation: str, -) -> str: - return ( - f"Move `{moved_uid}` so its final state is " - f"{_relative_relation_phrase(relation)} `{reference_uid}`." - ) - - -def _default_relative_plan_summary( - placements: Sequence[_RelativePlacementStepSpec], -) -> str: - if len(placements) == 1: - placement = placements[0] - return _default_relative_task_summary( - placement.moved_runtime_uid, - placement.reference_runtime_uid, - placement.relation, - ) - placement_text = "; ".join( - f"use the {placement.active_side} UR5 to move " - f"`{placement.moved_runtime_uid}` " - f"{_relative_relation_phrase(placement.relation)} " - f"`{placement.reference_runtime_uid}`" - for placement in placements - ) - return f"Use both UR5 arms for a dual-arm relative placement: {placement_text}." - - -def _default_relative_action_sketch( - placements: Sequence[_RelativePlacementStepSpec], -) -> list[str]: - if len(placements) == 1: - placement = placements[0] - return [ - f"grasp {placement.moved_runtime_uid}", - ( - f"move above the {placement.relation} release pose relative to " - f"{placement.reference_runtime_uid}" - ), - "place at the release pose with PlaceAction", - ] - sketch = ["grasp both moved objects with their assigned arms"] - for placement in placements: - sketch.extend( - [ - ( - f"use {placement.active_side}_arm to move " - f"{placement.moved_runtime_uid} above the release pose relative " - f"to {placement.reference_runtime_uid}" - ), - f"place {placement.moved_runtime_uid} with PlaceAction", - ] - ) - return sketch - - -def _relative_relation_phrase(relation: str) -> str: - relation = _normalize_relative_relation(relation) - if relation == "inside": - return "inside" - if relation == "on": - return "on top of" - if relation == "left_of": - return "to the left of" - if relation == "right_of": - return "to the right of" - if relation == "front_of": - return "in front of" - if relation == "behind": - return "behind" - if relation == "front_left_of": - return "to the front-left of" - if relation == "back_left_of": - return "to the back-left of" - if relation == "front_right_of": - return "to the front-right of" - if relation == "back_right_of": - return "to the back-right of" - raise ValueError(f"Unsupported relative placement relation: {relation!r}.") - - -def _build_ur5_basket_bundle( - *, - scene_dir: Path, - source_config: Mapping[str, Any], - roles: _BasketTaskRoles, - project_name: str, - task_name: str, - target_body_scale: float | list[float] | tuple[float, float, float], - target_replacements: Sequence[_ResolvedTargetReplacement], - max_episodes: int, - max_episode_steps: int, - mesh_normalizer: MeshFrameNormalizer, -) -> dict[str, Any]: - scene_objects = _collect_scene_objects(source_config) - by_uid = {obj.source_uid: obj for obj in scene_objects} - replacement_by_source_uid = { - replacement.source_uid: replacement for replacement in target_replacements - } - object_scale = _target_body_scale_vector(target_body_scale) - container_scale = _source_body_scale(by_uid[roles.container_source_uid]) - task_source_uids = { - roles.container_source_uid, - roles.left_target_source_uid, - roles.right_target_source_uid, - } - extra_rigid_objects = [ - obj - for obj in scene_objects - if obj.source_role == "rigid_object" and obj.source_uid not in task_source_uids - ] - extra_background_objects = [ - obj - for obj in scene_objects - if obj.source_role == "background" and obj.source_uid != roles.table_source_uid - ] - table_config = _make_background_config( - scene_dir, - by_uid[roles.table_source_uid], - mesh_normalizer, - ) - table_top_z = _mesh_config_world_zmax(table_config) - robot_init_z = _dual_ur5_init_z_from_table_top(table_top_z) - - gym_config = { - "id": "AtomicActionsAgent-v3", - "max_episodes": int(max_episodes), - "max_episode_steps": int(max_episode_steps), - "env": { - "extensions": _make_extensions_config(roles), - "events": _make_events_config(roles), - "observations": _make_observations_config(), - "dataset": _make_dataset_config(project_name, roles), - }, - "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), - "sensor": _make_sensor_config(), - "light": _make_light_config(), - "background": [ - table_config, - _make_container_background_config( - scene_dir, - by_uid[roles.container_source_uid], - roles.container_runtime_uid, - container_scale, - mesh_normalizer, - ), - *[ - _make_extra_background_config(scene_dir, obj, mesh_normalizer) - for obj in extra_background_objects - ], - ], - "rigid_object": [ - _make_target_object_config( - scene_dir, - by_uid[roles.right_target_source_uid], - roles.right_target_runtime_uid, - object_scale, - mesh_normalizer, - replacement_by_source_uid.get(roles.right_target_source_uid), - ), - _make_target_object_config( - scene_dir, - by_uid[roles.left_target_source_uid], - roles.left_target_runtime_uid, - object_scale, - mesh_normalizer, - replacement_by_source_uid.get(roles.left_target_source_uid), - ), - *[ - _make_extra_rigid_object_config( - scene_dir, - obj, - _source_body_scale(obj), - mesh_normalizer, - ) - for obj in extra_rigid_objects - ], - ], - } - _apply_tabletop_z_placement(gym_config, table_top_z) - return { - "gym_config": gym_config, - "agent_config": make_agent_config(), - "task_prompt": make_basket_task_prompt(task_name, project_name, roles), - "basic_background": make_basket_basic_background(project_name, roles), - "atom_actions": make_basket_atom_actions_prompt(roles), - "summary": { - "mode": "basket_template", - "left_target": roles.left_target_runtime_uid, - "right_target": roles.right_target_runtime_uid, - "container": roles.container_runtime_uid, - "target_replacements": [ - { - "source_uid": replacement.source_uid, - "prompt": replacement.prompt, - "output_dir_name": replacement.output_dir_name, - "mesh_path": replacement.mesh_path.as_posix(), - "runtime_noun": replacement.runtime_noun, - "reused": replacement.reused, - } - for replacement in target_replacements - ], - }, - } - - -def _attach_coacd_cache_summary(bundle: dict[str, Any]) -> None: - from embodichain.gen_sim.action_agent_pipeline.generation.coacd_cache import ( - prewarm_coacd_cache_for_gym_config, - ) - - bundle.setdefault("summary", {})["coacd_cache"] = ( - prewarm_coacd_cache_for_gym_config(bundle["gym_config"]) - ) - - -def _attach_mesh_normalization_summary( - bundle: dict[str, Any], - mesh_normalizer: MeshFrameNormalizer, -) -> None: - reports = mesh_normalizer.reports - if reports: - bundle.setdefault("summary", {})["normalized_meshes"] = reports - - -def _build_relative_placement_bundle( - *, - scene_dir: Path, - source_config: Mapping[str, Any], - spec: _RelativePlacementSpec, - project_name: str, - task_name: str, - target_body_scale: float | list[float] | tuple[float, float, float], - max_episodes: int, - max_episode_steps: int, - mesh_normalizer: MeshFrameNormalizer, -) -> dict[str, Any]: - scene_objects = _collect_scene_objects(source_config) - background_objects = [ - obj for obj in scene_objects if obj.source_role == "background" - ] - rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] - by_uid = {obj.source_uid: obj for obj in scene_objects} - runtime_uids = _relative_scene_runtime_uid_mapping( - scene_objects, - table_source_uid=spec.table_source_uid, - ) - moved_source_uids = {placement.moved_source_uid for placement in spec.placements} - reference_runtime_uids = { - placement.reference_runtime_uid for placement in spec.placements - } - registered_runtime_uids = sorted( - {runtime_uids[obj.source_uid] for obj in rigid_objects} | reference_runtime_uids - ) - dynamic_rigid_objects = [ - obj for obj in rigid_objects if obj.source_uid in moved_source_uids - ] - static_scene_objects = [ - obj for obj in rigid_objects if obj.source_uid not in moved_source_uids - ] - object_scale = _target_body_scale_vector(target_body_scale) - table_config = _make_background_config( - scene_dir, - by_uid[spec.table_source_uid], - mesh_normalizer, - ) - table_top_z = _mesh_config_world_zmax(table_config) - robot_init_z = _dual_ur5_init_z_from_table_top(table_top_z) - - gym_config = { - "id": "AtomicActionsAgent-v3", - "max_episodes": int(max_episodes), - "max_episode_steps": int(max_episode_steps), - "env": { - "extensions": {}, - "events": _make_relative_events_config(spec, registered_runtime_uids), - "observations": _make_observations_config(), - "dataset": {}, - }, - "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), - "sensor": _make_sensor_config(), - "light": _make_light_config(), - "background": [ - table_config, - *[ - _make_relative_background_object_config( - scene_dir, - obj, - runtime_uids[obj.source_uid], - max_convex_hull_num=_relative_static_background_max_convex_hull_num( - runtime_uids[obj.source_uid], - spec, - ), - mesh_normalizer=mesh_normalizer, - ) - for obj in static_scene_objects - ], - *[ - _make_extra_background_config( - scene_dir, - obj, - mesh_normalizer, - runtime_uid=runtime_uids[obj.source_uid], - ) - for obj in background_objects - if obj.source_uid != spec.table_source_uid - ], - ], - "rigid_object": [ - _make_relative_rigid_object_config( - scene_dir=scene_dir, - obj=obj, - runtime_uid=runtime_uids[obj.source_uid], - body_scale=object_scale, - max_convex_hull_num=_relative_rigid_object_max_convex_hull_num( - runtime_uids[obj.source_uid], - spec, - ), - mesh_normalizer=mesh_normalizer, - ) - for obj in dynamic_rigid_objects - ], - } - _apply_tabletop_z_placement(gym_config, table_top_z) - spec = _with_self_relative_absolute_targets(spec, gym_config) - gym_config["env"]["extensions"] = _make_relative_extensions_config(spec) - gym_config["env"]["dataset"] = _make_relative_dataset_config(project_name, spec) - return { - "gym_config": gym_config, - "agent_config": make_agent_config(), - "task_prompt": make_relative_task_prompt(task_name, project_name, spec), - "basic_background": make_relative_basic_background(project_name, spec), - "atom_actions": make_relative_atom_actions_prompt(spec), - "summary": _make_relative_summary(spec), - } - - -def _with_self_relative_absolute_targets( - spec: _RelativePlacementSpec, - gym_config: Mapping[str, Any], -) -> _RelativePlacementSpec: - if not any(placement.reference_is_initial_pose for placement in spec.placements): - return spec - - generated_positions = { - str(obj.get("uid")): _clean_vector3(obj.get("init_pos", [0.0, 0.0, 0.0])) - for obj in gym_config.get("rigid_object", []) - } - placements = tuple( - _with_self_relative_absolute_target(placement, generated_positions) - for placement in spec.placements - ) - primary = placements[0] - return _RelativePlacementSpec( - table_source_uid=spec.table_source_uid, - moved_source_uid=primary.moved_source_uid, - reference_source_uid=primary.reference_source_uid, - moved_runtime_uid=primary.moved_runtime_uid, - reference_runtime_uid=primary.reference_runtime_uid, - relation=primary.relation, - active_side=primary.active_side, - task_description=spec.task_description, - task_prompt_summary=spec.task_prompt_summary, - basic_background_notes=spec.basic_background_notes, - action_sketch=spec.action_sketch, - release_offset=primary.release_offset, - high_offset=primary.high_offset, - placements=placements, - reference_is_initial_pose=primary.reference_is_initial_pose, - release_position=primary.release_position, - high_position=primary.high_position, - ) - - -def _with_self_relative_absolute_target( - placement: _RelativePlacementStepSpec, - generated_positions: Mapping[str, list[float]], -) -> _RelativePlacementStepSpec: - if not placement.reference_is_initial_pose: - return placement - initial_position = generated_positions.get(placement.moved_runtime_uid) - if initial_position is None: - raise ValueError( - "Generated relative config missing self-relative moved object " - f"{placement.moved_runtime_uid!r}." - ) - release_position = _offset_position(initial_position, placement.release_offset) - high_position = _offset_position(initial_position, placement.high_offset) - return _RelativePlacementStepSpec( - moved_source_uid=placement.moved_source_uid, - reference_source_uid=placement.reference_source_uid, - moved_runtime_uid=placement.moved_runtime_uid, - reference_runtime_uid=placement.reference_runtime_uid, - relation=placement.relation, - active_side=placement.active_side, - release_offset=placement.release_offset, - high_offset=placement.high_offset, - reference_is_initial_pose=True, - release_position=release_position, - high_position=high_position, - ) - - -def _offset_position( - position: Sequence[float], - offset: Sequence[float], -) -> list[float]: - return [ - round(float(position[index]) + float(offset[index]), 6) for index in range(3) - ] - - -def _target_body_scale_vector( - target_body_scale: float | list[float] | tuple[float, float, float], -) -> list[float]: - if isinstance(target_body_scale, (int, float)): - value = float(target_body_scale) - return [value, value, value] - return _clean_vector3(target_body_scale) - - -def _source_body_scale(obj: _SceneObject) -> list[float]: - return _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])) - - -def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: - if len(spec.placements) == 1: - return { - "mode": "relative_placement", - "moved_object": spec.moved_runtime_uid, - "reference_object": spec.reference_runtime_uid, - "relation": spec.relation, - "active_arm": f"{spec.active_side}_arm", - "release_offset": spec.release_offset, - } - return { - "mode": "dual_arm_relative_placement", - "placements": [ - { - "moved_object": placement.moved_runtime_uid, - "reference_object": placement.reference_runtime_uid, - "relation": placement.relation, - "active_arm": f"{placement.active_side}_arm", - "release_offset": placement.release_offset, - } - for placement in spec.placements - ], - } - - -def _dual_ur5_init_z_from_table_top(table_top_z: float | None) -> float: - if table_top_z is None: - return _DUAL_UR5_LEGACY_INIT_Z - - init_z = table_top_z + _DUAL_UR5_TABLETOP_CLEARANCE - _DUAL_UR5_ARM_COMPONENT_Z - return round(init_z, 6) - - -def _apply_tabletop_z_placement( - gym_config: dict[str, Any], - table_top_z: float | None, -) -> None: - if table_top_z is None: - return - target_bottom_z = float(table_top_z) + _TABLETOP_OBJECT_CLEARANCE - for obj in _iter_generated_scene_object_configs(gym_config): - if obj.get("uid") == "table": - continue - mesh_min_z = _mesh_config_local_zmin_after_rotation(obj) - if mesh_min_z is None: - continue - init_pos = _clean_vector3(obj.get("init_pos", [0.0, 0.0, 0.0])) - init_pos[2] = round(target_bottom_z - mesh_min_z, 6) - obj["init_pos"] = init_pos - - -def _iter_generated_scene_object_configs( - gym_config: Mapping[str, Any], -) -> list[dict[str, Any]]: - objects: list[dict[str, Any]] = [] - for section in ("background", "rigid_object"): - value = gym_config.get(section, []) - if isinstance(value, Mapping): - value = [value] - if not isinstance(value, list): - continue - objects.extend(obj for obj in value if isinstance(obj, dict)) - return objects - - -def _mesh_config_world_zmax(obj_config: Mapping[str, Any]) -> float | None: - bounds = _mesh_config_world_z_bounds(obj_config) - if bounds is None: - return None - return bounds[1] - - -def _mesh_config_local_zmin_after_rotation( - obj_config: Mapping[str, Any], -) -> float | None: - shape = obj_config.get("shape", {}) - if not isinstance(shape, Mapping): - return None - mesh_path = shape.get("fpath") - if not isinstance(mesh_path, str): - return None - vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) - if not vertices: - return None - - matrix = _mesh_config_transform_matrix( - obj_config, - translation=[0.0, 0.0, 0.0], - ) - return min(_transform_point(matrix, vertex)[2] for vertex in vertices) - - -def _mesh_config_world_z_bounds( - obj_config: Mapping[str, Any], -) -> tuple[float, float] | None: - shape = obj_config.get("shape", {}) - if not isinstance(shape, Mapping): - return None - mesh_path = shape.get("fpath") - if not isinstance(mesh_path, str): - return None - vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) - if not vertices: - return None - - matrix = _mesh_config_transform_matrix(obj_config) - z_values = [_transform_point(matrix, vertex)[2] for vertex in vertices] - return (min(z_values), max(z_values)) - - -def _mesh_config_transform_matrix( - obj_config: Mapping[str, Any], - *, - translation: list[float] | None = None, -) -> list[list[float]]: - scale = _vector3(obj_config.get("body_scale", [1.0, 1.0, 1.0])) - init_local_pose = obj_config.get("init_local_pose") - if init_local_pose is not None and translation is None: - root_matrix = _matrix4(init_local_pose) - else: - root_matrix = _euler_xyz_degrees_matrix( - _vector3(obj_config.get("init_rot", [0.0, 0.0, 0.0])), - ( - _vector3(obj_config.get("init_pos", [0.0, 0.0, 0.0])) - if translation is None - else translation - ), - ) - return _matrix_multiply(root_matrix, _scale_matrix4(scale)) - - -def _resolve_table_mesh_world_zmax( - scene_dir: Path, - table_obj: _SceneObject, -) -> float | None: - shape = table_obj.config.get("shape", {}) - if not isinstance(shape, Mapping): - return None - if shape.get("shape_type") != "Mesh" or not shape.get("fpath"): - return None - - mesh_path = _source_asset_path(scene_dir, str(shape["fpath"])) - try: - vertices = _load_mesh_vertices(mesh_path) - except ( - OSError, - ValueError, - json.JSONDecodeError, - UnicodeDecodeError, - struct.error, - ): - return None - if not vertices: - return None - - world_matrix = _table_mesh_world_matrix(table_obj.config) - return max(_transform_point(world_matrix, vertex)[2] for vertex in vertices) - - -def _source_asset_path(scene_dir: Path, fpath: str) -> Path: - raw_path = Path(fpath) - if raw_path.is_absolute(): - return raw_path.resolve() - - scene_candidate = (scene_dir / raw_path).resolve() - if scene_candidate.exists(): - return scene_candidate - - repo_candidate = (_repo_root() / raw_path).resolve() - if repo_candidate.exists(): - return repo_candidate - return scene_candidate - - -def _load_mesh_vertices(mesh_path: Path) -> list[tuple[float, float, float]] | None: - if mesh_path.suffix.lower() == ".glb": - try: - return list(_iter_glb_world_position_vertices(mesh_path)) - except ( - OSError, - ValueError, - json.JSONDecodeError, - UnicodeDecodeError, - struct.error, - ): - return _load_mesh_vertices_with_trimesh(mesh_path) - return _load_mesh_vertices_with_trimesh(mesh_path) - - -def _load_mesh_vertices_with_trimesh( - mesh_path: Path, -) -> list[tuple[float, float, float]] | None: - try: - import trimesh - except ImportError: - return None - - try: - scene_or_mesh = trimesh.load(str(mesh_path), force="scene") - if hasattr(scene_or_mesh, "to_geometry"): - mesh = scene_or_mesh.to_geometry() - elif hasattr(scene_or_mesh, "dump"): - mesh = scene_or_mesh.dump(concatenate=True) - else: - mesh = scene_or_mesh - except Exception: - return None - vertices = getattr(mesh, "vertices", None) - if vertices is None or len(vertices) == 0: - return None - return [ - (float(vertex[0]), float(vertex[1]), float(vertex[2])) for vertex in vertices - ] - - -def _iter_glb_world_position_vertices( - mesh_path: Path, -): - doc, binary_chunk = _read_glb(mesh_path) - nodes = doc.get("nodes", []) - if not isinstance(nodes, list): - raise ValueError("GLB nodes must be a list.") - - scenes = doc.get("scenes", []) - if scenes: - scene_index = int(doc.get("scene", 0)) - root_node_ids = scenes[scene_index].get("nodes", []) - else: - root_node_ids = list(range(len(nodes))) - - stack = [(int(node_id), _identity_matrix4()) for node_id in root_node_ids] - while stack: - node_id, parent_matrix = stack.pop() - node = nodes[node_id] - node_matrix = _matrix_multiply(parent_matrix, _gltf_node_matrix(node)) - mesh_index = node.get("mesh") - if mesh_index is not None: - for vertex in _iter_gltf_mesh_position_vertices( - doc, - binary_chunk, - int(mesh_index), - ): - yield _transform_point(node_matrix, vertex) - for child_id in node.get("children", []) or []: - stack.append((int(child_id), node_matrix)) - - -def _read_glb(mesh_path: Path) -> tuple[dict[str, Any], bytes]: - data = mesh_path.read_bytes() - if len(data) < 20: - raise ValueError("GLB file is too small.") - - magic, version, total_length = struct.unpack_from("<4sII", data, 0) - if magic != b"glTF" or version != 2: - raise ValueError("Only GLB version 2 files are supported.") - if total_length > len(data): - raise ValueError("GLB length header exceeds file size.") - - doc: dict[str, Any] | None = None - binary_chunk = b"" - offset = 12 - while offset + 8 <= total_length: - chunk_length, chunk_type = struct.unpack_from(" total_length: - raise ValueError("GLB chunk exceeds file size.") - chunk = data[offset:chunk_end] - offset = chunk_end - if chunk_type == _GLB_JSON_CHUNK_TYPE: - doc = json.loads(chunk.decode("utf-8").rstrip("\x00 ")) - elif chunk_type == _GLB_BINARY_CHUNK_TYPE: - binary_chunk = chunk - - if doc is None: - raise ValueError("GLB file does not contain a JSON chunk.") - return doc, binary_chunk - - -def _iter_gltf_mesh_position_vertices( - doc: Mapping[str, Any], - binary_chunk: bytes, - mesh_index: int, -): - meshes = doc.get("meshes", []) - accessors = doc.get("accessors", []) - mesh = meshes[mesh_index] - for primitive in mesh.get("primitives", []) or []: - attributes = primitive.get("attributes", {}) - position_accessor = attributes.get("POSITION") - if position_accessor is None: - continue - if int(position_accessor) >= len(accessors): - raise ValueError("POSITION accessor index is out of range.") - yield from _iter_gltf_accessor_vec3(doc, binary_chunk, int(position_accessor)) - - -def _iter_gltf_accessor_vec3( - doc: Mapping[str, Any], - binary_chunk: bytes, - accessor_index: int, -): - accessor = doc["accessors"][accessor_index] - if accessor.get("sparse"): - raise ValueError("Sparse GLB accessors are not supported.") - if accessor.get("type") != "VEC3": - raise ValueError("POSITION accessor must be VEC3.") - if "bufferView" not in accessor: - raise ValueError("POSITION accessor must reference a bufferView.") - - component_type = int(accessor["componentType"]) - if component_type not in _GLTF_COMPONENT_FORMATS: - raise ValueError(f"Unsupported GLB component type: {component_type}.") - component_format, component_size = _GLTF_COMPONENT_FORMATS[component_type] - component_count = _GLTF_TYPE_COMPONENT_COUNTS[accessor["type"]] - buffer_view = doc["bufferViews"][int(accessor["bufferView"])] - if int(buffer_view.get("buffer", 0)) != 0: - raise ValueError("Only GLB embedded binary buffers are supported.") - - stride = int(buffer_view.get("byteStride", component_size * component_count)) - offset = int(buffer_view.get("byteOffset", 0)) + int(accessor.get("byteOffset", 0)) - element_format = "<" + component_format * component_count - for index in range(int(accessor["count"])): - values = struct.unpack_from( - element_format, - binary_chunk, - offset + index * stride, - ) - yield (float(values[0]), float(values[1]), float(values[2])) - - -def _table_mesh_world_matrix(table_config: Mapping[str, Any]) -> list[list[float]]: - scale = _vector3(table_config.get("body_scale", [1.0, 1.0, 1.0])) - init_local_pose = table_config.get("init_local_pose") - if init_local_pose is not None: - root_matrix = _matrix4(init_local_pose) - else: - root_matrix = _euler_xyz_degrees_matrix( - _vector3(table_config.get("init_rot", [0.0, 0.0, 0.0])), - _vector3(table_config.get("init_pos", [0.0, 0.0, 0.0])), - ) - return _matrix_multiply(root_matrix, _scale_matrix4(scale)) - - -def _gltf_node_matrix(node: Mapping[str, Any]) -> list[list[float]]: - if "matrix" in node: - values = [float(value) for value in node["matrix"]] - if len(values) != 16: - raise ValueError("GLB node matrix must contain 16 values.") - return [[values[column * 4 + row] for column in range(4)] for row in range(4)] - - translation = [float(value) for value in node.get("translation", [0.0, 0.0, 0.0])] - scale = [float(value) for value in node.get("scale", [1.0, 1.0, 1.0])] - rotation = [float(value) for value in node.get("rotation", [0.0, 0.0, 0.0, 1.0])] - if len(translation) != 3 or len(scale) != 3 or len(rotation) != 4: - raise ValueError("Invalid GLB node TRS transform.") - - x, y, z, w = rotation - xx, yy, zz = x * x, y * y, z * z - xy, xz, yz = x * y, x * z, y * z - wx, wy, wz = w * x, w * y, w * z - matrix = [ - [ - (1.0 - 2.0 * (yy + zz)) * scale[0], - (2.0 * (xy - wz)) * scale[1], - (2.0 * (xz + wy)) * scale[2], - translation[0], - ], - [ - (2.0 * (xy + wz)) * scale[0], - (1.0 - 2.0 * (xx + zz)) * scale[1], - (2.0 * (yz - wx)) * scale[2], - translation[1], - ], - [ - (2.0 * (xz - wy)) * scale[0], - (2.0 * (yz + wx)) * scale[1], - (1.0 - 2.0 * (xx + yy)) * scale[2], - translation[2], - ], - [0.0, 0.0, 0.0, 1.0], - ] - return matrix - - -def _euler_xyz_degrees_matrix( - rotation_deg: Sequence[float], - translation: Sequence[float], -) -> list[list[float]]: - rx, ry, rz = (math.radians(float(value)) for value in rotation_deg) - cx, sx = math.cos(rx), math.sin(rx) - cy, sy = math.cos(ry), math.sin(ry) - cz, sz = math.cos(rz), math.sin(rz) - rot_x = [ - [1.0, 0.0, 0.0, 0.0], - [0.0, cx, -sx, 0.0], - [0.0, sx, cx, 0.0], - [0.0, 0.0, 0.0, 1.0], - ] - rot_y = [ - [cy, 0.0, sy, 0.0], - [0.0, 1.0, 0.0, 0.0], - [-sy, 0.0, cy, 0.0], - [0.0, 0.0, 0.0, 1.0], - ] - rot_z = [ - [cz, -sz, 0.0, 0.0], - [sz, cz, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.0], - [0.0, 0.0, 0.0, 1.0], - ] - matrix = _matrix_multiply(_matrix_multiply(rot_z, rot_y), rot_x) - matrix[0][3] = float(translation[0]) - matrix[1][3] = float(translation[1]) - matrix[2][3] = float(translation[2]) - return matrix - - -def _identity_matrix4() -> list[list[float]]: - return [ - [1.0, 0.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.0], - [0.0, 0.0, 0.0, 1.0], - ] - - -def _scale_matrix4(scale: Sequence[float]) -> list[list[float]]: - return [ - [float(scale[0]), 0.0, 0.0, 0.0], - [0.0, float(scale[1]), 0.0, 0.0], - [0.0, 0.0, float(scale[2]), 0.0], - [0.0, 0.0, 0.0, 1.0], - ] - - -def _matrix4(value: Any) -> list[list[float]]: - if not isinstance(value, (list, tuple)) or len(value) != 4: - raise ValueError(f"Expected a 4x4 matrix, got {value!r}.") - matrix = [] - for row in value: - if not isinstance(row, (list, tuple)) or len(row) != 4: - raise ValueError(f"Expected a 4x4 matrix, got {value!r}.") - matrix.append([float(item) for item in row]) - return matrix - - -def _matrix_multiply( - left: Sequence[Sequence[float]], - right: Sequence[Sequence[float]], -) -> list[list[float]]: - return [ - [ - sum( - float(left[row][inner]) * float(right[inner][column]) - for inner in range(4) - ) - for column in range(4) - ] - for row in range(4) - ] - - -def _transform_point( - matrix: Sequence[Sequence[float]], - point: Sequence[float], -) -> tuple[float, float, float]: - x, y, z = (float(point[0]), float(point[1]), float(point[2])) - return ( - float(matrix[0][0]) * x - + float(matrix[0][1]) * y - + float(matrix[0][2]) * z - + float(matrix[0][3]), - float(matrix[1][0]) * x - + float(matrix[1][1]) * y - + float(matrix[1][2]) * z - + float(matrix[1][3]), - float(matrix[2][0]) * x - + float(matrix[2][1]) * y - + float(matrix[2][2]) * z - + float(matrix[2][3]), - ) - - -def _make_extensions_config(roles: _BasketTaskRoles) -> dict[str, Any]: - return { - "agent_arm_slots": { - "left": { - "arm": "right_arm", - "eef": "right_eef", - }, - "right": { - "arm": "left_arm", - "eef": "left_eef", - }, - }, - "arm_aim_yaw_offset": { - "left": 3.141592653589793, - "right": 0.0, - }, - "gripper_open_state": [0.0], - "gripper_close_state": [0.04], - "ignore_terminations_during_agent": True, - "viewer_camera_uid": "cam_high", - "agent_success": { - "op": "all", - "terms": [ - _object_in_container_success( - roles.left_target_runtime_uid, - roles.container_runtime_uid, - ), - _object_in_container_success( - roles.right_target_runtime_uid, - roles.container_runtime_uid, - ), - ], - }, - } - - -def _object_in_container_success(object_uid: str, container_uid: str) -> dict[str, Any]: - return { - "type": "object_in_container", - "object": object_uid, - "container": container_uid, - "radius": 0.2, - "min_z_offset": -0.05, - "max_z_offset": 0.35, - } - - -def _make_relative_extensions_config(spec: _RelativePlacementSpec) -> dict[str, Any]: - return { - "agent_arm_slots": { - "left": { - "arm": "right_arm", - "eef": "right_eef", - }, - "right": { - "arm": "left_arm", - "eef": "left_eef", - }, - }, - "arm_aim_yaw_offset": { - "left": 3.141592653589793, - "right": 0.0, - }, - "gripper_open_state": [0.0], - "gripper_close_state": [0.04], - "ignore_terminations_during_agent": True, - "viewer_camera_uid": "cam_high", - "agent_success": _make_relative_success_spec(spec), - } - - -def _make_relative_success_spec(spec: _RelativePlacementSpec) -> dict[str, Any]: - if len(spec.placements) == 1: - return _make_relative_placement_success_spec(spec.placements[0]) - return { - "op": "all", - "terms": [ - _make_relative_placement_success_spec(placement) - for placement in spec.placements - ], - } - - -def _make_relative_placement_success_spec( - placement: _RelativePlacementStepSpec, -) -> dict[str, Any]: - if placement.relation == "inside": - return _object_in_container_success( - placement.moved_runtime_uid, - placement.reference_runtime_uid, - ) - if placement.relation == "on": - return { - "type": "object_on_object", - "object": placement.moved_runtime_uid, - "support": placement.reference_runtime_uid, - "xy_radius": 0.08, - "min_z_offset": 0.02, - "max_z_offset": 0.35, - } - - if placement.reference_is_initial_pose: - if placement.release_position is None: - raise ValueError( - "Self-relative success requires an absolute release position." - ) - return { - "op": "all", - "terms": [ - *_absolute_xy_success_terms( - placement.moved_runtime_uid, - placement.release_position, - ), - { - "type": "object_not_fallen", - "object": placement.moved_runtime_uid, - "max_tilt": 0.9, - }, - ], - } - - return { - "op": "all", - "terms": [ - *_relative_xy_success_terms(placement), - { - "type": "object_not_fallen", - "object": placement.moved_runtime_uid, - "max_tilt": 0.9, - }, - ], - } - - -def _absolute_xy_success_terms( - object_uid: str, - position: Sequence[float], -) -> list[dict[str, Any]]: - return [ - { - "type": "object_axis_near", - "object": object_uid, - "axis": axis, - "target": float(position[index]), - "tolerance": 0.05, - } - for index, axis in enumerate(("x", "y")) - ] - - -def _relative_xy_success_terms( - placement: _RelativePlacementStepSpec, -) -> list[dict[str, Any]]: - x_offset, y_offset = _side_relation_xy_offsets(placement.relation) - return [ - { - "type": "object_axis_offset_near", - "object": placement.moved_runtime_uid, - "reference": placement.reference_runtime_uid, - "axis": axis, - "offset": offset, - "tolerance": 0.05 if offset else 0.06, - } - for axis, offset in (("x", x_offset), ("y", y_offset)) - ] - - -def _make_relative_events_config( - spec: _RelativePlacementSpec, - registered_runtime_uids: list[str], -) -> dict[str, Any]: - return { - "record_camera": _record_camera_event_config(), - "validation_cameras": _validation_cameras_event_config(), - "prepare_extra_attr": { - "func": "prepare_extra_attr", - "mode": "reset", - "params": { - "attrs": [ - { - "name": "object_lengths", - "mode": "callable", - "entity_uids": "all_objects", - "func_name": "compute_object_length", - "func_kwargs": { - "is_svd_frame": True, - "sample_points": 5000, - }, - }, - ] - }, - }, - "register_info_to_env": { - "func": "register_info_to_env", - "mode": "reset", - "params": { - "registry": [ - _object_registry_entry(uid) - for uid in sorted(registered_runtime_uids) - ], - "registration": "affordance_datas", - "sim_update": True, - }, - }, - } - - -def _make_events_config(roles: _BasketTaskRoles) -> dict[str, Any]: - return { - "record_camera": _record_camera_event_config(), - "validation_cameras": _validation_cameras_event_config(), - "prepare_extra_attr": { - "func": "prepare_extra_attr", - "mode": "reset", - "params": { - "attrs": [ - { - "name": "object_lengths", - "mode": "callable", - "entity_uids": "all_objects", - "func_name": "compute_object_length", - "func_kwargs": { - "is_svd_frame": True, - "sample_points": 5000, - }, - }, - ] - }, - }, - "register_info_to_env": { - "func": "register_info_to_env", - "mode": "reset", - "params": { - "registry": [ - _object_registry_entry(roles.left_target_runtime_uid), - _object_registry_entry(roles.right_target_runtime_uid), - _object_registry_entry(roles.container_runtime_uid), - ], - "registration": "affordance_datas", - "sim_update": True, - }, - }, - } - - -def _record_camera_event_config() -> dict[str, Any]: - camera = _make_sensor_config()[0] - extrinsics = camera["extrinsics"] - return { - "func": "record_camera_data", - "mode": "interval", - "interval_step": 1, - "params": { - "name": "record_cam_high", - "resolution": [camera["width"], camera["height"]], - "intrinsics": camera["intrinsics"], - "eye": extrinsics["eye"], - "target": extrinsics["target"], - "up": extrinsics["up"], - }, - } - - -def _validation_cameras_event_config() -> dict[str, Any]: - return { - "func": "validation_cameras", - "mode": "trigger", - "params": {}, - } - - -def _object_registry_entry(uid: str) -> dict[str, Any]: - return { - "entity_cfg": { - "uid": uid, - }, - "pose_register_params": { - "compute_relative": False, - "compute_pose_object_to_arena": True, - "to_matrix": True, - }, - } - - -def _make_observations_config() -> dict[str, Any]: - return { - "norm_robot_eef_joint": { - "func": "normalize_robot_joint_data", - "mode": "modify", - "name": "robot/qpos", - "params": { - "joint_ids": [12, 13, 14, 15], - }, - } - } - - -def _make_dataset_config( - project_name: str, - roles: _BasketTaskRoles, -) -> dict[str, Any]: - left_target_text = _left_target_text(roles) - right_target_text = _right_target_text(roles) - target_description = _target_task_description_text(roles) - return { - "lerobot": { - "func": "LeRobotRecorder", - "mode": "save", - "params": { - "robot_meta": { - "robot_type": "DualUR5", - "control_freq": 25, - }, - "instruction": { - "lang": ( - f"Use the left UR5 to place the left {left_target_text} into " - f"the {roles.container_runtime_uid}, then use the right " - f"UR5 to place the right {right_target_text} into the " - f"{roles.container_runtime_uid}." - ), - }, - "extra": { - "scene_type": project_name, - "task_description": ( - f"Dual UR5 {target_description}-to-container placement" - ), - "data_type": "sim", - }, - "use_videos": True, - }, - } - } - - -def _make_relative_dataset_config( - project_name: str, - spec: _RelativePlacementSpec, -) -> dict[str, Any]: - return { - "lerobot": { - "func": "LeRobotRecorder", - "mode": "save", - "params": { - "robot_meta": { - "robot_type": "DualUR5", - "control_freq": 25, - }, - "instruction": { - "lang": _relative_dataset_instruction(spec), - }, - "extra": { - "scene_type": project_name, - "task_description": spec.task_description, - "data_type": "sim", - }, - "use_videos": True, - }, - } - } - - -def _relative_dataset_instruction(spec: _RelativePlacementSpec) -> str: - if len(spec.placements) == 1: - placement = spec.placements[0] - return ( - f"Use the {placement.active_side} UR5 to move " - f"{placement.moved_runtime_uid} " - f"{_relative_relation_phrase(placement.relation)} " - f"{placement.reference_runtime_uid}." - ) - return " ".join( - f"Use the {placement.active_side} UR5 to move " - f"{placement.moved_runtime_uid} " - f"{_relative_relation_phrase(placement.relation)} " - f"{placement.reference_runtime_uid}." - for placement in spec.placements - ) - - -def _make_dual_ur5_robot_config(*, robot_init_z: float) -> dict[str, Any]: - return { - "uid": "DualUR5", - "urdf_cfg": { - "fname": "dual_ur5_dh_pgi_basket", - "components": [ - { - "component_type": "left_arm", - "urdf_path": "UniversalRobots/UR5/UR5.urdf", - "transform": [ - [0.0, -1.0, 0.0, -0.3], - [1.0, 0.0, 0.0, -1.45], - [0.0, 0.0, 1.0, 0.4], - [0.0, 0.0, 0.0, 1.0], - ], - }, - { - "component_type": "left_hand", - "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf", - }, - { - "component_type": "right_arm", - "urdf_path": "UniversalRobots/UR5/UR5.urdf", - "transform": [ - [0.0, -1.0, 0.0, 0.3], - [1.0, 0.0, 0.0, -1.45], - [0.0, 0.0, 1.0, 0.4], - [0.0, 0.0, 0.0, 1.0], - ], - }, - { - "component_type": "right_hand", - "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf", - }, - ], - }, - "init_pos": [_DUAL_UR5_ROTATED_INIT_X, 0.0, float(robot_init_z)], - "init_rot": [0.0, 0.0, _DUAL_UR5_ROTATED_INIT_YAW_DEGREES], - "init_qpos": [ - 0, - 0, - -1.57, - -1.57, - 1.57, - 1.57, - -1.57, - -1.57, - -1.57, - -1.57, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - ], - "drive_pros": { - "stiffness": { - "LEFT_JOINT[1-6]": 10000.0, - "RIGHT_JOINT[1-6]": 10000.0, - "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 100.0, - "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 100.0, - }, - "damping": { - "LEFT_JOINT[1-6]": 1000.0, - "RIGHT_JOINT[1-6]": 1000.0, - "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 10.0, - "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 10.0, - }, - "max_effort": { - "LEFT_JOINT[1-6]": 100000.0, - "RIGHT_JOINT[1-6]": 100000.0, - "LEFT_GRIPPER_FINGER[1-2]_JOINT_1": 1000.0, - "RIGHT_GRIPPER_FINGER[1-2]_JOINT_1": 1000.0, - }, - }, - "control_parts": { - "left_arm": ["LEFT_JOINT[1-6]"], - "left_eef": ["LEFT_GRIPPER_FINGER[1-2]_JOINT_1"], - "right_arm": ["RIGHT_JOINT[1-6]"], - "right_eef": ["RIGHT_GRIPPER_FINGER[1-2]_JOINT_1"], - }, - "solver_cfg": { - "left_arm": _ur5_solver_config("left"), - "right_arm": _ur5_solver_config("right"), - }, - } - - -def _ur5_solver_config(side: str) -> dict[str, Any]: - return { - "class_type": "PytorchSolver", - "end_link_name": f"{side}_ee_link", - "root_link_name": f"{side}_base_link", - "tcp": [ - [1.0, 0.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.16], - [0.0, 0.0, 0.0, 1.0], - ], - } - - -def _make_sensor_config() -> list[dict[str, Any]]: - return [ - { - "sensor_type": "Camera", - "uid": "cam_high", - "width": 960, - "height": 540, - "intrinsics": [420, 420, 480, 270], - "extrinsics": { - "pos": [0.4, 0.0, 2.2], - "eye": [0.6, 0.0, 3.3], - "target": [0.0, 0.0, 0.75], - "up": [1.0, 0.0, 0.0], - }, - }, - { - "sensor_type": "Camera", - "uid": "cam_wrist_left", - "width": 640, - "height": 480, - "intrinsics": [600, 600, 320, 240], - "extrinsics": { - "parent": "left_ee_link", - "pos": [0.0, 0.12, 0.08], - "quat": [ - -0.0012598701, - -0.029051816664441618998, - 0.9094039177564813, - 0.41489627504330695, - ], - }, - }, - { - "sensor_type": "Camera", - "uid": "cam_wrist_right", - "width": 640, - "height": 480, - "intrinsics": [600, 600, 320, 240], - "extrinsics": { - "parent": "right_ee_link", - "pos": [0.0, 0.12, 0.08], - "quat": [ - -0.0012598701, - -0.029051816664441618998, - 0.9094039177564813, - 0.41489627504330695, - ], - }, - }, - ] - - -def _make_light_config() -> dict[str, Any]: - return { - "direct": [ - { - "uid": "main_light", - "light_type": "point", - "color": [1.0, 1.0, 1.0], - "intensity": 40.0, - "init_pos": [0.0, -0.4, 2.2], - "radius": 10.0, - } - ] - } - - -def _make_background_config( - scene_dir: Path, - obj: _SceneObject, - mesh_normalizer: MeshFrameNormalizer, -) -> dict[str, Any]: - shape = _make_shape_config(scene_dir, obj.config, mesh_normalizer=mesh_normalizer) - return { - "uid": "table", - "shape": shape, - "attrs": dict(_BACKGROUND_ATTRS), - "body_scale": _clean_vector3(obj.config.get("body_scale", [1.0, 1.0, 1.0])), - "body_type": "kinematic", - "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), - "max_convex_hull_num": _role_limited_max_convex_hull_num( - obj, - _BACKGROUND_MAX_CONVEX_HULL_NUM, - ), - } - - -def _make_extra_background_config( - scene_dir: Path, - obj: _SceneObject, - mesh_normalizer: MeshFrameNormalizer, - body_scale: Any | None = None, - runtime_uid: str | None = None, -) -> dict[str, Any]: - shape = _make_shape_config(scene_dir, obj.config, mesh_normalizer=mesh_normalizer) - config = { - "uid": runtime_uid or _normalize_runtime_uid(obj.source_uid), - "shape": shape, - "attrs": copy.deepcopy(dict(obj.config.get("attrs", _BACKGROUND_ATTRS))), - "body_scale": _clean_vector3( - obj.config.get("body_scale", [1.0, 1.0, 1.0]) - if body_scale is None - else body_scale - ), - "body_type": str(obj.config.get("body_type", "static")), - "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), - "max_convex_hull_num": _role_limited_max_convex_hull_num( - obj, - _BACKGROUND_MAX_CONVEX_HULL_NUM, - ), - } - return config - - -def _make_target_object_config( - scene_dir: Path, - obj: _SceneObject, - runtime_uid: str, - target_scale: list[float], - mesh_normalizer: MeshFrameNormalizer, - replacement: _ResolvedTargetReplacement | None = None, -) -> dict[str, Any]: - config = _make_rigid_object_config( - scene_dir, - obj, - runtime_uid, - target_scale, - max_convex_hull_num=_TARGET_MAX_CONVEX_HULL_NUM, - mesh_fpath=replacement.mesh_path if replacement else None, - mesh_normalizer=mesh_normalizer, - ) - config["body_type"] = "dynamic" - return config - - -def _make_container_object_config( - scene_dir: Path, - obj: _SceneObject, - runtime_uid: str, - body_scale: Any, - mesh_normalizer: MeshFrameNormalizer, -) -> dict[str, Any]: - return _make_rigid_object_config( - scene_dir, - obj, - runtime_uid, - body_scale, - max_convex_hull_num=_role_limited_max_convex_hull_num( - obj, - _CONTAINER_MAX_CONVEX_HULL_NUM, - ), - mesh_normalizer=mesh_normalizer, - ) - - -def _make_container_background_config( - scene_dir: Path, - obj: _SceneObject, - runtime_uid: str, - body_scale: Any, - mesh_normalizer: MeshFrameNormalizer, -) -> dict[str, Any]: - config = _make_container_object_config( - scene_dir, - obj, - runtime_uid, - body_scale, - mesh_normalizer, - ) - config["body_type"] = "kinematic" - return config - - -def _make_relative_background_object_config( - scene_dir: Path, - obj: _SceneObject, - runtime_uid: str, - *, - max_convex_hull_num: int, - mesh_normalizer: MeshFrameNormalizer, -) -> dict[str, Any]: - config = _make_rigid_object_config( - scene_dir, - obj, - runtime_uid, - _source_body_scale(obj), - max_convex_hull_num=max_convex_hull_num, - mesh_normalizer=mesh_normalizer, - ) - config["body_type"] = "kinematic" - return config - - -def _make_extra_rigid_object_config( - scene_dir: Path, - obj: _SceneObject, - body_scale: Any, - mesh_normalizer: MeshFrameNormalizer, -) -> dict[str, Any]: - return _make_rigid_object_config( - scene_dir, - obj, - _normalize_runtime_uid(obj.source_uid), - body_scale, - max_convex_hull_num=_role_limited_max_convex_hull_num( - obj, - _EXTRA_RIGID_MAX_CONVEX_HULL_NUM, - ), - mesh_normalizer=mesh_normalizer, - ) - - -def _make_relative_rigid_object_config( - *, - scene_dir: Path, - obj: _SceneObject, - runtime_uid: str, - body_scale: Any, - max_convex_hull_num: int, - mesh_normalizer: MeshFrameNormalizer, -) -> dict[str, Any]: - if max_convex_hull_num == _TARGET_MAX_CONVEX_HULL_NUM: - resolved_max_convex_hull_num = max_convex_hull_num - else: - resolved_max_convex_hull_num = _role_limited_max_convex_hull_num( - obj, - max_convex_hull_num, - ) - config = _make_rigid_object_config( - scene_dir, - obj, - runtime_uid, - body_scale, - max_convex_hull_num=resolved_max_convex_hull_num, - mesh_normalizer=mesh_normalizer, - ) - config["body_type"] = "dynamic" - return config - - -def _make_rigid_object_config( - scene_dir: Path, - obj: _SceneObject, - runtime_uid: str, - body_scale: Any, - max_convex_hull_num: int, - mesh_fpath: str | Path | None = None, - mesh_normalizer: MeshFrameNormalizer | None = None, -) -> dict[str, Any]: - shape = _make_shape_config( - scene_dir, - obj.config, - mesh_fpath=mesh_fpath, - mesh_normalizer=mesh_normalizer, - ) - config = { - "uid": runtime_uid, - "shape": shape, - "attrs": dict(_RIGID_OBJECT_ATTRS), - "init_pos": _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])), - "init_rot": _clean_vector3(obj.config.get("init_rot", [0.0, 0.0, 0.0])), - "body_scale": _clean_vector3(body_scale), - "max_convex_hull_num": int(max_convex_hull_num), - } - if "body_type" in obj.config: - config["body_type"] = str(obj.config["body_type"]) - return config - - -def _role_limited_max_convex_hull_num( - obj: _SceneObject, - role_max_convex_hull_num: int, -) -> int: - source_max_convex_hull_num = obj.config.get("max_convex_hull_num") - if source_max_convex_hull_num is None: - return role_max_convex_hull_num - return max(1, min(int(source_max_convex_hull_num), role_max_convex_hull_num)) - - -def _relative_rigid_object_max_convex_hull_num( - runtime_uid: str, - spec: _RelativePlacementSpec, -) -> int: - for placement in spec.placements: - if ( - placement.relation == "inside" - and runtime_uid == placement.reference_runtime_uid - ): - return _CONTAINER_MAX_CONVEX_HULL_NUM - task_uids = { - uid - for placement in spec.placements - for uid in (placement.moved_runtime_uid, placement.reference_runtime_uid) - } - if runtime_uid in task_uids: - return _TARGET_MAX_CONVEX_HULL_NUM - return _EXTRA_RIGID_MAX_CONVEX_HULL_NUM - - -def _relative_static_background_max_convex_hull_num( - runtime_uid: str, - spec: _RelativePlacementSpec, -) -> int: - for placement in spec.placements: - if ( - placement.relation == "inside" - and runtime_uid == placement.reference_runtime_uid - ): - return _CONTAINER_MAX_CONVEX_HULL_NUM - return _BACKGROUND_MAX_CONVEX_HULL_NUM - - -def _make_shape_config( - scene_dir: Path, - source_config: Mapping[str, Any], - *, - mesh_fpath: str | Path | None = None, - mesh_normalizer: MeshFrameNormalizer | None = None, -) -> dict[str, Any]: - shape = copy.deepcopy(dict(source_config.get("shape", {}))) - if mesh_fpath is not None: - shape["shape_type"] = "Mesh" - shape["fpath"] = str(mesh_fpath) - if shape.get("shape_type") == "Mesh" and "fpath" in shape: - mesh_path = Path(_asset_path_for_config(scene_dir, str(shape["fpath"]))) - if mesh_normalizer is not None: - mesh_path = mesh_normalizer.normalize_path(mesh_path) - shape["fpath"] = mesh_path.as_posix() - shape.setdefault("compute_uv", False) - return shape - - -def _asset_path_for_config(scene_dir: Path, fpath: str) -> str: - raw_path = Path(fpath) - if raw_path.is_absolute(): - return raw_path.resolve().as_posix() - return (scene_dir / raw_path).resolve().as_posix() - - -def _repo_root() -> Path: - current = Path(__file__).resolve() - for parent in current.parents: - if (parent / "setup.py").exists() and (parent / "embodichain").exists(): - return parent - return Path.cwd().resolve() - - -def _validate_bundle(bundle: Mapping[str, Any], roles: _BasketTaskRoles) -> None: - gym_config = bundle["gym_config"] - if gym_config.get("id") != "AtomicActionsAgent-v3": - raise ValueError("Generated gym config must use AtomicActionsAgent-v3.") - if gym_config.get("robot", {}).get("uid") != "DualUR5": - raise ValueError("Generated UR5 basket config must use DualUR5.") - - rigid_uids = {obj["uid"] for obj in gym_config.get("rigid_object", [])} - background_uids = {obj["uid"] for obj in gym_config.get("background", [])} - scene_uids = rigid_uids | background_uids - required_rigid = { - roles.left_target_runtime_uid, - roles.right_target_runtime_uid, - } - if not required_rigid.issubset(rigid_uids): - raise ValueError( - f"Generated rigid objects missing: {sorted(required_rigid - rigid_uids)}" - ) - if roles.container_runtime_uid not in scene_uids: - raise ValueError( - f"Generated scene objects missing container: {roles.container_runtime_uid}" - ) - - success = gym_config["env"]["extensions"]["agent_success"] - for term in success.get("terms", []): - if ( - term.get("object") not in rigid_uids - or term.get("container") not in scene_uids - ): - raise ValueError(f"Invalid success term uid reference: {term}") - - -def _validate_relative_bundle( - bundle: Mapping[str, Any], - spec: _RelativePlacementSpec, -) -> None: - gym_config = bundle["gym_config"] - if gym_config.get("id") != "AtomicActionsAgent-v3": - raise ValueError("Generated gym config must use AtomicActionsAgent-v3.") - if gym_config.get("robot", {}).get("uid") != "DualUR5": - raise ValueError("Generated relative placement config must use DualUR5.") - - rigid_uid_list = [obj["uid"] for obj in gym_config.get("rigid_object", [])] - if len(rigid_uid_list) != len(set(rigid_uid_list)): - raise ValueError(f"Duplicate rigid object runtime uid(s): {rigid_uid_list}") - rigid_uids = set(rigid_uid_list) - background_uids = {obj["uid"] for obj in gym_config.get("background", [])} - scene_uids = rigid_uids | background_uids - moved_required = {placement.moved_runtime_uid for placement in spec.placements} - missing_moved = moved_required - rigid_uids - if missing_moved: - raise ValueError( - f"Generated relative config missing moved rigid object(s): {missing_moved}" - ) - reference_required = { - placement.reference_runtime_uid for placement in spec.placements - } - missing_reference = reference_required - scene_uids - if missing_reference: - raise ValueError( - f"Generated relative config missing reference object(s): {missing_reference}" - ) - - _validate_success_uids( - gym_config["env"]["extensions"]["agent_success"], - rigid_uids=rigid_uids, - scene_uids=scene_uids, - ) - registry = gym_config["env"]["events"]["register_info_to_env"]["params"]["registry"] - registered = {entry["entity_cfg"]["uid"] for entry in registry} - required = moved_required | reference_required - if not required.issubset(registered): - raise ValueError( - f"Relative config registry missing: {sorted(required - registered)}" - ) - - -def _validate_success_uids( - success: Mapping[str, Any], - *, - rigid_uids: set[str], - scene_uids: set[str], -) -> None: - if success.get("op") in {"all", "and", "any", "or"}: - for term in success.get("terms", []): - _validate_success_uids(term, rigid_uids=rigid_uids, scene_uids=scene_uids) - return - - success_type = str(success.get("type", success.get("func", ""))).lower() - if success_type == "object_in_container": - required_keys = ("object", "container") - elif success_type in {"object_on_object", "object_on", "on_object"}: - required_keys = ("object", "support") - elif success_type in { - "object_axis_offset_near", - "object_relative_axis_near", - }: - required_keys = ("object", "reference") - elif success_type in {"object_axis_near", "object_coordinate_near"}: - required_keys = ("object",) - elif success_type in {"object_not_fallen", "not_fallen"}: - required_keys = ("object",) - else: - raise ValueError(f"Unsupported generated success term: {success_type!r}.") - - for key in required_keys: - uid = success.get(key) - valid_uids = rigid_uids if key == "object" else scene_uids - if uid not in valid_uids: - raise ValueError(f"Invalid success uid reference {key}={uid!r}.") - - -def _write_config_bundle( - *, - output_dir: Path, - bundle: Mapping[str, Any], - overwrite: bool, -) -> GeneratedUR5BasketConfigPaths: - paths = GeneratedUR5BasketConfigPaths( - output_dir=output_dir, - gym_config=output_dir / "fast_gym_config.json", - agent_config=output_dir / "agent_config.json", - task_prompt=output_dir / "task_prompt.txt", - basic_background=output_dir / "basic_background.txt", - atom_actions=output_dir / "atom_actions.txt", - summary=dict(bundle.get("summary", {})), - ) - _raise_if_generated_files_exist(output_dir, overwrite) - - output_dir.mkdir(parents=True, exist_ok=True) - _write_json(paths.gym_config, bundle["gym_config"]) - _write_json(paths.agent_config, bundle["agent_config"]) - _write_text(paths.task_prompt, bundle["task_prompt"]) - _write_text(paths.basic_background, bundle["basic_background"]) - _write_text(paths.atom_actions, bundle["atom_actions"]) - return paths - - -def _raise_if_generated_files_exist(output_dir: Path, overwrite: bool) -> None: - if overwrite: - return - output_files = [ - output_dir / "fast_gym_config.json", - output_dir / "agent_config.json", - output_dir / "task_prompt.txt", - output_dir / "basic_background.txt", - output_dir / "atom_actions.txt", - ] - existing = [path for path in output_files if path.exists()] - if existing: - existing_text = ", ".join(path.as_posix() for path in existing) - raise FileExistsError( - f"Generated file(s) already exist: {existing_text}. " - "Pass overwrite=True or --overwrite to replace them." - ) - - -def _write_json(path: Path, data: Mapping[str, Any]) -> None: - path.write_text( - json.dumps(data, ensure_ascii=False, indent=4) + "\n", - encoding="utf-8", - ) - - -def _write_text(path: Path, content: str) -> None: - path.write_text(content.rstrip() + "\n", encoding="utf-8") - - -def _read_json(path: Path) -> dict[str, Any]: - with path.open("r", encoding="utf-8") as file: - return json.load(file) - - -def _vector3(value: Any) -> list[float]: - if not isinstance(value, (list, tuple)) or len(value) != 3: - raise ValueError(f"Expected a 3-vector, got {value!r}.") - return [float(item) for item in value] - - -def _clean_vector3(value: Any) -> list[float]: - cleaned = [] - for item in _vector3(value): - if abs(item - 1.0) < 1e-9: - cleaned.append(1.0) - elif abs(item) < 1e-12: - cleaned.append(0.0) - else: - cleaned.append(item) - return cleaned diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py index 0da04292..cccbe35a 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/image2tabletop_client.py @@ -21,6 +21,7 @@ import argparse import json +import os import re import shutil import sys @@ -32,10 +33,21 @@ import requests from requests import exceptions as request_exceptions +__all__ = [ + "check_health", + "collect_image_paths", + "download_zip", + "extract_gym_project", + "main", + "process_image", + "submit_job", + "wait_for_job", +] + _IMAGE_SUFFIXES = frozenset({".bmp", ".jpeg", ".jpg", ".png", ".webp"}) _PROJECT_NAME_RE = re.compile(r"^[0-9]+_gym_project$") _PROJECT_ID_RE = re.compile(r"Image2Tabletop-([0-9]+)-v[0-9]+") -_DEFAULT_SERVER = "http://192.168.3.23:4523" +_DEFAULT_JOB_TIMEOUT_S = 1800.0 def _repo_root() -> Path: @@ -51,8 +63,20 @@ def _repo_root() -> Path: _DEFAULT_IMAGE_INPUT = _DEFAULT_OUTPUT_ROOT / "action_agent_pipeline/images" +def _require_server(server: str | None) -> str: + resolved = ( + str(server or os.getenv("IMAGE2TABLETOP_SERVER") or "").strip().rstrip("/") + ) + if not resolved: + raise ValueError( + "Image2Tabletop API server is required. Pass --server or set " + "IMAGE2TABLETOP_SERVER." + ) + return resolved + + def _server_url(base_url: str, path: str) -> str: - return f"{base_url.rstrip('/')}{path}" + return f"{_require_server(base_url)}{path}" def check_health(server: str) -> None: @@ -88,10 +112,21 @@ def submit_job(server: str, image_path: Path) -> str: return str(job_id) -def wait_for_job(server: str, job_id: str, poll_interval: float) -> dict: +def wait_for_job( + server: str, + job_id: str, + poll_interval: float, + timeout_s: float = _DEFAULT_JOB_TIMEOUT_S, +) -> dict: status_url = _server_url(server, f"/api/image2tabletop/status/{job_id}") + deadline = time.monotonic() + timeout_s while True: - response = requests.get(status_url, timeout=30) + remaining_s = deadline - time.monotonic() + if remaining_s <= 0: + raise TimeoutError( + f"job {job_id} did not complete within {timeout_s}s: {status_url}" + ) + response = requests.get(status_url, timeout=min(30, max(0.001, remaining_s))) response.raise_for_status() data = response.json() status = data.get("status") @@ -100,7 +135,7 @@ def wait_for_job(server: str, job_id: str, poll_interval: float) -> dict: return data if status == "failed": raise RuntimeError(f"job failed: {data}") - time.sleep(poll_interval) + time.sleep(min(poll_interval, max(0.0, deadline - time.monotonic()))) def download_zip(server: str, job_id: str, output_dir: Path) -> Path: @@ -216,10 +251,11 @@ def process_image( output_root: Path, poll_interval: float, overwrite: bool, + job_timeout_s: float = _DEFAULT_JOB_TIMEOUT_S, ) -> Path: job_id = submit_job(server, image_path) print(f"submitted job: {job_id} image={image_path}", flush=True) - wait_for_job(server, job_id, poll_interval) + wait_for_job(server, job_id, poll_interval, timeout_s=job_timeout_s) with TemporaryDirectory( prefix=f"{job_id}_image2tabletop_download_" ) as temp_dir_name: @@ -235,8 +271,8 @@ def main() -> int: ) parser.add_argument( "--server", - default=_DEFAULT_SERVER, - help=f"Image2Tabletop demo API server. Defaults to {_DEFAULT_SERVER}", + default=None, + help="Image2Tabletop demo API server. Defaults to IMAGE2TABLETOP_SERVER.", ) parser.add_argument( "--image", @@ -258,6 +294,7 @@ def main() -> int: help=argparse.SUPPRESS, ) parser.add_argument("--poll-interval", type=float, default=10.0) + parser.add_argument("--job-timeout-s", type=float, default=_DEFAULT_JOB_TIMEOUT_S) parser.add_argument( "--skip-health-check", action="store_true", @@ -273,18 +310,20 @@ def main() -> int: args = parser.parse_args() image_paths = collect_image_paths(Path(args.image)) + server = _require_server(args.server) if not args.skip_health_check: - check_health(args.server) + check_health(server) project_paths = [] for image_path in image_paths: project_paths.append( process_image( - server=args.server, + server=server, image_path=image_path, output_root=Path(args.output_root or _DEFAULT_OUTPUT_ROOT), poll_interval=args.poll_interval, overwrite=args.overwrite, + job_timeout_s=args.job_timeout_s, ) ) diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/.gitignore b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/.gitignore index ede6bbf2..89f30140 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/.gitignore +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/.gitignore @@ -1,4 +1,3 @@ # Python cache __pycache__/ *.py[cod] - diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.json b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.json index 740a5710..9aeb1c93 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.json +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.json @@ -1,20 +1,17 @@ { "services": { "zimage": { - "base_url": "http://192.168.3.23:5013" + "base_url": "" }, "sam3": { - "base_url": "http://192.168.3.23:5015" + "base_url": "" }, "sam3d": { - "base_url": "http://192.168.3.23:5016" + "base_url": "" } }, "llm": { "openai_compatible": { - "api_key": "sk-7hjyRgBLrhUYUSCpLgPSARk8sz1Sc2vZ2bnt3fy1bkHsI7ak", - "model": "gpt-5.5", - "base_url": "https://airouter.cloud/v1", "timeout_s": 120 } } diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.py index cf7dda1d..b371bdd9 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.py +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/config.py @@ -53,8 +53,8 @@ def load_prompt2geometry_config( raise FileNotFoundError(f"Prompt2Geometry config not found: {path}") raw = json.loads(path.read_text(encoding="utf-8")) services = _mapping(raw.get("services"), "services") - llm = _mapping( - _mapping(raw.get("llm"), "llm").get("openai_compatible"), + llm = _optional_mapping( + _optional_mapping(raw.get("llm"), "llm").get("openai_compatible"), "llm.openai_compatible", ) shared_llm = get_openai_compatible_llm_config( @@ -88,8 +88,7 @@ def load_prompt2geometry_config( str(shared_llm.get("base_url") or llm.get("base_url", "")), ).rstrip("/"), llm_timeout_s=float( - os.getenv("PROMPT2GEOMETRY_LLM_TIMEOUT_S") - or llm.get("timeout_s", 120.0) + os.getenv("PROMPT2GEOMETRY_LLM_TIMEOUT_S") or llm.get("timeout_s", 120.0) ), ) @@ -107,3 +106,9 @@ def _mapping(value: Any, name: str) -> dict[str, Any]: if not isinstance(value, dict): raise ValueError(f"Prompt2Geometry config key {name} must be an object.") return value + + +def _optional_mapping(value: Any, name: str) -> dict[str, Any]: + if value is None: + return {} + return _mapping(value, name) diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/dimensions.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/dimensions.py index 3c0dec17..4ce40f47 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/dimensions.py +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/dimensions.py @@ -95,8 +95,7 @@ def estimate_real_dimensions( time.sleep(1.0) continue raise ValueError( - "Failed to estimate object dimensions after " - f"{max_attempts} attempts." + "Failed to estimate object dimensions after " f"{max_attempts} attempts." ) diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/pipeline.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/pipeline.py index 2154da48..9e066da4 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/pipeline.py +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/pipeline.py @@ -56,14 +56,14 @@ class Prompt2GeometryRequest: target_id: str = "asset_0" request_id: str = "prompt2geometry_asset_0" output_name: str | None = None - zimage_base_url: str = "http://192.168.3.23:5013" + zimage_base_url: str = "" zimage_width: int = 1024 zimage_height: int = 1024 zimage_seed: int = 42 zimage_num_inference_steps: int = 8 zimage_prompt_suffix: str = "a complete single object, with pure-black background" - sam3_base_url: str = "http://192.168.3.23:5015" - sam3d_base_url: str = "http://192.168.3.23:5016" + sam3_base_url: str = "" + sam3d_base_url: str = "" sam3d_seed: int = 42 llm_api_key: str | None = None llm_model: str | None = None @@ -148,7 +148,13 @@ def _generate_image( output_root: Path, ) -> tuple[Path, dict[str, Any]]: image_path = output_root / "zimage" / "zimage.png" - client = ZImageClient(base_url=request.zimage_base_url) + client = ZImageClient( + base_url=_required_service_base_url( + os.getenv("PROMPT2GEOMETRY_ZIMAGE_BASE_URL") or request.zimage_base_url, + "zimage", + "PROMPT2GEOMETRY_ZIMAGE_BASE_URL or --zimage-base-url", + ) + ) manifest = client.generate_png( prompt=_zimage_prompt(request), output_path=image_path, @@ -176,7 +182,11 @@ def _segment_image( selection_reason="Use the full generated image as a bbox prompt.", ) sam3_client = SAM3Client( - base_url=os.getenv("PROMPT2GEOMETRY_SAM3_BASE_URL") or request.sam3_base_url, + base_url=_required_service_base_url( + os.getenv("PROMPT2GEOMETRY_SAM3_BASE_URL") or request.sam3_base_url, + "SAM3", + "PROMPT2GEOMETRY_SAM3_BASE_URL or --sam3-base-url", + ), ) health = sam3_client.health() _write_json(output_root / "sam3_health.json", health) @@ -228,7 +238,11 @@ def _generate_geometry( ) client = SAM3DClient( - base_url=os.getenv("PROMPT2GEOMETRY_SAM3D_BASE_URL") or request.sam3d_base_url, + base_url=_required_service_base_url( + os.getenv("PROMPT2GEOMETRY_SAM3D_BASE_URL") or request.sam3d_base_url, + "SAM3D", + "PROMPT2GEOMETRY_SAM3D_BASE_URL or --sam3d-base-url", + ), ) health = client.health() _write_json(output_root / "sam3d_health.json", health) @@ -303,6 +317,9 @@ def _final_scaled_glb_path( def _extract_glb_stem_from_prompt( prompt: str, client: OpenAICompatibleClient, + *, + max_attempts: int = 3, + retry_delay_s: float = 1.0, ) -> str: system_prompt = """ @@ -332,13 +349,30 @@ def _extract_glb_stem_from_prompt( ), }, ] - while True: + last_error: Exception | None = None + for attempt in range(1, max_attempts + 1): try: raw = client.chat_json(messages=messages) return _validate_glb_stem_output(raw) - except Exception: - time.sleep(1.0) - continue + except Exception as exc: + last_error = exc + if attempt >= max_attempts: + break + time.sleep(retry_delay_s) + raise RuntimeError( + f"Failed to extract GLB file name from prompt after {max_attempts} attempts." + ) from last_error + + +def _required_service_base_url( + value: str | None, service_name: str, source: str +) -> str: + base_url = str(value or "").strip().rstrip("/") + if not base_url: + raise ValueError( + f"Missing Prompt2Geometry {service_name} base URL. Set {source}." + ) + return base_url def _validate_glb_stem_output(raw: dict[str, Any]) -> str: diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3_client.py index 7bc60abe..03d8f2d4 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3_client.py +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3_client.py @@ -49,7 +49,9 @@ def __init__( timeout_s: float = 120.0, poll_interval_s: float = 2.0, ): - self.base_url = base_url.rstrip("/") + self.base_url = base_url.strip().rstrip("/") + if not self.base_url: + raise ValueError("SAM3 base_url must be non-empty.") self.boxes_path = boxes_path self.health_path = health_path self.timeout_s = timeout_s @@ -165,8 +167,14 @@ def _resolve_async_result( _append_progress(progress_path, result) _print_progress("segmentation", result, verbose=verbose) + deadline = time.monotonic() + self.timeout_s while True: - time.sleep(self.poll_interval_s) + remaining_s = deadline - time.monotonic() + if remaining_s <= 0: + raise SAM3ClientError( + f"SAM3 async job timed out after {self.timeout_s}s: {result}" + ) + time.sleep(min(self.poll_interval_s, remaining_s)) job = self._get_json(status_url) _append_progress(progress_path, job) _print_progress("segmentation", job, verbose=verbose) @@ -261,6 +269,4 @@ def _validate_segmentation_result(result: dict[str, Any]) -> None: raise SAM3ClientError(f"SAM3 segmentation {index} must be an object.") target_id = segmentation.get("target_id") if not isinstance(target_id, str) or not target_id.strip(): - raise SAM3ClientError( - f"SAM3 segmentation {index} must contain target_id." - ) + raise SAM3ClientError(f"SAM3 segmentation {index} must contain target_id.") diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3d_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3d_client.py index d8e4d8f8..c434d0f6 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3d_client.py +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/sam3d_client.py @@ -44,7 +44,9 @@ def __init__( timeout_s: float = 1800.0, poll_interval_s: float = 5.0, ): - self.base_url = base_url.rstrip("/") + self.base_url = base_url.strip().rstrip("/") + if not self.base_url: + raise ValueError("SAM3D base_url must be non-empty.") self.generation_path = generation_path self.health_path = health_path self.timeout_s = timeout_s @@ -205,8 +207,14 @@ def _resolve_async_result( _append_progress(progress_path, result) _print_progress("3D-generation", result, verbose=verbose) + deadline = time.monotonic() + self.timeout_s while True: - time.sleep(self.poll_interval_s) + remaining_s = deadline - time.monotonic() + if remaining_s <= 0: + raise SAM3DClientError( + f"SAM3D async job timed out after {self.timeout_s}s: {result}" + ) + time.sleep(min(self.poll_interval_s, remaining_s)) job = self._get_json(status_url) _append_progress(progress_path, job) _print_progress("3D-generation", job, verbose=verbose) diff --git a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/zimage_client.py b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/zimage_client.py index e9d7b287..3fee0863 100644 --- a/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/zimage_client.py +++ b/embodichain/gen_sim/action_agent_pipeline/gym_project_api/prompt2geometry/zimage_client.py @@ -35,12 +35,14 @@ class ZImageClient: def __init__( self, *, - base_url: str = "http://192.168.3.23:5013", + base_url: str, generation_path: str = "/generate.png", timeout_s: float = 300.0, ): """Initialize the z-image client.""" - self.base_url = base_url.rstrip("/") + self.base_url = base_url.strip().rstrip("/") + if not self.base_url: + raise ValueError("ZImage base_url must be non-empty.") self.generation_path = generation_path self.timeout_s = timeout_s self._opener = build_opener(ProxyHandler({})) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py index 806ad2f0..71eee571 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_action_utils.py @@ -16,7 +16,7 @@ from __future__ import annotations -from embodichain.utils.logger import log_error +__all__ = ["get_arm_states", "resolve_arm_side"] def _available_arm_sides(env) -> list[str]: @@ -30,19 +30,23 @@ def _available_arm_sides(env) -> list[str]: def resolve_arm_side(env, robot_name: str) -> str: """Resolve robot_name to an available left/right graph slot.""" name = robot_name or "" + available_sides = _available_arm_sides(env) if "right" in name: side = "right" elif "left" in name: side = "left" else: - sides = _available_arm_sides(env) - side = "right" if sides == ["right"] else "left" + if not available_sides: + raise ValueError( + "No available arm control parts were found for action-agent runtime. " + f"Robot control parts are {getattr(env.robot, 'control_parts', None)}." + ) + side = "right" if available_sides == ["right"] else "left" - if side not in _available_arm_sides(env): - log_error( - f"Requested {side}_arm for robot_name='{robot_name}', but available " + if side not in available_sides: + raise ValueError( + f"Requested {side}_arm for robot_name={robot_name!r}, but available " f"control parts are {getattr(env.robot, 'control_parts', None)}.", - error_type=ValueError, ) return side diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index ec6733a5..2db470dc 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -30,6 +30,10 @@ get_arm_states, resolve_arm_side, ) +from embodichain.gen_sim.action_agent_pipeline.runtime.coacd_cache_bridge import ( + GraspCollisionCachePreparationError, + ensure_grasp_collision_cache_from_env_coacd, +) from embodichain.lab.sim.atomic_actions import ( AntipodalAffordance, MoveAction, @@ -49,14 +53,16 @@ from embodichain.toolkits.graspkit.pg_grasp.antipodal_generator import ( GRASP_ANNOTATOR_CACHE_DIR, ) -from embodichain.utils.logger import log_info +from embodichain.utils.logger import log_info, log_warning from embodichain.utils.math import get_offset_pose __all__ = [ "AtomicActionSpec", + "build_parallel_action_stream", "execute_atomic_action", "execute_parallel_atomic_actions", "normalize_atomic_action_spec", + "step_env_with_actions", ] @@ -103,14 +109,19 @@ class AtomicActionSpec: @classmethod def from_mapping(cls, spec: Mapping[str, Any]) -> "AtomicActionSpec": normalized = normalize_atomic_action_spec(spec) + return cls.from_normalized(normalized) + + @classmethod + def from_normalized(cls, normalized: Mapping[str, Any]) -> "AtomicActionSpec": + """Build an atomic action spec from already-normalized data.""" return cls( atomic_action_class=normalized["atomic_action_class"], robot_name=normalized["robot_name"], control=normalized["control"], - target_object=normalized.get("target_object", {}), - target_pose=normalized.get("target_pose", {}), - target_qpos=normalized.get("target_qpos", {}), - cfg=normalized["cfg"], + target_object=dict(normalized.get("target_object", {})), + target_pose=dict(normalized.get("target_pose", {})), + target_qpos=dict(normalized.get("target_qpos", {})), + cfg=dict(normalized["cfg"]), ) def to_dict(self) -> dict[str, Any]: @@ -434,11 +445,36 @@ def execute_atomic_action( def execute_parallel_atomic_actions( left_arm_action=None, right_arm_action=None, - env=None, + *, + env, return_result: bool = False, **runtime_kwargs, ): """Execute left/right atomic action specs as one synchronized stream.""" + actions = build_parallel_action_stream( + left_arm_action=left_arm_action, + right_arm_action=right_arm_action, + env=env, + **runtime_kwargs, + ) + step_env_with_actions(env, actions) + if return_result: + return { + "actions": actions, + } + return actions + + +def build_parallel_action_stream( + left_arm_action=None, + right_arm_action=None, + *, + env, + **runtime_kwargs, +) -> list[torch.Tensor]: + """Build a synchronized left/right atomic action stream without stepping env.""" + if env is None: + raise ValueError("env is required to build parallel atomic actions.") left_arm_action = _resolve_action_spec(left_arm_action, env, runtime_kwargs) right_arm_action = _resolve_action_spec(right_arm_action, env, runtime_kwargs) @@ -483,17 +519,22 @@ def execute_parallel_atomic_actions( actions[:, arm_index] = action actions = torch.from_numpy(actions).to(dtype=torch.float32).unsqueeze(1) - actions = list(actions.unbind(dim=0)) + return list(actions.unbind(dim=0)) + +def step_env_with_actions( + env, + actions: list[torch.Tensor], + *, + update_obj_info: bool = True, +) -> None: + """Step an environment through a prebuilt action stream.""" + if env is None: + raise ValueError("env is required to step action stream.") for action in tqdm(actions): env.step(action) - env.update_obj_info() - - if return_result: - return { - "actions": actions, - } - return actions + if update_obj_info: + env.update_obj_info() def _resolve_action_spec(action_spec, env, runtime_kwargs: dict[str, Any]): @@ -943,10 +984,6 @@ def _prepare_grasp_collision_cache_from_env_coacd( return try: - from embodichain.gen_sim.action_agent_pipeline.runtime.coacd_cache_bridge import ( - ensure_grasp_collision_cache_from_env_coacd, - ) - result = ensure_grasp_collision_cache_from_env_coacd( mesh_vertices=mesh_vertices, mesh_triangles=mesh_triangles, @@ -954,7 +991,16 @@ def _prepare_grasp_collision_cache_from_env_coacd( max_decomposition_hulls=max_decomposition_hulls, body_scale=body_scale, ) - except Exception: + except ( + ImportError, + ModuleNotFoundError, + OSError, + GraspCollisionCachePreparationError, + ) as exc: + log_warning( + "Failed to prepare grasp collision cache from environment CoACD cache; " + f"falling back to the default grasp collision path: {exc}" + ) return if result.get("status") == "generated": diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/coacd_cache_bridge.py b/embodichain/gen_sim/action_agent_pipeline/runtime/coacd_cache_bridge.py index b0212fec..2206c0a7 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/coacd_cache_bridge.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/coacd_cache_bridge.py @@ -17,6 +17,7 @@ from __future__ import annotations import hashlib +import os import pickle from pathlib import Path from typing import Any @@ -29,6 +30,7 @@ ) __all__ = [ + "GraspCollisionCachePreparationError", "ensure_grasp_collision_cache_from_env_coacd", "grasp_collision_cache_path", ] @@ -39,6 +41,10 @@ ) +class GraspCollisionCachePreparationError(RuntimeError): + """Raised when env CoACD cache cannot be converted for grasp collision.""" + + def grasp_collision_cache_path( mesh_vertices: torch.Tensor | np.ndarray, mesh_triangles: torch.Tensor | np.ndarray, @@ -105,13 +111,10 @@ def ensure_grasp_collision_cache_from_env_coacd( try: plane_equations = _plane_equations_from_env_cache(env_cache_path, body_scale) _write_grasp_collision_cache(grasp_cache_path, plane_equations) - except Exception as exc: - return { - "status": "skipped", - "reason": str(exc), - "env_cache_path": env_cache_path.as_posix(), - "grasp_cache_path": grasp_cache_path.as_posix(), - } + except (ImportError, OSError, ValueError, TypeError) as exc: + raise GraspCollisionCachePreparationError( + f"Failed to convert env CoACD cache {env_cache_path}: {exc}" + ) from exc return { "status": "generated", @@ -172,14 +175,20 @@ def _write_grasp_collision_cache( ) plane_equation_counts[index] = n_equation - with cache_path.open("wb") as cache_file: - pickle.dump( - { - "plane_equations": plane_equations, - "plane_equation_counts": plane_equation_counts, - }, - cache_file, - ) + temp_path = cache_path.with_name(f"{cache_path.name}.tmp.{os.getpid()}") + try: + with temp_path.open("wb") as cache_file: + pickle.dump( + { + "plane_equations": plane_equations, + "plane_equation_counts": plane_equation_counts, + }, + cache_file, + ) + os.replace(temp_path, cache_path) + finally: + if temp_path.exists(): + temp_path.unlink() def _resolve_cache_dir(cache_dir: str | Path | None) -> Path: diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py index 4393c37c..b60be991 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py @@ -154,7 +154,9 @@ def _validate_task_spec(task_spec: Mapping[str, Any]) -> None: if edge_id in edge_ids: raise ValueError(f"Duplicate graph edge id '{edge_id}'.") edge_ids.add(edge_id) - if edge.get("left_arm_action") is None and edge.get("right_arm_action") is None: + if _is_empty_action_spec(edge.get("left_arm_action")) and _is_empty_action_spec( + edge.get("right_arm_action") + ): raise ValueError(f"Nominal edge '{edge_id}' must define an arm action.") for node_key in ("source", "target"): @@ -212,9 +214,7 @@ def _validate_nominal_path( def _compile_action(spec: Any, action_module: Any) -> Any: - if spec is None: - return None - if isinstance(spec, str) and spec.strip().lower() in {"", "none", "null"}: + if _is_empty_action_spec(spec): return None if not isinstance(spec, Mapping): raise TypeError(f"Action spec must be a mapping or null, but got {type(spec)}.") @@ -236,7 +236,17 @@ def _compile_action(spec: Any, action_module: Any) -> Any: "target_qpos." ) - return action_module.normalize_atomic_action_spec(spec) + normalized = action_module.normalize_atomic_action_spec(spec) + spec_cls = getattr(action_module, "AtomicActionSpec", None) + if spec_cls is None: + return normalized + return spec_cls.from_normalized(normalized) + + +def _is_empty_action_spec(spec: Any) -> bool: + return spec is None or ( + isinstance(spec, str) and spec.strip().lower() in {"", "none", "null"} + ) def _reject_recovery_keys(task_spec: Mapping[str, Any]) -> None: diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py b/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py index 53ea5a1f..51cff7c3 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py @@ -103,7 +103,9 @@ def add_edge( self.outgoing[source].append(edge_id) return self - def run(self, env=None, **kwargs) -> ExecutedActionList: + def run(self, *, env, **kwargs) -> ExecutedActionList: + if env is None: + raise ValueError("env is required to run an agent task graph.") current = self.start executed_actions: list[Any] = [] transitions = 0 diff --git a/embodichain/gen_sim/action_agent_pipeline/utils/__init__.py b/embodichain/gen_sim/action_agent_pipeline/utils/__init__.py index 9cfdb173..015c4151 100644 --- a/embodichain/gen_sim/action_agent_pipeline/utils/__init__.py +++ b/embodichain/gen_sim/action_agent_pipeline/utils/__init__.py @@ -16,3 +16,4 @@ from __future__ import annotations +__all__: list[str] = [] diff --git a/embodichain/gen_sim/action_agent_pipeline/utils/llm_config.py b/embodichain/gen_sim/action_agent_pipeline/utils/llm_config.py index c267a1c8..1bb6e63b 100644 --- a/embodichain/gen_sim/action_agent_pipeline/utils/llm_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/utils/llm_config.py @@ -75,9 +75,9 @@ def _load_env_files(paths: tuple[Path, ...] | None = None) -> dict[str, str]: """Read local env files, with later paths taking precedence.""" env_values: dict[str, str] = {} for path in paths or ( - SIMREADY_LLM_ENV_PATH, - ACTION_PIPELINE_LLM_ENV_PATH, LLM_ENV_PATH, + ACTION_PIPELINE_LLM_ENV_PATH, + SIMREADY_LLM_ENV_PATH, ): env_values.update(_load_env_file(path)) return env_values @@ -116,7 +116,10 @@ def get_openai_compatible_llm_config( ) -> dict[str, Any]: """Return shared OpenAI-compatible LLM config for agents and gen-sim.""" local_env = _load_env_files() - json_cfg = _load_gen_config() + try: + json_cfg = _load_gen_config() + except FileNotFoundError: + json_cfg = {} cfg = { "api_key": _get_first_value(local_env, "OPENAI_API_KEY") diff --git a/embodichain/gen_sim/action_agent_pipeline/utils/mllm.py b/embodichain/gen_sim/action_agent_pipeline/utils/mllm.py index f39f2d0f..0d1db025 100644 --- a/embodichain/gen_sim/action_agent_pipeline/utils/mllm.py +++ b/embodichain/gen_sim/action_agent_pipeline/utils/mllm.py @@ -16,7 +16,6 @@ from __future__ import annotations -import os from collections.abc import Mapping from typing import Any @@ -38,11 +37,12 @@ def apply_proxy_env(proxy_url: str | None) -> None: - """Apply an optional proxy URL for OpenAI-compatible clients.""" - if not proxy_url: - return - os.environ["HTTP_PROXY"] = proxy_url - os.environ["HTTPS_PROXY"] = proxy_url + """Deprecated compatibility shim for older callers. + + Proxy configuration is now passed directly to each client instance to avoid + mutating process-global environment variables. + """ + return None def _resolve_llm_config( @@ -73,14 +73,15 @@ def create_openai_client( required=required, require_base_url=require_base_url, ) - apply_proxy_env(cfg.get("proxy_url")) - kwargs: dict[str, Any] = { "api_key": cfg["api_key"], "default_query": cfg.get("default_query") or None, } if cfg.get("base_url"): kwargs["base_url"] = cfg["base_url"] + http_client = _proxy_http_client(cfg.get("proxy_url")) + if http_client is not None: + kwargs["http_client"] = http_client return OpenAI(**kwargs) @@ -100,8 +101,6 @@ def create_chat_openai( required=required, require_base_url=False, ) - apply_proxy_env(cfg.get("proxy_url")) - kwargs: dict[str, Any] = { "temperature": temperature, "model": model or cfg.get("model") or DEFAULT_LLM_MODEL, @@ -109,7 +108,23 @@ def create_chat_openai( } if cfg.get("base_url"): kwargs["base_url"] = cfg["base_url"] + http_client = _proxy_http_client(cfg.get("proxy_url")) + if http_client is not None: + kwargs["http_client"] = http_client return UsageTrackedChatModel( ChatOpenAI(**kwargs), stage=usage_stage, ) + + +def _proxy_http_client(proxy_url: str | None) -> Any | None: + proxy = str(proxy_url or "").strip() + if not proxy: + return None + + import httpx + + try: + return httpx.Client(proxy=proxy, trust_env=False) + except TypeError: + return httpx.Client(proxies=proxy, trust_env=False) diff --git a/tests/gen_sim/action_agent_pipeline/test_action_agent_cli_and_clients.py b/tests/gen_sim/action_agent_pipeline/test_action_agent_cli_and_clients.py new file mode 100644 index 00000000..41049b9a --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_action_agent_cli_and_clients.py @@ -0,0 +1,314 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import ast +import json +import os +import sys +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from embodichain.gen_sim.action_agent_pipeline.gym_project_api.image2tabletop_client import ( + wait_for_job, +) + +_ACTION_AGENT_PACKAGE_ROOT = ( + Path(__file__).resolve().parents[3] / "embodichain/gen_sim/action_agent_pipeline" +) + + +def test_image2tabletop_wait_for_job_times_out_without_polling() -> None: + with pytest.raises(TimeoutError, match="did not complete"): + wait_for_job("http://example.test", "job-1", poll_interval=0.1, timeout_s=0.0) + + +def test_action_agent_config_cli_imports() -> None: + from embodichain.gen_sim.action_agent_pipeline.cli import ( + generate_action_agent_config, + ) + + assert callable(generate_action_agent_config.cli) + + +def test_action_agent_config_generation_imports() -> None: + from embodichain.gen_sim.action_agent_pipeline.generation import action_agent_config + + assert callable(action_agent_config.generate_action_agent_config_from_project) + assert action_agent_config.GeneratedActionAgentConfigPaths.__name__ == ( + "GeneratedActionAgentConfigPaths" + ) + + +def test_action_agent_python_modules_declare_all() -> None: + missing_all = [] + for path in sorted(_ACTION_AGENT_PACKAGE_ROOT.rglob("*.py")): + tree = ast.parse(path.read_text(encoding="utf-8"), filename=path.as_posix()) + has_all = any( + isinstance(node, (ast.Assign, ast.AnnAssign)) + and ( + any( + isinstance(target, ast.Name) and target.id == "__all__" + for target in getattr(node, "targets", []) + ) + or ( + isinstance(getattr(node, "target", None), ast.Name) + and node.target.id == "__all__" + ) + ) + for node in tree.body + ) + if not has_all: + missing_all.append(path.relative_to(_ACTION_AGENT_PACKAGE_ROOT).as_posix()) + + assert missing_all == [] + + +def test_glb_io_is_shared_by_generation_modules() -> None: + from embodichain.gen_sim.action_agent_pipeline.generation import ( + action_agent_config, + glb_io, + mesh_frame_normalization, + ) + + assert action_agent_config.read_glb is glb_io.read_glb + assert mesh_frame_normalization.read_glb is glb_io.read_glb + + +def test_create_openai_client_uses_per_client_proxy(monkeypatch) -> None: + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( + create_openai_client, + ) + + class _FakeHttpClient: + def __init__(self, **kwargs) -> None: + self.kwargs = kwargs + + class _FakeOpenAI: + last_kwargs = None + + def __init__(self, **kwargs) -> None: + _FakeOpenAI.last_kwargs = kwargs + + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.delenv("HTTPS_PROXY", raising=False) + monkeypatch.setitem(sys.modules, "httpx", SimpleNamespace(Client=_FakeHttpClient)) + monkeypatch.setitem(sys.modules, "openai", SimpleNamespace(OpenAI=_FakeOpenAI)) + + create_openai_client( + config={ + "api_key": "test-key", + "base_url": "https://example.test/v1", + "proxy_url": "http://proxy.test:8080", + } + ) + + assert "HTTP_PROXY" not in os.environ + assert "HTTPS_PROXY" not in os.environ + http_client = _FakeOpenAI.last_kwargs["http_client"] + assert http_client.kwargs == { + "proxy": "http://proxy.test:8080", + "trust_env": False, + } + + +def test_create_chat_openai_uses_per_client_proxy(monkeypatch) -> None: + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import create_chat_openai + + class _FakeHttpClient: + def __init__(self, **kwargs) -> None: + self.kwargs = kwargs + + class _FakeChatOpenAI: + def __init__(self, **kwargs) -> None: + self.kwargs = kwargs + + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.delenv("HTTPS_PROXY", raising=False) + monkeypatch.setitem(sys.modules, "httpx", SimpleNamespace(Client=_FakeHttpClient)) + monkeypatch.setitem( + sys.modules, + "langchain_openai", + SimpleNamespace(ChatOpenAI=_FakeChatOpenAI), + ) + + chat_model = create_chat_openai( + config={ + "api_key": "test-key", + "model": "test-model", + "proxy_url": "http://proxy.test:8080", + }, + usage_stage="test", + ) + + assert "HTTP_PROXY" not in os.environ + assert "HTTPS_PROXY" not in os.environ + http_client = chat_model._inner.kwargs["http_client"] + assert http_client.kwargs == { + "proxy": "http://proxy.test:8080", + "trust_env": False, + } + + +def test_image2scene_pipeline_passes_client_url(monkeypatch, tmp_path) -> None: + from embodichain.gen_sim.action_agent_pipeline.cli.image2scene_stage import ( + run_image2scene_pipeline, + ) + + root = tmp_path / "image2scene" + script = root / "demo_api/client/image2scene_pipeline.py" + script.parent.mkdir(parents=True) + script.write_text("pass\n", encoding="utf-8") + image = tmp_path / "demo.jpg" + image.write_bytes(b"image") + gen_config = root / "gen_config.json" + gen_config.write_text( + json.dumps( + { + "DEFAULT_TABLE_TYPE": "", + "DEFAULT_API_KEY": "key", + "DEFAULT_MODEL": "model", + "DEFAULT_BASE_URL": "https://llm.test/v1", + "DEFAULT_CLIENT_URL": "", + } + ), + encoding="utf-8", + ) + merged_output = root / "merged.json" + merged_output.write_text("{}", encoding="utf-8") + captured = {} + + def fake_run(command, **kwargs): + captured["command"] = command + captured["kwargs"] = kwargs + return SimpleNamespace(returncode=0) + + monkeypatch.setattr( + "embodichain.gen_sim.action_agent_pipeline.cli.image2scene_stage.subprocess.run", + fake_run, + ) + + run_image2scene_pipeline( + SimpleNamespace( + background="a vase", + image2scene_root=str(root), + image=str(image), + image_name=None, + image2scene_download_dir="./downloads", + image2scene_output_root="./generated", + image2scene_gen_config="./gen_config.json", + image2scene_llm_config="./gen_config.json", + image2scene_extract_dir=None, + image2scene_merged_output="./merged.json", + server="http://stage-a.test:4523", + image2scene_client_url=None, + poll_interval=0.1, + ) + ) + + command = captured["command"] + gen_config_index = command.index("--gen-config") + 1 + runtime_config = Path(command[gen_config_index]) + assert runtime_config != gen_config + assert runtime_config.parent.name == ".image2scene_runtime" + assert ( + json.loads(runtime_config.read_text(encoding="utf-8"))["DEFAULT_CLIENT_URL"] + == "http://stage-a.test:4523" + ) + + +def test_image2scene_runtime_gen_config_injects_client_url(tmp_path) -> None: + from embodichain.gen_sim.action_agent_pipeline.cli.image2scene_stage import ( + _stage_b_gen_config_with_client_url, + ) + + gen_config = tmp_path / "gen_config.json" + gen_config.write_text( + json.dumps( + { + "DEFAULT_TABLE_TYPE": "", + "DEFAULT_API_KEY": "key", + "DEFAULT_MODEL": "model", + "DEFAULT_BASE_URL": "https://llm.test/v1", + "DEFAULT_CLIENT_URL": "", + } + ), + encoding="utf-8", + ) + + runtime_config = _stage_b_gen_config_with_client_url( + gen_config, + "http://mesatask.test:4523/", + tmp_path, + ) + + assert runtime_config != gen_config + assert ( + json.loads(gen_config.read_text(encoding="utf-8"))["DEFAULT_CLIENT_URL"] == "" + ) + assert ( + json.loads(runtime_config.read_text(encoding="utf-8"))["DEFAULT_CLIENT_URL"] + == "http://mesatask.test:4523" + ) + + +def test_agentic_gen_sim_env_api_and_compat_alias() -> None: + from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware import ( + agent_env, + ) + + assert agent_env.AtomicActionsAgentEnv is agent_env.AgenticGenSimEnv + assert agent_env.EmbodiedEnv in agent_env.AgenticGenSimEnv.__mro__ + assert len(agent_env.AgenticGenSimEnv.__bases__) == 1 + assert agent_env.AgenticGenSimEnv.__bases__[0] is agent_env.EmbodiedEnv + + +def test_agentic_gen_sim_env_splits_agent_kwargs(monkeypatch) -> None: + from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware import ( + agent_env, + ) + + captured = {} + + def fake_env_init(self, cfg=None, **kwargs) -> None: + captured["cfg"] = cfg + captured["env_kwargs"] = kwargs + self.cfg = SimpleNamespace(ignore_terminations=False) + + def fake_init_agents(self, **kwargs) -> None: + captured["agent_kwargs"] = kwargs + + monkeypatch.setattr(agent_env.EmbodiedEnv, "__init__", fake_env_init) + monkeypatch.setattr(agent_env.AgenticGenSimEnv, "_init_agents", fake_init_agents) + + agent_env.AgenticGenSimEnv( + cfg="cfg", + agent_config={"Agent": {}}, + task_name="Task", + agent_config_path="agent_config.json", + num_envs=1, + ) + + assert captured["cfg"] == "cfg" + assert captured["env_kwargs"] == {"num_envs": 1} + assert captured["agent_kwargs"] == { + "agent_config": {"Agent": {}}, + "task_name": "Task", + "agent_config_path": "agent_config.json", + } diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py index 2cdbe9aa..58cea8fa 100644 --- a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -23,9 +23,19 @@ from embodichain.gen_sim.action_agent_pipeline.runtime import atom_actions from embodichain.gen_sim.action_agent_pipeline.runtime.atom_actions import ( + build_parallel_action_stream, execute_atomic_action, + execute_parallel_atomic_actions, normalize_atomic_action_spec, + step_env_with_actions, ) +from embodichain.gen_sim.action_agent_pipeline.runtime.atom_action_utils import ( + resolve_arm_side, +) +from embodichain.gen_sim.action_agent_pipeline.runtime.coacd_cache_bridge import ( + GraspCollisionCachePreparationError, +) +from embodichain.gen_sim.action_agent_pipeline.runtime.task_graph import AgentTaskGraph from embodichain.lab.sim.atomic_actions import ( MoveActionCfg, PickUpActionCfg, @@ -160,6 +170,13 @@ def execute(self, target, start_qpos=None, **kwargs): return True, trajectory, [0, 1] if "left" in self.cfg.control_part else [3, 4] +@pytest.fixture(autouse=True) +def _reset_fake_backend_capture(): + _FakeBackendAction.capture = None + yield + _FakeBackendAction.capture = None + + def test_normalize_atomic_action_spec_rejects_legacy_schema() -> None: with pytest.raises(ValueError, match="Legacy action schema"): normalize_atomic_action_spec({"action": "move", "robot_name": "left_arm"}) @@ -249,6 +266,94 @@ def test_atom_actions_module_exposes_atomic_runtime_entrypoints() -> None: assert callable(atom_actions.execute_parallel_atomic_actions) +def test_execute_parallel_atomic_actions_requires_env() -> None: + with pytest.raises(TypeError, match="env"): + execute_parallel_atomic_actions(left_arm_action=torch.zeros((1, 3))) + + +def test_execute_parallel_atomic_actions_rejects_none_env() -> None: + with pytest.raises(ValueError, match="env is required"): + execute_parallel_atomic_actions( + left_arm_action=torch.zeros((1, 3)), + env=None, + ) + + +def test_build_parallel_action_stream_does_not_step_env() -> None: + env = _FakeEnv() + action_stream = build_parallel_action_stream( + left_arm_action=torch.zeros((2, 3)), + env=env, + ) + + assert len(action_stream) == 2 + assert not hasattr(env, "stepped_actions") + + +def test_step_env_with_actions_steps_and_updates_env() -> None: + class _StepEnv: + def __init__(self) -> None: + self.stepped_actions = [] + self.update_count = 0 + + def step(self, action): + self.stepped_actions.append(action) + + def update_obj_info(self) -> None: + self.update_count += 1 + + env = _StepEnv() + actions = [torch.zeros(1, 1), torch.ones(1, 1)] + + step_env_with_actions(env, actions) + + assert env.stepped_actions == actions + assert env.update_count == 2 + + +def test_agent_task_graph_run_requires_env() -> None: + graph = AgentTaskGraph(start="start", goal="goal") + with pytest.raises(TypeError, match="env"): + graph.run() + + +def test_agent_task_graph_run_rejects_none_env() -> None: + graph = AgentTaskGraph(start="start", goal="goal") + with pytest.raises(ValueError, match="env is required"): + graph.run(env=None) + + +def test_resolve_arm_side_rejects_unavailable_requested_arm() -> None: + env = _FakeEnv() + env.right_arm_joints = [] + env.right_eef_joints = [] + env.robot.control_parts = {"left_arm": [0, 1], "left_eef": [2]} + + with pytest.raises(ValueError, match="Requested right_arm"): + resolve_arm_side(env, "right_arm") + + +def test_resolve_arm_side_uses_only_available_arm_for_unspecified_name() -> None: + env = _FakeEnv() + env.left_arm_joints = [] + env.left_eef_joints = [] + env.robot.control_parts = {"right_arm": [3, 4], "right_eef": [5]} + + assert resolve_arm_side(env, "ur5") == "right" + + +def test_resolve_arm_side_rejects_env_without_available_arms() -> None: + env = _FakeEnv() + env.left_arm_joints = [] + env.left_eef_joints = [] + env.right_arm_joints = [] + env.right_eef_joints = [] + env.robot.control_parts = {} + + with pytest.raises(ValueError, match="No available arm control parts"): + resolve_arm_side(env, "ur5") + + def test_object_referenced_pose_builds_move_cfg_and_pose_target(monkeypatch) -> None: env = _FakeEnv() capture = [] @@ -461,3 +566,53 @@ def test_place_action_rejects_qpos_target(monkeypatch) -> None: }, env=env, ) + + +def test_grasp_collision_cache_bridge_error_falls_back(monkeypatch) -> None: + warnings = [] + + def raise_cache_error(**kwargs): + raise GraspCollisionCachePreparationError("cache conversion failed") + + monkeypatch.setattr( + atom_actions, + "ensure_grasp_collision_cache_from_env_coacd", + raise_cache_error, + ) + monkeypatch.setattr(atom_actions, "log_warning", warnings.append) + + atom_actions._prepare_grasp_collision_cache_from_env_coacd( + obj_name="apple", + mesh_vertices=torch.zeros(1, 3), + mesh_triangles=torch.zeros(1, 3, dtype=torch.int64), + source_mesh_path="/tmp/fake.obj", + max_decomposition_hulls=4, + body_scale=None, + runtime_kwargs={}, + ) + + assert len(warnings) == 1 + assert "falling back to the default grasp collision path" in warnings[0] + assert "cache conversion failed" in warnings[0] + + +def test_grasp_collision_cache_unexpected_error_propagates(monkeypatch) -> None: + def raise_unexpected_error(**kwargs): + raise AssertionError("unexpected bug") + + monkeypatch.setattr( + atom_actions, + "ensure_grasp_collision_cache_from_env_coacd", + raise_unexpected_error, + ) + + with pytest.raises(AssertionError, match="unexpected bug"): + atom_actions._prepare_grasp_collision_cache_from_env_coacd( + obj_name="apple", + mesh_vertices=torch.zeros(1, 3), + mesh_triangles=torch.zeros(1, 3, dtype=torch.int64), + source_mesh_path="/tmp/fake.obj", + max_decomposition_hulls=4, + body_scale=None, + runtime_kwargs={}, + ) diff --git a/tests/gen_sim/action_agent_pipeline/test_base_agent_env_config.py b/tests/gen_sim/action_agent_pipeline/test_base_agent_env_config.py new file mode 100644 index 00000000..3af6f926 --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_base_agent_env_config.py @@ -0,0 +1,44 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import pytest + +from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.agent_env import ( + AgenticGenSimEnv, +) + + +def test_agentic_gen_sim_env_rejects_reserved_agent_config_keys() -> None: + env = AgenticGenSimEnv.__new__(AgenticGenSimEnv) + + with pytest.raises(ValueError, match="reserved keys: task_name"): + env._validate_agent_config_keys("TaskAgent", {"task_name": "bad"}) + + +def test_agentic_gen_sim_env_rejects_reserved_common_agent_config_keys() -> None: + env = AgenticGenSimEnv.__new__(AgenticGenSimEnv) + agent_config = { + "Agent": {"prompt_kwargs": {}, "task_name": "bad"}, + "TaskAgent": {}, + "CompileAgent": {}, + } + + with pytest.raises( + ValueError, match="Agent config contains reserved keys: task_name" + ): + env._init_agents(agent_config, task_name="UnitTask") diff --git a/tests/gen_sim/action_agent_pipeline/test_coacd_cache.py b/tests/gen_sim/action_agent_pipeline/test_coacd_cache.py index 8a974071..bb76991d 100644 --- a/tests/gen_sim/action_agent_pipeline/test_coacd_cache.py +++ b/tests/gen_sim/action_agent_pipeline/test_coacd_cache.py @@ -98,6 +98,7 @@ def test_grasp_cache_bridge_uses_existing_env_coacd_obj(tmp_path) -> None: assert set(cache) == {"plane_equations", "plane_equation_counts"} assert cache["plane_equations"].shape[-1] == 4 assert cache["plane_equation_counts"].numel() == 1 + assert not list(env_cache_path.parent.glob("*.tmp.*")) second_result = ensure_grasp_collision_cache_from_env_coacd( mesh_vertices=mesh_vertices, diff --git a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py index 122b188b..1878b104 100644 --- a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py +++ b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py @@ -21,6 +21,9 @@ from embodichain.gen_sim.action_agent_pipeline.runtime.graph_compiler import ( compile_agent_graph_spec, ) +from embodichain.gen_sim.action_agent_pipeline.runtime.atom_actions import ( + AtomicActionSpec, +) class _FakeGraph: @@ -97,7 +100,9 @@ def test_compile_agent_graph_accepts_atomic_action_class_spec() -> None: graph_cls=_FakeGraph, ) - assert graph.edges["e01"]["left_arm_action"] == action + left_arm_action = graph.edges["e01"]["left_arm_action"] + assert isinstance(left_arm_action, AtomicActionSpec) + assert left_arm_action.to_dict() == action def test_compile_agent_graph_rejects_legacy_action_schema() -> None: @@ -119,3 +124,15 @@ def test_compile_agent_graph_rejects_extra_edge_fields() -> None: task_graph, graph_cls=_FakeGraph, ) + + +def test_compile_agent_graph_rejects_empty_string_arm_actions() -> None: + task_graph = _task_graph(_pick_up_spec("left_arm", "apple")) + task_graph["edges"][0]["left_arm_action"] = "" + task_graph["edges"][0]["right_arm_action"] = "none" + + with pytest.raises(ValueError, match="must define an arm action"): + compile_agent_graph_spec( + task_graph, + graph_cls=_FakeGraph, + ) diff --git a/tests/gen_sim/action_agent_pipeline/test_tableware_success.py b/tests/gen_sim/action_agent_pipeline/test_tableware_success.py new file mode 100644 index 00000000..2b5c4b8d --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_tableware_success.py @@ -0,0 +1,47 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import pytest +import torch + +from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.success import ( + evaluate_configured_success, +) + + +class _FakeSim: + def get_rigid_object(self, uid: str): + return None + + +class _FakeEnv: + num_envs = 1 + device = torch.device("cpu") + sim = _FakeSim() + + +def test_success_unknown_rigid_object_uid_raises_clear_error() -> None: + with pytest.raises(ValueError, match="Unknown rigid object uid: 'missing'"): + evaluate_configured_success( + _FakeEnv(), + { + "type": "object_xy_near", + "object": "missing", + "target_xy": [0.0, 0.0], + }, + ) diff --git a/tests/gen_sim/action_agent_pipeline/test_task_agent_cache.py b/tests/gen_sim/action_agent_pipeline/test_task_agent_cache.py new file mode 100644 index 00000000..28acfe2a --- /dev/null +++ b/tests/gen_sim/action_agent_pipeline/test_task_agent_cache.py @@ -0,0 +1,90 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from types import SimpleNamespace + +from embodichain.gen_sim.action_agent_pipeline.agents import ( + task_agent as task_agent_module, +) +from embodichain.gen_sim.action_agent_pipeline.agents.task_agent import TaskAgent + + +class _FakeLLM: + def __init__(self) -> None: + self.calls = 0 + + def invoke(self, prompt: str): + self.calls += 1 + return SimpleNamespace(content=f'{{"prompt": "{prompt}"}}') + + +class _FakePromptValue: + def __init__(self, value: str) -> None: + self.value = value + + def to_string(self) -> str: + return self.value + + def __str__(self) -> str: + return f"unstable:{self.value}" + + +def test_task_agent_cache_uses_prompt_hash_metadata(tmp_path, monkeypatch) -> None: + monkeypatch.setattr( + task_agent_module.TaskPrompt, + "unit_prompt", + staticmethod(lambda **kwargs: f"task={kwargs['task']}"), + raising=False, + ) + llm = _FakeLLM() + agent = TaskAgent( + llm, + prompt_name="unit_prompt", + prompt_kwargs={}, + task_name="UnitTask", + ) + + first = agent.generate(log_dir=tmp_path, task="a") + second = agent.generate(log_dir=tmp_path, task="a") + third = agent.generate(log_dir=tmp_path, task="b") + + assert first == second + assert first != third + assert llm.calls == 2 + assert (tmp_path / "agent_task_graph.metadata.json").is_file() + + +def test_task_agent_cache_hashes_prompt_value_objects(tmp_path, monkeypatch) -> None: + monkeypatch.setattr( + task_agent_module.TaskPrompt, + "unit_prompt_value", + staticmethod(lambda **kwargs: _FakePromptValue(f"task={kwargs['task']}")), + raising=False, + ) + llm = _FakeLLM() + agent = TaskAgent( + llm, + prompt_name="unit_prompt_value", + prompt_kwargs={}, + task_name="UnitTask", + ) + + agent.generate(log_dir=tmp_path, task="a") + agent.generate(log_dir=tmp_path, task="a") + + assert llm.calls == 1 diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index fc475160..7872cf66 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -25,32 +25,56 @@ import pytest import torch +from embodichain.gen_sim.action_agent_pipeline.cli import ( + target_replacements as target_replacements_cli, +) from embodichain.gen_sim.action_agent_pipeline.generation import ( - ur5_basket_config as ur5_basket_config_generation, + action_agent_config as action_agent_config_generation, ) -from embodichain.gen_sim.action_agent_pipeline.cli import ( - run_agent_pipeline as run_agent_pipeline_cli, +from embodichain.gen_sim.action_agent_pipeline.generation.action_agent_templates import ( + make_dual_ur5_robot_config, + make_light_config, + make_sensor_config, ) from embodichain.gen_sim.action_agent_pipeline.generation.mesh_frame_normalization import ( MESH_FRAME_NORMALIZATION_POLICY_VERSION, MeshFrameNormalizer, ) -from embodichain.gen_sim.action_agent_pipeline.generation.ur5_basket_config import ( +from embodichain.gen_sim.action_agent_pipeline.generation.action_agent_config import ( TargetReplacementSpec, - generate_ur5_basket_config_from_project, + generate_action_agent_config_from_project, ) from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.success import ( evaluate_configured_success, ) -def test_ur5_basket_generator_uses_parallel_handoff( +def test_action_agent_templates_load_fresh_json_copies() -> None: + first_robot = make_dual_ur5_robot_config(robot_init_z=0.42) + second_robot = make_dual_ur5_robot_config(robot_init_z=0.84) + first_sensors = make_sensor_config() + second_sensors = make_sensor_config() + first_lights = make_light_config() + second_lights = make_light_config() + + first_robot["control_parts"]["left_arm"].append("MUTATED_JOINT") + first_sensors[0]["uid"] = "mutated_camera" + first_lights["direct"][0]["uid"] = "mutated_light" + + assert second_robot["init_pos"] == pytest.approx([2.0, 0.0, 0.84]) + assert first_robot["init_pos"] == pytest.approx([2.0, 0.0, 0.42]) + assert second_robot["control_parts"]["left_arm"] == ["LEFT_JOINT[1-6]"] + assert second_sensors[0]["uid"] == "cam_high" + assert second_lights["direct"][0]["uid"] == "main_light" + + +def test_action_agent_config_generator_uses_parallel_handoff( tmp_path: Path, ) -> None: project_dir = tmp_path / "1790000000_gym_project" _write_project(project_dir) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_agent", target_body_scale=0.6, @@ -72,13 +96,13 @@ def test_ur5_basket_generator_uses_parallel_handoff( _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) _assert_normalized_obj_path(background_objects["table"]["shape"]["fpath"]) _assert_normalized_obj_path(background_objects["wicker_basket"]["shape"]["fpath"]) - table_top_z = ur5_basket_config_generation._mesh_config_world_zmax( + table_top_z = action_agent_config_generation._mesh_config_world_zmax( background_objects["table"] ) expected_robot_init_z = ( table_top_z - + ur5_basket_config_generation._DUAL_UR5_TABLETOP_CLEARANCE - - ur5_basket_config_generation._DUAL_UR5_ARM_COMPONENT_Z + + action_agent_config_generation._DUAL_UR5_TABLETOP_CLEARANCE + - action_agent_config_generation._DUAL_UR5_ARM_COMPONENT_Z ) assert gym_config["robot"]["init_pos"] == pytest.approx( [2.0, 0.0, expected_robot_init_z] @@ -167,7 +191,7 @@ def test_generator_normalizes_glb_meshes_and_preserves_source_rot( project_dir = tmp_path / "1790000000_gym_project" _write_project(project_dir) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_agent", ) @@ -310,7 +334,7 @@ def test_target_replacements_generate_meshes_and_replace_paths( _write_project(project_dir) calls = _patch_prompt2geometry(monkeypatch) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_agent", target_replacements=[ @@ -356,7 +380,7 @@ def test_target_replacements_can_sync_runtime_names( _write_project(project_dir) _patch_prompt2geometry(monkeypatch) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_agent", target_replacements=[ @@ -400,7 +424,7 @@ def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: } assert ( - run_agent_pipeline_cli._auto_replacement_source_uid( + target_replacements_cli._auto_replacement_source_uid( gym_config, replacement_number=1, option_name="--target_replacement1", @@ -408,7 +432,7 @@ def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: == "bread_2" ) assert ( - run_agent_pipeline_cli._auto_replacement_source_uid( + target_replacements_cli._auto_replacement_source_uid( gym_config, replacement_number=2, option_name="--target_replacement2", @@ -442,7 +466,7 @@ def test_directory_input_prefers_merged_config_and_preserves_extra_scene_scale( encoding="utf-8", ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_agent", target_body_scale=0.8, @@ -488,17 +512,17 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_resolve_table_mesh_world_zmax", lambda scene_dir, table_obj: None, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_relative_agent", task_name="AppleLeftOfBasket", @@ -573,17 +597,17 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_resolve_table_mesh_world_zmax", lambda scene_dir, table_obj: None, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_front_relative_agent", task_name="AppleFrontOfApple", @@ -669,12 +693,12 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_self_relative_agent", task_description="用左臂把薯片袋子往左前移动", @@ -732,17 +756,17 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_resolve_table_mesh_world_zmax", lambda scene_dir, table_obj: None, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_front_right_relative_agent", task_description="用右臂把 apple_1 放到 basket_3 右前", @@ -765,19 +789,23 @@ def fake_call_relative_task_llm(**kwargs): def test_side_relation_offsets_use_robot_view_front_back_convention() -> None: - assert ur5_basket_config_generation._side_relation_xy_offsets("front_of") == ( + assert action_agent_config_generation._side_relation_xy_offsets("front_of") == ( -0.16, 0.0, ) - assert ur5_basket_config_generation._side_relation_xy_offsets("behind") == ( + assert action_agent_config_generation._side_relation_xy_offsets("behind") == ( 0.16, 0.0, ) - assert ur5_basket_config_generation._side_relation_xy_offsets("front_left_of") == ( + assert action_agent_config_generation._side_relation_xy_offsets( + "front_left_of" + ) == ( -0.16, -0.16, ) - assert ur5_basket_config_generation._side_relation_xy_offsets("back_right_of") == ( + assert action_agent_config_generation._side_relation_xy_offsets( + "back_right_of" + ) == ( 0.16, 0.16, ) @@ -796,9 +824,9 @@ def test_relative_relation_aliases_include_diagonal_chinese_directions( raw_relation: str, normalized: str, ) -> None: - assert ur5_basket_config_generation._normalize_relative_relation(raw_relation) == ( - normalized - ) + assert action_agent_config_generation._normalize_relative_relation( + raw_relation + ) == (normalized) def test_task_description_on_container_is_compiled_as_inside( @@ -817,17 +845,17 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_resolve_table_mesh_world_zmax", lambda scene_dir, table_obj: None, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_above_container_agent", task_description="把 apple_1 放到 basket_3 上方然后松手", @@ -861,17 +889,17 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_resolve_table_mesh_world_zmax", lambda scene_dir, table_obj: None, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_left_arm_agent", task_description="左臂把 apple_1 放到 basket_3 左边", @@ -911,12 +939,12 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_right_arm_agent", task_description="右臂把 apple_2 放到 basket_3 右边", @@ -958,12 +986,12 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_demo3_relative_agent", task_description="用右臂把咖啡杯子放到垫子上", @@ -1053,12 +1081,12 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_single_rigid_agent", task_description="用左臂抓薯片袋子放到垫子上", @@ -1115,17 +1143,17 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_resolve_table_mesh_world_zmax", lambda scene_dir, table_obj: None, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_dual_relative_agent", task_description=( @@ -1216,6 +1244,99 @@ def fake_call_relative_task_llm(**kwargs): assert '"obj_name":"apple_1"' in atom_actions +def test_dual_inside_same_container_uses_container_long_axis_slots( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "placements": [ + { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "inside", + "arm": "left", + }, + { + "moved_object": "apple_1", + "reference_object": "basket_3", + "goal_relation": "inside", + "arm": "right", + }, + ], + "task_prompt_summary": "Use both arms to put both apples into basket_3.", + "basic_background_notes": "Both apples share the same target container.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + monkeypatch.setattr( + action_agent_config_generation, + "_resolve_table_mesh_world_zmax", + lambda scene_dir, table_obj: None, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_dual_inside_agent", + task_description="双臂把两个 apple 放进 basket_3", + prewarm_coacd_cache=False, + ) + + assert _stable_summary(paths.summary) == { + "mode": "dual_arm_relative_placement", + "placements": [ + { + "moved_object": "apple_2", + "reference_object": "wicker_basket", + "relation": "inside", + "active_arm": "left_arm", + "release_offset": [-0.04, 0.0, 0.12], + }, + { + "moved_object": "apple_1", + "reference_object": "wicker_basket", + "relation": "inside", + "active_arm": "right_arm", + "release_offset": [0.04, 0.0, 0.12], + }, + ], + } + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + success = gym_config["env"]["extensions"]["agent_success"] + assert success["op"] == "all" + assert { + (term["type"], term["object"], term["container"]) for term in success["terms"] + } == { + ("object_in_container", "apple_2", "wicker_basket"), + ("object_in_container", "apple_1", "wicker_basket"), + } + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + for text in (task_prompt, atom_actions): + assert '"offset":[-0.04,0.0,0.22]' in text + assert '"offset":[0.04,0.0,0.22]' in text + assert ( + '"atomic_action_class":"PlaceAction","robot_name":"left_arm",' + '"control":"arm","target_pose":{"reference":"object",' + '"obj_name":"wicker_basket","offset":[-0.04,0.0,0.12]}' in text + ) + assert ( + '"atomic_action_class":"PlaceAction","robot_name":"right_arm",' + '"control":"arm","target_pose":{"reference":"object",' + '"obj_name":"wicker_basket","offset":[0.04,0.0,0.12]}' in text + ) + assert "container XY long axis" in task_prompt + + def test_task_description_rejects_dual_relative_same_arm( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -1242,13 +1363,13 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) with pytest.raises(ValueError, match="one left arm and one right arm"): - generate_ur5_basket_config_from_project( + generate_action_agent_config_from_project( project_dir, tmp_path / "bad_dual_relative_agent", task_description="双臂分别移动两个苹果", @@ -1291,12 +1412,12 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_dual_auto_relative_agent", task_description="双臂分别移动两个苹果", @@ -1326,12 +1447,12 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_stack_agent", task_description="把 apple_2 放到 apple_1 上方并松手", @@ -1373,13 +1494,13 @@ def fake_call_relative_task_llm(**kwargs): } monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_call_relative_task_llm", fake_call_relative_task_llm, ) with pytest.raises(ValueError, match="unknown moved_object"): - generate_ur5_basket_config_from_project( + generate_action_agent_config_from_project( project_dir, tmp_path / "bad_agent", task_description="把 missing_bread 放到 basket_3 左边", @@ -1405,7 +1526,7 @@ def test_high_tabletop_scene_adjusts_robot_height_and_light( encoding="utf-8", ) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_high_table_agent", ) @@ -1413,8 +1534,8 @@ def test_high_tabletop_scene_adjusts_robot_height_and_light( gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) expected_init_z = ( 1.18 - + ur5_basket_config_generation._DUAL_UR5_TABLETOP_CLEARANCE - - ur5_basket_config_generation._DUAL_UR5_ARM_COMPONENT_Z + + action_agent_config_generation._DUAL_UR5_TABLETOP_CLEARANCE + - action_agent_config_generation._DUAL_UR5_ARM_COMPONENT_Z ) assert gym_config["robot"]["init_pos"][2] == pytest.approx(expected_init_z) assert gym_config["light"]["direct"][0]["intensity"] == 40.0 @@ -1426,7 +1547,7 @@ def test_tabletop_z_placement_uses_normalized_mesh_bounds( project_dir = tmp_path / "1790000000_gym_project" _write_project(project_dir) - paths = generate_ur5_basket_config_from_project( + paths = generate_action_agent_config_from_project( project_dir, tmp_path / "generated_z_agent", target_body_scale=0.8, @@ -1437,15 +1558,17 @@ def test_tabletop_z_placement_uses_normalized_mesh_bounds( table_config = next( obj for obj in gym_config["background"] if obj["uid"] == "table" ) - table_top_z = ur5_basket_config_generation._mesh_config_world_zmax(table_config) + table_top_z = action_agent_config_generation._mesh_config_world_zmax(table_config) expected_min_z = ( - table_top_z + ur5_basket_config_generation._TABLETOP_OBJECT_CLEARANCE + table_top_z + action_agent_config_generation._TABLETOP_OBJECT_CLEARANCE ) for obj_config in [ *[obj for obj in gym_config["background"] if obj["uid"] != "table"], *gym_config["rigid_object"], ]: - min_z, _ = ur5_basket_config_generation._mesh_config_world_z_bounds(obj_config) + min_z, _ = action_agent_config_generation._mesh_config_world_z_bounds( + obj_config + ) assert min_z == pytest.approx(expected_min_z) @@ -1456,7 +1579,7 @@ def test_table_mesh_world_zmax_reads_glb_vertices(tmp_path: Path) -> None: mesh_path, [(-0.5, -0.5, 0.0), (0.5, -0.5, 1.2), (0.0, 0.5, 0.4)], ) - table_obj = ur5_basket_config_generation._SceneObject( + table_obj = action_agent_config_generation._SceneObject( source_uid="table", source_role="background", config=_mesh_object( @@ -1468,7 +1591,7 @@ def test_table_mesh_world_zmax_reads_glb_vertices(tmp_path: Path) -> None: ) table_obj.config["body_scale"] = [1.0, 1.0, 2.0] - assert ur5_basket_config_generation._resolve_table_mesh_world_zmax( + assert action_agent_config_generation._resolve_table_mesh_world_zmax( scene_dir, table_obj, ) == pytest.approx(2.5) @@ -1831,7 +1954,7 @@ def fake_run_prompt2geometry_replacement( return {"scaled_mesh_path": str(mesh_path)} monkeypatch.setattr( - ur5_basket_config_generation, + action_agent_config_generation, "_run_prompt2geometry_replacement", fake_run_prompt2geometry_replacement, ) From bdd24df121c197afd791f950567ed4c5089718b5 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Wed, 24 Jun 2026 15:07:54 +0800 Subject: [PATCH 18/33] Add line arrangement config generation --- .../generation/action_agent_config.py | 171 +++++ .../generation/arrangement_spec.py | 691 ++++++++++++++++++ .../generation/config_blocks.py | 79 ++ .../generation/config_types.py | 28 + .../generation/prompt_builders.py | 159 ++++ .../generation/success_specs.py | 86 +++ .../test_ur5_basket_config_generation.py | 254 +++++++ 7 files changed, 1468 insertions(+) create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py index 3966814f..63321003 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py @@ -26,6 +26,7 @@ write_config_bundle as _write_config_bundle, ) from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _ArrangementLineSpec, GeneratedActionAgentConfigPaths, TargetReplacementSpec, _BasketTaskRoles, @@ -45,6 +46,9 @@ from embodichain.gen_sim.action_agent_pipeline.generation.glb_io import read_glb from embodichain.gen_sim.action_agent_pipeline.generation.prompt_builders import ( make_agent_config, + make_arrangement_atom_actions_prompt, + make_arrangement_basic_background, + make_arrangement_task_prompt, make_basket_atom_actions_prompt, make_basket_basic_background, make_basket_task_prompt, @@ -52,6 +56,12 @@ make_relative_basic_background, make_relative_task_prompt, ) +from embodichain.gen_sim.action_agent_pipeline.generation.arrangement_spec import ( + _build_arrangement_line_spec_with_llm, + _call_arrangement_task_llm, + _is_arrangement_task_description, + _with_arrangement_generated_z_targets, +) from embodichain.gen_sim.action_agent_pipeline.generation.action_agent_templates import ( make_dual_ur5_robot_config as _make_dual_ur5_robot_config, make_light_config as _make_light_config, @@ -59,6 +69,8 @@ ) from embodichain.gen_sim.action_agent_pipeline.generation.config_blocks import ( _make_background_config, + _make_arrangement_dataset_config, + _make_arrangement_events_config, _make_container_background_config, _make_dataset_config, _make_events_config, @@ -114,9 +126,11 @@ _refine_roles_with_llm, ) from embodichain.gen_sim.action_agent_pipeline.generation.success_specs import ( + _make_arrangement_extensions_config, _make_extensions_config, _make_relative_extensions_config, _object_in_container_success, + _validate_arrangement_bundle, _validate_bundle, _validate_relative_bundle, _validate_success_uids, @@ -203,6 +217,34 @@ def generate_action_agent_config_from_project( "target_replacements are only supported by the default basket " "template. Do not combine them with task_description." ) + if _is_arrangement_task_description(task_description): + spec = _build_arrangement_line_spec_with_llm( + scene_objects=scene_objects, + project_name=project_name, + scene_dir=scene_dir, + task_description=task_description, + model=llm_model, + task_llm_caller=_call_arrangement_task_llm, + ) + bundle = _build_arrangement_line_bundle( + scene_dir=scene_dir, + source_config=source_config, + spec=spec, + project_name=project_name, + task_name=task_name, + max_episodes=max_episodes, + max_episode_steps=max_episode_steps, + mesh_normalizer=mesh_normalizer, + ) + _validate_arrangement_bundle(bundle, spec) + _attach_mesh_normalization_summary(bundle, mesh_normalizer) + if prewarm_coacd_cache: + _attach_coacd_cache_summary(bundle) + return _write_config_bundle( + output_dir=output_dir_path, + bundle=bundle, + overwrite=overwrite, + ) spec = _build_relative_placement_spec_with_llm( scene_objects=scene_objects, project_name=project_name, @@ -406,6 +448,135 @@ def _build_ur5_basket_bundle( } +def _build_arrangement_line_bundle( + *, + scene_dir: Path, + source_config: Mapping[str, Any], + spec: _ArrangementLineSpec, + project_name: str, + task_name: str, + max_episodes: int, + max_episode_steps: int, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + scene_objects = _collect_scene_objects(source_config) + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + by_uid = {obj.source_uid: obj for obj in scene_objects} + runtime_uids = _relative_scene_runtime_uid_mapping( + scene_objects, + table_source_uid=spec.table_source_uid, + ) + moved_source_uids = {step.source_uid for step in spec.steps} + for step in spec.steps: + runtime_uids[step.source_uid] = step.runtime_uid + + dynamic_rigid_objects = [ + obj for obj in rigid_objects if obj.source_uid in moved_source_uids + ] + static_scene_objects = [ + obj for obj in rigid_objects if obj.source_uid not in moved_source_uids + ] + table_config = _make_background_config( + scene_dir, + by_uid[spec.table_source_uid], + mesh_normalizer, + ) + table_top_z = _mesh_config_world_zmax(table_config) + robot_init_z = _dual_ur5_init_z_from_table_top(table_top_z) + + gym_config = { + "id": "AtomicActionsAgent-v3", + "max_episodes": int(max_episodes), + "max_episode_steps": int(max_episode_steps), + "env": { + "extensions": {}, + "events": _make_arrangement_events_config( + [step.runtime_uid for step in spec.steps], + sensor_config_factory=_make_sensor_config, + ), + "observations": _make_observations_config(), + "dataset": {}, + }, + "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), + "sensor": _make_sensor_config(), + "light": _make_light_config(), + "background": [ + table_config, + *[ + _make_relative_background_object_config( + scene_dir, + obj, + runtime_uids[obj.source_uid], + max_convex_hull_num=1, + mesh_normalizer=mesh_normalizer, + ) + for obj in static_scene_objects + ], + *[ + _make_extra_background_config( + scene_dir, + obj, + mesh_normalizer, + runtime_uid=runtime_uids[obj.source_uid], + ) + for obj in background_objects + if obj.source_uid != spec.table_source_uid + ], + ], + "rigid_object": [ + _make_relative_rigid_object_config( + scene_dir=scene_dir, + obj=obj, + runtime_uid=runtime_uids[obj.source_uid], + body_scale=_source_body_scale(obj), + max_convex_hull_num=16, + mesh_normalizer=mesh_normalizer, + ) + for obj in dynamic_rigid_objects + ], + } + _apply_tabletop_z_placement(gym_config, table_top_z) + spec = _with_arrangement_generated_z_targets(spec, gym_config) + gym_config["env"]["extensions"] = _make_arrangement_extensions_config(spec) + gym_config["env"]["dataset"] = _make_arrangement_dataset_config( + project_name, + spec, + ) + return { + "gym_config": gym_config, + "agent_config": make_agent_config(), + "task_prompt": make_arrangement_task_prompt(task_name, project_name, spec), + "basic_background": make_arrangement_basic_background(project_name, spec), + "atom_actions": make_arrangement_atom_actions_prompt(spec), + "summary": { + **_make_arrangement_summary(spec), + }, + } + + +def _make_arrangement_summary(spec: _ArrangementLineSpec) -> dict[str, Any]: + return { + "mode": "arrangement_line", + "axis": spec.axis, + "anchor": spec.anchor, + "order_by": spec.order_by, + "order_direction": spec.order_direction, + "placements": [ + { + "object": step.runtime_uid, + "source_uid": step.source_uid, + "slot_index": step.slot_index, + "active_arm": f"{step.active_side}_arm", + "target_xy": [float(step.target_xy[0]), float(step.target_xy[1])], + } + for step in spec.steps + ], + } + + def _attach_coacd_cache_summary(bundle: dict[str, Any]) -> None: from embodichain.gen_sim.action_agent_pipeline.generation.coacd_cache import ( prewarm_coacd_cache_for_gym_config, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py new file mode 100644 index 00000000..cd38d7fb --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py @@ -0,0 +1,691 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Callable, Mapping, Sequence +from dataclasses import replace +from pathlib import Path +from typing import Any +import json + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _ArrangementLineSpec, + _ArrangementLineStepSpec, + _SceneObject, +) +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( + _clean_vector3, + _mesh_config_world_xy_extents, + _mesh_config_world_z_bounds, +) +from embodichain.gen_sim.action_agent_pipeline.generation.naming import ( + _base_name, + _normalize_runtime_uid, + _string_list, +) +from embodichain.gen_sim.action_agent_pipeline.generation.scene_objects import ( + _arm_side_for_position, + _pick_table, +) + +__all__ = [ + "_apply_arrangement_task_response", + "_arrangement_line_slot_positions", + "_build_arrangement_line_spec_with_llm", + "_call_arrangement_task_llm", + "_is_arrangement_task_description", + "_make_arrangement_scene_summary", + "_with_arrangement_generated_z_targets", +] + +_ARRANGEMENT_KEYWORDS = ( + "arrange", + "sort", + "order", + "line", + "left to right", + "left-to-right", + "从左到右", + "由大到小", + "从大到小", + "由小到大", + "从小到大", + "排序", + "排列", + "排成", + "一行", +) +_DEFAULT_RELEASE_Z = 0.12 +_DEFAULT_STAGING_Z_DELTA = 0.10 +_SLOT_MARGIN = 0.01 +_MIN_SLOT_SPACING = 0.07 +_MAX_SLOT_SPACING = 0.12 +_SUPPORTED_ORDER_BY = {"size", "color", "explicit"} +_SUPPORTED_ORDER_DIRECTIONS = {"ascending", "descending", "given"} +_SUPPORTED_AXES = {"left_to_right"} + + +def _is_arrangement_task_description(task_description: str) -> bool: + text = task_description.strip().lower() + return any(keyword in text for keyword in _ARRANGEMENT_KEYWORDS) + + +def _build_arrangement_line_spec_with_llm( + *, + scene_objects: list[_SceneObject], + project_name: str, + scene_dir: Path, + task_description: str, + model: str | None, + task_llm_caller: Callable[..., Mapping[str, Any]] | None = None, +) -> _ArrangementLineSpec: + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + if not background_objects: + raise ValueError("Arrangement generation requires a background table.") + if len(rigid_objects) < 2: + raise ValueError( + "Arrangement generation requires at least two movable objects." + ) + + table = _pick_table(background_objects) + scene_summary = _make_arrangement_scene_summary( + scene_objects, + scene_dir=scene_dir, + ) + if task_llm_caller is None: + task_llm_caller = _call_arrangement_task_llm + response = task_llm_caller( + project_name=project_name, + task_description=task_description, + scene_summary=scene_summary, + model=model, + ) + return _apply_arrangement_task_response( + response=response, + table_source_uid=table.source_uid, + scene_objects=scene_objects, + rigid_objects=rigid_objects, + scene_dir=scene_dir, + task_description=task_description, + ) + + +def _call_arrangement_task_llm( + *, + project_name: str, + task_description: str, + scene_summary: list[dict[str, Any]], + model: str | None, +) -> dict[str, Any]: + from langchain_core.messages import HumanMessage, SystemMessage + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + extract_json_object, + ) + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( + create_chat_openai, + ) + + prompt = ( + "Parse a tabletop multi-object line arrangement task and produce one " + "strict config-level JSON spec. The generator computes all target slot " + "coordinates deterministically from this spec.\n\n" + "Return exactly one JSON object with this schema:\n" + "{\n" + ' "objects": ["", "..."],\n' + ' "order_by": "size|color|explicit",\n' + ' "order_direction": "ascending|descending|given",\n' + ' "ordered_attributes": ["red", "green", "blue"],\n' + ' "object_attributes": {"": {"color": "red"}},\n' + ' "anchor": "table_center",\n' + ' "line_axis": "left_to_right",\n' + ' "task_prompt_summary": "",\n' + ' "basic_background_notes": ""\n' + "}\n\n" + "Rules:\n" + "- Use only source_uid values from rigid_object scene items.\n" + "- Include every object that must be moved and sorted.\n" + "- Use order_by='size' for large/small ordering. Use " + "order_direction='descending' for large-to-small and 'ascending' for " + "small-to-large.\n" + "- Use order_by='color' when the task specifies a color sequence such as " + "red-green-blue. Put that sequence in ordered_attributes and include a " + "color attribute for each object.\n" + "- Use line_axis='left_to_right' for left-to-right tabletop rows.\n" + "- Do not return target positions, robot config, success JSON, or action " + "graphs.\n\n" + f"Project: {project_name}\n" + f"Task description:\n{task_description}\n" + f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}" + ) + llm = create_chat_openai( + temperature=0.0, + model=model, + usage_stage="config_generation.arrangement_task", + ) + response = llm.invoke( + [ + SystemMessage( + content=( + "You produce strict JSON specs for simulation config " + "generation. Do not include markdown." + ) + ), + HumanMessage(content=prompt), + ] + ) + content = getattr(response, "content", response) + return extract_json_object(content) + + +def _make_arrangement_scene_summary( + scene_objects: Sequence[_SceneObject], + *, + scene_dir: Path, +) -> list[dict[str, Any]]: + return [ + { + "source_uid": obj.source_uid, + "role": obj.source_role, + "object_type": _base_name(obj), + "mesh": obj.config.get("shape", {}).get("fpath"), + "init_pos": obj.config.get("init_pos"), + "body_scale": obj.config.get("body_scale"), + "color_hint": _color_hint_for_object(obj), + "size_score": _arrangement_object_size_score( + obj, + scene_dir=scene_dir, + ), + } + for obj in scene_objects + ] + + +def _apply_arrangement_task_response( + *, + response: Mapping[str, Any], + table_source_uid: str, + scene_objects: list[_SceneObject], + rigid_objects: list[_SceneObject], + scene_dir: Path, + task_description: str, +) -> _ArrangementLineSpec: + by_uid = {obj.source_uid: obj for obj in scene_objects} + table_obj = by_uid[table_source_uid] + rigid_by_uid = {obj.source_uid: obj for obj in rigid_objects} + runtime_uids = _arrangement_runtime_uid_mapping(rigid_objects) + + object_source_uids = _resolve_arrangement_object_uids( + response.get("objects"), + rigid_by_uid, + ) + object_attributes = _object_attributes(response.get("object_attributes")) + order_by = _normalize_order_by(response.get("order_by")) + order_direction = _normalize_order_direction(response.get("order_direction")) + axis = _normalize_axis(response.get("line_axis", response.get("axis"))) + anchor = _normalize_anchor(response.get("anchor")) + + if order_by == "size": + ordered_source_uids = _order_uids_by_size( + object_source_uids, + rigid_by_uid=rigid_by_uid, + scene_dir=scene_dir, + descending=order_direction != "ascending", + ) + order_direction = ( + "descending" if order_direction == "given" else order_direction + ) + elif order_by == "color": + ordered_source_uids = _order_uids_by_color( + object_source_uids, + rigid_by_uid=rigid_by_uid, + object_attributes=object_attributes, + ordered_colors=_string_list(response.get("ordered_attributes")), + ) + order_direction = "given" + else: + ordered_source_uids = object_source_uids + order_direction = "given" + + anchor_xy = _table_anchor_xy(table_obj, anchor) + spacing = _arrangement_spacing( + [rigid_by_uid[uid] for uid in object_source_uids], + scene_dir=scene_dir, + ) + slots = _arrangement_line_slot_positions( + anchor_xy=anchor_xy, + count=len(ordered_source_uids), + spacing=spacing, + line_axis=axis, + ) + + steps = [] + for slot_index, (source_uid, target_xy) in enumerate( + zip(ordered_source_uids, slots) + ): + obj = rigid_by_uid[source_uid] + release_z = _release_z_for_object(obj) + release_position = [ + round(float(target_xy[0]), 6), + round(float(target_xy[1]), 6), + release_z, + ] + high_position = list(release_position) + high_position[2] = round(high_position[2] + _DEFAULT_STAGING_Z_DELTA, 6) + steps.append( + _ArrangementLineStepSpec( + source_uid=source_uid, + runtime_uid=runtime_uids[source_uid], + slot_index=slot_index, + active_side=_arm_side_for_position( + _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) + ), + target_xy=[ + round(float(target_xy[0]), 6), + round(float(target_xy[1]), 6), + ], + release_position=release_position, + high_position=high_position, + size_score=_arrangement_object_size_score(obj, scene_dir=scene_dir), + color=_object_color(source_uid, object_attributes), + ) + ) + + summary = str(response.get("task_prompt_summary", "")).strip() + if not summary: + summary = "Arrange the selected objects in one left-to-right line." + notes = str(response.get("basic_background_notes", "")).strip() + + return _ArrangementLineSpec( + table_source_uid=table_source_uid, + task_description=task_description, + task_prompt_summary=summary, + basic_background_notes=notes, + order_by=order_by, + order_direction=order_direction, + axis=axis, + anchor=anchor, + steps=tuple(steps), + ) + + +def _arrangement_line_slot_positions( + *, + anchor_xy: Sequence[float], + count: int, + spacing: float, + line_axis: str, +) -> list[list[float]]: + if count < 1: + raise ValueError("Arrangement line requires at least one slot.") + axis = _normalize_axis(line_axis) + anchor = [float(anchor_xy[0]), float(anchor_xy[1])] + center = (count - 1) / 2.0 + slots: list[list[float]] = [] + for index in range(count): + axis_offset = (index - center) * float(spacing) + if axis == "left_to_right": + slots.append( + [ + round(anchor[0], 6), + round(anchor[1] + axis_offset, 6), + ] + ) + continue + raise ValueError(f"Unsupported arrangement line axis: {line_axis!r}.") + return slots + + +def _with_arrangement_generated_z_targets( + spec: _ArrangementLineSpec, + gym_config: Mapping[str, Any], +) -> _ArrangementLineSpec: + init_z_by_uid = { + str(obj.get("uid")): _clean_vector3(obj.get("init_pos", [0.0, 0.0, 0.0]))[2] + for obj in gym_config.get("rigid_object", []) + if isinstance(obj, Mapping) and obj.get("uid") is not None + } + steps = [] + for step in spec.steps: + init_z = init_z_by_uid.get(step.runtime_uid) + if init_z is None: + steps.append(step) + continue + release_position = [ + float(step.target_xy[0]), + float(step.target_xy[1]), + round(float(init_z) + _DEFAULT_RELEASE_Z, 6), + ] + high_position = list(release_position) + high_position[2] = round(high_position[2] + _DEFAULT_STAGING_Z_DELTA, 6) + steps.append( + replace( + step, + release_position=release_position, + high_position=high_position, + ) + ) + return replace(spec, steps=tuple(steps)) + + +def _resolve_arrangement_object_uids( + value: Any, + rigid_by_uid: Mapping[str, _SceneObject], +) -> list[str]: + values = _string_list(value) + if not values: + raise ValueError("Arrangement response requires non-empty objects.") + + resolved = [] + for raw_value in values: + resolved.append( + _resolve_rigid_uid(raw_value, rigid_by_uid, field_name="objects") + ) + if len(resolved) != len(set(resolved)): + raise ValueError("Arrangement objects must be distinct.") + return resolved + + +def _resolve_rigid_uid( + value: str, + rigid_by_uid: Mapping[str, _SceneObject], + *, + field_name: str, +) -> str: + if value in rigid_by_uid: + return value + normalized = _normalize_runtime_uid(value) + matches = [ + source_uid + for source_uid, obj in rigid_by_uid.items() + if _normalize_runtime_uid(source_uid) == normalized + or _base_name(obj) == normalized + ] + if len(matches) == 1: + return matches[0] + if not matches: + raise ValueError(f"LLM returned unknown arrangement {field_name}: {value!r}.") + raise ValueError( + f"LLM returned ambiguous arrangement {field_name}: {value!r}; " + f"candidates: {matches}." + ) + + +def _normalize_order_by(value: Any) -> str: + text = str(value or "explicit").strip().lower().replace("-", "_").replace(" ", "_") + aliases = { + "largest": "size", + "smallest": "size", + "big_to_small": "size", + "large_to_small": "size", + "color_sequence": "color", + "given_order": "explicit", + } + text = aliases.get(text, text) + if text not in _SUPPORTED_ORDER_BY: + raise ValueError( + f"Unsupported arrangement order_by {value!r}; expected one of " + f"{sorted(_SUPPORTED_ORDER_BY)}." + ) + return text + + +def _normalize_order_direction(value: Any) -> str: + text = str(value or "given").strip().lower().replace("-", "_").replace(" ", "_") + aliases = { + "large_to_small": "descending", + "largest_first": "descending", + "big_to_small": "descending", + "small_to_large": "ascending", + "smallest_first": "ascending", + "increasing": "ascending", + "decreasing": "descending", + } + text = aliases.get(text, text) + if text not in _SUPPORTED_ORDER_DIRECTIONS: + raise ValueError( + f"Unsupported arrangement order_direction {value!r}; expected one of " + f"{sorted(_SUPPORTED_ORDER_DIRECTIONS)}." + ) + return text + + +def _normalize_axis(value: Any) -> str: + text = ( + str(value or "left_to_right") + .strip() + .lower() + .replace("-", "_") + .replace(" ", "_") + ) + aliases = { + "left_right": "left_to_right", + "robot_left_to_right": "left_to_right", + "y": "left_to_right", + "world_y": "left_to_right", + } + text = aliases.get(text, text) + if text not in _SUPPORTED_AXES: + raise ValueError( + f"Unsupported arrangement line axis {value!r}; expected one of " + f"{sorted(_SUPPORTED_AXES)}." + ) + return text + + +def _normalize_anchor(value: Any) -> str: + text = str(value or "table_center").strip().lower().replace("-", "_") + aliases = { + "center": "table_center", + "table_centre": "table_center", + "桌子中央": "table_center", + "桌面中央": "table_center", + } + text = aliases.get(text, text) + if text != "table_center": + raise ValueError("Arrangement only supports anchor='table_center'.") + return text + + +def _object_attributes(value: Any) -> dict[str, dict[str, str]]: + if not isinstance(value, Mapping): + return {} + attributes: dict[str, dict[str, str]] = {} + for source_uid, raw_attrs in value.items(): + if not isinstance(raw_attrs, Mapping): + continue + attributes[str(source_uid)] = { + str(key): str(attr_value).strip().lower() + for key, attr_value in raw_attrs.items() + if str(attr_value).strip() + } + return attributes + + +def _order_uids_by_size( + source_uids: list[str], + *, + rigid_by_uid: Mapping[str, _SceneObject], + scene_dir: Path, + descending: bool, +) -> list[str]: + return sorted( + source_uids, + key=lambda uid: ( + _arrangement_object_size_score(rigid_by_uid[uid], scene_dir=scene_dir) + or 0.0 + ), + reverse=descending, + ) + + +def _order_uids_by_color( + source_uids: list[str], + *, + rigid_by_uid: Mapping[str, _SceneObject], + object_attributes: Mapping[str, Mapping[str, str]], + ordered_colors: list[str], +) -> list[str]: + if not ordered_colors: + raise ValueError("Color arrangement requires ordered_attributes colors.") + color_rank = { + color.strip().lower(): index for index, color in enumerate(ordered_colors) + } + missing = [] + ranked: list[tuple[int, str]] = [] + for source_uid in source_uids: + color = _object_color(source_uid, object_attributes) or _color_hint_for_object( + rigid_by_uid[source_uid] + ) + if color is None or color not in color_rank: + missing.append(source_uid) + continue + ranked.append((color_rank[color], source_uid)) + if missing: + raise ValueError( + "Color arrangement requires colors for every object; missing or " + f"unranked: {missing}." + ) + return [source_uid for _, source_uid in sorted(ranked)] + + +def _object_color( + source_uid: str, + object_attributes: Mapping[str, Mapping[str, str]], +) -> str | None: + attrs = object_attributes.get(source_uid, {}) + color = attrs.get("color") + return color.strip().lower() if isinstance(color, str) and color.strip() else None + + +def _color_hint_for_object(obj: _SceneObject) -> str | None: + text = (f"{obj.source_uid} {obj.config.get('shape', {}).get('fpath', '')}").lower() + color_aliases = { + "red": ("red", "红"), + "green": ("green", "绿"), + "blue": ("blue", "蓝"), + "yellow": ("yellow", "黄"), + "orange": ("orange", "橙"), + "purple": ("purple", "紫"), + "black": ("black", "黑"), + "white": ("white", "白"), + } + for canonical, aliases in color_aliases.items(): + if any(alias in text for alias in aliases): + return canonical + return None + + +def _arrangement_runtime_uid_mapping( + rigid_objects: Sequence[_SceneObject], +) -> dict[str, str]: + candidates = {obj.source_uid: _base_name(obj) for obj in rigid_objects} + counts: dict[str, int] = {} + for runtime_uid in candidates.values(): + counts[runtime_uid] = counts.get(runtime_uid, 0) + 1 + return { + source_uid: ( + runtime_uid + if counts[runtime_uid] == 1 + else _normalize_runtime_uid(source_uid) + ) + for source_uid, runtime_uid in candidates.items() + } + + +def _table_anchor_xy(table_obj: _SceneObject, anchor: str) -> list[float]: + _normalize_anchor(anchor) + init_pos = _clean_vector3(table_obj.config.get("init_pos", [0.0, 0.0, 0.0])) + return [round(init_pos[0], 6), round(init_pos[1], 6)] + + +def _arrangement_spacing( + objects: Sequence[_SceneObject], + *, + scene_dir: Path, +) -> float: + max_extent = max( + (_arrangement_object_xy_extent(obj, scene_dir=scene_dir) or 0.0) + for obj in objects + ) + spacing = max(max_extent + _SLOT_MARGIN, _MIN_SLOT_SPACING) + spacing = min(spacing, _MAX_SLOT_SPACING) + return round(float(spacing), 6) + + +def _arrangement_object_size_score( + obj: _SceneObject, + *, + scene_dir: Path, +) -> float | None: + bounds = _source_mesh_world_bounds(obj, scene_dir=scene_dir) + if bounds is None: + return None + mins, maxs = bounds + extents = [maxs[index] - mins[index] for index in range(3)] + return round(float(max(extents)), 6) + + +def _arrangement_object_xy_extent( + obj: _SceneObject, + *, + scene_dir: Path, +) -> float | None: + config = _resolved_mesh_config(obj, scene_dir=scene_dir) + extents = _mesh_config_world_xy_extents(config) + if extents is None: + return None + return max(extents) + + +def _release_z_for_object(obj: _SceneObject) -> float: + init_pos = obj.config.get("init_pos") + if isinstance(init_pos, Sequence) and len(init_pos) == 3: + return round(float(init_pos[2]) + _DEFAULT_RELEASE_Z, 6) + return _DEFAULT_RELEASE_Z + + +def _source_mesh_world_bounds( + obj: _SceneObject, + *, + scene_dir: Path, +) -> tuple[list[float], list[float]] | None: + config = _resolved_mesh_config(obj, scene_dir=scene_dir) + z_bounds = _mesh_config_world_z_bounds(config) + xy_extents = _mesh_config_world_xy_extents(config) + if z_bounds is None or xy_extents is None: + return None + return [0.0, 0.0, z_bounds[0]], [xy_extents[0], xy_extents[1], z_bounds[1]] + + +def _resolved_mesh_config( + obj: _SceneObject, + *, + scene_dir: Path, +) -> dict[str, Any]: + config = dict(obj.config) + shape = dict(config.get("shape", {}) or {}) + fpath = shape.get("fpath") + if isinstance(fpath, str): + raw_path = Path(fpath) + if not raw_path.is_absolute(): + shape["fpath"] = (scene_dir / raw_path).resolve().as_posix() + config["shape"] = shape + return config diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py index 605d1045..97d4ec48 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py @@ -22,6 +22,7 @@ import copy from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _ArrangementLineSpec, _BasketTaskRoles, _RelativePlacementSpec, _ResolvedTargetReplacement, @@ -42,6 +43,8 @@ __all__ = [ "_make_background_config", + "_make_arrangement_dataset_config", + "_make_arrangement_events_config", "_make_dataset_config", "_make_events_config", "_make_extra_background_config", @@ -137,6 +140,47 @@ def _make_relative_events_config( } +def _make_arrangement_events_config( + registered_runtime_uids: list[str], + *, + sensor_config_factory: Callable[[], list[dict[str, Any]]], +) -> dict[str, Any]: + return { + "record_camera": _record_camera_event_config(sensor_config_factory), + "validation_cameras": _validation_cameras_event_config(), + "prepare_extra_attr": { + "func": "prepare_extra_attr", + "mode": "reset", + "params": { + "attrs": [ + { + "name": "object_lengths", + "mode": "callable", + "entity_uids": "all_objects", + "func_name": "compute_object_length", + "func_kwargs": { + "is_svd_frame": True, + "sample_points": 5000, + }, + }, + ] + }, + }, + "register_info_to_env": { + "func": "register_info_to_env", + "mode": "reset", + "params": { + "registry": [ + _object_registry_entry(uid) + for uid in sorted(registered_runtime_uids) + ], + "registration": "affordance_datas", + "sim_update": True, + }, + }, + } + + def _make_events_config( roles: _BasketTaskRoles, *, @@ -302,6 +346,41 @@ def _make_relative_dataset_config( } +def _make_arrangement_dataset_config( + project_name: str, + spec: _ArrangementLineSpec, +) -> dict[str, Any]: + return { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "DualUR5", + "control_freq": 25, + }, + "instruction": { + "lang": _arrangement_dataset_instruction(spec), + }, + "extra": { + "scene_type": project_name, + "task_description": spec.task_description, + "data_type": "sim", + }, + "use_videos": True, + }, + } + } + + +def _arrangement_dataset_instruction(spec: _ArrangementLineSpec) -> str: + ordered = ", ".join(step.runtime_uid for step in spec.steps) + return ( + "Move the selected objects to the table center and arrange them " + f"left-to-right as: {ordered}." + ) + + def _relative_dataset_instruction( spec: _RelativePlacementSpec, *, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py index a0a6dcfd..ae46bc4c 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py @@ -21,6 +21,8 @@ from typing import Any __all__ = [ + "_ArrangementLineSpec", + "_ArrangementLineStepSpec", "GeneratedActionAgentConfigPaths", "TargetReplacementSpec", "_BasketTaskRoles", @@ -119,3 +121,29 @@ class _RelativePlacementSpec: reference_is_initial_pose: bool = False release_position: list[float] | None = None high_position: list[float] | None = None + + +@dataclass(frozen=True) +class _ArrangementLineStepSpec: + source_uid: str + runtime_uid: str + slot_index: int + active_side: str + target_xy: list[float] + release_position: list[float] + high_position: list[float] + size_score: float | None = None + color: str | None = None + + +@dataclass(frozen=True) +class _ArrangementLineSpec: + table_source_uid: str + task_description: str + task_prompt_summary: str + basic_background_notes: str + order_by: str + order_direction: str + axis: str + anchor: str + steps: tuple[_ArrangementLineStepSpec, ...] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 5cfa8322..e5d3f61e 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -24,6 +24,9 @@ __all__ = [ "make_agent_config", + "make_arrangement_atom_actions_prompt", + "make_arrangement_basic_background", + "make_arrangement_task_prompt", "make_basket_atom_actions_prompt", "make_basket_basic_background", "make_basket_task_prompt", @@ -82,6 +85,29 @@ class _RelativeSpecLike(_RelativePlacementLike, Protocol): basic_background_notes: str +class _ArrangementStepLike(Protocol): + source_uid: str + runtime_uid: str + slot_index: int + active_side: str + target_xy: Sequence[float] + release_position: Sequence[float] + high_position: Sequence[float] + size_score: float | None + color: str | None + + +class _ArrangementSpecLike(Protocol): + task_description: str + task_prompt_summary: str + basic_background_notes: str + order_by: str + order_direction: str + axis: str + anchor: str + steps: Sequence[_ArrangementStepLike] + + def make_agent_config() -> dict[str, Any]: return { "TaskAgent": { @@ -107,6 +133,139 @@ def make_agent_config() -> dict[str, Any]: } +def make_arrangement_task_prompt( + task_name: str, + project_name: str, + spec: _ArrangementSpecLike, +) -> str: + edge_count = len(spec.steps) * 4 + step_blocks = "\n\n".join( + _arrangement_step_prompt_block(index, step) + for index, step in enumerate(spec.steps, start=1) + ) + final_order = ", ".join( + f"`{step.runtime_uid}` at slot {step.slot_index}" for step in spec.steps + ) + return f"""Task: +{task_name}: {spec.task_prompt_summary} + +This config was generated from a simple task description by the config-stage +LLM. The execution-stage LLM must now generate the graph JSON from this prompt. + +Original simple task description: +{spec.task_description} + +Arrangement plan: +- Layout axis: `{spec.axis}`. Slot 0 is the robot-view leftmost slot, and later + slots move monotonically toward robot-view right. +- Anchor: `{spec.anchor}` in the exported {project_name} environment. +- Ordering rule: `{spec.order_by}` with direction `{spec.order_direction}`. +- Final order: {final_order}. + +Generate one deterministic nominal graph with exactly {edge_count} nominal edges. +Use only the atomic action class JSON specs shown below. Do not add recovery, +monitor, search, alignment, or extra lift edges. Use `PlaceAction` for each +release-place step so lowering, gripper opening, and upward retreat remain one +atomic action. The arm not listed for a step must remain null. + +{step_blocks} + +Final state: all listed objects must rest near their assigned absolute XY slots +and remain upright. Use the exact absolute target_pose JSON specs shown above; +do not rewrite slot placement as object-referenced poses. +""" + + +def _arrangement_step_prompt_block(index: int, step: _ArrangementStepLike) -> str: + active_arm = f"{step.active_side}_arm" + active_slot = f"{step.active_side}_arm_action" + inactive_slot = f"{'right' if step.active_side == 'left' else 'left'}_arm_action" + base_edge = (index - 1) * 4 + return f"""{base_edge + 1}. Pick up `{step.runtime_uid}` for slot {step.slot_index}: + - {active_slot}: {_format_pick_up_spec(active_arm, step.runtime_uid)} + - {inactive_slot}: null + +{base_edge + 2}. Move `{step.runtime_uid}` to the high staging pose above slot {step.slot_index}: + - {active_slot}: {_format_pose_absolute_spec(active_arm, step.high_position, sample_interval=45)} + - {inactive_slot}: null + +{base_edge + 3}. Place `{step.runtime_uid}` at slot {step.slot_index}: + - {active_slot}: {_format_place_absolute_spec(active_arm, step.release_position, sample_interval=80, lift_height=_PLACE_LIFT_HEIGHT)} + - {inactive_slot}: null + +{base_edge + 4}. Return `{active_arm}` to its initial pose: + - {active_slot}: {_format_initial_qpos_spec(active_arm, sample_interval=30)} + - {inactive_slot}: null""" + + +def make_arrangement_basic_background( + project_name: str, + spec: _ArrangementSpecLike, +) -> str: + notes = spec.basic_background_notes or ( + "No extra scene notes were provided by the config-stage LLM." + ) + object_lines = "\n".join( + _arrangement_object_background_line(step) for step in spec.steps + ) + return f"""The scene comes from the exported {project_name} mesh environment. + +This configuration directory is for a Dual-UR5 multi-object line arrangement +task generated from a simple natural-language task description. + +The robot is a dual-UR5 composite robot with DH_PGI_140_80 parallel grippers: +- left_arm is the semantic robot-view left slot, mapped to the physical + right_arm control part. +- right_arm is the semantic robot-view right slot, mapped to the physical + left_arm control part. + +Interactive task objects and target slots: +{object_lines} + +Config-stage LLM notes: +{notes} +""" + + +def _arrangement_object_background_line(step: _ArrangementStepLike) -> str: + attrs = [] + if step.color: + attrs.append(f"color={step.color}") + if step.size_score is not None: + attrs.append(f"size_score={float(step.size_score):.6g}") + attr_text = f" ({', '.join(attrs)})" if attrs else "" + return ( + f"- {step.runtime_uid}: source `{step.source_uid}`{attr_text}, " + f"slot {step.slot_index} at xy={list(step.target_xy)}, " + f"handled by {step.active_side}_arm." + ) + + +def make_arrangement_atom_actions_prompt(spec: _ArrangementSpecLike) -> str: + blocks = "\n\n".join(_arrangement_atom_action_block(step) for step in spec.steps) + return f"""### Atomic Action Class JSON Specs for Dual-UR5 Line Arrangement + +Use only atomic action class JSON specs backed by `PickUpAction`, `MoveAction`, and +`PlaceAction`. Each object is moved to an absolute slot pose computed by the +config-stage generator. Keep the non-active arm null for each listed object. + +{blocks} +""" + + +def _arrangement_atom_action_block(step: _ArrangementStepLike) -> str: + active_arm = f"{step.active_side}_arm" + return f"""Object `{step.runtime_uid}` to slot {step.slot_index}: +- Pick up: + {_format_pick_up_spec(active_arm, step.runtime_uid)} +- High staging: + {_format_pose_absolute_spec(active_arm, step.high_position, sample_interval=45)} +- Place: + {_format_place_absolute_spec(active_arm, step.release_position, sample_interval=80, lift_height=_PLACE_LIFT_HEIGHT)} +- Return: + {_format_initial_qpos_spec(active_arm, sample_interval=30)}""" + + def make_relative_task_prompt( task_name: str, project_name: str, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py index b430ae3d..7204c534 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py @@ -20,15 +20,18 @@ from typing import Any from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _ArrangementLineSpec, _BasketTaskRoles, _RelativePlacementSpec, _RelativePlacementStepSpec, ) __all__ = [ + "_make_arrangement_extensions_config", "_make_extensions_config", "_make_relative_extensions_config", "_object_in_container_success", + "_validate_arrangement_bundle", "_validate_bundle", "_validate_relative_bundle", "_validate_success_uids", @@ -113,6 +116,51 @@ def _make_relative_extensions_config( } +def _make_arrangement_extensions_config(spec: _ArrangementLineSpec) -> dict[str, Any]: + return { + "agent_arm_slots": { + "left": { + "arm": "right_arm", + "eef": "right_eef", + }, + "right": { + "arm": "left_arm", + "eef": "left_eef", + }, + }, + "arm_aim_yaw_offset": { + "left": 3.141592653589793, + "right": 0.0, + }, + "gripper_open_state": [0.0], + "gripper_close_state": [0.04], + "ignore_terminations_during_agent": True, + "viewer_camera_uid": "cam_high", + "agent_success": _make_arrangement_success_spec(spec), + } + + +def _make_arrangement_success_spec(spec: _ArrangementLineSpec) -> dict[str, Any]: + terms: list[dict[str, Any]] = [] + for step in spec.steps: + terms.extend( + [ + { + "type": "object_xy_near", + "object": step.runtime_uid, + "target_xy": [float(step.target_xy[0]), float(step.target_xy[1])], + "tolerance": 0.05, + }, + { + "type": "object_not_fallen", + "object": step.runtime_uid, + "max_tilt": 0.9, + }, + ] + ) + return {"op": "all", "terms": terms} + + def _make_relative_success_spec( spec: _RelativePlacementSpec, *, @@ -303,6 +351,42 @@ def _validate_relative_bundle( ) +def _validate_arrangement_bundle( + bundle: Mapping[str, Any], + spec: _ArrangementLineSpec, +) -> None: + gym_config = bundle["gym_config"] + if gym_config.get("id") != "AtomicActionsAgent-v3": + raise ValueError("Generated gym config must use AtomicActionsAgent-v3.") + if gym_config.get("robot", {}).get("uid") != "DualUR5": + raise ValueError("Generated arrangement config must use DualUR5.") + + rigid_uid_list = [obj["uid"] for obj in gym_config.get("rigid_object", [])] + if len(rigid_uid_list) != len(set(rigid_uid_list)): + raise ValueError(f"Duplicate rigid object runtime uid(s): {rigid_uid_list}") + rigid_uids = set(rigid_uid_list) + background_uids = {obj["uid"] for obj in gym_config.get("background", [])} + scene_uids = rigid_uids | background_uids + required = {step.runtime_uid for step in spec.steps} + missing = required - rigid_uids + if missing: + raise ValueError( + f"Generated arrangement config missing moved rigid object(s): {missing}" + ) + + _validate_success_uids( + gym_config["env"]["extensions"]["agent_success"], + rigid_uids=rigid_uids, + scene_uids=scene_uids, + ) + registry = gym_config["env"]["events"]["register_info_to_env"]["params"]["registry"] + registered = {entry["entity_cfg"]["uid"] for entry in registry} + if not required.issubset(registered): + raise ValueError( + f"Arrangement config registry missing: {sorted(required - registered)}" + ) + + def _validate_success_uids( success: Mapping[str, Any], *, @@ -326,6 +410,8 @@ def _validate_success_uids( required_keys = ("object", "reference") elif success_type in {"object_axis_near", "object_coordinate_near"}: required_keys = ("object",) + elif success_type in {"object_xy_near", "object_near_xy"}: + required_keys = ("object",) elif success_type in {"object_not_fallen", "not_fallen"}: required_keys = ("object",) else: diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 7872cf66..096b530a 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -44,6 +44,10 @@ TargetReplacementSpec, generate_action_agent_config_from_project, ) +from embodichain.gen_sim.action_agent_pipeline.generation.arrangement_spec import ( + _apply_arrangement_task_response, + _arrangement_line_slot_positions, +) from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.success import ( evaluate_configured_success, ) @@ -1244,6 +1248,200 @@ def fake_call_relative_task_llm(**kwargs): assert '"obj_name":"apple_1"' in atom_actions +def test_arrangement_response_orders_explicit_color_sequence(tmp_path: Path) -> None: + scene_objects = [ + action_agent_config_generation._SceneObject( + source_uid="table", + source_role="background", + config=_mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 0.0], + ), + ), + action_agent_config_generation._SceneObject( + source_uid="cube_red", + source_role="rigid_object", + config=_mesh_object( + "cube_red", + "mesh_assets/cube/cube_red.glb", + [0.0, 0.20, 0.76], + [0.0, 0.0, 0.0], + ), + ), + action_agent_config_generation._SceneObject( + source_uid="cube_blue", + source_role="rigid_object", + config=_mesh_object( + "cube_blue", + "mesh_assets/cube/cube_blue.glb", + [0.0, -0.10, 0.76], + [0.0, 0.0, 0.0], + ), + ), + action_agent_config_generation._SceneObject( + source_uid="cube_green", + source_role="rigid_object", + config=_mesh_object( + "cube_green", + "mesh_assets/cube/cube_green.glb", + [0.0, 0.00, 0.76], + [0.0, 0.0, 0.0], + ), + ), + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + + spec = _apply_arrangement_task_response( + response={ + "objects": ["cube_red", "cube_green", "cube_blue"], + "order_by": "color", + "ordered_attributes": ["red", "green", "blue"], + "object_attributes": { + "cube_blue": {"color": "blue"}, + "cube_red": {"color": "red"}, + "cube_green": {"color": "green"}, + }, + "task_prompt_summary": "Arrange the cubes red, green, blue.", + }, + table_source_uid="table", + scene_objects=scene_objects, + rigid_objects=rigid_objects, + scene_dir=tmp_path, + task_description="将红、绿、蓝三个方块按从左到右红、绿、蓝的顺序排成一行", + ) + + assert [step.source_uid for step in spec.steps] == [ + "cube_red", + "cube_green", + "cube_blue", + ] + assert [step.color for step in spec.steps] == ["red", "green", "blue"] + assert [step.slot_index for step in spec.steps] == [0, 1, 2] + assert [step.target_xy[1] for step in spec.steps] == sorted( + step.target_xy[1] for step in spec.steps + ) + + +def test_arrangement_line_slot_positions_are_centered_left_to_right() -> None: + slots = _arrangement_line_slot_positions( + anchor_xy=[0.10, -0.20], + count=3, + spacing=0.08, + line_axis="left_to_right", + ) + + assert slots == [ + [0.10, -0.28], + [0.10, -0.20], + [0.10, -0.12], + ] + + +def test_task_description_generates_size_order_arrangement_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_arrangement_project(project_dir) + + def fake_call_arrangement_task_llm(**kwargs): + size_by_uid = { + item["source_uid"]: item["size_score"] + for item in kwargs["scene_summary"] + if item["role"] == "rigid_object" + } + assert size_by_uid["cube_2"] > size_by_uid["cube_1"] > size_by_uid["cube_3"] + return { + "objects": ["cube_1", "cube_2", "cube_3"], + "order_by": "size", + "order_direction": "descending", + "anchor": "table_center", + "line_axis": "left_to_right", + "task_prompt_summary": ( + "Move the three cubes to the table center and arrange them " + "from large to small left-to-right." + ), + "basic_background_notes": "All three cubes are movable task objects.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_arrangement_task_llm", + fake_call_arrangement_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_arrangement_agent", + task_name="BlocksRankingSize", + task_description="桌上有三个颜色随机的方块,将它们移动到桌子中央,并按从左到右由大到小排列。", + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} + assert set(rigid_objects) == {"cube_1", "cube_2", "cube_3"} + assert {obj["body_type"] for obj in rigid_objects.values()} == {"dynamic"} + assert rigid_objects["cube_2"]["body_scale"] == [1.0, 1.0, 1.0] + + assert _stable_summary(paths.summary) == { + "mode": "arrangement_line", + "axis": "left_to_right", + "anchor": "table_center", + "order_by": "size", + "order_direction": "descending", + "placements": [ + { + "object": "cube_2", + "source_uid": "cube_2", + "slot_index": 0, + "active_arm": "left_arm", + "target_xy": [0.0, -0.07], + }, + { + "object": "cube_1", + "source_uid": "cube_1", + "slot_index": 1, + "active_arm": "right_arm", + "target_xy": [0.0, 0.0], + }, + { + "object": "cube_3", + "source_uid": "cube_3", + "slot_index": 2, + "active_arm": "right_arm", + "target_xy": [0.0, 0.07], + }, + ], + } + + success = gym_config["env"]["extensions"]["agent_success"] + assert success["op"] == "all" + xy_targets = { + (term["object"], tuple(term["target_xy"])) + for term in success["terms"] + if term["type"] == "object_xy_near" + } + assert xy_targets == { + ("cube_2", (0.0, -0.07)), + ("cube_1", (0.0, 0.0)), + ("cube_3", (0.0, 0.07)), + } + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert "Generate one deterministic nominal graph with exactly 12 nominal edges" in ( + task_prompt + ) + assert task_prompt.count('"atomic_action_class":"PickUpAction"') == 3 + assert task_prompt.count('"atomic_action_class":"PlaceAction"') == 3 + assert task_prompt.count('"reference":"absolute"') >= 6 + assert atom_actions.count('"atomic_action_class":"PickUpAction"') == 3 + assert atom_actions.count('"atomic_action_class":"PlaceAction"') == 3 + + def test_dual_inside_same_container_uses_container_long_axis_slots( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -1715,6 +1913,62 @@ def _write_demo3_role_project(project_dir: Path) -> None: ) +def _write_arrangement_project(project_dir: Path) -> None: + _write_minimal_glb( + project_dir / "mesh_assets/table/table_0.glb", + [(-0.60, -0.40, 0.0), (0.60, -0.40, 0.0), (0.0, 0.40, 0.0)], + ) + for uid, size in { + "cube_1": 0.04, + "cube_2": 0.06, + "cube_3": 0.03, + }.items(): + _write_minimal_glb( + project_dir / f"mesh_assets/cube/{uid}/{uid}.glb", + [ + (-size / 2.0, -size / 2.0, 0.0), + (size / 2.0, -size / 2.0, 0.0), + (0.0, size / 2.0, size), + ], + ) + + gym_config = { + "id": "Image2Tabletop-1790000000-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 0.0], + ) + ], + "rigid_object": [ + _mesh_object( + "cube_1", + "mesh_assets/cube/cube_1/cube_1.glb", + [0.0, 0.08, 0.76], + [0.0, 0.0, 0.0], + ), + _mesh_object( + "cube_2", + "mesh_assets/cube/cube_2/cube_2.glb", + [0.0, -0.08, 0.76], + [0.0, 0.0, 0.0], + ), + _mesh_object( + "cube_3", + "mesh_assets/cube/cube_3/cube_3.glb", + [0.0, 0.16, 0.76], + [0.0, 0.0, 0.0], + ), + ], + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def _mesh_object( uid: str, fpath: str, From e75e3f278c689eff2ec26b850fb8624f421c18e5 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Fri, 26 Jun 2026 10:37:08 +0800 Subject: [PATCH 19/33] Fix dual UR5 robot view semantics --- .../cli/target_replacements.py | 2 +- .../generation/prompt_builders.py | 24 ++-- .../generation/relative_geometry.py | 4 +- .../generation/relative_spec.py | 6 +- .../generation/role_refinement.py | 4 +- .../generation/scene_objects.py | 2 +- .../generation/success_specs.py | 46 +++----- .../generation/templates/dual_ur5_robot.json | 4 +- .../runtime/task_graph.py | 3 +- .../test_backend_atomic_runtime.py | 17 ++- .../test_ur5_basket_config_generation.py | 106 +++++++++--------- 11 files changed, 107 insertions(+), 111 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/cli/target_replacements.py b/embodichain/gen_sim/action_agent_pipeline/cli/target_replacements.py index c12ca8ec..f2a9e36e 100644 --- a/embodichain/gen_sim/action_agent_pipeline/cli/target_replacements.py +++ b/embodichain/gen_sim/action_agent_pipeline/cli/target_replacements.py @@ -179,7 +179,7 @@ def _auto_replacement_source_uid( "to disambiguate." ) - selected = positioned_objects[replacement_number - 1] + selected = positioned_objects[-replacement_number] source_uid = selected["object"]["uid"] print( f"Resolved {option_name} auto source -> {source_uid!r} " diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index e5d3f61e..b329df35 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -35,18 +35,18 @@ "make_relative_task_prompt", ] -_BASKET_LEFT_RELEASE_OFFSET_Y = -0.04 -_BASKET_RIGHT_RELEASE_OFFSET_Y = 0.04 +_BASKET_LEFT_RELEASE_OFFSET_Y = 0.04 +_BASKET_RIGHT_RELEASE_OFFSET_Y = -0.04 _PLACE_LIFT_HEIGHT = 0.10 _RELATIVE_COORDINATE_CONVENTION = """Coordinate convention for relative placement: -- `left_of` means negative world y relative to the reference object. -- `right_of` means positive world y relative to the reference object. -- `front_of` means negative world x relative to the reference object. -- `behind` means positive world x relative to the reference object. -- `front_left_of` combines negative world x and negative world y. -- `back_left_of` combines positive world x and negative world y. -- `front_right_of` combines negative world x and positive world y. -- `back_right_of` combines positive world x and positive world y. +- `left_of` means positive world y relative to the reference object. +- `right_of` means negative world y relative to the reference object. +- `front_of` means positive world x relative to the reference object. +- `behind` means negative world x relative to the reference object. +- `front_left_of` combines positive world x and positive world y. +- `back_left_of` combines negative world x and positive world y. +- `front_right_of` combines positive world x and negative world y. +- `back_right_of` combines negative world x and negative world y. - `inside` uses generated container slot offsets; multiple objects sharing a container are distributed along the container XY long axis. - `on` uses the reference object's xy center.""" @@ -793,9 +793,9 @@ def make_basket_basic_background( The interactive objects are: - {roles.left_target_runtime_uid}: the {left_target_text} mesh initially on the - negative-y side (source object {roles.left_target_source_uid}). + positive-y side (source object {roles.left_target_source_uid}). - {roles.right_target_runtime_uid}: the {right_target_text} mesh initially on the - positive-y side (source object {roles.right_target_source_uid}). + negative-y side (source object {roles.right_target_source_uid}). - {roles.container_runtime_uid}: the target container near the center of the table (source object {roles.container_source_uid}). diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py index 720f189e..fcf7f529 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py @@ -54,8 +54,8 @@ _CONTAINER_SLOT_AXIS_TIE_RATIO = 0.10 _STAGING_Z_DELTA = 0.10 _ON_RELEASE_Z_OFFSET = 0.2 -_ROBOT_VIEW_LEFT_WORLD_Y_SIGN = -1.0 -_ROBOT_VIEW_FRONT_WORLD_X_SIGN = -1.0 +_ROBOT_VIEW_LEFT_WORLD_Y_SIGN = 1.0 +_ROBOT_VIEW_FRONT_WORLD_X_SIGN = 1.0 def _relative_release_offset(relation: str) -> list[float]: diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py index ad182c97..e539b90f 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py @@ -275,9 +275,9 @@ def _call_relative_task_llm( "instructions such as 右臂, 右机械臂, right arm, or right UR5; use " "arm='auto' when the task does not specify an arm.\n" "- For Chinese/English left/right/front/back, use the relation enums " - "from the rotated robot-view perspective. front_of means negative " - "world-x; behind means positive world-x; left_of means negative " - "world-y; right_of means positive world-y. Diagonal relations combine " + "from the rotated robot-view perspective. front_of means positive " + "world-x; behind means negative world-x; left_of means positive " + "world-y; right_of means negative world-y. Diagonal relations combine " "both axes: front_left_of, back_left_of, front_right_of, back_right_of.\n" "- If the task says to release an object above a basket/container so it " "falls into it, use goal_relation='inside'.\n" diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/role_refinement.py b/embodichain/gen_sim/action_agent_pipeline/generation/role_refinement.py index 70d20cfa..48ea8b5a 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/role_refinement.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/role_refinement.py @@ -115,8 +115,8 @@ def _call_role_llm( "Return only one JSON object with keys: container_object, " "left_target_object, right_target_object, target_noun, " "container_runtime_uid. Use only source_uid values from the scene. The " - "rotated robot-view left target starts on the negative-y side, and the " - "rotated robot-view right target starts on the positive-y side.\n\n" + "rotated robot-view left target starts on the positive-y side, and the " + "rotated robot-view right target starts on the negative-y side.\n\n" f"Project: {project_name}\n" f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}\n" f"Default roles:\n{json.dumps(default_roles, ensure_ascii=False, indent=2)}" diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/scene_objects.py b/embodichain/gen_sim/action_agent_pipeline/generation/scene_objects.py index e188d89a..dac184cc 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/scene_objects.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/scene_objects.py @@ -249,7 +249,7 @@ def _side_axis_value(obj: _SceneObject) -> float: def _position_side_axis_value(position: list[float]) -> float: - return float(position[_DUAL_UR5_SIDE_AXIS_INDEX]) + return -float(position[_DUAL_UR5_SIDE_AXIS_INDEX]) def _arm_side_for_position(position: list[float]) -> str: diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py index 7204c534..a8a6a365 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py @@ -40,20 +40,7 @@ def _make_extensions_config(roles: _BasketTaskRoles) -> dict[str, Any]: return { - "agent_arm_slots": { - "left": { - "arm": "right_arm", - "eef": "right_eef", - }, - "right": { - "arm": "left_arm", - "eef": "left_eef", - }, - }, - "arm_aim_yaw_offset": { - "left": 3.141592653589793, - "right": 0.0, - }, + **_make_dual_ur5_arm_slot_config(), "gripper_open_state": [0.0], "gripper_close_state": [0.04], "ignore_terminations_during_agent": True, @@ -85,11 +72,7 @@ def _object_in_container_success(object_uid: str, container_uid: str) -> dict[st } -def _make_relative_extensions_config( - spec: _RelativePlacementSpec, - *, - side_relation_xy_offsets: Callable[[str], tuple[float, float]], -) -> dict[str, Any]: +def _make_dual_ur5_arm_slot_config() -> dict[str, Any]: return { "agent_arm_slots": { "left": { @@ -105,6 +88,16 @@ def _make_relative_extensions_config( "left": 3.141592653589793, "right": 0.0, }, + } + + +def _make_relative_extensions_config( + spec: _RelativePlacementSpec, + *, + side_relation_xy_offsets: Callable[[str], tuple[float, float]], +) -> dict[str, Any]: + return { + **_make_dual_ur5_arm_slot_config(), "gripper_open_state": [0.0], "gripper_close_state": [0.04], "ignore_terminations_during_agent": True, @@ -118,20 +111,7 @@ def _make_relative_extensions_config( def _make_arrangement_extensions_config(spec: _ArrangementLineSpec) -> dict[str, Any]: return { - "agent_arm_slots": { - "left": { - "arm": "right_arm", - "eef": "right_eef", - }, - "right": { - "arm": "left_arm", - "eef": "left_eef", - }, - }, - "arm_aim_yaw_offset": { - "left": 3.141592653589793, - "right": 0.0, - }, + **_make_dual_ur5_arm_slot_config(), "gripper_open_state": [0.0], "gripper_close_state": [0.04], "ignore_terminations_during_agent": True, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json b/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json index b74bf850..d3c6abc9 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json +++ b/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json @@ -33,8 +33,8 @@ } ] }, - "init_pos": [2.0, 0.0, 0.0], - "init_rot": [0.0, 0.0, -90.0], + "init_pos": [-2.0, 0.0, 0.0], + "init_rot": [0.0, 0.0, 90.0], "init_qpos": [ 0, 0, diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py b/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py index 51cff7c3..96e61155 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py @@ -17,6 +17,7 @@ from __future__ import annotations from collections import defaultdict +from collections.abc import Sequence from dataclasses import dataclass from typing import Any @@ -51,7 +52,7 @@ class AgentGraphEdge: right_arm_action: Any = None -class ExecutedActionList: +class ExecutedActionList(Sequence[Any]): """Action sequence already executed online by the graph runtime.""" already_executed = True diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py index 58cea8fa..b74a18c5 100644 --- a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -16,6 +16,7 @@ from __future__ import annotations +from collections.abc import Sequence from types import SimpleNamespace import pytest @@ -35,7 +36,10 @@ from embodichain.gen_sim.action_agent_pipeline.runtime.coacd_cache_bridge import ( GraspCollisionCachePreparationError, ) -from embodichain.gen_sim.action_agent_pipeline.runtime.task_graph import AgentTaskGraph +from embodichain.gen_sim.action_agent_pipeline.runtime.task_graph import ( + AgentTaskGraph, + ExecutedActionList, +) from embodichain.lab.sim.atomic_actions import ( MoveActionCfg, PickUpActionCfg, @@ -323,6 +327,17 @@ def test_agent_task_graph_run_rejects_none_env() -> None: graph.run(env=None) +def test_executed_action_list_is_sequence() -> None: + actions = [torch.zeros(1, 2), torch.ones(1, 2)] + action_list = ExecutedActionList(actions) + + assert isinstance(action_list, Sequence) + assert action_list.already_executed + assert len(action_list) == 2 + assert action_list[1] is actions[1] + assert list(action_list) == actions + + def test_resolve_arm_side_rejects_unavailable_requested_arm() -> None: env = _FakeEnv() env.right_arm_joints = [] diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 096b530a..08cead9a 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -65,8 +65,8 @@ def test_action_agent_templates_load_fresh_json_copies() -> None: first_sensors[0]["uid"] = "mutated_camera" first_lights["direct"][0]["uid"] = "mutated_light" - assert second_robot["init_pos"] == pytest.approx([2.0, 0.0, 0.84]) - assert first_robot["init_pos"] == pytest.approx([2.0, 0.0, 0.42]) + assert second_robot["init_pos"] == pytest.approx([-2.0, 0.0, 0.84]) + assert first_robot["init_pos"] == pytest.approx([-2.0, 0.0, 0.42]) assert second_robot["control_parts"]["left_arm"] == ["LEFT_JOINT[1-6]"] assert second_sensors[0]["uid"] == "cam_high" assert second_lights["direct"][0]["uid"] == "main_light" @@ -109,9 +109,9 @@ def test_action_agent_config_generator_uses_parallel_handoff( - action_agent_config_generation._DUAL_UR5_ARM_COMPONENT_Z ) assert gym_config["robot"]["init_pos"] == pytest.approx( - [2.0, 0.0, expected_robot_init_z] + [-2.0, 0.0, expected_robot_init_z] ) - assert gym_config["robot"]["init_rot"] == [0.0, 0.0, -90.0] + assert gym_config["robot"]["init_rot"] == [0.0, 0.0, 90.0] extensions = gym_config["env"]["extensions"] assert extensions["agent_arm_slots"]["left"] == { "arm": "right_arm", @@ -145,23 +145,23 @@ def test_action_agent_config_generator_uses_parallel_handoff( assert "positive-x side" not in basic_background left_high_offset_spec = ( '"robot_name":"left_arm","control":"arm","target_pose":{"reference":"object",' - '"obj_name":"wicker_basket","offset":[0.0,-0.04,0.22]' + '"obj_name":"wicker_basket","offset":[0.0,0.04,0.22]' ) right_high_offset_spec = ( '"robot_name":"right_arm","control":"arm","target_pose":{"reference":"object",' - '"obj_name":"wicker_basket","offset":[0.0,0.04,0.22]' + '"obj_name":"wicker_basket","offset":[0.0,-0.04,0.22]' ) assert left_high_offset_spec in task_prompt assert right_high_offset_spec in task_prompt assert ( '"atomic_action_class":"PlaceAction","robot_name":"left_arm","control":"arm",' '"target_pose":{"reference":"object","obj_name":"wicker_basket",' - '"offset":[0.0,-0.04,0.12]}' in task_prompt + '"offset":[0.0,0.04,0.12]}' in task_prompt ) assert ( '"atomic_action_class":"PlaceAction","robot_name":"right_arm","control":"arm",' '"target_pose":{"reference":"object","obj_name":"wicker_basket",' - '"offset":[0.0,0.04,0.12]}' in task_prompt + '"offset":[0.0,-0.04,0.12]}' in task_prompt ) assert '"offset":[-0.04,0.0,0.22]' not in task_prompt assert '"offset":[0.04,0.0,0.22]' not in task_prompt @@ -206,8 +206,8 @@ def test_generator_normalizes_glb_meshes_and_preserves_source_rot( assert background_objects["table"]["init_rot"] == [0.0, 0.0, 180.0] assert background_objects["wicker_basket"]["init_rot"] == [0.0, 0.0, 180.0] - assert rigid_objects["right_apple"]["init_rot"] == [0.0, 0.0, 140.0] - assert rigid_objects["left_apple"]["init_rot"] == [0.0, 0.0, 160.0] + assert rigid_objects["left_apple"]["init_rot"] == [0.0, 0.0, 140.0] + assert rigid_objects["right_apple"]["init_rot"] == [0.0, 0.0, 160.0] for obj_config in [ background_objects["table"], background_objects["wicker_basket"], @@ -399,24 +399,24 @@ def test_target_replacements_can_sync_runtime_names( background_objects = {obj["uid"]: obj for obj in gym_config["background"]} - assert set(rigid_objects) == {"left_orange", "right_apple"} + assert set(rigid_objects) == {"left_apple", "right_orange"} assert "wicker_basket" in background_objects assert background_objects["wicker_basket"]["body_type"] == "kinematic" - _assert_normalized_obj_path(rigid_objects["left_orange"]["shape"]["fpath"]) - _assert_normalized_obj_path(rigid_objects["right_apple"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["left_apple"]["shape"]["fpath"]) + _assert_normalized_obj_path(rigid_objects["right_orange"]["shape"]["fpath"]) success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] assert {term["object"] for term in success_terms} == { - "left_orange", - "right_apple", + "left_apple", + "right_orange", } task_prompt = paths.task_prompt.read_text(encoding="utf-8") basic_background = paths.basic_background.read_text(encoding="utf-8") - assert "the left orange and right apple into the wicker_basket" in task_prompt - assert "left_arm must only manipulate `left_orange`" in task_prompt - assert "- left_orange: the orange mesh initially" in basic_background - assert "- right_apple: the apple mesh initially" in basic_background + assert "the left apple and right orange into the wicker_basket" in task_prompt + assert "right_arm must only manipulate `right_orange`" in task_prompt + assert "- left_apple: the apple mesh initially" in basic_background + assert "- right_orange: the orange mesh initially" in basic_background def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: @@ -433,7 +433,7 @@ def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: replacement_number=1, option_name="--target_replacement1", ) - == "bread_2" + == "bread_1" ) assert ( target_replacements_cli._auto_replacement_source_uid( @@ -441,7 +441,7 @@ def test_pipeline_auto_replacement_uses_rotated_robot_view_order() -> None: replacement_number=2, option_name="--target_replacement2", ) - == "bread_1" + == "bread_2" ) @@ -555,7 +555,7 @@ def fake_call_relative_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("y", -0.16) in axis_terms + assert ("y", 0.16) in axis_terms assert ("x", 0.0) in axis_terms assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] @@ -566,10 +566,10 @@ def fake_call_relative_task_llm(**kwargs): "Generate one deterministic nominal graph with exactly 4 nominal edges" in task_prompt ) - assert '"atomic_action_class":"PickUpAction","robot_name":"left_arm"' in task_prompt - assert '"atomic_action_class":"PlaceAction","robot_name":"left_arm"' in task_prompt + assert '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in task_prompt + assert '"atomic_action_class":"PlaceAction","robot_name":"right_arm"' in task_prompt assert '"obj_name":"apple_2"' in task_prompt - assert "right_arm_action: null" in task_prompt + assert "left_arm_action: null" in task_prompt assert "Generate exactly 10 nominal edges" not in task_prompt assert _stable_summary(paths.summary) == { @@ -577,8 +577,8 @@ def fake_call_relative_task_llm(**kwargs): "moved_object": "apple_2", "reference_object": "wicker_basket", "relation": "left_of", - "active_arm": "left_arm", - "release_offset": [0.0, -0.16, 0.12], + "active_arm": "right_arm", + "release_offset": [0.0, 0.16, 0.12], } @@ -634,13 +634,13 @@ def fake_call_relative_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("x", -0.16) in axis_terms + assert ("x", 0.16) in axis_terms assert ("y", 0.0) in axis_terms task_prompt = paths.task_prompt.read_text(encoding="utf-8") atom_actions = paths.atom_actions.read_text(encoding="utf-8") - assert '"offset":[-0.16,0.0,0.22]' in task_prompt - assert '"offset":[-0.16,0.0,0.22]' in atom_actions + assert '"offset":[0.16,0.0,0.22]' in task_prompt + assert '"offset":[0.16,0.0,0.22]' in atom_actions assert _stable_summary(paths.summary) == { "mode": "relative_placement", @@ -648,7 +648,7 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "apple_2", "relation": "front_of", "active_arm": "right_arm", - "release_offset": [-0.16, 0.0, 0.12], + "release_offset": [0.16, 0.0, 0.12], } @@ -714,8 +714,8 @@ def fake_call_relative_task_llm(**kwargs): rigid_objects = {obj["uid"]: obj for obj in gym_config["rigid_object"]} assert set(rigid_objects) == {"chip_bag"} initial_position = rigid_objects["chip_bag"]["init_pos"] - expected_x = round(initial_position[0] - 0.16, 6) - expected_y = round(initial_position[1] - 0.16, 6) + expected_x = round(initial_position[0] + 0.16, 6) + expected_y = round(initial_position[1] + 0.16, 6) success = gym_config["env"]["extensions"]["agent_success"] assert success["op"] == "all" @@ -739,7 +739,7 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "chip_bag", "relation": "front_left_of", "active_arm": "left_arm", - "release_offset": [-0.16, -0.16, 0.12], + "release_offset": [0.16, 0.16, 0.12], } @@ -784,34 +784,34 @@ def fake_call_relative_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("x", -0.16) in axis_terms - assert ("y", 0.16) in axis_terms + assert ("x", 0.16) in axis_terms + assert ("y", -0.16) in axis_terms task_prompt = paths.task_prompt.read_text(encoding="utf-8") - assert '"offset":[-0.16,0.16,0.12]' in task_prompt - assert _stable_summary(paths.summary)["release_offset"] == [-0.16, 0.16, 0.12] + assert '"offset":[0.16,-0.16,0.12]' in task_prompt + assert _stable_summary(paths.summary)["release_offset"] == [0.16, -0.16, 0.12] def test_side_relation_offsets_use_robot_view_front_back_convention() -> None: assert action_agent_config_generation._side_relation_xy_offsets("front_of") == ( - -0.16, + 0.16, 0.0, ) assert action_agent_config_generation._side_relation_xy_offsets("behind") == ( - 0.16, + -0.16, 0.0, ) assert action_agent_config_generation._side_relation_xy_offsets( "front_left_of" ) == ( - -0.16, - -0.16, + 0.16, + 0.16, ) assert action_agent_config_generation._side_relation_xy_offsets( "back_right_of" ) == ( - 0.16, - 0.16, + -0.16, + -0.16, ) @@ -871,7 +871,7 @@ def fake_call_relative_task_llm(**kwargs): assert success["object"] == "apple_1" assert success["container"] == "wicker_basket" assert paths.summary["relation"] == "inside" - assert paths.summary["active_arm"] == "right_arm" + assert paths.summary["active_arm"] == "left_arm" assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] @@ -1185,8 +1185,8 @@ def fake_call_relative_task_llm(**kwargs): for term in placement_success["terms"] if term["type"] == "object_axis_offset_near" } - assert ("apple_2", "y", -0.16) in axis_terms - assert ("apple_1", "y", 0.16) in axis_terms + assert ("apple_2", "y", 0.16) in axis_terms + assert ("apple_1", "y", -0.16) in axis_terms attr_names = { attr["name"] @@ -1202,14 +1202,14 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "wicker_basket", "relation": "left_of", "active_arm": "left_arm", - "release_offset": [0.0, -0.16, 0.12], + "release_offset": [0.0, 0.16, 0.12], }, { "moved_object": "apple_1", "reference_object": "wicker_basket", "relation": "right_of", "active_arm": "right_arm", - "release_offset": [0.0, 0.16, 0.12], + "release_offset": [0.0, -0.16, 0.12], }, ], } @@ -1397,21 +1397,21 @@ def fake_call_arrangement_task_llm(**kwargs): "object": "cube_2", "source_uid": "cube_2", "slot_index": 0, - "active_arm": "left_arm", + "active_arm": "right_arm", "target_xy": [0.0, -0.07], }, { "object": "cube_1", "source_uid": "cube_1", "slot_index": 1, - "active_arm": "right_arm", + "active_arm": "left_arm", "target_xy": [0.0, 0.0], }, { "object": "cube_3", "source_uid": "cube_3", "slot_index": 2, - "active_arm": "right_arm", + "active_arm": "left_arm", "target_xy": [0.0, 0.07], }, ], @@ -1623,7 +1623,7 @@ def fake_call_relative_task_llm(**kwargs): ) active_arms = [placement["active_arm"] for placement in paths.summary["placements"]] - assert active_arms == ["left_arm", "right_arm"] + assert active_arms == ["right_arm", "left_arm"] gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) assert "agent_grasp_pose_overrides" not in gym_config["env"]["extensions"] From adcd3ba800c3ee12da9fe1f0e1559508a4f53621 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Fri, 26 Jun 2026 11:28:32 +0800 Subject: [PATCH 20/33] fix: adapt action-agent runtime to typed atomic actions --- .../runtime/atom_actions.py | 80 +++++++++---------- .../test_backend_atomic_runtime.py | 74 ++++++++++++----- 2 files changed, 91 insertions(+), 63 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index 2db470dc..3157dbf6 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -36,13 +36,16 @@ ) from embodichain.lab.sim.atomic_actions import ( AntipodalAffordance, - MoveAction, - MoveActionCfg, + EndEffectorPoseTarget, + GraspTarget, + MoveEndEffector, + MoveEndEffectorCfg, ObjectSemantics, - PickUpAction, - PickUpActionCfg, - PlaceAction, - PlaceActionCfg, + PickUp, + PickUpCfg, + Place, + PlaceCfg, + WorldState, ) from embodichain.lab.sim.planners import MotionGenerator, MotionGenCfg, ToppraPlannerCfg from embodichain.toolkits.graspkit.pg_grasp import ( @@ -88,9 +91,9 @@ ATOMIC_ACTION_REGISTRY = { - "PickUpAction": (PickUpAction, PickUpActionCfg), - "MoveAction": (MoveAction, MoveActionCfg), - "PlaceAction": (PlaceAction, PlaceActionCfg), + "PickUpAction": (PickUp, PickUpCfg), + "MoveAction": (MoveEndEffector, MoveEndEffectorCfg), + "PlaceAction": (Place, PlaceCfg), } @@ -397,28 +400,25 @@ def execute_atomic_action( return action_np target = _resolve_target(env, spec, runtime_kwargs) - is_left, arm_part, hand_part, arm_joints, eef_joints = _select_arm_parts( + _, arm_part, hand_part, arm_joints, eef_joints = _select_arm_parts( env, spec.robot_name ) cfg = _build_action_cfg(env, spec, arm_part, hand_part, len(eef_joints)) - start_qpos = _resolve_action_start_qpos( - env, - spec, - is_left=is_left, - arm_joints=arm_joints, - eef_joints=eef_joints, - ) + target = _build_typed_target(spec, target) + state = WorldState(last_qpos=env.robot.get_qpos().clone()) action_cls = _get_atomic_action_class(spec.atomic_action_class) action = action_cls(motion_generator=_make_motion_generator(env), cfg=cfg) - is_success, trajectory, joint_ids = action.execute( + result = action.execute( target=target, - start_qpos=start_qpos, + state=state, ) - if not is_success: + if not result.success: raise RuntimeError( f"Atomic action failed: atomic_action_class={spec.atomic_action_class}, " f"robot_name={spec.robot_name}, target={_target_summary(spec)}." ) + trajectory = result.trajectory[:, :, arm_joints + eef_joints] + joint_ids = arm_joints + eef_joints action_np = _trajectory_to_agent_action( env, @@ -656,6 +656,16 @@ def _get_atomic_action_class(atomic_action_class: str): return action_class +def _build_typed_target(spec: AtomicActionSpec, target): + if spec.atomic_action_class == "PickUpAction": + return GraspTarget(semantics=target) + if spec.atomic_action_class == "PlaceAction": + return EndEffectorPoseTarget(xpos=target) + if spec.atomic_action_class == "MoveAction": + return EndEffectorPoseTarget(xpos=target) + raise ValueError(f"Unsupported atomic action class: {spec.atomic_action_class}.") + + def _build_action_cfg( env, spec: AtomicActionSpec, @@ -670,50 +680,32 @@ def _build_action_cfg( if spec.atomic_action_class == "PickUpAction": if spec.control != "arm": raise ValueError("PickUpAction atomic action requires control='arm'.") - return PickUpActionCfg( + return PickUpCfg( control_part=arm_part, hand_control_part=hand_part, hand_open_qpos=_state_to_hand_qpos(env.open_state, hand_dof, device), hand_close_qpos=_state_to_hand_qpos(env.close_state, hand_dof, device), - **_cfg_supported_kwargs(PickUpActionCfg, cfg_values), + **_cfg_supported_kwargs(PickUpCfg, cfg_values), ) if spec.atomic_action_class == "PlaceAction": if spec.control != "arm": raise ValueError("PlaceAction atomic action requires control='arm'.") - return PlaceActionCfg( + return PlaceCfg( control_part=arm_part, hand_control_part=hand_part, hand_open_qpos=_state_to_hand_qpos(env.open_state, hand_dof, device), hand_close_qpos=_state_to_hand_qpos(env.close_state, hand_dof, device), - **_cfg_supported_kwargs(PlaceActionCfg, cfg_values), + **_cfg_supported_kwargs(PlaceCfg, cfg_values), ) control_part = arm_part if spec.control == "arm" else hand_part - return MoveActionCfg( + return MoveEndEffectorCfg( control_part=control_part, - **_cfg_supported_kwargs(MoveActionCfg, cfg_values), + **_cfg_supported_kwargs(MoveEndEffectorCfg, cfg_values), ) -def _resolve_action_start_qpos( - env, - spec: AtomicActionSpec, - *, - is_left: bool, - arm_joints: list[int], - eef_joints: list[int], -): - if spec.control == "hand": - _, _, _, _, current_gripper_state = get_arm_states(env, spec.robot_name) - return _state_to_hand_qpos( - current_gripper_state, - len(eef_joints), - env.robot.device, - ).reshape(1, len(eef_joints)) - return _current_arm_qpos(env, is_left, arm_joints) - - def _resolve_target(env, spec: AtomicActionSpec, runtime_kwargs: dict[str, Any]): if spec.atomic_action_class == "PickUpAction": return _resolve_pickup_target(env, spec, runtime_kwargs) diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py index b74a18c5..f6b79445 100644 --- a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -41,9 +41,13 @@ ExecutedActionList, ) from embodichain.lab.sim.atomic_actions import ( - MoveActionCfg, - PickUpActionCfg, - PlaceActionCfg, + ActionResult, + EndEffectorPoseTarget, + GraspTarget, + MoveEndEffectorCfg, + PickUpCfg, + PlaceCfg, + WorldState, ) @@ -155,23 +159,53 @@ def __init__(self, motion_generator, cfg): } ) - def execute(self, target, start_qpos=None, **kwargs): + def execute(self, target, state, **kwargs): if self.capture is not None: - self.capture[-1].update({"target": target, "start_qpos": start_qpos}) + self.capture[-1].update({"target": target, "state": state}) if self.cfg.name in {"pick_up", "place"}: trajectory = torch.tensor( - [[[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]]], dtype=torch.float32 + [ + [ + [0.1, 0.2, 0.3, 0.0, 0.0, 0.0], + [0.2, 0.3, 0.4, 0.0, 0.0, 0.0], + ] + ], + dtype=torch.float32, ) - return ( - True, - trajectory, - [0, 1, 2] if "left" in self.cfg.control_part else [3, 4, 5], + return ActionResult( + success=True, + trajectory=trajectory, + next_state=WorldState(last_qpos=trajectory[:, -1, :]), ) if self.cfg.control_part.endswith("eef"): - trajectory = torch.tensor([[[0.0], [0.05]]], dtype=torch.float32) - return True, trajectory, [2 if "left" in self.cfg.control_part else 5] - trajectory = torch.tensor([[[0.1, 0.2], [0.2, 0.3]]], dtype=torch.float32) - return True, trajectory, [0, 1] if "left" in self.cfg.control_part else [3, 4] + trajectory = torch.tensor( + [ + [ + [0.1, 0.2, 0.0, 0.0, 0.0, 0.0], + [0.1, 0.2, 0.05, 0.0, 0.0, 0.0], + ] + ], + dtype=torch.float32, + ) + return ActionResult( + success=True, + trajectory=trajectory, + next_state=WorldState(last_qpos=trajectory[:, -1, :]), + ) + trajectory = torch.tensor( + [ + [ + [0.1, 0.2, 0.0, 0.0, 0.0, 0.0], + [0.2, 0.3, 0.0, 0.0, 0.0, 0.0], + ] + ], + dtype=torch.float32, + ) + return ActionResult( + success=True, + trajectory=trajectory, + next_state=WorldState(last_qpos=trajectory[:, -1, :]), + ) @pytest.fixture(autouse=True) @@ -401,10 +435,11 @@ def test_object_referenced_pose_builds_move_cfg_and_pose_target(monkeypatch) -> ) assert action.shape == (2, 3) - assert isinstance(capture[0]["cfg"], MoveActionCfg) + assert isinstance(capture[0]["cfg"], MoveEndEffectorCfg) assert capture[0]["cfg"].control_part == "left_arm" assert capture[0]["cfg"].sample_interval == 12 - assert capture[0]["target"][:3, 3].tolist() == pytest.approx([0.5, 0.0, 0.4]) + assert isinstance(capture[0]["target"], EndEffectorPoseTarget) + assert capture[0]["target"].xpos[:3, 3].tolist() == pytest.approx([0.5, 0.0, 0.4]) def test_gripper_state_qpos_target_interpolates_hand_action(monkeypatch) -> None: @@ -510,11 +545,12 @@ def test_target_object_builds_pick_up_cfg(monkeypatch) -> None: allow_grasp_annotation=True, ) - assert isinstance(capture[0]["cfg"], PickUpActionCfg) + assert isinstance(capture[0]["cfg"], PickUpCfg) assert capture[0]["cfg"].control_part == "left_arm" assert capture[0]["cfg"].hand_control_part == "left_eef" assert capture[0]["cfg"].pre_grasp_distance == pytest.approx(0.07) - assert capture[0]["target"].label == "apple" + assert isinstance(capture[0]["target"], GraspTarget) + assert capture[0]["target"].semantics.label == "apple" def test_place_action_builds_place_cfg(monkeypatch) -> None: @@ -549,7 +585,7 @@ def test_place_action_builds_place_cfg(monkeypatch) -> None: ) assert action.shape == (2, 3) - assert isinstance(capture[0]["cfg"], PlaceActionCfg) + assert isinstance(capture[0]["cfg"], PlaceCfg) assert capture[0]["cfg"].control_part == "left_arm" assert capture[0]["cfg"].lift_height == pytest.approx(0.06) From c5a23a7fc1e9686b097b2462d3315684fc43e3e5 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Fri, 26 Jun 2026 11:48:08 +0800 Subject: [PATCH 21/33] style: format action-agent config generation test --- .../test_ur5_basket_config_generation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 08cead9a..00bd9df9 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -566,7 +566,9 @@ def fake_call_relative_task_llm(**kwargs): "Generate one deterministic nominal graph with exactly 4 nominal edges" in task_prompt ) - assert '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in task_prompt + assert ( + '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in task_prompt + ) assert '"atomic_action_class":"PlaceAction","robot_name":"right_arm"' in task_prompt assert '"obj_name":"apple_2"' in task_prompt assert "left_arm_action: null" in task_prompt From 0daf471a6fe7a268377985e59a0e924988500c09 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Fri, 26 Jun 2026 12:22:48 +0800 Subject: [PATCH 22/33] fix: pass grasp mesh data to typed affordance --- .../gen_sim/action_agent_pipeline/runtime/atom_actions.py | 8 ++++---- .../action_agent_pipeline/test_backend_atomic_runtime.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index 3157dbf6..f2ba4cf2 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -945,11 +945,11 @@ def _build_object_semantics( ) affordance = AntipodalAffordance( object_label=obj_name, + mesh_vertices=mesh_vertices, + mesh_triangles=mesh_triangles, + generator_cfg=generator_cfg, + gripper_collision_cfg=gripper_collision_cfg, force_reannotate=force_reannotate, - custom_config={ - "gripper_collision_cfg": gripper_collision_cfg, - "generator_cfg": generator_cfg, - }, ) return ObjectSemantics( label=obj_name, diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py index f6b79445..31a0205d 100644 --- a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -551,6 +551,8 @@ def test_target_object_builds_pick_up_cfg(monkeypatch) -> None: assert capture[0]["cfg"].pre_grasp_distance == pytest.approx(0.07) assert isinstance(capture[0]["target"], GraspTarget) assert capture[0]["target"].semantics.label == "apple" + assert capture[0]["target"].semantics.affordance.mesh_vertices is not None + assert capture[0]["target"].semantics.affordance.mesh_triangles is not None def test_place_action_builds_place_cfg(monkeypatch) -> None: From 557ab2f91d66e4d766f4e42e456500629b6dd57d Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Fri, 26 Jun 2026 14:52:45 +0800 Subject: [PATCH 23/33] Native action-agent atomic actions --- .../generation/config_types.py | 4 + .../generation/prompt_builders.py | 97 ++- .../generation/relative_geometry.py | 10 + .../generation/relative_spec.py | 70 +- .../prompts/atom_actions.txt | 47 +- .../prompts/task_prompt.py | 2 +- .../runtime/atom_actions.py | 698 ++++++++++++++---- .../runtime/graph_compiler.py | 7 +- .../runtime/task_graph.py | 8 +- .../test_backend_atomic_runtime.py | 295 +++++++- .../test_demo3_semantic_grasp_integration.py | 4 +- .../test_graph_spec_backend_atomic.py | 2 +- .../test_ur5_basket_config_generation.py | 237 +++++- 13 files changed, 1192 insertions(+), 289 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py index ae46bc4c..6fde7964 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py @@ -100,6 +100,8 @@ class _RelativePlacementStepSpec: reference_is_initial_pose: bool = False release_position: list[float] | None = None high_position: list[float] | None = None + orientation_goal: str = "preserve" + orientation_align_to_runtime_uid: str | None = None @dataclass(frozen=True) @@ -121,6 +123,8 @@ class _RelativePlacementSpec: reference_is_initial_pose: bool = False release_position: list[float] | None = None high_position: list[float] | None = None + orientation_goal: str = "preserve" + orientation_align_to_runtime_uid: str | None = None @dataclass(frozen=True) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index b329df35..6e1ef0f3 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -75,6 +75,8 @@ class _RelativePlacementLike(Protocol): reference_is_initial_pose: bool high_position: Sequence[float] | None release_position: Sequence[float] | None + orientation_goal: str + orientation_align_to_runtime_uid: str | None class _RelativeSpecLike(_RelativePlacementLike, Protocol): @@ -164,7 +166,7 @@ def make_arrangement_task_prompt( Generate one deterministic nominal graph with exactly {edge_count} nominal edges. Use only the atomic action class JSON specs shown below. Do not add recovery, -monitor, search, alignment, or extra lift edges. Use `PlaceAction` for each +monitor, search, alignment, or extra lift edges. Use `Place` for each release-place step so lowering, gripper opening, and upward retreat remain one atomic action. The arm not listed for a step must remain null. @@ -245,8 +247,8 @@ def make_arrangement_atom_actions_prompt(spec: _ArrangementSpecLike) -> str: blocks = "\n\n".join(_arrangement_atom_action_block(step) for step in spec.steps) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Line Arrangement -Use only atomic action class JSON specs backed by `PickUpAction`, `MoveAction`, and -`PlaceAction`. Each object is moved to an absolute slot pose computed by the +Use only the native atomic action class JSON specs shown below. Each object is +moved to an absolute slot pose computed by the config-stage generator. Keep the non-active arm null for each listed object. {blocks} @@ -321,7 +323,7 @@ def make_relative_task_prompt( Generate one deterministic nominal graph with exactly 4 nominal edges. Use only the atomic action class JSON specs shown below. Do not add recovery, monitor, search, -alignment, or extra lift edges. Use `PlaceAction` for the release-place step so +alignment, or extra lift edges. Use `Place` for the release-place step so lowering, gripper opening, and upward retreat remain one atomic action. The inactive arm must remain null in every edge. @@ -431,7 +433,7 @@ def _make_dual_relative_task_prompt( Generate one deterministic nominal graph with exactly 6 nominal edges. Use only the atomic action class JSON specs shown below. Do not add recovery, monitor, search, -alignment, or extra lift edges. Use `PlaceAction` for each release-place step so +alignment, or extra lift edges. Use `Place` for each release-place step so lowering, gripper opening, and upward retreat remain one atomic action. 1. Pick up both moved objects simultaneously: @@ -503,7 +505,7 @@ def make_relative_basic_background( The execution-stage LLM should generate graph JSON that grasps the moved object, moves it to the configured high staging pose, places it at the release pose with -one `PlaceAction`, and returns the active arm to its initial pose. +one `Place`, and returns the active arm to its initial pose. """ @@ -538,7 +540,7 @@ def _make_dual_relative_basic_background( {notes} The execution-stage LLM should generate graph JSON that grasps both moved -objects, stages and places the first moved object with one `PlaceAction`, then +objects, stages and places the first moved object with one `Place`, then stages and places the second moved object while the first arm returns to its initial pose. Each arm must release its moved object before returning to its initial pose. @@ -565,8 +567,8 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: ) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Relative Placement -Use only atomic action class JSON specs backed by `PickUpAction`, `MoveAction`, and -`PlaceAction`. The active arm is `{active_arm}`. Keep `{inactive_arm}` null in +Use only the native atomic action class JSON specs shown below. The active arm +is `{active_arm}`. Keep `{inactive_arm}` null in the nominal graph. Use exactly these action patterns: @@ -611,8 +613,7 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: ) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Dual-Arm Relative Placement -Use only atomic action class JSON specs backed by `PickUpAction`, `MoveAction`, and -`PlaceAction`. +Use only the native atomic action class JSON specs shown below. - `{first_arm}` manipulates `{first.moved_runtime_uid}`. - `{second_arm}` manipulates `{second.moved_runtime_uid}`. @@ -712,20 +713,20 @@ def make_basket_task_prompt( - Both target objects must be released into `{roles.container_runtime_uid}`. Generate one deterministic nominal graph with the following semantic sequence. -Do not add extra alignment, search, recovery, or monitor steps. Use `PlaceAction` +Do not add extra alignment, search, recovery, or monitor steps. Use `Place` for each release-place step so lowering, gripper opening, and upward retreat -remain one atomic action. The left arm must finish its `PlaceAction` retreat +remain one atomic action. The left arm must finish its `Place` retreat before the right arm enters the shared container workspace, but the left return-to-initial action and the right high-staging action must execute simultaneously in one graph edge. Generate exactly 6 nominal edges, one edge for each numbered step below. Do not split the simultaneous grasp or the simultaneous left-return/right-staging action into -separate edges. Do not split a `PlaceAction` into separate lower-to-release, +separate edges. Do not split a `Place` into separate lower-to-release, open-gripper, or upward-retreat edges. A target object is not considered placed when it is only above the {roles.container_runtime_uid}. For each arm, the placement order must be: move -to a high staging pose above the container, then execute one `PlaceAction` at +to a high staging pose above the container, then execute one `Place` at the release pose inside the container, then return the arm to its initial pose. Never use `target_qpos` source `initial` for an arm that has not already released its held target object. @@ -804,24 +805,24 @@ def make_basket_basic_background( {roles.right_target_runtime_uid} in the same graph edge. After both {target_plural} are held, the left UR5 places {roles.left_target_runtime_uid} into {roles.container_runtime_uid} with one -`PlaceAction`. The next graph edge is a parallel handoff: the left UR5 returns +`Place`. The next graph edge is a parallel handoff: the left UR5 returns to its initial pose while the right UR5 simultaneously moves its already-grasped {roles.right_target_runtime_uid} to the high staging pose above {roles.container_runtime_uid}. The right UR5 then places -{roles.right_target_runtime_uid} with one `PlaceAction` and returns to its +{roles.right_target_runtime_uid} with one `Place` and returns to its initial pose. To change the insertion order later, edit the task prompt sequence and keep the same atomic action API. The {roles.container_runtime_uid} area is a shared workspace. A UR5 should -complete its `PlaceAction` retreat before the other UR5 moves to the container, +complete its `Place` retreat before the other UR5 moves to the container, otherwise the two arms may collide near the container. The right UR5 should keep holding {roles.right_target_runtime_uid} while the left UR5 performs its -placement. Once that `PlaceAction` is complete, the right UR5 may move toward +placement. Once that `Place` is complete, the right UR5 may move toward the container while the left UR5 simultaneously returns to its initial pose; it must not wait for the left return-to-initial motion to finish. A target object at a high pose above `{roles.container_runtime_uid}` is only -staged, not placed. Each arm must execute a `PlaceAction` at the container +staged, not placed. Each arm must execute a `Place` at the container release pose before any return-to-initial motion. Always plan to the current `{roles.container_runtime_uid}` object pose from the @@ -859,20 +860,20 @@ def make_basket_atom_actions_prompt(roles: _BasketRolesLike) -> str: ) return f"""### Atomic Action Class JSON Specs for UR5BreadBasket Dual-UR5 Placement -Use only atomic action class JSON specs backed by `PickUpAction`, `MoveAction`, and -`PlaceAction`. Use `robot_name="left_arm"` only for +Use only the native atomic action class JSON specs shown below. Use +`robot_name="left_arm"` only for `{roles.left_target_runtime_uid}` and `robot_name="right_arm"` only for `{roles.right_target_runtime_uid}`. The nominal task starts with simultaneous dual-arm pick-up, followed by a left-first placement with an overlapped handoff to the right arm: -- The first nominal edge must use `atomic_action_class:"PickUpAction"` for both arms. +- The first nominal edge must use `atomic_action_class:"PickUp"` for both arms. - While the left arm places its target, keep the right hand closed with a `target_qpos` whose source is `gripper_state` and state is `close`. - After the left arm releases `{roles.left_target_runtime_uid}`, first move it - upward to clear the container as part of the same `PlaceAction`. + upward to clear the container as part of the same `Place`. - The next nominal edge must pair the left arm's initial `target_qpos` move with - the right arm's object-referenced `target_pose` high-staging move. Do not split this + the right arm's object-referenced `target_object_pose` high-staging move. Do not split this parallel handoff into separate edges. - After the parallel handoff edge, the remaining right-side placement steps put the actual action in `right_arm_action` and set `left_arm_action` to null. @@ -906,7 +907,7 @@ def _format_pick_up_spec( ) -> str: return _compact_json( { - "atomic_action_class": "PickUpAction", + "atomic_action_class": "PickUp", "robot_name": robot_name, "control": "arm", "target_object": { @@ -927,18 +928,24 @@ def _format_pose_object_spec( offset: tuple[float, float, float] | list[float], *, sample_interval: int, + orientation_goal: str = "preserve", + align_to: str | None = None, ) -> str: x, y, z = offset + target_object_pose = { + "reference": "object", + "obj_name": obj_name, + "offset": [float(x), float(y), float(z)], + "orientation_goal": orientation_goal, + } + if align_to is not None: + target_object_pose["align_to"] = align_to return _compact_json( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveHeldObject", "robot_name": robot_name, "control": "arm", - "target_pose": { - "reference": "object", - "obj_name": obj_name, - "offset": [float(x), float(y), float(z)], - }, + "target_object_pose": target_object_pose, "cfg": {"sample_interval": sample_interval}, } ) @@ -986,6 +993,8 @@ def _format_relative_pose_spec( robot_name, position, sample_interval=sample_interval, + orientation_goal=placement.orientation_goal, + align_to=placement.orientation_align_to_runtime_uid, ) offset = placement.high_offset if pose_kind == "high" else placement.release_offset @@ -994,6 +1003,8 @@ def _format_relative_pose_spec( placement.reference_runtime_uid, offset, sample_interval=sample_interval, + orientation_goal=placement.orientation_goal, + align_to=placement.orientation_align_to_runtime_uid, ) @@ -1028,16 +1039,22 @@ def _format_pose_absolute_spec( position: Sequence[float], *, sample_interval: int, + orientation_goal: str = "preserve", + align_to: str | None = None, ) -> str: + target_object_pose = { + "reference": "absolute", + "position": [float(value) for value in position], + "orientation_goal": orientation_goal, + } + if align_to is not None: + target_object_pose["align_to"] = align_to return _compact_json( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveHeldObject", "robot_name": robot_name, "control": "arm", - "target_pose": { - "reference": "absolute", - "position": [float(value) for value in position], - }, + "target_object_pose": target_object_pose, "cfg": {"sample_interval": sample_interval}, } ) @@ -1070,7 +1087,7 @@ def _format_place_spec( ) -> str: return _compact_json( { - "atomic_action_class": "PlaceAction", + "atomic_action_class": "Place", "robot_name": robot_name, "control": "arm", "target_pose": dict(target_pose), @@ -1094,7 +1111,7 @@ def _format_gripper_spec( cfg["post_hold_steps"] = post_hold_steps return _compact_json( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveJoints", "robot_name": robot_name, "control": "hand", "target_qpos": {"source": "gripper_state", "state": state}, @@ -1110,7 +1127,7 @@ def _format_initial_qpos_spec( ) -> str: return _compact_json( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveJoints", "robot_name": robot_name, "control": "arm", "target_qpos": {"source": "initial"}, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py index fcf7f529..f64152ea 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py @@ -129,6 +129,8 @@ def _with_self_relative_absolute_targets( reference_is_initial_pose=primary.reference_is_initial_pose, release_position=primary.release_position, high_position=primary.high_position, + orientation_goal=primary.orientation_goal, + orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, ) @@ -158,6 +160,8 @@ def _with_self_relative_absolute_target( reference_is_initial_pose=True, release_position=release_position, high_position=high_position, + orientation_goal=placement.orientation_goal, + orientation_align_to_runtime_uid=placement.orientation_align_to_runtime_uid, ) @@ -252,6 +256,8 @@ def _replace_relative_spec_placements( reference_is_initial_pose=primary.reference_is_initial_pose, release_position=primary.release_position, high_position=primary.high_position, + orientation_goal=primary.orientation_goal, + orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, ) @@ -380,6 +386,8 @@ def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: "relation": spec.relation, "active_arm": f"{spec.active_side}_arm", "release_offset": spec.release_offset, + "orientation_goal": spec.orientation_goal, + "orientation_align_to": spec.orientation_align_to_runtime_uid, } return { "mode": "dual_arm_relative_placement", @@ -390,6 +398,8 @@ def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: "relation": placement.relation, "active_arm": f"{placement.active_side}_arm", "release_offset": placement.release_offset, + "orientation_goal": placement.orientation_goal, + "orientation_align_to": placement.orientation_align_to_runtime_uid, } for placement in spec.placements ], diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py index e539b90f..839e3bf6 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py @@ -235,7 +235,9 @@ def _call_relative_task_llm( ' "reference_object": "",\n' ' "goal_relation": ' '"inside|on|left_of|right_of|front_of|behind|front_left_of|back_left_of|front_right_of|back_right_of",\n' - ' "arm": "left|right|auto"\n' + ' "arm": "left|right|auto",\n' + ' "orientation_goal": "preserve|upright|horizontal",\n' + ' "orientation_reference": "none|reference_object"\n' " }\n" " ],\n" ' "task_prompt_summary": "",\n' @@ -243,7 +245,7 @@ def _call_relative_task_llm( ' "action_sketch": [\n' ' "grasp moved_object",\n' ' "move above the relation target pose",\n' - ' "place at the release pose with PlaceAction"\n' + ' "place at the release pose with Place"\n' " ]\n" "}\n\n" "Rules:\n" @@ -283,6 +285,13 @@ def _call_relative_task_llm( "falls into it, use goal_relation='inside'.\n" "- If the task says to stack/place one object on another non-container " "support, use goal_relation='on'.\n" + "- orientation_goal captures the held object's intended pose before " + "release. Use 'horizontal' for tasks like 水平摆正, 平放, 横放, or lay " + "flat. Use 'upright' for tasks like 扶正, 竖起来, or stand upright. " + "Use 'preserve' when no orientation change is requested.\n" + "- orientation_reference should be 'reference_object' when the object " + "should be aligned to the pad, box, container, or target support; " + "otherwise use 'none'.\n" "- Do not return numeric offsets, object poses, scales, success JSON, " "robot config, or full prompt files. The generator computes those " "deterministically.\n\n" @@ -378,6 +387,8 @@ def _apply_relative_task_response( reference_is_initial_pose=primary.reference_is_initial_pose, release_position=primary.release_position, high_position=primary.high_position, + orientation_goal=primary.orientation_goal, + orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, ) @@ -478,6 +489,15 @@ def _build_relative_placement_step( raise ValueError( f"Relative placement produced duplicate runtime uid {moved_runtime_uid!r}." ) + orientation_goal = _normalize_orientation_goal(entry.get("orientation_goal")) + orientation_reference = _normalize_orientation_reference( + entry.get("orientation_reference") + ) + orientation_align_to_runtime_uid = ( + reference_runtime_uid + if orientation_reference == "reference_object" and not reference_is_initial_pose + else None + ) release_offset = [float(value) for value in release_offset_fn(relation)] high_offset = list(release_offset) @@ -506,6 +526,8 @@ def _build_relative_placement_step( release_offset=release_offset, high_offset=high_offset, reference_is_initial_pose=reference_is_initial_pose, + orientation_goal=orientation_goal, + orientation_align_to_runtime_uid=orientation_align_to_runtime_uid, ) @@ -643,6 +665,46 @@ def _normalize_relative_arm(value: Any) -> str: ) +def _normalize_orientation_goal(value: Any) -> str: + if value is None: + return "preserve" + text = str(value).strip().lower().replace("-", "_").replace(" ", "_") + if text in {"", "none", "null", "default", "preserve", "keep", "保持"}: + return "preserve" + if text in {"upright", "vertical", "stand_upright", "扶正", "竖直", "竖起来"}: + return "upright" + if text in {"horizontal", "flat", "lay_flat", "level", "水平", "平放", "横放"}: + return "horizontal" + raise ValueError( + f"Unsupported orientation_goal {value!r}; expected 'preserve', " + "'upright', or 'horizontal'." + ) + + +def _normalize_orientation_reference(value: Any) -> str: + if value is None: + return "none" + text = str(value).strip().lower().replace("-", "_").replace(" ", "_") + if text in {"", "none", "null", "default", "no", "false", "无"}: + return "none" + if text in { + "reference_object", + "reference", + "target", + "support", + "container", + "pad", + "box", + "参考物体", + "目标物体", + }: + return "reference_object" + raise ValueError( + f"Unsupported orientation_reference {value!r}; expected 'none' or " + "'reference_object'." + ) + + def _relative_runtime_uid_mapping( rigid_objects: list[_SceneObject], ) -> dict[str, str]: @@ -745,7 +807,7 @@ def _default_relative_action_sketch( f"move above the {placement.relation} release pose relative to " f"{placement.reference_runtime_uid}" ), - "place at the release pose with PlaceAction", + "place at the release pose with Place", ] sketch = ["grasp both moved objects with their assigned arms"] for placement in placements: @@ -756,7 +818,7 @@ def _default_relative_action_sketch( f"{placement.moved_runtime_uid} above the release pose relative " f"to {placement.reference_runtime_uid}" ), - f"place {placement.moved_runtime_uid} with PlaceAction", + f"place {placement.moved_runtime_uid} with Place", ] ) return sketch diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt index 596fc455..26f2f890 100644 --- a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt @@ -4,27 +4,44 @@ Each non-null graph edge action must be a JSON object with these common fields and exactly one target field: { - "atomic_action_class": "PickUpAction|MoveAction|PlaceAction", + "atomic_action_class": "PickUp|MoveEndEffector|MoveJoints|MoveHeldObject|Place", "robot_name": "left_arm|right_arm", "control": "arm|hand", "cfg": {} } -Use only these atomic action classes: +Use only these native atomic action classes: -1. `PickUpAction` +1. `PickUp` - Required target_object: {"obj_name": "", "affordance": "antipodal"} - Typical cfg: {"pre_grasp_distance": 0.08, "sample_interval": 45} -2. `MoveAction` - - Use `control: "arm"` with target_pose or arm target_qpos. - - Use `control: "hand"` with gripper target_qpos. - - Supported target_pose objects: +2. `MoveHeldObject` + - Use only after the same arm has successfully executed `PickUp`. + - Moves the already-held object in the air without releasing it. + - Required target_object_pose: + {"reference": "object", "obj_name": "", "offset": [x, y, z], "orientation_goal": "preserve|upright|horizontal"} + {"reference": "absolute", "position": [x, y, z], "orientation_goal": "preserve|upright|horizontal"} + {"reference": "relative", "offset": [dx, dy, dz], "frame": "world|eef", "orientation_goal": "preserve|upright|horizontal"} + - For horizontal insertion into a container, add `"align_to": ""`. + - Typical cfg: + {"sample_interval": 45} + +3. `Place` + - Prefer this for placement because one action lowers, opens the gripper, + and retreats upward. + - Required target_pose: {"reference": "object", "obj_name": "", "offset": [x, y, z]} {"reference": "absolute", "position": [x, y, z]} {"reference": "relative", "offset": [dx, dy, dz], "frame": "world|eef"} + - Typical cfg: + {"sample_interval": 80, "lift_height": 0.1} + +4. `MoveJoints` + - Use `control: "arm"` for arm joint targets. + - Use `control: "hand"` for gripper targets. - Supported target_qpos objects: {"source": "initial"} {"source": "gripper_state", "state": "open|close"} @@ -32,24 +49,24 @@ Use only these atomic action classes: - Typical cfg: {"sample_interval": 30} -3. `PlaceAction` - - Prefer this for placement because one action lowers, opens the gripper, - and retreats upward. - - Required target_pose. Supported pose targets are the same target_pose objects - accepted by `MoveAction`. +5. `MoveEndEffector` + - Use for moving an empty end-effector to a pose. + - Do not use it to move a held object; use `MoveHeldObject` instead. + - Required target_pose. Supported pose targets are the same as `Place`. - Typical cfg: - {"sample_interval": 80, "lift_height": 0.1} + {"sample_interval": 30} Rules: - Do not output Python code, function calls, or `fn`/`kwargs` action objects. - Do not output legacy `action`-based specs. +- Do not output old action names `PickUpAction`, `MoveAction`, or `PlaceAction`. - Use `null` for an idle arm. - Keep all values JSON primitives. - Each non-null action must contain exactly one of `target_object`, `target_pose`, - or `target_qpos`. + `target_qpos`, or `target_object_pose`. - To keep a holding arm closed while the other arm moves, use: { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveJoints", "robot_name": "", "control": "hand", "cfg": {"sample_interval": 10}, diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/task_prompt.py b/embodichain/gen_sim/action_agent_pipeline/prompts/task_prompt.py index 8f512216..5ffe0844 100644 --- a/embodichain/gen_sim/action_agent_pipeline/prompts/task_prompt.py +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/task_prompt.py @@ -47,7 +47,7 @@ def generate_task_graph(observations: dict[str, Any], **kwargs: Any) -> Any: "source": "v0_start", "target": "v1_", "left_arm_action": { - "atomic_action_class": "PickUpAction|MoveAction|PlaceAction", + "atomic_action_class": "PickUp|MoveEndEffector|MoveJoints|MoveHeldObject|Place", "robot_name": "left_arm|right_arm", "control": "arm|hand", "target_object": {"obj_name": "", "affordance": "antipodal"}, diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index f2ba4cf2..8d047231 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -38,8 +38,14 @@ AntipodalAffordance, EndEffectorPoseTarget, GraspTarget, + HeldObjectPoseTarget, + JointPositionTarget, MoveEndEffector, MoveEndEffectorCfg, + MoveHeldObject, + MoveHeldObjectCfg, + MoveJoints, + MoveJointsCfg, ObjectSemantics, PickUp, PickUpCfg, @@ -64,14 +70,26 @@ "build_parallel_action_stream", "execute_atomic_action", "execute_parallel_atomic_actions", + "init_parallel_world_states", "normalize_atomic_action_spec", "step_env_with_actions", ] -SUPPORTED_ATOMIC_ACTION_CLASSES = {"PickUpAction", "MoveAction", "PlaceAction"} +SUPPORTED_ATOMIC_ACTION_CLASSES = { + "PickUp", + "MoveEndEffector", + "MoveJoints", + "MoveHeldObject", + "Place", +} SUPPORTED_CONTROLS = {"arm", "hand"} -TARGET_SPEC_FIELDS = ("target_object", "target_pose", "target_qpos") +TARGET_SPEC_FIELDS = ( + "target_object", + "target_pose", + "target_qpos", + "target_object_pose", +) ACTION_SPEC_FIELDS = { "atomic_action_class", "robot_name", @@ -80,6 +98,7 @@ *TARGET_SPEC_FIELDS, } SUPPORTED_POSE_REFERENCES = {"object", "absolute", "relative"} +SUPPORTED_OBJECT_ORIENTATION_GOALS = {"preserve", "upright", "horizontal"} SUPPORTED_QPOS_SOURCES = {"initial", "gripper_state", "joint_delta"} SUPPORTED_CFG_KEYS = { "sample_interval", @@ -91,9 +110,11 @@ ATOMIC_ACTION_REGISTRY = { - "PickUpAction": (PickUp, PickUpCfg), - "MoveAction": (MoveEndEffector, MoveEndEffectorCfg), - "PlaceAction": (Place, PlaceCfg), + "PickUp": (PickUp, PickUpCfg), + "MoveEndEffector": (MoveEndEffector, MoveEndEffectorCfg), + "MoveJoints": (MoveJoints, MoveJointsCfg), + "MoveHeldObject": (MoveHeldObject, MoveHeldObjectCfg), + "Place": (Place, PlaceCfg), } @@ -107,6 +128,7 @@ class AtomicActionSpec: target_object: dict[str, Any] = field(default_factory=dict) target_pose: dict[str, Any] = field(default_factory=dict) target_qpos: dict[str, Any] = field(default_factory=dict) + target_object_pose: dict[str, Any] = field(default_factory=dict) cfg: dict[str, Any] = field(default_factory=dict) @classmethod @@ -124,6 +146,7 @@ def from_normalized(cls, normalized: Mapping[str, Any]) -> "AtomicActionSpec": target_object=dict(normalized.get("target_object", {})), target_pose=dict(normalized.get("target_pose", {})), target_qpos=dict(normalized.get("target_qpos", {})), + target_object_pose=dict(normalized.get("target_object_pose", {})), cfg=dict(normalized["cfg"]), ) @@ -140,9 +163,19 @@ def to_dict(self) -> dict[str, Any]: spec["target_pose"] = deepcopy(self.target_pose) if self.target_qpos: spec["target_qpos"] = deepcopy(self.target_qpos) + if self.target_object_pose: + spec["target_object_pose"] = deepcopy(self.target_object_pose) return spec +@dataclass(frozen=True) +class _ExecutedAtomicAction: + action: np.ndarray + next_state: WorldState | None + robot_name: str | None + control: str | None + + def normalize_atomic_action_spec(spec: Mapping[str, Any]) -> dict[str, Any]: """Validate and normalize an atomic action JSON spec.""" if not isinstance(spec, Mapping): @@ -157,12 +190,12 @@ def normalize_atomic_action_spec(spec: Mapping[str, Any]) -> dict[str, Any]: if "action" in spec: raise ValueError( "Legacy action schema is not supported. Use atomic_action_class with " - "PickUpAction, MoveAction, or PlaceAction." + "PickUp, MoveEndEffector, MoveJoints, MoveHeldObject, or Place." ) if "target" in spec: raise ValueError( "Legacy target.kind schema is not supported. Use exactly one of " - "target_object, target_pose, or target_qpos." + "target_object, target_pose, target_qpos, or target_object_pose." ) unknown_fields = set(spec) - ACTION_SPEC_FIELDS if unknown_fields: @@ -222,7 +255,7 @@ def _normalize_action_target( if len(target_fields) != 1: raise ValueError( "Atomic action spec requires exactly one of target_object, target_pose, " - f"or target_qpos; got {target_fields}." + f"target_qpos, or target_object_pose; got {target_fields}." ) target_field = target_fields[0] @@ -231,29 +264,41 @@ def _normalize_action_target( raise ValueError(f"{target_field} must be a non-empty object.") target_spec = dict(target_spec) - if atomic_action_class == "PickUpAction": + if atomic_action_class == "PickUp": if control != "arm" or target_field != "target_object": - raise ValueError("PickUpAction requires control='arm' and target_object.") + raise ValueError("PickUp requires control='arm' and target_object.") _validate_target_object(target_spec) return target_field, target_spec - if atomic_action_class == "PlaceAction": + if atomic_action_class == "Place": if control != "arm" or target_field != "target_pose": - raise ValueError("PlaceAction requires control='arm' and target_pose.") + raise ValueError("Place requires control='arm' and target_pose.") _validate_target_pose(target_spec) return target_field, target_spec - if target_field == "target_pose": + if atomic_action_class == "MoveEndEffector": if control != "arm": - raise ValueError("MoveAction target_pose requires control='arm'.") + raise ValueError("MoveEndEffector requires control='arm'.") + if target_field != "target_pose": + raise ValueError("MoveEndEffector requires target_pose.") _validate_target_pose(target_spec) return target_field, target_spec - if target_field == "target_qpos": + if atomic_action_class == "MoveJoints": + if target_field != "target_qpos": + raise ValueError("MoveJoints requires target_qpos.") _validate_target_qpos(target_spec, control=control) return target_field, target_spec - raise ValueError("MoveAction requires target_pose or target_qpos.") + if atomic_action_class == "MoveHeldObject": + if control != "arm" or target_field != "target_object_pose": + raise ValueError( + "MoveHeldObject requires control='arm' and target_object_pose." + ) + _validate_target_object_pose(target_spec) + return target_field, target_spec + + raise ValueError(f"Unsupported atomic action class: {atomic_action_class}.") def _validate_target_object(target_object: Mapping[str, Any]) -> None: @@ -313,6 +358,66 @@ def _validate_target_pose(target_pose: Mapping[str, Any]) -> None: raise ValueError("relative target_pose frame must be 'world' or 'eef'.") +def _validate_target_object_pose(target_object_pose: Mapping[str, Any]) -> None: + _validate_target_pose_like(target_object_pose, "target_object_pose") + orientation_goal = target_object_pose.get("orientation_goal", "preserve") + if orientation_goal not in SUPPORTED_OBJECT_ORIENTATION_GOALS: + raise ValueError( + "target_object_pose orientation_goal must be one of " + f"{sorted(SUPPORTED_OBJECT_ORIENTATION_GOALS)}." + ) + align_to = target_object_pose.get("align_to") + if align_to is not None and (not isinstance(align_to, str) or not align_to): + raise ValueError("target_object_pose align_to must be a non-empty string.") + + +def _validate_target_pose_like( + target_pose: Mapping[str, Any], + target_name: str, +) -> None: + reference = target_pose.get("reference") + allowed_common = {"orientation_goal", "align_to"} + if reference not in SUPPORTED_POSE_REFERENCES: + raise ValueError( + f"{target_name} reference must be one of {sorted(SUPPORTED_POSE_REFERENCES)}." + ) + + if reference == "object": + _validate_target_fields( + target_pose, + {"reference", "obj_name", "offset"} | allowed_common, + target_name, + ) + obj_name = target_pose.get("obj_name") + if not isinstance(obj_name, str) or not obj_name: + raise ValueError(f"object {target_name} requires non-empty obj_name.") + _xyz(target_pose.get("offset", [0.0, 0.0, 0.0]), "offset") + return + + if reference == "absolute": + _validate_target_fields( + target_pose, + {"reference", "position"} | allowed_common, + target_name, + ) + position = target_pose.get("position") + if not isinstance(position, list) or len(position) != 3: + raise ValueError( + f"absolute {target_name} requires position with three entries." + ) + return + + _validate_target_fields( + target_pose, + {"reference", "offset", "frame"} | allowed_common, + target_name, + ) + _xyz(target_pose.get("offset", [0.0, 0.0, 0.0]), "offset") + frame = target_pose.get("frame", "world") + if frame not in {"world", "eef"}: + raise ValueError(f"relative {target_name} frame must be 'world' or 'eef'.") + + def _validate_target_qpos( target_qpos: Mapping[str, Any], *, @@ -370,42 +475,48 @@ def execute_atomic_action( action_spec: Mapping[str, Any] | AtomicActionSpec, *, env, + state: WorldState | None = None, **runtime_kwargs, ) -> np.ndarray: """Execute one atomic action spec and return local arm+eef qpos actions.""" + executed = _execute_atomic_action_result( + action_spec, + env=env, + state=state, + **runtime_kwargs, + ) + _sync_agent_state_from_atomic_action( + env, + executed.robot_name, + executed.action, + executed.control, + ) + return executed.action + + +def _execute_atomic_action_result( + action_spec: Mapping[str, Any] | AtomicActionSpec, + *, + env, + state: WorldState | None = None, + **runtime_kwargs, +) -> _ExecutedAtomicAction: + """Execute one atomic action spec and keep the typed WorldState result.""" spec = ( action_spec if isinstance(action_spec, AtomicActionSpec) else AtomicActionSpec.from_mapping(action_spec) ) - if spec.atomic_action_class == "MoveAction" and spec.target_qpos: - action_np = _execute_move_qpos_action(env, spec) - action_np = _append_hold_steps( - action_np, - int(spec.cfg.get("post_hold_steps", 0)), - "atomic qpos action", - ) - _sync_agent_state_from_atomic_action( - env, - spec.robot_name, - action_np, - spec.control, - ) - log_info( - "Using action-agent qpos action: " - f"control={spec.control}, target={_target_summary(spec)}, " - f"steps={len(action_np)}.", - color="green", - ) - return action_np - target = _resolve_target(env, spec, runtime_kwargs) + target = _resolve_target(env, spec, runtime_kwargs, state=state) _, arm_part, hand_part, arm_joints, eef_joints = _select_arm_parts( env, spec.robot_name ) cfg = _build_action_cfg(env, spec, arm_part, hand_part, len(eef_joints)) target = _build_typed_target(spec, target) - state = WorldState(last_qpos=env.robot.get_qpos().clone()) + if state is None: + state = WorldState(last_qpos=env.robot.get_qpos().clone()) + state = _state_with_current_agent_qpos(env, spec, state) action_cls = _get_atomic_action_class(spec.atomic_action_class) action = action_cls(motion_generator=_make_motion_generator(env), cfg=cfg) result = action.execute( @@ -417,8 +528,11 @@ def execute_atomic_action( f"Atomic action failed: atomic_action_class={spec.atomic_action_class}, " f"robot_name={spec.robot_name}, target={_target_summary(spec)}." ) - trajectory = result.trajectory[:, :, arm_joints + eef_joints] - joint_ids = arm_joints + eef_joints + if spec.atomic_action_class == "MoveJoints": + joint_ids = arm_joints if spec.control == "arm" else eef_joints + else: + joint_ids = arm_joints + eef_joints + trajectory = result.trajectory[:, :, joint_ids] action_np = _trajectory_to_agent_action( env, @@ -431,7 +545,6 @@ def execute_atomic_action( int(spec.cfg.get("post_hold_steps", 0)), "atomic action", ) - _sync_agent_state_from_atomic_action(env, spec.robot_name, action_np, spec.control) log_info( "Using atomic action: " f"atomic_action_class={spec.atomic_action_class}, cfg={cfg.__class__.__name__}, " @@ -439,7 +552,18 @@ def execute_atomic_action( f"steps={len(action_np)}.", color="green", ) - return action_np + next_state = result.next_state + if int(spec.cfg.get("post_hold_steps", 0)) > 0: + next_state = WorldState( + last_qpos=next_state.last_qpos.clone(), + held_object=next_state.held_object, + ) + return _ExecutedAtomicAction( + action=action_np, + next_state=next_state, + robot_name=spec.robot_name, + control=spec.control, + ) def execute_parallel_atomic_actions( @@ -447,21 +571,24 @@ def execute_parallel_atomic_actions( right_arm_action=None, *, env, + world_states: dict[str, WorldState] | None = None, return_result: bool = False, **runtime_kwargs, ): """Execute left/right atomic action specs as one synchronized stream.""" - actions = build_parallel_action_stream( + result = build_parallel_action_stream( left_arm_action=left_arm_action, right_arm_action=right_arm_action, env=env, + world_states=world_states, + return_result=True, **runtime_kwargs, ) + actions = result["actions"] step_env_with_actions(env, actions) + _sync_agent_states_from_parallel_actions(env, result["arm_actions"]) if return_result: - return { - "actions": actions, - } + return result return actions @@ -470,17 +597,37 @@ def build_parallel_action_stream( right_arm_action=None, *, env, + world_states: dict[str, WorldState] | None = None, + return_result: bool = False, **runtime_kwargs, -) -> list[torch.Tensor]: +) -> list[torch.Tensor] | dict[str, Any]: """Build a synchronized left/right atomic action stream without stepping env.""" if env is None: raise ValueError("env is required to build parallel atomic actions.") - left_arm_action = _resolve_action_spec(left_arm_action, env, runtime_kwargs) - right_arm_action = _resolve_action_spec(right_arm_action, env, runtime_kwargs) + if world_states is None: + world_states = init_parallel_world_states(env) + left_arm_action = _resolve_action_spec( + left_arm_action, + env, + runtime_kwargs, + state=world_states.get("left"), + ) + right_arm_action = _resolve_action_spec( + right_arm_action, + env, + runtime_kwargs, + state=world_states.get("right"), + ) - left_arm_action = _as_2d_action(left_arm_action, "left_arm_action") - right_arm_action = _as_2d_action(right_arm_action, "right_arm_action") - arm_actions = {"left": left_arm_action, "right": right_arm_action} + left_action_np = _as_2d_action( + _executed_action_array(left_arm_action), + "left_arm_action", + ) + right_action_np = _as_2d_action( + _executed_action_array(right_arm_action), + "right_arm_action", + ) + arm_actions = {"left": left_action_np, "right": right_action_np} if all(action is None for action in arm_actions.values()): raise ValueError("At least one atomic arm action must be provided.") @@ -519,7 +666,36 @@ def build_parallel_action_stream( actions[:, arm_index] = action actions = torch.from_numpy(actions).to(dtype=torch.float32).unsqueeze(1) - return list(actions.unbind(dim=0)) + actions = list(actions.unbind(dim=0)) + if not return_result: + return actions + next_world_states = dict(world_states) + for side, executed in { + "left": left_arm_action, + "right": right_arm_action, + }.items(): + if ( + isinstance(executed, _ExecutedAtomicAction) + and executed.next_state is not None + ): + next_world_states[side] = executed.next_state + return { + "actions": actions, + "world_states": next_world_states, + "arm_actions": { + "left": left_arm_action, + "right": right_arm_action, + }, + } + + +def init_parallel_world_states(env) -> dict[str, WorldState]: + """Seed independent per-arm WorldState slots from the current robot qpos.""" + qpos = env.robot.get_qpos().clone() + return { + "left": WorldState(last_qpos=qpos.clone()), + "right": WorldState(last_qpos=qpos.clone()), + } def step_env_with_actions( @@ -537,99 +713,46 @@ def step_env_with_actions( env.update_obj_info() -def _resolve_action_spec(action_spec, env, runtime_kwargs: dict[str, Any]): +def _resolve_action_spec( + action_spec, + env, + runtime_kwargs: dict[str, Any], + *, + state: WorldState | None, +): if action_spec is None: return None if isinstance(action_spec, np.ndarray): return action_spec if isinstance(action_spec, torch.Tensor): return action_spec - return execute_atomic_action(action_spec, env=env, **runtime_kwargs) - - -def _execute_move_qpos_action(env, spec: AtomicActionSpec) -> np.ndarray: - """Execute MoveAction target_qpos locally without extending core MoveAction.""" - target_qpos = _resolve_qpos_target(env, spec) - start_qpos, joint_ids = _qpos_start_and_joint_ids(env, spec) - target_qpos = _resolve_batched_qpos( - target_qpos, - expected_dof=len(joint_ids), - device=env.robot.device, - name="target_qpos", - ) - sample_interval = int(spec.cfg.get("sample_interval", 80)) - trajectory = _interpolate_qpos_trajectory( - start_qpos, - target_qpos, - sample_interval, - ) - return _trajectory_to_agent_action( - env, - spec.robot_name, - trajectory, - joint_ids, + return _execute_atomic_action_result( + action_spec, + env=env, + state=state, + **runtime_kwargs, ) -def _qpos_start_and_joint_ids( - env, - spec: AtomicActionSpec, -) -> tuple[torch.Tensor, list[int]]: - is_left, _, _, arm_joints, eef_joints = _select_arm_parts(env, spec.robot_name) - if spec.control == "hand": - _, _, _, _, current_gripper_state = get_arm_states(env, spec.robot_name) - start_qpos = _state_to_hand_qpos( - current_gripper_state, - len(eef_joints), - env.robot.device, - ) - return start_qpos.reshape(1, len(eef_joints)), eef_joints - return _current_arm_qpos(env, is_left, arm_joints), arm_joints - - -def _resolve_batched_qpos( - qpos, - *, - expected_dof: int, - device, - name: str, -) -> torch.Tensor: - qpos = torch.as_tensor(qpos, dtype=torch.float32, device=device) - if qpos.shape == (expected_dof,): - qpos = qpos.reshape(1, expected_dof) - if qpos.ndim != 2 or qpos.shape[1] != expected_dof: - raise ValueError( - f"{name} must have shape ({expected_dof},) or (num_envs, {expected_dof}), " - f"got {tuple(qpos.shape)}." - ) - return qpos +def _executed_action_array(action): + if isinstance(action, _ExecutedAtomicAction): + return action.action + return action -def _interpolate_qpos_trajectory( - start_qpos: torch.Tensor, - target_qpos: torch.Tensor, - sample_interval: int, -) -> torch.Tensor: - if sample_interval < 2: - raise ValueError("sample_interval must be at least 2 for qpos interpolation.") - if target_qpos.shape[0] == 1 and start_qpos.shape[0] > 1: - target_qpos = target_qpos.repeat(start_qpos.shape[0], 1) - if start_qpos.shape != target_qpos.shape: - raise ValueError( - f"start_qpos and target_qpos must have matching shapes, got " - f"{tuple(start_qpos.shape)} and {tuple(target_qpos.shape)}." +def _sync_agent_states_from_parallel_actions( + env, + arm_actions: Mapping[str, Any], +) -> None: + for executed in arm_actions.values(): + if not isinstance(executed, _ExecutedAtomicAction): + continue + _sync_agent_state_from_atomic_action( + env, + executed.robot_name, + executed.action, + executed.control, ) - weights = torch.linspace( - 0.0, - 1.0, - steps=sample_interval, - dtype=start_qpos.dtype, - device=start_qpos.device, - ).reshape(1, sample_interval, 1) - return ( - start_qpos.unsqueeze(1) - + (target_qpos.unsqueeze(1) - start_qpos.unsqueeze(1)) * weights - ) def _select_arm_parts(env, robot_name: str): @@ -645,6 +768,32 @@ def _select_arm_parts(env, robot_name: str): return is_left, arm_part, hand_part, list(arm_joints), list(eef_joints) +def _state_with_current_agent_qpos( + env, + spec: AtomicActionSpec, + state: WorldState, +) -> WorldState: + qpos = state.last_qpos.clone() + _, _, current_arm_qpos, _, current_gripper_state = get_arm_states( + env, + spec.robot_name, + ) + _, _, _, arm_joints, eef_joints = _select_arm_parts(env, spec.robot_name) + if arm_joints: + qpos[:, arm_joints] = torch.as_tensor( + current_arm_qpos, + dtype=torch.float32, + device=qpos.device, + ).reshape(1, len(arm_joints)) + if eef_joints: + qpos[:, eef_joints] = _state_to_hand_qpos( + current_gripper_state, + len(eef_joints), + qpos.device, + ).reshape(1, len(eef_joints)) + return WorldState(last_qpos=qpos, held_object=state.held_object) + + def _make_motion_generator(env): return MotionGenerator( cfg=MotionGenCfg(planner_cfg=ToppraPlannerCfg(robot_uid=env.robot.uid)) @@ -657,12 +806,14 @@ def _get_atomic_action_class(atomic_action_class: str): def _build_typed_target(spec: AtomicActionSpec, target): - if spec.atomic_action_class == "PickUpAction": + if spec.atomic_action_class == "PickUp": return GraspTarget(semantics=target) - if spec.atomic_action_class == "PlaceAction": - return EndEffectorPoseTarget(xpos=target) - if spec.atomic_action_class == "MoveAction": + if spec.atomic_action_class in {"MoveEndEffector", "Place"}: return EndEffectorPoseTarget(xpos=target) + if spec.atomic_action_class == "MoveJoints": + return JointPositionTarget(qpos=target) + if spec.atomic_action_class == "MoveHeldObject": + return HeldObjectPoseTarget(object_target_pose=target) raise ValueError(f"Unsupported atomic action class: {spec.atomic_action_class}.") @@ -677,9 +828,9 @@ def _build_action_cfg( cfg_values.pop("post_hold_steps", None) device = env.robot.device - if spec.atomic_action_class == "PickUpAction": + if spec.atomic_action_class == "PickUp": if spec.control != "arm": - raise ValueError("PickUpAction atomic action requires control='arm'.") + raise ValueError("PickUp atomic action requires control='arm'.") return PickUpCfg( control_part=arm_part, hand_control_part=hand_part, @@ -688,9 +839,9 @@ def _build_action_cfg( **_cfg_supported_kwargs(PickUpCfg, cfg_values), ) - if spec.atomic_action_class == "PlaceAction": + if spec.atomic_action_class == "Place": if spec.control != "arm": - raise ValueError("PlaceAction atomic action requires control='arm'.") + raise ValueError("Place atomic action requires control='arm'.") return PlaceCfg( control_part=arm_part, hand_control_part=hand_part, @@ -699,19 +850,46 @@ def _build_action_cfg( **_cfg_supported_kwargs(PlaceCfg, cfg_values), ) + if spec.atomic_action_class == "MoveHeldObject": + if spec.control != "arm": + raise ValueError("MoveHeldObject atomic action requires control='arm'.") + return MoveHeldObjectCfg( + control_part=arm_part, + hand_control_part=hand_part, + hand_close_qpos=_state_to_hand_qpos(env.close_state, hand_dof, device), + **_cfg_supported_kwargs(MoveHeldObjectCfg, cfg_values), + ) + control_part = arm_part if spec.control == "arm" else hand_part - return MoveEndEffectorCfg( - control_part=control_part, - **_cfg_supported_kwargs(MoveEndEffectorCfg, cfg_values), - ) + if spec.atomic_action_class == "MoveJoints": + return MoveJointsCfg( + control_part=control_part, + **_cfg_supported_kwargs(MoveJointsCfg, cfg_values), + ) + if spec.atomic_action_class == "MoveEndEffector": + return MoveEndEffectorCfg( + control_part=control_part, + **_cfg_supported_kwargs(MoveEndEffectorCfg, cfg_values), + ) + raise ValueError(f"Unsupported atomic action class: {spec.atomic_action_class}.") -def _resolve_target(env, spec: AtomicActionSpec, runtime_kwargs: dict[str, Any]): - if spec.atomic_action_class == "PickUpAction": +def _resolve_target( + env, + spec: AtomicActionSpec, + runtime_kwargs: dict[str, Any], + *, + state: WorldState | None, +): + if spec.atomic_action_class == "PickUp": return _resolve_pickup_target(env, spec, runtime_kwargs) - if spec.atomic_action_class == "MoveAction": - return _resolve_move_target(env, spec) - if spec.atomic_action_class == "PlaceAction": + if spec.atomic_action_class == "MoveEndEffector": + return _resolve_move_end_effector_target(env, spec) + if spec.atomic_action_class == "MoveJoints": + return _resolve_move_joints_target(env, spec) + if spec.atomic_action_class == "MoveHeldObject": + return _resolve_move_held_object_target(env, spec, state) + if spec.atomic_action_class == "Place": return _resolve_place_target(env, spec) raise ValueError(f"Unsupported atomic action class: {spec.atomic_action_class}.") @@ -722,21 +900,37 @@ def _resolve_pickup_target( runtime_kwargs: dict[str, Any], ): if not spec.target_object: - raise ValueError("PickUpAction requires target_object.") + raise ValueError("PickUp requires target_object.") return _build_object_semantics(env, spec.target_object, runtime_kwargs) -def _resolve_move_target(env, spec: AtomicActionSpec): - if spec.target_pose: - return _resolve_pose_target(env, spec) - if spec.target_qpos: - return _resolve_qpos_target(env, spec) - raise ValueError("MoveAction requires target_pose or target_qpos.") +def _resolve_move_end_effector_target(env, spec: AtomicActionSpec): + if not spec.target_pose: + raise ValueError("MoveEndEffector requires target_pose.") + return _resolve_pose_target(env, spec) + + +def _resolve_move_joints_target(env, spec: AtomicActionSpec): + if not spec.target_qpos: + raise ValueError("MoveJoints requires target_qpos.") + return _resolve_qpos_target(env, spec) + + +def _resolve_move_held_object_target( + env, + spec: AtomicActionSpec, + state: WorldState | None, +): + if not spec.target_object_pose: + raise ValueError("MoveHeldObject requires target_object_pose.") + if state is None or state.held_object is None: + raise ValueError("MoveHeldObject requires a held object from a prior PickUp.") + return _resolve_held_object_pose_target(env, spec, state) def _resolve_place_target(env, spec: AtomicActionSpec): if not spec.target_pose: - raise ValueError("PlaceAction requires target_pose.") + raise ValueError("Place requires target_pose.") return _resolve_pose_target(env, spec) @@ -751,6 +945,192 @@ def _resolve_pose_target(env, spec: AtomicActionSpec): raise ValueError(f"Unsupported target_pose reference: {reference}.") +def _resolve_held_object_pose_target( + env, + spec: AtomicActionSpec, + state: WorldState, +) -> torch.Tensor: + target_pose_spec = spec.target_object_pose + pose_spec = AtomicActionSpec( + atomic_action_class="MoveEndEffector", + robot_name=spec.robot_name, + control="arm", + target_pose={ + key: deepcopy(value) + for key, value in target_pose_spec.items() + if key not in {"orientation_goal", "align_to"} + }, + cfg={}, + ) + target_pose = _resolve_pose_target(env, pose_spec) + target_pose = _ensure_pose_tensor(target_pose, env.robot.device) + current_object_pose = _held_object_current_pose(state, env.robot.device) + target_pose[:3, :3] = _resolve_object_orientation( + env, + target_pose_spec, + current_object_pose, + state, + ) + return target_pose + + +def _held_object_current_pose(state: WorldState, device) -> torch.Tensor: + held = state.held_object + if held is None: + raise ValueError("Held object state is required.") + entity = held.semantics.entity + if entity is not None and hasattr(entity, "get_local_pose"): + return _ensure_pose_tensor(entity.get_local_pose(to_matrix=True), device) + return held.grasp_xpos.to(device=device, dtype=torch.float32).squeeze(0) + + +def _resolve_object_orientation( + env, + target_pose_spec: Mapping[str, Any], + current_object_pose: torch.Tensor, + state: WorldState, +) -> torch.Tensor: + orientation_goal = target_pose_spec.get("orientation_goal", "preserve") + current_rotation = current_object_pose[:3, :3].clone() + if orientation_goal == "preserve": + return current_rotation + + mesh_vertices = _held_object_mesh_vertices(state, env.robot.device) + local_axes = _principal_local_axes(mesh_vertices) + long_axis = local_axes[:, 0] + up_axis = local_axes[:, 2] + if orientation_goal == "upright": + return _rotation_from_axis_targets( + local_primary=long_axis, + world_primary=torch.tensor([0.0, 0.0, 1.0], device=env.robot.device), + local_secondary=up_axis, + world_secondary=torch.tensor([1.0, 0.0, 0.0], device=env.robot.device), + ) + if orientation_goal == "horizontal": + align_direction = _horizontal_alignment_direction( + env, + target_pose_spec.get("align_to"), + env.robot.device, + ) + return _rotation_from_axis_targets( + local_primary=long_axis, + world_primary=align_direction, + local_secondary=up_axis, + world_secondary=torch.tensor([0.0, 0.0, 1.0], device=env.robot.device), + ) + raise ValueError(f"Unsupported orientation_goal: {orientation_goal}.") + + +def _held_object_mesh_vertices(state: WorldState, device) -> torch.Tensor: + held = state.held_object + if held is None: + raise ValueError("Held object state is required.") + vertices = held.semantics.geometry.get("mesh_vertices") + if vertices is None and held.semantics.entity is not None: + vertices = held.semantics.entity.get_vertices(env_ids=[0], scale=True)[0] + vertices = torch.as_tensor(vertices, dtype=torch.float32, device=device) + if vertices.ndim != 2 or vertices.shape[-1] != 3 or vertices.numel() == 0: + raise ValueError("Held object mesh_vertices must have shape (N, 3).") + return vertices + + +def _principal_local_axes(vertices: torch.Tensor) -> torch.Tensor: + mins = vertices.min(dim=0).values + maxs = vertices.max(dim=0).values + extents = maxs - mins + order = torch.argsort(extents, descending=True) + axes = torch.eye(3, dtype=torch.float32, device=vertices.device)[:, order] + return axes + + +def _horizontal_alignment_direction(env, align_to: str | None, device) -> torch.Tensor: + if align_to: + target_obj = env.sim.get_rigid_object(align_to) + if target_obj is None: + raise ValueError(f"No rigid object found for align_to={align_to}.") + vertices = torch.as_tensor( + target_obj.get_vertices(env_ids=[0], scale=True)[0], + dtype=torch.float32, + device=device, + ) + extents = vertices.max(dim=0).values - vertices.min(dim=0).values + axis_index = 0 if extents[0] >= extents[1] else 1 + pose = _ensure_pose_tensor(target_obj.get_local_pose(to_matrix=True), device) + direction = pose[:3, axis_index] + direction = direction.clone() + direction[2] = 0.0 + norm = torch.linalg.norm(direction) + if float(norm) > 1e-6: + return direction / norm + return torch.tensor([1.0, 0.0, 0.0], dtype=torch.float32, device=device) + + +def _rotation_from_axis_targets( + *, + local_primary: torch.Tensor, + world_primary: torch.Tensor, + local_secondary: torch.Tensor, + world_secondary: torch.Tensor, +) -> torch.Tensor: + device = world_primary.device + dtype = torch.float32 + local_primary = _normalize_vector(local_primary.to(device=device, dtype=dtype)) + world_primary = _normalize_vector(world_primary.to(device=device, dtype=dtype)) + local_secondary = _orthogonalized_axis( + local_secondary.to(device=device, dtype=dtype), + local_primary, + ) + world_secondary = _orthogonalized_axis( + world_secondary.to(device=device, dtype=dtype), + world_primary, + ) + local_basis = torch.stack( + [ + local_primary, + local_secondary, + _normalize_vector(torch.linalg.cross(local_primary, local_secondary)), + ], + dim=1, + ) + world_basis = torch.stack( + [ + world_primary, + world_secondary, + _normalize_vector(torch.linalg.cross(world_primary, world_secondary)), + ], + dim=1, + ) + return world_basis @ local_basis.transpose(0, 1) + + +def _orthogonalized_axis(axis: torch.Tensor, reference: torch.Tensor) -> torch.Tensor: + axis = axis - torch.dot(axis, reference) * reference + if float(torch.linalg.norm(axis)) < 1e-6: + fallback = torch.tensor([1.0, 0.0, 0.0], device=reference.device) + if float(torch.abs(torch.dot(fallback, reference))) > 0.9: + fallback = torch.tensor([0.0, 1.0, 0.0], device=reference.device) + axis = fallback - torch.dot(fallback, reference) * reference + return _normalize_vector(axis) + + +def _normalize_vector(vector: torch.Tensor) -> torch.Tensor: + norm = torch.linalg.norm(vector) + if float(norm) < 1e-6: + raise ValueError("Cannot normalize a near-zero vector.") + return vector / norm + + +def _ensure_pose_tensor(pose, device) -> torch.Tensor: + pose = torch.as_tensor(pose, dtype=torch.float32, device=device) + if pose.shape == (1, 4, 4): + pose = pose.squeeze(0) + if pose.shape != (4, 4): + raise ValueError( + f"Pose target must have shape (4, 4), got {tuple(pose.shape)}." + ) + return pose.clone() + + def _resolve_qpos_target(env, spec: AtomicActionSpec): source = spec.target_qpos["source"] if source == "initial": diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py index b60be991..fced5e31 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/graph_compiler.py @@ -222,18 +222,19 @@ def _compile_action(spec: Any, action_module: Any) -> Any: raise ValueError( "Legacy fn/kwargs action schema is not supported. Use atomic action " "class JSON spec with atomic_action_class, robot_name, control, cfg, " - "and exactly one of target_object, target_pose, or target_qpos." + "and exactly one of target_object, target_pose, target_qpos, or " + "target_object_pose." ) if "action" in spec: raise ValueError( "Legacy action schema is not supported. Use atomic_action_class with " - "PickUpAction, MoveAction, or PlaceAction." + "PickUp, MoveEndEffector, MoveJoints, MoveHeldObject, or Place." ) if spec.get("atomic_action_class") is None: raise ValueError( "Atomic action class schema requires atomic_action_class, robot_name, " "control, cfg, and exactly one of target_object, target_pose, or " - "target_qpos." + "target_qpos, or target_object_pose." ) normalized = action_module.normalize_atomic_action_spec(spec) diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py b/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py index 96e61155..7146109b 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/task_graph.py @@ -23,6 +23,7 @@ from embodichain.gen_sim.action_agent_pipeline.runtime.atom_actions import ( execute_parallel_atomic_actions, + init_parallel_world_states, ) __all__ = [ @@ -110,6 +111,7 @@ def run(self, *, env, **kwargs) -> ExecutedActionList: current = self.start executed_actions: list[Any] = [] transitions = 0 + world_states = init_parallel_world_states(env) while current != self.goal: transitions += 1 @@ -117,12 +119,16 @@ def run(self, *, env, **kwargs) -> ExecutedActionList: raise RuntimeError("Agent task graph exceeded max_transitions.") edge = self.edges[self._next_edge(current)] - actions = execute_parallel_atomic_actions( + result = execute_parallel_atomic_actions( left_arm_action=edge.left_arm_action, right_arm_action=edge.right_arm_action, env=env, + world_states=world_states, + return_result=True, **kwargs, ) + actions = result["actions"] + world_states = result["world_states"] executed_actions.extend(actions) current = edge.target diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py index 31a0205d..34d5c046 100644 --- a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -44,7 +44,12 @@ ActionResult, EndEffectorPoseTarget, GraspTarget, + HeldObjectPoseTarget, + HeldObjectState, + JointPositionTarget, MoveEndEffectorCfg, + MoveHeldObjectCfg, + MoveJointsCfg, PickUpCfg, PlaceCfg, WorldState, @@ -54,6 +59,7 @@ class _FakeRobot: uid = "fake_robot" device = torch.device("cpu") + dof = 6 control_parts = { "left_arm": [0, 1], "left_eef": [2], @@ -64,6 +70,9 @@ class _FakeRobot: def get_qpos(self): return torch.zeros(1, 6) + def get_joint_ids(self, name: str): + return list(self.control_parts[name]) + class _FakeObject: cfg = SimpleNamespace(shape=SimpleNamespace(fpath="/tmp/fake.obj")) @@ -76,10 +85,19 @@ def get_local_pose(self, to_matrix: bool = True): return self._pose.unsqueeze(0) def get_vertices(self, env_ids=None, scale: bool = True): - return [torch.tensor([[0.0, 0.0, 0.0], [0.01, 0.0, 0.0], [0.0, 0.01, 0.0]])] + return [ + torch.tensor( + [ + [0.0, 0.0, 0.0], + [0.3, 0.0, 0.0], + [0.0, 0.1, 0.0], + [0.0, 0.0, 0.05], + ] + ) + ] def get_triangles(self, env_ids=None): - return [torch.tensor([[0, 1, 2]])] + return [torch.tensor([[0, 1, 2], [0, 1, 3]])] def get_body_scale(self, env_ids=None): return torch.ones(1, 3) @@ -111,6 +129,14 @@ def __init__(self): self.right_arm_current_gripper_state = torch.tensor([0.0]) self.open_state = torch.tensor([0.05]) self.close_state = torch.tensor([0.0]) + self.stepped_actions = [] + self.update_count = 0 + + def step(self, action): + self.stepped_actions.append(action) + + def update_obj_info(self) -> None: + self.update_count += 1 def get_current_qpos_agent(self): return self.left_arm_current_qpos, self.right_arm_current_qpos @@ -162,7 +188,57 @@ def __init__(self, motion_generator, cfg): def execute(self, target, state, **kwargs): if self.capture is not None: self.capture[-1].update({"target": target, "state": state}) - if self.cfg.name in {"pick_up", "place"}: + if self.cfg.name == "move_joints": + joint_ids = self.motion_generator.robot.get_joint_ids(self.cfg.control_part) + trajectory = state.last_qpos.unsqueeze(1).repeat(1, 2, 1) + trajectory[:, -1, joint_ids] = target.qpos.reshape(1, -1) + return ActionResult( + success=True, + trajectory=trajectory, + next_state=WorldState( + last_qpos=trajectory[:, -1, :], + held_object=state.held_object, + ), + ) + if self.cfg.name == "move_held_object": + trajectory = torch.tensor( + [ + [ + [0.1, 0.2, 0.0, 0.0, 0.0, 0.0], + [0.25, 0.35, 0.0, 0.0, 0.0, 0.0], + ] + ], + dtype=torch.float32, + ) + return ActionResult( + success=True, + trajectory=trajectory, + next_state=WorldState( + last_qpos=trajectory[:, -1, :], + held_object=state.held_object, + ), + ) + if self.cfg.name == "pick_up": + trajectory = torch.tensor( + [ + [ + [0.1, 0.2, 0.3, 0.0, 0.0, 0.0], + [0.2, 0.3, 0.4, 0.0, 0.0, 0.0], + ] + ], + dtype=torch.float32, + ) + held = HeldObjectState( + semantics=target.semantics, + object_to_eef=torch.eye(4).unsqueeze(0), + grasp_xpos=torch.eye(4).unsqueeze(0), + ) + return ActionResult( + success=True, + trajectory=trajectory, + next_state=WorldState(last_qpos=trajectory[:, -1, :], held_object=held), + ) + if self.cfg.name == "place": trajectory = torch.tensor( [ [ @@ -220,11 +296,27 @@ def test_normalize_atomic_action_spec_rejects_legacy_schema() -> None: normalize_atomic_action_spec({"action": "move", "robot_name": "left_arm"}) +def test_normalize_atomic_action_spec_rejects_old_action_names() -> None: + with pytest.raises(ValueError, match="Unsupported atomic action class"): + normalize_atomic_action_spec( + { + "atomic_action_class": "MoveAction", + "robot_name": "left_arm", + "control": "arm", + "target_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + }, + "cfg": {}, + } + ) + + def test_normalize_atomic_action_spec_rejects_legacy_target_kind_schema() -> None: with pytest.raises(ValueError, match="Legacy target.kind schema"): normalize_atomic_action_spec( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveEndEffector", "robot_name": "left_arm", "control": "arm", "target": {"kind": "pose_relative_to_object", "obj_name": "apple"}, @@ -237,7 +329,7 @@ def test_normalize_atomic_action_spec_rejects_unknown_fields() -> None: with pytest.raises(ValueError, match="Unsupported atomic action spec fields"): normalize_atomic_action_spec( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveJoints", "robot_name": "left_arm", "control": "arm", "target_qpos": {"source": "initial"}, @@ -251,7 +343,7 @@ def test_normalize_atomic_action_spec_rejects_multiple_target_fields() -> None: with pytest.raises(ValueError, match="exactly one of target_object"): normalize_atomic_action_spec( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveJoints", "robot_name": "left_arm", "control": "arm", "target_pose": { @@ -268,7 +360,7 @@ def test_normalize_atomic_action_spec_rejects_orientation_field() -> None: with pytest.raises(ValueError, match="Unsupported target_pose fields"): normalize_atomic_action_spec( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveEndEffector", "robot_name": "left_arm", "control": "arm", "target_pose": { @@ -283,10 +375,10 @@ def test_normalize_atomic_action_spec_rejects_orientation_field() -> None: def test_normalize_atomic_action_spec_rejects_pickup_pose_target() -> None: - with pytest.raises(ValueError, match="PickUpAction requires control='arm'"): + with pytest.raises(ValueError, match="PickUp requires control='arm'"): normalize_atomic_action_spec( { - "atomic_action_class": "PickUpAction", + "atomic_action_class": "PickUp", "robot_name": "left_arm", "control": "arm", "target_pose": { @@ -325,7 +417,7 @@ def test_build_parallel_action_stream_does_not_step_env() -> None: ) assert len(action_stream) == 2 - assert not hasattr(env, "stepped_actions") + assert env.stepped_actions == [] def test_step_env_with_actions_steps_and_updates_env() -> None: @@ -372,6 +464,79 @@ def test_executed_action_list_is_sequence() -> None: assert list(action_list) == actions +def test_agent_task_graph_threads_world_state_between_edges(monkeypatch) -> None: + env = _FakeEnv() + capture = [] + _FakeBackendAction.capture = capture + + monkeypatch.setattr( + atom_actions, + "_make_motion_generator", + lambda env: SimpleNamespace(robot=env.robot, device=env.robot.device), + ) + monkeypatch.setattr( + atom_actions, + "_get_atomic_action_class", + lambda atomic_action_class: _FakeBackendAction, + ) + + graph = AgentTaskGraph(start="v0", goal="v3") + graph.add_node("v0").add_node("v1").add_node("v2").add_node("v3") + graph.add_edge( + "e01", + "v0", + "v1", + left_arm_action={ + "atomic_action_class": "PickUp", + "robot_name": "left_arm", + "control": "arm", + "target_object": {"obj_name": "apple", "affordance": "antipodal"}, + "cfg": {}, + }, + ) + graph.add_edge( + "e12", + "v1", + "v2", + left_arm_action={ + "atomic_action_class": "MoveHeldObject", + "robot_name": "left_arm", + "control": "arm", + "target_object_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + "orientation_goal": "preserve", + }, + "cfg": {}, + }, + ) + graph.add_edge( + "e23", + "v2", + "v3", + left_arm_action={ + "atomic_action_class": "Place", + "robot_name": "left_arm", + "control": "arm", + "target_pose": { + "reference": "relative", + "offset": [0.0, 0.0, -0.1], + "frame": "world", + }, + "cfg": {}, + }, + ) + + actions = graph.run(env=env, allow_grasp_annotation=True) + + assert isinstance(actions, ExecutedActionList) + assert len(capture) == 3 + assert capture[0]["state"].held_object is None + assert capture[1]["state"].held_object is not None + assert capture[2]["state"].held_object is not None + + def test_resolve_arm_side_rejects_unavailable_requested_arm() -> None: env = _FakeEnv() env.right_arm_joints = [] @@ -421,7 +586,7 @@ def test_object_referenced_pose_builds_move_cfg_and_pose_target(monkeypatch) -> action = execute_atomic_action( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveEndEffector", "robot_name": "left_arm", "control": "arm", "target_pose": { @@ -442,7 +607,7 @@ def test_object_referenced_pose_builds_move_cfg_and_pose_target(monkeypatch) -> assert capture[0]["target"].xpos[:3, 3].tolist() == pytest.approx([0.5, 0.0, 0.4]) -def test_gripper_state_qpos_target_interpolates_hand_action(monkeypatch) -> None: +def test_gripper_state_qpos_target_uses_move_joints(monkeypatch) -> None: env = _FakeEnv() capture = [] _FakeBackendAction.capture = capture @@ -460,7 +625,7 @@ def test_gripper_state_qpos_target_interpolates_hand_action(monkeypatch) -> None action = execute_atomic_action( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveJoints", "robot_name": "left_arm", "control": "hand", "target_qpos": {"source": "gripper_state", "state": "open"}, @@ -469,15 +634,18 @@ def test_gripper_state_qpos_target_interpolates_hand_action(monkeypatch) -> None env=env, ) - assert action.shape == (7, 3) - assert capture == [] + assert action.shape == (4, 3) + assert isinstance(capture[0]["cfg"], MoveJointsCfg) + assert capture[0]["cfg"].control_part == "left_eef" + assert isinstance(capture[0]["target"], JointPositionTarget) + assert capture[0]["target"].qpos.tolist() == pytest.approx([0.05]) assert action[0].tolist() == pytest.approx([0.1, 0.2, 0.0]) - assert action[4].tolist() == pytest.approx([0.1, 0.2, 0.05]) + assert action[1].tolist() == pytest.approx([0.1, 0.2, 0.05]) assert action[-1].tolist() == pytest.approx([0.1, 0.2, 0.05]) assert env.left_arm_current_gripper_state.tolist() == pytest.approx([0.05]) -def test_initial_qpos_target_interpolates_arm_action(monkeypatch) -> None: +def test_initial_qpos_target_uses_move_joints(monkeypatch) -> None: env = _FakeEnv() capture = [] _FakeBackendAction.capture = capture @@ -495,7 +663,7 @@ def test_initial_qpos_target_interpolates_arm_action(monkeypatch) -> None: action = execute_atomic_action( { - "atomic_action_class": "MoveAction", + "atomic_action_class": "MoveJoints", "robot_name": "right_arm", "control": "arm", "target_qpos": {"source": "initial"}, @@ -504,8 +672,11 @@ def test_initial_qpos_target_interpolates_arm_action(monkeypatch) -> None: env=env, ) - assert action.shape == (4, 3) - assert capture == [] + assert action.shape == (2, 3) + assert isinstance(capture[0]["cfg"], MoveJointsCfg) + assert capture[0]["cfg"].control_part == "right_arm" + assert isinstance(capture[0]["target"], JointPositionTarget) + assert capture[0]["target"].qpos.tolist() == pytest.approx([-0.3, -0.4]) assert action[0].tolist() == pytest.approx([0.3, 0.4, 0.0]) assert action[-1].tolist() == pytest.approx([-0.3, -0.4, 0.0]) assert env.right_arm_current_qpos.tolist() == pytest.approx([-0.3, -0.4]) @@ -529,7 +700,7 @@ def test_target_object_builds_pick_up_cfg(monkeypatch) -> None: execute_atomic_action( { - "atomic_action_class": "PickUpAction", + "atomic_action_class": "PickUp", "robot_name": "left_arm", "control": "arm", "target_object": { @@ -573,7 +744,7 @@ def test_place_action_builds_place_cfg(monkeypatch) -> None: action = execute_atomic_action( { - "atomic_action_class": "PlaceAction", + "atomic_action_class": "Place", "robot_name": "left_arm", "control": "arm", "target_pose": { @@ -592,6 +763,82 @@ def test_place_action_builds_place_cfg(monkeypatch) -> None: assert capture[0]["cfg"].lift_height == pytest.approx(0.06) +def test_move_held_object_builds_cfg_and_object_pose_target(monkeypatch) -> None: + env = _FakeEnv() + capture = [] + _FakeBackendAction.capture = capture + semantics = atom_actions._build_object_semantics( + env, + {"obj_name": "apple", "affordance": "antipodal"}, + {"allow_grasp_annotation": True}, + ) + state = WorldState( + last_qpos=env.robot.get_qpos().clone(), + held_object=HeldObjectState( + semantics=semantics, + object_to_eef=torch.eye(4).unsqueeze(0), + grasp_xpos=torch.eye(4).unsqueeze(0), + ), + ) + + monkeypatch.setattr( + atom_actions, + "_make_motion_generator", + lambda env: SimpleNamespace(robot=env.robot, device=env.robot.device), + ) + monkeypatch.setattr( + atom_actions, + "_get_atomic_action_class", + lambda atomic_action_class: _FakeBackendAction, + ) + + action = execute_atomic_action( + { + "atomic_action_class": "MoveHeldObject", + "robot_name": "left_arm", + "control": "arm", + "target_object_pose": { + "reference": "object", + "obj_name": "apple", + "offset": [0.0, 0.0, 0.2], + "orientation_goal": "upright", + }, + "cfg": {"sample_interval": 13}, + }, + env=env, + state=state, + ) + + assert action.shape == (2, 3) + assert isinstance(capture[0]["cfg"], MoveHeldObjectCfg) + assert capture[0]["cfg"].control_part == "left_arm" + assert capture[0]["cfg"].hand_control_part == "left_eef" + assert isinstance(capture[0]["target"], HeldObjectPoseTarget) + assert capture[0]["target"].object_target_pose[:3, 3].tolist() == pytest.approx( + [0.4, -0.2, 0.3] + ) + + +def test_move_held_object_requires_prior_pickup() -> None: + env = _FakeEnv() + with pytest.raises(ValueError, match="requires a held object"): + execute_atomic_action( + { + "atomic_action_class": "MoveHeldObject", + "robot_name": "left_arm", + "control": "arm", + "target_object_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + "orientation_goal": "preserve", + }, + "cfg": {}, + }, + env=env, + ) + + def test_place_action_rejects_qpos_target(monkeypatch) -> None: env = _FakeEnv() monkeypatch.setattr( @@ -607,11 +854,11 @@ def test_place_action_rejects_qpos_target(monkeypatch) -> None: with pytest.raises( ValueError, - match="PlaceAction requires control='arm' and target_pose", + match="Place requires control='arm' and target_pose", ): execute_atomic_action( { - "atomic_action_class": "PlaceAction", + "atomic_action_class": "Place", "robot_name": "left_arm", "control": "arm", "target_qpos": {"source": "initial"}, diff --git a/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py b/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py index 1c55ef74..ffa41f90 100644 --- a/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py +++ b/tests/gen_sim/action_agent_pipeline/test_demo3_semantic_grasp_integration.py @@ -54,7 +54,7 @@ ) _POST_GRASP_HOLD_STEPS = int(os.environ.get("RUN_DEXSIM_GRASP_HOLD_STEPS", "10")) _PICK_UP_SPEC_RE = re.compile( - r'"atomic_action_class"\s*:\s*"PickUpAction".*?' + r'"atomic_action_class"\s*:\s*"PickUp".*?' r'"robot_name"\s*:\s*"(?P[^"]+)".*?' r'"obj_name"\s*:\s*"(?P[^"]+)"', re.DOTALL, @@ -199,7 +199,7 @@ def _assert_semantic_grasp_lifts_object( gym_env.reset() z_before = float(_object_xyz(env, obj_name)[2]) action_spec = { - "atomic_action_class": "PickUpAction", + "atomic_action_class": "PickUp", "robot_name": robot_name, "control": "arm", "target_object": { diff --git a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py index 1878b104..4ca7a730 100644 --- a/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py +++ b/tests/gen_sim/action_agent_pipeline/test_graph_spec_backend_atomic.py @@ -58,7 +58,7 @@ def add_edge( def _pick_up_spec(robot_name: str, obj_name: str) -> dict: return { - "atomic_action_class": "PickUpAction", + "atomic_action_class": "PickUp", "robot_name": robot_name, "control": "arm", "target_object": { diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 00bd9df9..1bbb0f33 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -144,22 +144,26 @@ def test_action_agent_config_generator_uses_parallel_handoff( assert "negative-x side" not in basic_background assert "positive-x side" not in basic_background left_high_offset_spec = ( - '"robot_name":"left_arm","control":"arm","target_pose":{"reference":"object",' - '"obj_name":"wicker_basket","offset":[0.0,0.04,0.22]' + '"atomic_action_class":"MoveHeldObject","robot_name":"left_arm",' + '"control":"arm","target_object_pose":{"reference":"object",' + '"obj_name":"wicker_basket","offset":[0.0,0.04,0.22],' + '"orientation_goal":"preserve"}' ) right_high_offset_spec = ( - '"robot_name":"right_arm","control":"arm","target_pose":{"reference":"object",' - '"obj_name":"wicker_basket","offset":[0.0,-0.04,0.22]' + '"atomic_action_class":"MoveHeldObject","robot_name":"right_arm",' + '"control":"arm","target_object_pose":{"reference":"object",' + '"obj_name":"wicker_basket","offset":[0.0,-0.04,0.22],' + '"orientation_goal":"preserve"}' ) assert left_high_offset_spec in task_prompt assert right_high_offset_spec in task_prompt assert ( - '"atomic_action_class":"PlaceAction","robot_name":"left_arm","control":"arm",' + '"atomic_action_class":"Place","robot_name":"left_arm","control":"arm",' '"target_pose":{"reference":"object","obj_name":"wicker_basket",' '"offset":[0.0,0.04,0.12]}' in task_prompt ) assert ( - '"atomic_action_class":"PlaceAction","robot_name":"right_arm","control":"arm",' + '"atomic_action_class":"Place","robot_name":"right_arm","control":"arm",' '"target_pose":{"reference":"object","obj_name":"wicker_basket",' '"offset":[0.0,-0.04,0.12]}' in task_prompt ) @@ -177,12 +181,12 @@ def test_action_agent_config_generator_uses_parallel_handoff( maxsplit=1, )[0] assert ( - '"robot_name":"left_arm","control":"arm","target_qpos":{"source":"initial"}' - in handoff_edge + '"atomic_action_class":"MoveJoints","robot_name":"left_arm","control":"arm",' + '"target_qpos":{"source":"initial"}' in handoff_edge ) assert ( - '"robot_name":"right_arm","control":"arm","target_pose":{"reference":"object"' - in handoff_edge + '"atomic_action_class":"MoveHeldObject","robot_name":"right_arm",' + '"control":"arm","target_object_pose":{"reference":"object"' in handoff_edge ) assert '"state":"close"' not in handoff_edge assert "left_arm_action: null" not in handoff_edge @@ -566,10 +570,8 @@ def fake_call_relative_task_llm(**kwargs): "Generate one deterministic nominal graph with exactly 4 nominal edges" in task_prompt ) - assert ( - '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in task_prompt - ) - assert '"atomic_action_class":"PlaceAction","robot_name":"right_arm"' in task_prompt + assert '"atomic_action_class":"PickUp","robot_name":"right_arm"' in task_prompt + assert '"atomic_action_class":"Place","robot_name":"right_arm"' in task_prompt assert '"obj_name":"apple_2"' in task_prompt assert "left_arm_action: null" in task_prompt assert "Generate exactly 10 nominal edges" not in task_prompt @@ -923,7 +925,7 @@ def fake_call_relative_task_llm(**kwargs): assert paths.summary["active_arm"] == "left_arm" task_prompt = paths.task_prompt.read_text(encoding="utf-8") - assert '"atomic_action_class":"PickUpAction","robot_name":"left_arm"' in task_prompt + assert '"atomic_action_class":"PickUp","robot_name":"left_arm"' in task_prompt assert '"obj_name":"apple_1"' in task_prompt assert "right_arm_action: null" in task_prompt @@ -967,9 +969,7 @@ def fake_call_relative_task_llm(**kwargs): assert paths.summary["active_arm"] == "right_arm" task_prompt = paths.task_prompt.read_text(encoding="utf-8") - assert ( - '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in task_prompt - ) + assert '"atomic_action_class":"PickUp","robot_name":"right_arm"' in task_prompt assert '"obj_name":"apple_2"' in task_prompt assert "left_arm_action: null" in task_prompt @@ -1022,14 +1022,167 @@ def fake_call_relative_task_llm(**kwargs): assert success["support"] == "pad" atom_actions = paths.atom_actions.read_text(encoding="utf-8") - assert atom_actions.count('"atomic_action_class":"PickUpAction"') == 1 - assert ( - '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in atom_actions - ) + assert atom_actions.count('"atomic_action_class":"PickUp"') == 1 + assert '"atomic_action_class":"PickUp","robot_name":"right_arm"' in atom_actions assert '"obj_name":"cup"' in atom_actions assert _stable_summary(paths.summary)["relation"] == "on" +def test_relative_orientation_intent_generates_horizontal_move_held_object( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "15_Move Stapler Pad_gym_project" + for rel_path in ( + "mesh_assets/table/table_0.glb", + "mesh_assets/pad/colored_pad_1.glb", + "mesh_assets/stapler/stapler_1.glb", + ): + _write_minimal_glb(project_dir / rel_path, _default_mesh_vertices()) + + gym_config = { + "id": "Image2Tabletop-15-v0", + "background": [ + _mesh_object( + "table_0", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, -0.05], + [0.0, 0.0, 0.0], + ), + _mesh_object( + "colored_pad_1", + "mesh_assets/pad/colored_pad_1.glb", + [0.3, 0.0, 0.0], + [0.0, 0.0, 0.0], + ), + ], + "rigid_object": [ + _mesh_object( + "stapler_1", + "mesh_assets/stapler/stapler_1.glb", + [0.0, 0.1, 0.0], + [0.0, 0.0, 0.0], + ) + ], + } + gym_config["background"][1]["body_scale"] = [1.2, 1.0, 0.3] + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def fake_call_relative_task_llm(**kwargs): + assert ( + kwargs["task_description"] + == "使用合适的机械臂将订书机水平摆正到彩色垫子上。" + ) + return { + "moved_object": "stapler_1", + "reference_object": "colored_pad_1", + "goal_relation": "on", + "arm": "auto", + "orientation_goal": "horizontal", + "orientation_reference": "reference_object", + "task_prompt_summary": "Place the stapler horizontally on the colored pad.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_stapler_pad_agent", + task_name="Demo3_Text", + task_description="使用合适的机械臂将订书机水平摆正到彩色垫子上。", + target_body_scale=0.8, + prewarm_coacd_cache=False, + ) + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + for text in (task_prompt, atom_actions): + assert '"atomic_action_class":"MoveHeldObject"' in text + assert '"target_object_pose":{"reference":"object"' in text + assert '"obj_name":"colored_pad"' in text + assert '"orientation_goal":"horizontal"' in text + assert '"align_to":"colored_pad"' in text + + summary = _stable_summary(paths.summary) + assert summary["orientation_goal"] == "horizontal" + assert summary["orientation_align_to"] == "colored_pad" + + +def test_relative_orientation_upright_does_not_emit_align_to( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "left_of", + "orientation_goal": "upright", + "orientation_reference": "none", + "task_prompt_summary": "Move apple_2 upright to the left of basket_3.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_upright_relative_agent", + task_description="把 apple_2 扶正后放到 basket_3 左边", + prewarm_coacd_cache=False, + ) + + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert '"orientation_goal":"upright"' in atom_actions + assert '"align_to"' not in atom_actions + assert _stable_summary(paths.summary)["orientation_goal"] == "upright" + assert paths.summary["orientation_align_to"] is None + + +def test_relative_orientation_rejects_invalid_enum( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "left_of", + "orientation_goal": "diagonal", + "task_prompt_summary": "Invalid orientation.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + with pytest.raises(ValueError, match="Unsupported orientation_goal"): + generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_invalid_orientation_agent", + task_description="把 apple_2 斜着放到 basket_3 左边", + prewarm_coacd_cache=False, + ) + + def test_task_description_allows_single_rigid_with_background_reference( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -1223,30 +1376,26 @@ def fake_call_relative_task_llm(**kwargs): task_prompt ) assert ( - 'left_arm_action: {"atomic_action_class":"PickUpAction","robot_name":"left_arm"' + 'left_arm_action: {"atomic_action_class":"PickUp","robot_name":"left_arm"' in task_prompt ) assert ( - 'right_arm_action: {"atomic_action_class":"PickUpAction","robot_name":"right_arm"' + 'right_arm_action: {"atomic_action_class":"PickUp","robot_name":"right_arm"' in task_prompt ) assert ( '"robot_name":"right_arm","control":"hand","target_qpos":{"source":"gripper_state","state":"close"}' in task_prompt ) - assert '"atomic_action_class":"PlaceAction","robot_name":"left_arm"' in task_prompt - assert '"atomic_action_class":"PlaceAction","robot_name":"right_arm"' in task_prompt + assert '"atomic_action_class":"Place","robot_name":"left_arm"' in task_prompt + assert '"atomic_action_class":"Place","robot_name":"right_arm"' in task_prompt assert "The inactive arm must remain null" not in task_prompt assert "Both arms participate" in basic_background assert "left_arm moves `apple_2`" in basic_background assert "right_arm moves `apple_1`" in basic_background - assert ( - '"atomic_action_class":"PickUpAction","robot_name":"left_arm"' in atom_actions - ) + assert '"atomic_action_class":"PickUp","robot_name":"left_arm"' in atom_actions assert '"obj_name":"apple_2"' in atom_actions - assert ( - '"atomic_action_class":"PickUpAction","robot_name":"right_arm"' in atom_actions - ) + assert '"atomic_action_class":"PickUp","robot_name":"right_arm"' in atom_actions assert '"obj_name":"apple_1"' in atom_actions @@ -1437,11 +1586,11 @@ def fake_call_arrangement_task_llm(**kwargs): assert "Generate one deterministic nominal graph with exactly 12 nominal edges" in ( task_prompt ) - assert task_prompt.count('"atomic_action_class":"PickUpAction"') == 3 - assert task_prompt.count('"atomic_action_class":"PlaceAction"') == 3 + assert task_prompt.count('"atomic_action_class":"PickUp"') == 3 + assert task_prompt.count('"atomic_action_class":"Place"') == 3 assert task_prompt.count('"reference":"absolute"') >= 6 - assert atom_actions.count('"atomic_action_class":"PickUpAction"') == 3 - assert atom_actions.count('"atomic_action_class":"PlaceAction"') == 3 + assert atom_actions.count('"atomic_action_class":"PickUp"') == 3 + assert atom_actions.count('"atomic_action_class":"Place"') == 3 def test_dual_inside_same_container_uses_container_long_axis_slots( @@ -1525,12 +1674,12 @@ def fake_call_relative_task_llm(**kwargs): assert '"offset":[-0.04,0.0,0.22]' in text assert '"offset":[0.04,0.0,0.22]' in text assert ( - '"atomic_action_class":"PlaceAction","robot_name":"left_arm",' + '"atomic_action_class":"Place","robot_name":"left_arm",' '"control":"arm","target_pose":{"reference":"object",' '"obj_name":"wicker_basket","offset":[-0.04,0.0,0.12]}' in text ) assert ( - '"atomic_action_class":"PlaceAction","robot_name":"right_arm",' + '"atomic_action_class":"Place","robot_name":"right_arm",' '"control":"arm","target_pose":{"reference":"object",' '"obj_name":"wicker_basket","offset":[0.04,0.0,0.12]}' in text ) @@ -2001,11 +2150,21 @@ def _assert_normalized_obj_path(fpath: str) -> None: def _stable_summary(summary: dict) -> dict: - return { + stable = { key: value for key, value in summary.items() if key not in {"normalized_meshes", "coacd_cache"} } + if stable.get("orientation_goal") == "preserve": + stable.pop("orientation_goal", None) + if stable.get("orientation_align_to") is None: + stable.pop("orientation_align_to", None) + for placement in stable.get("placements", []): + if placement.get("orientation_goal") == "preserve": + placement.pop("orientation_goal", None) + if placement.get("orientation_align_to") is None: + placement.pop("orientation_align_to", None) + return stable def _obj_vertices(path: Path) -> list[tuple[float, float, float]]: From 59e5100d7c36c5f7213be1a624e2f805c46ce071 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Fri, 26 Jun 2026 21:56:18 +0800 Subject: [PATCH 24/33] Native relative pose-sensitive release flow --- .../generation/config_types.py | 2 + .../generation/prompt_builders.py | 367 +++++++++++++----- .../generation/relative_geometry.py | 5 + .../generation/relative_spec.py | 98 ++++- .../prompts/atom_actions.txt | 20 +- .../runtime/atom_actions.py | 153 ++++++-- .../test_backend_atomic_runtime.py | 283 +++++++++++++- .../test_ur5_basket_config_generation.py | 120 +++++- 8 files changed, 894 insertions(+), 154 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py index 6fde7964..a3db87d5 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py @@ -101,6 +101,7 @@ class _RelativePlacementStepSpec: release_position: list[float] | None = None high_position: list[float] | None = None orientation_goal: str = "preserve" + orientation_axis: str = "none" orientation_align_to_runtime_uid: str | None = None @@ -124,6 +125,7 @@ class _RelativePlacementSpec: release_position: list[float] | None = None high_position: list[float] | None = None orientation_goal: str = "preserve" + orientation_axis: str = "none" orientation_align_to_runtime_uid: str | None = None diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 6e1ef0f3..23552915 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -38,6 +38,8 @@ _BASKET_LEFT_RELEASE_OFFSET_Y = 0.04 _BASKET_RIGHT_RELEASE_OFFSET_Y = -0.04 _PLACE_LIFT_HEIGHT = 0.10 +_RELEASE_ONLY_PLACE_SAMPLE_INTERVAL = 10 +_EMPTY_HAND_RETREAT_SAMPLE_INTERVAL = 30 _RELATIVE_COORDINATE_CONVENTION = """Coordinate convention for relative placement: - `left_of` means positive world y relative to the reference object. - `right_of` means negative world y relative to the reference object. @@ -76,6 +78,7 @@ class _RelativePlacementLike(Protocol): high_position: Sequence[float] | None release_position: Sequence[float] | None orientation_goal: str + orientation_axis: str orientation_align_to_runtime_uid: str | None @@ -283,23 +286,69 @@ def make_relative_task_prompt( active_slot = f"{spec.active_side}_arm_action" action_sketch = _format_action_sketch(spec.action_sketch) pick_spec = _format_pick_up_spec(active_arm, spec.moved_runtime_uid) + initial_spec = _format_initial_qpos_spec(active_arm, sample_interval=30) + reference_line = _relative_reference_line(spec) + final_planning_rule = _relative_final_planning_rule(project_name, spec) + high_step_label = _relative_pose_step_label(spec, "high staging") + release_step_label = _relative_pose_step_label(spec, "release") high_spec = _format_relative_pose_spec( active_arm, spec, pose_kind="high", sample_interval=45, ) - place_spec = _format_relative_place_spec( - active_arm, - spec, - sample_interval=80, - lift_height=_PLACE_LIFT_HEIGHT, - ) - initial_spec = _format_initial_qpos_spec(active_arm, sample_interval=30) - reference_line = _relative_reference_line(spec) - final_planning_rule = _relative_final_planning_rule(project_name, spec) - high_step_label = _relative_pose_step_label(spec, "high staging") - release_step_label = _relative_pose_step_label(spec, "release") + pose_sensitive = _is_pose_sensitive_placement(spec) + if pose_sensitive: + release_move_spec = _format_relative_pose_spec( + active_arm, + spec, + pose_kind="release", + sample_interval=45, + ) + place_spec = _format_release_only_place_spec(active_arm) + retreat_spec = _format_empty_hand_retreat_spec(active_arm) + edge_count = 6 + release_instruction = f"""3. Move the held object down to the {release_step_label} object pose: + - {active_slot}: {release_move_spec} + - {inactive_slot}: null + +4. Release the held object in-place without moving the object pose: + - {active_slot}: {place_spec} + - {inactive_slot}: null + +5. Retreat the now-empty end-effector upward: + - {active_slot}: {retreat_spec} + - {inactive_slot}: null + +6. Return the active arm to its initial pose: + - {active_slot}: {initial_spec} + - {inactive_slot}: null""" + release_rule = ( + "For this pose-sensitive placement, `MoveHeldObject` must move the " + "object all the way down to the final release object pose. The " + "following `Place` must be the exact relative-zero release-only spec " + "shown below so it opens the gripper without re-planning a new " + "placement pose." + ) + else: + place_spec = _format_relative_place_spec( + active_arm, + spec, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, + ) + edge_count = 4 + release_instruction = f"""3. Place the held object at the {release_step_label} pose: + - {active_slot}: {place_spec} + - {inactive_slot}: null + +4. Return the active arm to its initial pose: + - {active_slot}: {initial_spec} + - {inactive_slot}: null""" + release_rule = ( + "Use `Place` for the release-place step so lowering, gripper " + "opening, and upward retreat remain one atomic action." + ) return f"""Task: {task_name}: {spec.task_prompt_summary} @@ -321,11 +370,10 @@ def make_relative_task_prompt( {_RELATIVE_COORDINATE_CONVENTION} -Generate one deterministic nominal graph with exactly 4 nominal edges. Use only -the atomic action class JSON specs shown below. Do not add recovery, monitor, search, -alignment, or extra lift edges. Use `Place` for the release-place step so -lowering, gripper opening, and upward retreat remain one atomic action. The -inactive arm must remain null in every edge. +Generate one deterministic nominal graph with exactly {edge_count} nominal edges. +Use only the atomic action class JSON specs shown below. Do not add recovery, +monitor, search, alignment, or extra lift edges. {release_rule} The inactive arm +must remain null in every edge. 1. Pick up the moved object: - {active_slot}: {pick_spec} @@ -335,13 +383,7 @@ def make_relative_task_prompt( - {active_slot}: {high_spec} - {inactive_slot}: null -3. Place the held object at the {release_step_label} pose: - - {active_slot}: {place_spec} - - {inactive_slot}: null - -4. Return the active arm to its initial pose: - - {active_slot}: {initial_spec} - - {inactive_slot}: null +{release_instruction} Final state: `{spec.moved_runtime_uid}` must be {_relative_relation_phrase(spec.relation)} `{spec.reference_runtime_uid}`. @@ -374,18 +416,6 @@ def _make_dual_relative_task_prompt( pose_kind="high", sample_interval=45, ) - first_place_spec = _format_relative_place_spec( - first_arm, - first, - sample_interval=80, - lift_height=_PLACE_LIFT_HEIGHT, - ) - second_place_spec = _format_relative_place_spec( - second_arm, - second, - sample_interval=80, - lift_height=_PLACE_LIFT_HEIGHT, - ) first_close_spec = _format_gripper_spec( first_arm, "close", @@ -407,6 +437,57 @@ def _make_dual_relative_task_prompt( first_reference_line = _relative_reference_line(first) second_reference_line = _relative_reference_line(second) final_planning_rule = _dual_relative_final_planning_rule(project_name, spec) + first_release_edges = _dual_relative_release_edge_blocks( + placement=first, + active_arm=first_arm, + active_slot=first_slot, + waiting_slot=second_slot, + waiting_action=second_close_spec, + ) + second_release_edges = _dual_relative_release_edge_blocks( + placement=second, + active_arm=second_arm, + active_slot=second_slot, + waiting_slot=first_slot, + waiting_action=None, + ) + edge_blocks = [ + ( + "Pick up both moved objects simultaneously", + { + first_slot: first_pick_spec, + second_slot: second_pick_spec, + }, + ), + ( + f"Move `{first.moved_runtime_uid}` to the high staging pose while " + f"the other arm keeps holding `{second.moved_runtime_uid}`", + { + first_slot: first_high_spec, + second_slot: second_close_spec, + }, + ), + *first_release_edges, + ( + f"Return `{first_arm}` to its initial pose while moving " + f"`{second.moved_runtime_uid}` to the high staging pose", + { + first_slot: first_initial_spec, + second_slot: second_high_spec, + }, + ), + *second_release_edges, + ( + f"Return `{second_arm}` to its initial pose", + { + first_slot: None, + second_slot: second_initial_spec, + }, + ), + ] + edge_count = len(edge_blocks) + numbered_edges = _format_numbered_edge_blocks(edge_blocks) + release_rule = _dual_relative_release_rule(spec) return f"""Task: {task_name}: {spec.task_prompt_summary} @@ -431,36 +512,11 @@ def _make_dual_relative_task_prompt( {_RELATIVE_COORDINATE_CONVENTION} -Generate one deterministic nominal graph with exactly 6 nominal edges. Use only -the atomic action class JSON specs shown below. Do not add recovery, monitor, search, -alignment, or extra lift edges. Use `Place` for each release-place step so -lowering, gripper opening, and upward retreat remain one atomic action. - -1. Pick up both moved objects simultaneously: - - {first_slot}: {first_pick_spec} - - {second_slot}: {second_pick_spec} - -2. Move `{first.moved_runtime_uid}` to the high staging pose while the other arm - keeps holding `{second.moved_runtime_uid}`: - - {first_slot}: {first_high_spec} - - {second_slot}: {second_close_spec} - -3. Place `{first.moved_runtime_uid}` at the release pose: - - {first_slot}: {first_place_spec} - - {second_slot}: {second_close_spec} - -4. Return `{first_arm}` to its initial pose while moving `{second.moved_runtime_uid}` - to the high staging pose: - - {first_slot}: {first_initial_spec} - - {second_slot}: {second_high_spec} - -5. Place `{second.moved_runtime_uid}` at the release pose: - - {first_slot}: null - - {second_slot}: {second_place_spec} +Generate one deterministic nominal graph with exactly {edge_count} nominal edges. +Use only the atomic action class JSON specs shown below. Do not add recovery, +monitor, search, alignment, or extra lift edges. {release_rule} -6. Return `{second_arm}` to its initial pose: - - {first_slot}: null - - {second_slot}: {second_initial_spec} +{numbered_edges} Final state: `{first.moved_runtime_uid}` must be {_relative_relation_phrase(first.relation)} `{first.reference_runtime_uid}`, and @@ -470,6 +526,106 @@ def _make_dual_relative_task_prompt( """ +def _dual_relative_release_edge_blocks( + *, + placement: _RelativePlacementLike, + active_arm: str, + active_slot: str, + waiting_slot: str, + waiting_action: str | None, +) -> list[tuple[str, Mapping[str, str | None]]]: + waiting_value = waiting_action + if _is_pose_sensitive_placement(placement): + return [ + ( + f"Move `{placement.moved_runtime_uid}` down to the final " + "release object pose", + { + active_slot: _format_relative_pose_spec( + active_arm, + placement, + pose_kind="release", + sample_interval=45, + ), + waiting_slot: waiting_value, + }, + ), + ( + f"Release `{placement.moved_runtime_uid}` in-place without moving " + "the object pose", + { + active_slot: _format_release_only_place_spec(active_arm), + waiting_slot: waiting_value, + }, + ), + ( + f"Retreat `{active_arm}` upward after release", + { + active_slot: _format_empty_hand_retreat_spec(active_arm), + waiting_slot: waiting_value, + }, + ), + ] + + return [ + ( + f"Place `{placement.moved_runtime_uid}` at the release pose", + { + active_slot: _format_relative_place_spec( + active_arm, + placement, + sample_interval=80, + lift_height=_PLACE_LIFT_HEIGHT, + ), + waiting_slot: waiting_value, + }, + ) + ] + + +def _dual_relative_release_rule(spec: _RelativeSpecLike) -> str: + if any(_is_pose_sensitive_placement(placement) for placement in spec.placements): + return ( + "For pose-sensitive placements, `MoveHeldObject` must move the held " + "object all the way down to the final release object pose; the " + "following `Place` must be the exact relative-zero release-only spec " + "shown below, and then the empty hand retreats upward. For preserve " + "placements, keep the normal `Place` release-place action." + ) + return ( + "Use `Place` for each release-place step so lowering, gripper opening, " + "and upward retreat remain one atomic action." + ) + + +def _format_numbered_edge_blocks( + edge_blocks: Sequence[tuple[str, Mapping[str, str | None]]], +) -> str: + formatted_blocks = [] + for index, (title, actions) in enumerate(edge_blocks, start=1): + action_lines = "\n".join( + f" - {slot}: {action if action is not None else 'null'}" + for slot, action in actions.items() + ) + formatted_blocks.append(f"{index}. {title}:\n{action_lines}") + return "\n\n".join(formatted_blocks) + + +def _relative_release_action_patterns( + robot_name: str, + placement: _RelativePlacementLike, +) -> str: + if _is_pose_sensitive_placement(placement): + return f"""- Final release object pose: + {_format_relative_pose_spec(robot_name, placement, pose_kind="release", sample_interval=45)} +- Release-only Place: + {_format_release_only_place_spec(robot_name)} +- Empty-hand retreat: + {_format_empty_hand_retreat_spec(robot_name)}""" + return f"""- Place at the release pose: + {_format_relative_place_spec(robot_name, placement, sample_interval=80, lift_height=_PLACE_LIFT_HEIGHT)}""" + + def make_relative_basic_background( project_name: str, spec: _RelativeSpecLike, @@ -504,8 +660,9 @@ def make_relative_basic_background( {notes} The execution-stage LLM should generate graph JSON that grasps the moved object, -moves it to the configured high staging pose, places it at the release pose with -one `Place`, and returns the active arm to its initial pose. +moves it to the configured high staging pose, releases it at the final pose, and +returns the active arm to its initial pose. Pose-sensitive placements must use a +final `MoveHeldObject` object-pose move followed by release-only `Place`. """ @@ -540,10 +697,11 @@ def _make_dual_relative_basic_background( {notes} The execution-stage LLM should generate graph JSON that grasps both moved -objects, stages and places the first moved object with one `Place`, then -stages and places the second moved object while the first arm returns to its -initial pose. Each arm must release its moved object before returning to its -initial pose. +objects, stages and releases the first moved object, then stages and releases +the second moved object while the first arm returns to its initial pose. Each +arm must release its moved object before returning to its initial pose. +Pose-sensitive placements must use a final `MoveHeldObject` object-pose move +followed by release-only `Place`. """ @@ -559,12 +717,7 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: pose_kind="high", sample_interval=45, ) - place_spec = _format_relative_place_spec( - active_arm, - spec, - sample_interval=80, - lift_height=_PLACE_LIFT_HEIGHT, - ) + release_actions = _relative_release_action_patterns(active_arm, spec) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Relative Placement Use only the native atomic action class JSON specs shown below. The active arm @@ -576,8 +729,7 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: {_format_pick_up_spec(active_arm, spec.moved_runtime_uid)} - {_relative_pose_step_label(spec, "High staging")}: {high_spec} -- Place at the release pose: - {place_spec} +{release_actions} - Return to initial qpos: {_format_initial_qpos_spec(active_arm, sample_interval=30)} """ @@ -599,18 +751,8 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: pose_kind="high", sample_interval=45, ) - first_place_spec = _format_relative_place_spec( - first_arm, - first, - sample_interval=80, - lift_height=_PLACE_LIFT_HEIGHT, - ) - second_place_spec = _format_relative_place_spec( - second_arm, - second, - sample_interval=80, - lift_height=_PLACE_LIFT_HEIGHT, - ) + first_release_actions = _relative_release_action_patterns(first_arm, first) + second_release_actions = _relative_release_action_patterns(second_arm, second) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Dual-Arm Relative Placement Use only the native atomic action class JSON specs shown below. @@ -624,12 +766,10 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: {_format_pick_up_spec(second_arm, second.moved_runtime_uid)} - First high staging: {first_high_spec} -- First place action: - {first_place_spec} +{first_release_actions} - Second high staging: {second_high_spec} -- Second place action: - {second_place_spec} +{second_release_actions} - Keep a holding arm closed: {_format_gripper_spec("", "close", sample_interval=10)} - Return to initial qpos: @@ -929,6 +1069,7 @@ def _format_pose_object_spec( *, sample_interval: int, orientation_goal: str = "preserve", + orientation_axis: str = "none", align_to: str | None = None, ) -> str: x, y, z = offset @@ -937,6 +1078,7 @@ def _format_pose_object_spec( "obj_name": obj_name, "offset": [float(x), float(y), float(z)], "orientation_goal": orientation_goal, + "orientation_axis": orientation_axis, } if align_to is not None: target_object_pose["align_to"] = align_to @@ -994,6 +1136,7 @@ def _format_relative_pose_spec( position, sample_interval=sample_interval, orientation_goal=placement.orientation_goal, + orientation_axis=placement.orientation_axis, align_to=placement.orientation_align_to_runtime_uid, ) @@ -1004,6 +1147,7 @@ def _format_relative_pose_spec( offset, sample_interval=sample_interval, orientation_goal=placement.orientation_goal, + orientation_axis=placement.orientation_axis, align_to=placement.orientation_align_to_runtime_uid, ) @@ -1034,18 +1178,53 @@ def _format_relative_place_spec( ) +def _is_pose_sensitive_placement(placement: _RelativePlacementLike) -> bool: + return placement.orientation_goal != "preserve" + + +def _format_release_only_place_spec(robot_name: str) -> str: + return _format_place_spec( + robot_name, + { + "reference": "relative", + "offset": [0.0, 0.0, 0.0], + "frame": "world", + }, + sample_interval=_RELEASE_ONLY_PLACE_SAMPLE_INTERVAL, + lift_height=0.0, + ) + + +def _format_empty_hand_retreat_spec(robot_name: str) -> str: + return _compact_json( + { + "atomic_action_class": "MoveEndEffector", + "robot_name": robot_name, + "control": "arm", + "target_pose": { + "reference": "relative", + "offset": [0.0, 0.0, _PLACE_LIFT_HEIGHT], + "frame": "world", + }, + "cfg": {"sample_interval": _EMPTY_HAND_RETREAT_SAMPLE_INTERVAL}, + } + ) + + def _format_pose_absolute_spec( robot_name: str, position: Sequence[float], *, sample_interval: int, orientation_goal: str = "preserve", + orientation_axis: str = "none", align_to: str | None = None, ) -> str: target_object_pose = { "reference": "absolute", "position": [float(value) for value in position], "orientation_goal": orientation_goal, + "orientation_axis": orientation_axis, } if align_to is not None: target_object_pose["align_to"] = align_to diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py index f64152ea..4aee046d 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py @@ -130,6 +130,7 @@ def _with_self_relative_absolute_targets( release_position=primary.release_position, high_position=primary.high_position, orientation_goal=primary.orientation_goal, + orientation_axis=primary.orientation_axis, orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, ) @@ -161,6 +162,7 @@ def _with_self_relative_absolute_target( release_position=release_position, high_position=high_position, orientation_goal=placement.orientation_goal, + orientation_axis=placement.orientation_axis, orientation_align_to_runtime_uid=placement.orientation_align_to_runtime_uid, ) @@ -257,6 +259,7 @@ def _replace_relative_spec_placements( release_position=primary.release_position, high_position=primary.high_position, orientation_goal=primary.orientation_goal, + orientation_axis=primary.orientation_axis, orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, ) @@ -387,6 +390,7 @@ def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: "active_arm": f"{spec.active_side}_arm", "release_offset": spec.release_offset, "orientation_goal": spec.orientation_goal, + "orientation_axis": spec.orientation_axis, "orientation_align_to": spec.orientation_align_to_runtime_uid, } return { @@ -399,6 +403,7 @@ def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: "active_arm": f"{placement.active_side}_arm", "release_offset": placement.release_offset, "orientation_goal": placement.orientation_goal, + "orientation_axis": placement.orientation_axis, "orientation_align_to": placement.orientation_align_to_runtime_uid, } for placement in spec.placements diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py index 839e3bf6..079e59d7 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py @@ -236,8 +236,9 @@ def _call_relative_task_llm( ' "goal_relation": ' '"inside|on|left_of|right_of|front_of|behind|front_left_of|back_left_of|front_right_of|back_right_of",\n' ' "arm": "left|right|auto",\n' - ' "orientation_goal": "preserve|upright|horizontal",\n' - ' "orientation_reference": "none|reference_object"\n' + ' "orientation_goal": "preserve|upright|lay_flat|axis_align",\n' + ' "orientation_reference": "none|world_axes|reference_object",\n' + ' "orientation_axis": "none|x|y|long_axis|short_axis"\n' " }\n" " ],\n" ' "task_prompt_summary": "",\n' @@ -286,12 +287,20 @@ def _call_relative_task_llm( "- If the task says to stack/place one object on another non-container " "support, use goal_relation='on'.\n" "- orientation_goal captures the held object's intended pose before " - "release. Use 'horizontal' for tasks like 水平摆正, 平放, 横放, or lay " - "flat. Use 'upright' for tasks like 扶正, 竖起来, or stand upright. " - "Use 'preserve' when no orientation change is requested.\n" - "- orientation_reference should be 'reference_object' when the object " - "should be aligned to the pad, box, container, or target support; " - "otherwise use 'none'.\n" + "release. Use 'upright' for tasks like 扶正, 竖起来, or stand upright. " + "Use 'lay_flat' for tasks like 平放, 横放, or lay flat. Use " + "'axis_align' for tasks like 水平摆正, 摆正, or aligning an object to a " + "pad, box, container, or support axis. Use 'preserve' when no " + "orientation change is requested.\n" + "- For axis_align, set orientation_reference='reference_object' and " + "orientation_axis='long_axis' when aligning an object such as a stapler " + "or shoe to the long side of a pad, box, or container. Use " + "orientation_axis='short_axis' only when the task explicitly asks for " + "the short side. Use orientation_reference='world_axes' with " + "orientation_axis='x' or 'y' only when the task explicitly specifies a " + "world/table axis.\n" + "- For preserve, upright, and lay_flat, use orientation_reference='none' " + "and orientation_axis='none'.\n" "- Do not return numeric offsets, object poses, scales, success JSON, " "robot config, or full prompt files. The generator computes those " "deterministically.\n\n" @@ -388,6 +397,7 @@ def _apply_relative_task_response( release_position=primary.release_position, high_position=primary.high_position, orientation_goal=primary.orientation_goal, + orientation_axis=primary.orientation_axis, orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, ) @@ -493,6 +503,12 @@ def _build_relative_placement_step( orientation_reference = _normalize_orientation_reference( entry.get("orientation_reference") ) + orientation_axis = _normalize_orientation_axis(entry.get("orientation_axis")) + _validate_orientation_fields( + orientation_goal=orientation_goal, + orientation_reference=orientation_reference, + orientation_axis=orientation_axis, + ) orientation_align_to_runtime_uid = ( reference_runtime_uid if orientation_reference == "reference_object" and not reference_is_initial_pose @@ -527,6 +543,7 @@ def _build_relative_placement_step( high_offset=high_offset, reference_is_initial_pose=reference_is_initial_pose, orientation_goal=orientation_goal, + orientation_axis=orientation_axis, orientation_align_to_runtime_uid=orientation_align_to_runtime_uid, ) @@ -673,11 +690,13 @@ def _normalize_orientation_goal(value: Any) -> str: return "preserve" if text in {"upright", "vertical", "stand_upright", "扶正", "竖直", "竖起来"}: return "upright" - if text in {"horizontal", "flat", "lay_flat", "level", "水平", "平放", "横放"}: - return "horizontal" + if text in {"lay_flat", "flat", "level", "平放", "横放"}: + return "lay_flat" + if text in {"axis_align", "align_axis", "cardinal_align", "水平摆正", "摆正"}: + return "axis_align" raise ValueError( f"Unsupported orientation_goal {value!r}; expected 'preserve', " - "'upright', or 'horizontal'." + "'upright', 'lay_flat', or 'axis_align'." ) @@ -687,6 +706,8 @@ def _normalize_orientation_reference(value: Any) -> str: text = str(value).strip().lower().replace("-", "_").replace(" ", "_") if text in {"", "none", "null", "default", "no", "false", "无"}: return "none" + if text in {"world_axes", "world_axis", "world", "table_axes", "x_y_axes"}: + return "world_axes" if text in { "reference_object", "reference", @@ -701,10 +722,63 @@ def _normalize_orientation_reference(value: Any) -> str: return "reference_object" raise ValueError( f"Unsupported orientation_reference {value!r}; expected 'none' or " - "'reference_object'." + "'world_axes' or 'reference_object'." + ) + + +def _normalize_orientation_axis(value: Any) -> str: + if value is None: + return "none" + text = str(value).strip().lower().replace("-", "_").replace(" ", "_") + if text in {"", "none", "null", "default", "no", "false", "无"}: + return "none" + if text in {"x", "world_x", "x_axis", "world_x_axis"}: + return "x" + if text in {"y", "world_y", "y_axis", "world_y_axis"}: + return "y" + if text in {"long_axis", "long", "major_axis", "length", "长轴", "长边"}: + return "long_axis" + if text in {"short_axis", "short", "minor_axis", "width", "短轴", "短边"}: + return "short_axis" + raise ValueError( + f"Unsupported orientation_axis {value!r}; expected 'none', 'x', 'y', " + "'long_axis', or 'short_axis'." ) +def _validate_orientation_fields( + *, + orientation_goal: str, + orientation_reference: str, + orientation_axis: str, +) -> None: + if orientation_goal == "axis_align": + if orientation_reference == "world_axes": + if orientation_axis not in {"x", "y"}: + raise ValueError( + "axis_align with orientation_reference='world_axes' requires " + "orientation_axis 'x' or 'y'." + ) + return + if orientation_reference == "reference_object": + if orientation_axis not in {"long_axis", "short_axis"}: + raise ValueError( + "axis_align with orientation_reference='reference_object' " + "requires orientation_axis 'long_axis' or 'short_axis'." + ) + return + raise ValueError( + "axis_align requires orientation_reference 'world_axes' or " + "'reference_object'." + ) + + if orientation_reference != "none" or orientation_axis != "none": + raise ValueError( + "preserve, upright, and lay_flat require orientation_reference='none' " + "and orientation_axis='none'." + ) + + def _relative_runtime_uid_mapping( rigid_objects: list[_SceneObject], ) -> dict[str, str]: diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt index 26f2f890..b068d85f 100644 --- a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt @@ -21,17 +21,25 @@ Use only these native atomic action classes: 2. `MoveHeldObject` - Use only after the same arm has successfully executed `PickUp`. - Moves the already-held object in the air without releasing it. + - For pose-sensitive placement, move the held object all the way to the final + release object pose before calling release-only `Place`. - Required target_object_pose: - {"reference": "object", "obj_name": "", "offset": [x, y, z], "orientation_goal": "preserve|upright|horizontal"} - {"reference": "absolute", "position": [x, y, z], "orientation_goal": "preserve|upright|horizontal"} - {"reference": "relative", "offset": [dx, dy, dz], "frame": "world|eef", "orientation_goal": "preserve|upright|horizontal"} - - For horizontal insertion into a container, add `"align_to": ""`. + {"reference": "object", "obj_name": "", "offset": [x, y, z], "orientation_goal": "preserve|upright|lay_flat|axis_align", "orientation_axis": "none|x|y|long_axis|short_axis"} + {"reference": "absolute", "position": [x, y, z], "orientation_goal": "preserve|upright|lay_flat|axis_align", "orientation_axis": "none|x|y|long_axis|short_axis"} + {"reference": "relative", "offset": [dx, dy, dz], "frame": "world|eef", "orientation_goal": "preserve|upright|lay_flat|axis_align", "orientation_axis": "none|x|y|long_axis|short_axis"} + - For axis alignment to a pad, box, or container, add `"align_to": ""` and use `"orientation_axis": "long_axis"` or `"short_axis"`. + - For axis alignment to world/table axes, use `"orientation_axis": "x"` or `"y"` and omit `align_to`. - Typical cfg: {"sample_interval": 45} 3. `Place` - - Prefer this for placement because one action lowers, opens the gripper, - and retreats upward. + - For ordinary placement, prefer this because one action lowers, opens the + gripper, and retreats upward. + - For pose-sensitive placement after `MoveHeldObject` has already reached the + final release object pose, use release-only Place: + {"reference": "relative", "offset": [0.0, 0.0, 0.0], "frame": "world"} + with cfg {"sample_interval": 10, "lift_height": 0.0}, then retreat the + empty end-effector with `MoveEndEffector`. - Required target_pose: {"reference": "object", "obj_name": "", "offset": [x, y, z]} {"reference": "absolute", "position": [x, y, z]} diff --git a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py index 8d047231..1c18a823 100644 --- a/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py +++ b/embodichain/gen_sim/action_agent_pipeline/runtime/atom_actions.py @@ -98,7 +98,8 @@ *TARGET_SPEC_FIELDS, } SUPPORTED_POSE_REFERENCES = {"object", "absolute", "relative"} -SUPPORTED_OBJECT_ORIENTATION_GOALS = {"preserve", "upright", "horizontal"} +SUPPORTED_OBJECT_ORIENTATION_GOALS = {"preserve", "upright", "lay_flat", "axis_align"} +SUPPORTED_OBJECT_ORIENTATION_AXES = {"none", "x", "y", "long_axis", "short_axis"} SUPPORTED_QPOS_SOURCES = {"initial", "gripper_state", "joint_delta"} SUPPORTED_CFG_KEYS = { "sample_interval", @@ -366,9 +367,31 @@ def _validate_target_object_pose(target_object_pose: Mapping[str, Any]) -> None: "target_object_pose orientation_goal must be one of " f"{sorted(SUPPORTED_OBJECT_ORIENTATION_GOALS)}." ) + orientation_axis = target_object_pose.get("orientation_axis", "none") + if orientation_axis not in SUPPORTED_OBJECT_ORIENTATION_AXES: + raise ValueError( + "target_object_pose orientation_axis must be one of " + f"{sorted(SUPPORTED_OBJECT_ORIENTATION_AXES)}." + ) align_to = target_object_pose.get("align_to") if align_to is not None and (not isinstance(align_to, str) or not align_to): raise ValueError("target_object_pose align_to must be a non-empty string.") + if orientation_goal == "axis_align": + if align_to is None: + if orientation_axis not in {"x", "y"}: + raise ValueError( + "axis_align without align_to requires orientation_axis 'x' or 'y'." + ) + elif orientation_axis not in {"long_axis", "short_axis"}: + raise ValueError( + "axis_align with align_to requires orientation_axis 'long_axis' " + "or 'short_axis'." + ) + elif orientation_axis != "none" or align_to is not None: + raise ValueError( + "preserve, upright, and lay_flat require orientation_axis='none' " + "and no align_to." + ) def _validate_target_pose_like( @@ -376,7 +399,7 @@ def _validate_target_pose_like( target_name: str, ) -> None: reference = target_pose.get("reference") - allowed_common = {"orientation_goal", "align_to"} + allowed_common = {"orientation_goal", "orientation_axis", "align_to"} if reference not in SUPPORTED_POSE_REFERENCES: raise ValueError( f"{target_name} reference must be one of {sorted(SUPPORTED_POSE_REFERENCES)}." @@ -958,7 +981,7 @@ def _resolve_held_object_pose_target( target_pose={ key: deepcopy(value) for key, value in target_pose_spec.items() - if key not in {"orientation_goal", "align_to"} + if key not in {"orientation_goal", "orientation_axis", "align_to"} }, cfg={}, ) @@ -1006,18 +1029,25 @@ def _resolve_object_orientation( local_secondary=up_axis, world_secondary=torch.tensor([1.0, 0.0, 0.0], device=env.robot.device), ) - if orientation_goal == "horizontal": - align_direction = _horizontal_alignment_direction( - env, - target_pose_spec.get("align_to"), - env.robot.device, - ) + if orientation_goal == "lay_flat": return _rotation_from_axis_targets( local_primary=long_axis, - world_primary=align_direction, + world_primary=torch.tensor([1.0, 0.0, 0.0], device=env.robot.device), local_secondary=up_axis, world_secondary=torch.tensor([0.0, 0.0, 1.0], device=env.robot.device), ) + if orientation_goal == "axis_align": + target_direction = _axis_align_target_direction( + env, + target_pose_spec, + env.robot.device, + ) + current_direction = current_rotation @ long_axis.to( + device=env.robot.device, dtype=torch.float32 + ) + return _yaw_aligned_rotation( + current_rotation, current_direction, target_direction + ) raise ValueError(f"Unsupported orientation_goal: {orientation_goal}.") @@ -1043,26 +1073,93 @@ def _principal_local_axes(vertices: torch.Tensor) -> torch.Tensor: return axes -def _horizontal_alignment_direction(env, align_to: str | None, device) -> torch.Tensor: +def _axis_align_target_direction( + env, + target_pose_spec: Mapping[str, Any], + device, +) -> torch.Tensor: + orientation_axis = target_pose_spec.get("orientation_axis", "none") + align_to = target_pose_spec.get("align_to") if align_to: - target_obj = env.sim.get_rigid_object(align_to) - if target_obj is None: - raise ValueError(f"No rigid object found for align_to={align_to}.") - vertices = torch.as_tensor( - target_obj.get_vertices(env_ids=[0], scale=True)[0], - dtype=torch.float32, - device=device, + return _reference_object_axis_direction(env, align_to, orientation_axis, device) + if orientation_axis == "x": + return torch.tensor([1.0, 0.0, 0.0], dtype=torch.float32, device=device) + if orientation_axis == "y": + return torch.tensor([0.0, 1.0, 0.0], dtype=torch.float32, device=device) + raise ValueError( + "axis_align without align_to requires orientation_axis 'x' or 'y'." + ) + + +def _reference_object_axis_direction( + env, + align_to: str, + orientation_axis: str, + device, +) -> torch.Tensor: + if orientation_axis not in {"long_axis", "short_axis"}: + raise ValueError( + "Reference-object axis alignment requires orientation_axis " + "'long_axis' or 'short_axis'." ) - extents = vertices.max(dim=0).values - vertices.min(dim=0).values - axis_index = 0 if extents[0] >= extents[1] else 1 - pose = _ensure_pose_tensor(target_obj.get_local_pose(to_matrix=True), device) - direction = pose[:3, axis_index] - direction = direction.clone() - direction[2] = 0.0 - norm = torch.linalg.norm(direction) - if float(norm) > 1e-6: - return direction / norm - return torch.tensor([1.0, 0.0, 0.0], dtype=torch.float32, device=device) + target_obj = env.sim.get_rigid_object(align_to) + if target_obj is None: + raise ValueError(f"No rigid object found for align_to={align_to}.") + vertices = torch.as_tensor( + target_obj.get_vertices(env_ids=[0], scale=True)[0], + dtype=torch.float32, + device=device, + ) + extents = vertices.max(dim=0).values - vertices.min(dim=0).values + axis_index = 0 if extents[0] >= extents[1] else 1 + if orientation_axis == "short_axis": + axis_index = 1 - axis_index + pose = _ensure_pose_tensor(target_obj.get_local_pose(to_matrix=True), device) + direction = pose[:3, axis_index].clone() + direction[2] = 0.0 + norm = torch.linalg.norm(direction) + if float(norm) < 1e-6: + raise ValueError(f"Reference object {align_to!r} has no valid XY axis.") + return direction / norm + + +def _yaw_aligned_rotation( + current_rotation: torch.Tensor, + current_direction: torch.Tensor, + target_direction: torch.Tensor, +) -> torch.Tensor: + device = current_rotation.device + current_xy = current_direction.to(device=device, dtype=torch.float32).clone() + target_xy = target_direction.to(device=device, dtype=torch.float32).clone() + current_xy[2] = 0.0 + target_xy[2] = 0.0 + current_xy = _normalize_vector(current_xy) + target_xy = _normalize_vector(target_xy) + same_delta = _signed_yaw_delta(current_xy, target_xy) + opposite_delta = _signed_yaw_delta(current_xy, -target_xy) + delta = ( + same_delta + if torch.abs(same_delta) <= torch.abs(opposite_delta) + else opposite_delta + ) + return _yaw_rotation_matrix(delta, device) @ current_rotation + + +def _signed_yaw_delta(source: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + cross_z = source[0] * target[1] - source[1] * target[0] + dot = source[0] * target[0] + source[1] * target[1] + return torch.atan2(cross_z, dot) + + +def _yaw_rotation_matrix(delta: torch.Tensor, device) -> torch.Tensor: + c = torch.cos(delta) + s = torch.sin(delta) + rotation = torch.eye(3, dtype=torch.float32, device=device) + rotation[0, 0] = c + rotation[0, 1] = -s + rotation[1, 0] = s + rotation[1, 1] = c + return rotation def _rotation_from_axis_targets( diff --git a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py index 34d5c046..b1335142 100644 --- a/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py +++ b/tests/gen_sim/action_agent_pipeline/test_backend_atomic_runtime.py @@ -77,21 +77,28 @@ def get_joint_ids(self, name: str): class _FakeObject: cfg = SimpleNamespace(shape=SimpleNamespace(fpath="/tmp/fake.obj")) - def __init__(self, xyz): + def __init__(self, xyz, *, yaw_degrees: float = 0.0, extents=(0.3, 0.1, 0.05)): self._pose = torch.eye(4) self._pose[:3, 3] = torch.tensor(xyz, dtype=torch.float32) + yaw = torch.deg2rad(torch.tensor(float(yaw_degrees))) + self._pose[0, 0] = torch.cos(yaw) + self._pose[0, 1] = -torch.sin(yaw) + self._pose[1, 0] = torch.sin(yaw) + self._pose[1, 1] = torch.cos(yaw) + self._extents = torch.tensor(extents, dtype=torch.float32) def get_local_pose(self, to_matrix: bool = True): return self._pose.unsqueeze(0) def get_vertices(self, env_ids=None, scale: bool = True): + x, y, z = self._extents.tolist() return [ torch.tensor( [ [0.0, 0.0, 0.0], - [0.3, 0.0, 0.0], - [0.0, 0.1, 0.0], - [0.0, 0.0, 0.05], + [x, 0.0, 0.0], + [0.0, y, 0.0], + [0.0, 0.0, z], ] ) ] @@ -105,7 +112,15 @@ def get_body_scale(self, env_ids=None): class _FakeSim: def __init__(self): - self.objects = {"apple": _FakeObject([0.4, -0.2, 0.1])} + self.objects = { + "apple": _FakeObject([0.4, -0.2, 0.1]), + "pad_x": _FakeObject([0.4, 0.0, 0.0], extents=(0.4, 0.1, 0.02)), + "pad_y": _FakeObject( + [0.4, 0.0, 0.0], + yaw_degrees=90.0, + extents=(0.4, 0.1, 0.02), + ), + } def get_rigid_object(self, uid: str): return self.objects.get(uid) @@ -374,6 +389,100 @@ def test_normalize_atomic_action_spec_rejects_orientation_field() -> None: ) +def test_move_held_object_defaults_orientation_axis_to_none() -> None: + normalized = normalize_atomic_action_spec( + { + "atomic_action_class": "MoveHeldObject", + "robot_name": "left_arm", + "control": "arm", + "target_object_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + }, + "cfg": {}, + } + ) + + target = normalized["target_object_pose"] + assert target.get("orientation_goal", "preserve") == "preserve" + assert target.get("orientation_axis", "none") == "none" + + +def test_move_held_object_rejects_legacy_horizontal_orientation_goal() -> None: + with pytest.raises(ValueError, match="orientation_goal"): + normalize_atomic_action_spec( + { + "atomic_action_class": "MoveHeldObject", + "robot_name": "left_arm", + "control": "arm", + "target_object_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + "orientation_goal": "horizontal", + }, + "cfg": {}, + } + ) + + +def test_move_held_object_rejects_invalid_axis_alignment_pairings() -> None: + base_spec = { + "atomic_action_class": "MoveHeldObject", + "robot_name": "left_arm", + "control": "arm", + "target_object_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + "orientation_goal": "axis_align", + }, + "cfg": {}, + } + + with pytest.raises(ValueError, match="without align_to"): + normalize_atomic_action_spec( + { + **base_spec, + "target_object_pose": { + **base_spec["target_object_pose"], + "orientation_axis": "long_axis", + }, + } + ) + with pytest.raises(ValueError, match="with align_to"): + normalize_atomic_action_spec( + { + **base_spec, + "target_object_pose": { + **base_spec["target_object_pose"], + "orientation_axis": "x", + "align_to": "pad_x", + }, + } + ) + + +def test_move_held_object_rejects_axis_for_non_axis_align_goals() -> None: + with pytest.raises(ValueError, match="orientation_axis='none'"): + normalize_atomic_action_spec( + { + "atomic_action_class": "MoveHeldObject", + "robot_name": "left_arm", + "control": "arm", + "target_object_pose": { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + "orientation_goal": "lay_flat", + "orientation_axis": "x", + }, + "cfg": {}, + } + ) + + def test_normalize_atomic_action_spec_rejects_pickup_pose_target() -> None: with pytest.raises(ValueError, match="PickUp requires control='arm'"): normalize_atomic_action_spec( @@ -763,6 +872,170 @@ def test_place_action_builds_place_cfg(monkeypatch) -> None: assert capture[0]["cfg"].lift_height == pytest.approx(0.06) +def _held_state_with_yaw( + env: _FakeEnv, + yaw_degrees: float, + *, + mesh_extents=(0.3, 0.1, 0.05), +) -> WorldState: + yaw = torch.deg2rad(torch.tensor(float(yaw_degrees))) + current_pose = torch.eye(4) + current_pose[0, 0] = torch.cos(yaw) + current_pose[0, 1] = -torch.sin(yaw) + current_pose[1, 0] = torch.sin(yaw) + current_pose[1, 1] = torch.cos(yaw) + semantics = atom_actions._build_object_semantics( + env, + {"obj_name": "apple", "affordance": "antipodal"}, + {"allow_grasp_annotation": True}, + ) + x, y, z = mesh_extents + semantics.geometry["mesh_vertices"] = torch.tensor( + [ + [0.0, 0.0, 0.0], + [x, 0.0, 0.0], + [0.0, y, 0.0], + [0.0, 0.0, z], + ], + dtype=torch.float32, + ) + semantics.entity = None + return WorldState( + last_qpos=env.robot.get_qpos().clone(), + held_object=HeldObjectState( + semantics=semantics, + object_to_eef=torch.eye(4).unsqueeze(0), + grasp_xpos=current_pose.unsqueeze(0), + ), + ) + + +def _resolved_held_object_direction( + env: _FakeEnv, + state: WorldState, + target_object_pose: dict, +) -> torch.Tensor: + target = atom_actions._resolve_held_object_pose_target( + env, + atom_actions.AtomicActionSpec( + atomic_action_class="MoveHeldObject", + robot_name="left_arm", + control="arm", + target_object_pose=target_object_pose, + cfg={}, + ), + state, + ) + direction = target[:3, 0].clone() + direction[2] = 0.0 + return direction / torch.linalg.norm(direction) + + +def test_axis_align_world_axes_preserves_roll_pitch_and_aligns_x_axis() -> None: + env = _FakeEnv() + state = _held_state_with_yaw(env, 37.0) + + direction = _resolved_held_object_direction( + env, + state, + { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + "orientation_goal": "axis_align", + "orientation_axis": "x", + }, + ) + + assert abs(float(torch.dot(direction, torch.tensor([1.0, 0.0, 0.0])))) == ( + pytest.approx(1.0) + ) + + +def test_axis_align_world_axes_aligns_y_axis() -> None: + env = _FakeEnv() + state = _held_state_with_yaw(env, 12.0) + + direction = _resolved_held_object_direction( + env, + state, + { + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + "orientation_goal": "axis_align", + "orientation_axis": "y", + }, + ) + + assert abs(float(torch.dot(direction, torch.tensor([0.0, 1.0, 0.0])))) == ( + pytest.approx(1.0) + ) + + +def test_axis_align_reference_object_long_and_short_axes() -> None: + env = _FakeEnv() + state = _held_state_with_yaw(env, 10.0) + + long_direction = _resolved_held_object_direction( + env, + state, + { + "reference": "object", + "obj_name": "pad_y", + "offset": [0.0, 0.0, 0.1], + "orientation_goal": "axis_align", + "orientation_axis": "long_axis", + "align_to": "pad_y", + }, + ) + short_direction = _resolved_held_object_direction( + env, + state, + { + "reference": "object", + "obj_name": "pad_y", + "offset": [0.0, 0.0, 0.1], + "orientation_goal": "axis_align", + "orientation_axis": "short_axis", + "align_to": "pad_y", + }, + ) + + assert abs(float(torch.dot(long_direction, torch.tensor([0.0, 1.0, 0.0])))) == ( + pytest.approx(1.0) + ) + assert abs(float(torch.dot(short_direction, torch.tensor([1.0, 0.0, 0.0])))) == ( + pytest.approx(1.0) + ) + + +def test_axis_align_selects_smallest_equivalent_yaw() -> None: + env = _FakeEnv() + state = _held_state_with_yaw(env, 170.0) + + target = atom_actions._resolve_held_object_pose_target( + env, + atom_actions.AtomicActionSpec( + atomic_action_class="MoveHeldObject", + robot_name="left_arm", + control="arm", + target_object_pose={ + "reference": "relative", + "offset": [0.0, 0.0, 0.1], + "frame": "world", + "orientation_goal": "axis_align", + "orientation_axis": "x", + }, + cfg={}, + ), + state, + ) + final_yaw = torch.rad2deg(torch.atan2(target[1, 0], target[0, 0])) + + assert float(abs(final_yaw)) == pytest.approx(180.0) + + def test_move_held_object_builds_cfg_and_object_pose_target(monkeypatch) -> None: env = _FakeEnv() capture = [] diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 1bbb0f33..b7ae49ac 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -147,13 +147,13 @@ def test_action_agent_config_generator_uses_parallel_handoff( '"atomic_action_class":"MoveHeldObject","robot_name":"left_arm",' '"control":"arm","target_object_pose":{"reference":"object",' '"obj_name":"wicker_basket","offset":[0.0,0.04,0.22],' - '"orientation_goal":"preserve"}' + '"orientation_goal":"preserve","orientation_axis":"none"}' ) right_high_offset_spec = ( '"atomic_action_class":"MoveHeldObject","robot_name":"right_arm",' '"control":"arm","target_object_pose":{"reference":"object",' '"obj_name":"wicker_basket","offset":[0.0,-0.04,0.22],' - '"orientation_goal":"preserve"}' + '"orientation_goal":"preserve","orientation_axis":"none"}' ) assert left_high_offset_spec in task_prompt assert right_high_offset_spec in task_prompt @@ -1028,7 +1028,7 @@ def fake_call_relative_task_llm(**kwargs): assert _stable_summary(paths.summary)["relation"] == "on" -def test_relative_orientation_intent_generates_horizontal_move_held_object( +def test_relative_orientation_intent_generates_axis_align_move_held_object( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -1081,8 +1081,9 @@ def fake_call_relative_task_llm(**kwargs): "reference_object": "colored_pad_1", "goal_relation": "on", "arm": "auto", - "orientation_goal": "horizontal", + "orientation_goal": "axis_align", "orientation_reference": "reference_object", + "orientation_axis": "long_axis", "task_prompt_summary": "Place the stapler horizontally on the colored pad.", } @@ -1103,15 +1104,37 @@ def fake_call_relative_task_llm(**kwargs): task_prompt = paths.task_prompt.read_text(encoding="utf-8") atom_actions = paths.atom_actions.read_text(encoding="utf-8") + summary = _stable_summary(paths.summary) + active_arm = summary["active_arm"] + release_offset_json = json.dumps( + summary["release_offset"], ensure_ascii=False, separators=(",", ":") + ) + assert ( + "Generate one deterministic nominal graph with exactly 6 nominal edges" + in task_prompt + ) for text in (task_prompt, atom_actions): assert '"atomic_action_class":"MoveHeldObject"' in text assert '"target_object_pose":{"reference":"object"' in text assert '"obj_name":"colored_pad"' in text - assert '"orientation_goal":"horizontal"' in text + assert f'"offset":{release_offset_json}' in text + assert '"orientation_goal":"axis_align"' in text + assert '"orientation_axis":"long_axis"' in text assert '"align_to":"colored_pad"' in text + assert ( + f'"atomic_action_class":"Place","robot_name":"{active_arm}",' + '"control":"arm","target_pose":{"reference":"relative",' + '"offset":[0.0,0.0,0.0],"frame":"world"},' + '"cfg":{"sample_interval":10,"lift_height":0.0}' in text + ) + assert ( + f'"atomic_action_class":"MoveEndEffector","robot_name":"{active_arm}",' + '"control":"arm","target_pose":{"reference":"relative",' + '"offset":[0.0,0.0,0.1],"frame":"world"}' in text + ) - summary = _stable_summary(paths.summary) - assert summary["orientation_goal"] == "horizontal" + assert summary["orientation_goal"] == "axis_align" + assert summary["orientation_axis"] == "long_axis" assert summary["orientation_align_to"] == "colored_pad" @@ -1145,10 +1168,19 @@ def fake_call_relative_task_llm(**kwargs): prewarm_coacd_cache=False, ) + task_prompt = paths.task_prompt.read_text(encoding="utf-8") atom_actions = paths.atom_actions.read_text(encoding="utf-8") - assert '"orientation_goal":"upright"' in atom_actions - assert '"align_to"' not in atom_actions + for text in (task_prompt, atom_actions): + assert '"orientation_goal":"upright"' in text + assert '"orientation_axis":"none"' in text + assert '"align_to"' not in text + assert ( + '"target_pose":{"reference":"relative","offset":[0.0,0.0,0.0],' + '"frame":"world"},"cfg":{"sample_interval":10,"lift_height":0.0}' in text + ) + assert '"atomic_action_class":"MoveEndEffector"' in text assert _stable_summary(paths.summary)["orientation_goal"] == "upright" + assert paths.summary["orientation_axis"] == "none" assert paths.summary["orientation_align_to"] is None @@ -1183,6 +1215,72 @@ def fake_call_relative_task_llm(**kwargs): ) +def test_relative_orientation_rejects_invalid_reference_axis_pairing( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "left_of", + "orientation_goal": "axis_align", + "orientation_reference": "reference_object", + "orientation_axis": "x", + "task_prompt_summary": "Invalid axis pairing.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + with pytest.raises(ValueError, match="reference_object"): + generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_invalid_axis_pairing_agent", + task_description="把 apple_2 水平摆正到 basket_3 左边", + prewarm_coacd_cache=False, + ) + + +def test_relative_orientation_rejects_legacy_horizontal_goal( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "1790000000_gym_project" + _write_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "apple_2", + "reference_object": "basket_3", + "goal_relation": "left_of", + "orientation_goal": "horizontal", + "orientation_reference": "reference_object", + "orientation_axis": "long_axis", + "task_prompt_summary": "Legacy orientation.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + with pytest.raises(ValueError, match="Unsupported orientation_goal"): + generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_legacy_horizontal_agent", + task_description="把 apple_2 水平摆正到 basket_3 左边", + prewarm_coacd_cache=False, + ) + + def test_task_description_allows_single_rigid_with_background_reference( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -2157,11 +2255,15 @@ def _stable_summary(summary: dict) -> dict: } if stable.get("orientation_goal") == "preserve": stable.pop("orientation_goal", None) + if stable.get("orientation_axis") == "none": + stable.pop("orientation_axis", None) if stable.get("orientation_align_to") is None: stable.pop("orientation_align_to", None) for placement in stable.get("placements", []): if placement.get("orientation_goal") == "preserve": placement.pop("orientation_goal", None) + if placement.get("orientation_axis") == "none": + placement.pop("orientation_axis", None) if placement.get("orientation_align_to") is None: placement.pop("orientation_align_to", None) return stable From f75c295cc11ca16ef6e5a5b14efeb5b8df467f50 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sat, 27 Jun 2026 11:39:11 +0800 Subject: [PATCH 25/33] Fix pose-sensitive relative release height --- .../generation/action_agent_config.py | 4 + .../generation/prompt_builders.py | 171 ++++++++++++------ .../generation/relative_geometry.py | 112 ++++++++++++ .../generation/relative_spec.py | 11 +- .../prompts/atom_actions.txt | 6 +- .../test_ur5_basket_config_generation.py | 104 ++++++++++- 6 files changed, 351 insertions(+), 57 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py index 63321003..c20b1273 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py @@ -98,6 +98,7 @@ _resolve_table_mesh_world_zmax, ) from embodichain.gen_sim.action_agent_pipeline.generation.relative_geometry import ( + _POSE_SENSITIVE_STAGING_Z_DELTA, _STAGING_Z_DELTA, _inside_container_axis_offsets, _inside_container_slot_axis_and_distance, @@ -106,6 +107,7 @@ _relative_release_offset, _side_relation_xy_offsets, _with_inside_container_slot_offsets, + _with_on_surface_release_offsets, _with_self_relative_absolute_targets, ) from embodichain.gen_sim.action_agent_pipeline.generation.relative_spec import ( @@ -252,6 +254,7 @@ def generate_action_agent_config_from_project( model=llm_model, release_offset_fn=_relative_release_offset, staging_z_delta=_STAGING_Z_DELTA, + pose_sensitive_staging_z_delta=_POSE_SENSITIVE_STAGING_Z_DELTA, task_llm_caller=_call_relative_task_llm, ) bundle = _build_relative_placement_bundle( @@ -701,6 +704,7 @@ def _build_relative_placement_bundle( _apply_tabletop_z_placement(gym_config, table_top_z) spec = _with_self_relative_absolute_targets(spec, gym_config) spec = _with_inside_container_slot_offsets(spec, gym_config) + spec = _with_on_surface_release_offsets(spec, gym_config) gym_config["env"]["extensions"] = _make_relative_extensions_config( spec, side_relation_xy_offsets=_side_relation_xy_offsets, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 23552915..7bd93667 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -40,6 +40,7 @@ _PLACE_LIFT_HEIGHT = 0.10 _RELEASE_ONLY_PLACE_SAMPLE_INTERVAL = 10 _EMPTY_HAND_RETREAT_SAMPLE_INTERVAL = 30 +_USE_PLACEMENT_ALIGN_TO = object() _RELATIVE_COORDINATE_CONVENTION = """Coordinate convention for relative placement: - `left_of` means positive world y relative to the reference object. - `right_of` means negative world y relative to the reference object. @@ -299,6 +300,21 @@ def make_relative_task_prompt( ) pose_sensitive = _is_pose_sensitive_placement(spec) if pose_sensitive: + safe_high_spec = _format_relative_pose_spec( + active_arm, + spec, + pose_kind="high", + sample_interval=45, + orientation_goal="preserve", + orientation_axis="none", + align_to=None, + ) + high_orientation_spec = _format_relative_pose_spec( + active_arm, + spec, + pose_kind="high", + sample_interval=45, + ) release_move_spec = _format_relative_pose_spec( active_arm, spec, @@ -307,28 +323,37 @@ def make_relative_task_prompt( ) place_spec = _format_release_only_place_spec(active_arm) retreat_spec = _format_empty_hand_retreat_spec(active_arm) - edge_count = 6 - release_instruction = f"""3. Move the held object down to the {release_step_label} object pose: + edge_count = 7 + release_instruction = f"""2. Move the held object up to the {high_step_label} pose without changing orientation: + - {active_slot}: {safe_high_spec} + - {inactive_slot}: null + +3. Adjust the held object orientation at the same safe high staging pose: + - {active_slot}: {high_orientation_spec} + - {inactive_slot}: null + +4. Move the held object down to the {release_step_label} object pose: - {active_slot}: {release_move_spec} - {inactive_slot}: null -4. Release the held object in-place without moving the object pose: +5. Release the held object in-place without moving the object pose: - {active_slot}: {place_spec} - {inactive_slot}: null -5. Retreat the now-empty end-effector upward: +6. Retreat the now-empty end-effector upward: - {active_slot}: {retreat_spec} - {inactive_slot}: null -6. Return the active arm to its initial pose: +7. Return the active arm to its initial pose: - {active_slot}: {initial_spec} - {inactive_slot}: null""" + high_instruction = release_instruction release_rule = ( - "For this pose-sensitive placement, `MoveHeldObject` must move the " - "object all the way down to the final release object pose. The " - "following `Place` must be the exact relative-zero release-only spec " - "shown below so it opens the gripper without re-planning a new " - "placement pose." + "For this pose-sensitive placement, first use `MoveHeldObject` to " + "lift the object to the safe high staging pose while preserving its " + "current orientation. Only then adjust orientation at that same high " + "pose, move down to the final release object pose, and use the exact " + "relative-zero release-only `Place` spec shown below." ) else: place_spec = _format_relative_place_spec( @@ -338,7 +363,11 @@ def make_relative_task_prompt( lift_height=_PLACE_LIFT_HEIGHT, ) edge_count = 4 - release_instruction = f"""3. Place the held object at the {release_step_label} pose: + high_instruction = f"""2. Move the held object to the {high_step_label} pose: + - {active_slot}: {high_spec} + - {inactive_slot}: null + +3. Place the held object at the {release_step_label} pose: - {active_slot}: {place_spec} - {inactive_slot}: null @@ -379,11 +408,7 @@ def make_relative_task_prompt( - {active_slot}: {pick_spec} - {inactive_slot}: null -2. Move the held object to the {high_step_label} pose: - - {active_slot}: {high_spec} - - {inactive_slot}: null - -{release_instruction} +{high_instruction} Final state: `{spec.moved_runtime_uid}` must be {_relative_relation_phrase(spec.relation)} `{spec.reference_runtime_uid}`. @@ -537,6 +562,35 @@ def _dual_relative_release_edge_blocks( waiting_value = waiting_action if _is_pose_sensitive_placement(placement): return [ + ( + f"Lift `{placement.moved_runtime_uid}` to the safe high staging " + "pose without changing orientation", + { + active_slot: _format_relative_pose_spec( + active_arm, + placement, + pose_kind="high", + sample_interval=45, + orientation_goal="preserve", + orientation_axis="none", + align_to=None, + ), + waiting_slot: waiting_value, + }, + ), + ( + f"Adjust `{placement.moved_runtime_uid}` orientation at the same " + "safe high staging pose", + { + active_slot: _format_relative_pose_spec( + active_arm, + placement, + pose_kind="high", + sample_interval=45, + ), + waiting_slot: waiting_value, + }, + ), ( f"Move `{placement.moved_runtime_uid}` down to the final " "release object pose", @@ -586,11 +640,13 @@ def _dual_relative_release_edge_blocks( def _dual_relative_release_rule(spec: _RelativeSpecLike) -> str: if any(_is_pose_sensitive_placement(placement) for placement in spec.placements): return ( - "For pose-sensitive placements, `MoveHeldObject` must move the held " - "object all the way down to the final release object pose; the " - "following `Place` must be the exact relative-zero release-only spec " - "shown below, and then the empty hand retreats upward. For preserve " - "placements, keep the normal `Place` release-place action." + "For pose-sensitive placements, first lift the held object to the " + "safe high staging pose with orientation preserved, then adjust " + "orientation at the same high pose before moving down to the final " + "release object pose. The following `Place` must be the exact " + "relative-zero release-only spec shown below, and then the empty " + "hand retreats upward. For preserve placements, keep the normal " + "`Place` release-place action." ) return ( "Use `Place` for each release-place step so lowering, gripper opening, " @@ -626,6 +682,19 @@ def _relative_release_action_patterns( {_format_relative_place_spec(robot_name, placement, sample_interval=80, lift_height=_PLACE_LIFT_HEIGHT)}""" +def _relative_high_action_patterns( + robot_name: str, + placement: _RelativePlacementLike, +) -> str: + if _is_pose_sensitive_placement(placement): + return f"""- Safe high staging without orientation change: + {_format_relative_pose_spec(robot_name, placement, pose_kind="high", sample_interval=45, orientation_goal="preserve", orientation_axis="none", align_to=None)} +- High staging orientation adjustment: + {_format_relative_pose_spec(robot_name, placement, pose_kind="high", sample_interval=45)}""" + return f"""- {_relative_pose_step_label(placement, "High staging")}: + {_format_relative_pose_spec(robot_name, placement, pose_kind="high", sample_interval=45)}""" + + def make_relative_basic_background( project_name: str, spec: _RelativeSpecLike, @@ -662,7 +731,9 @@ def make_relative_basic_background( The execution-stage LLM should generate graph JSON that grasps the moved object, moves it to the configured high staging pose, releases it at the final pose, and returns the active arm to its initial pose. Pose-sensitive placements must use a -final `MoveHeldObject` object-pose move followed by release-only `Place`. +safe high `MoveHeldObject` lift with orientation preserved before high-pose +orientation adjustment, then a final object-pose move followed by release-only +`Place`. """ @@ -701,7 +772,9 @@ def _make_dual_relative_basic_background( the second moved object while the first arm returns to its initial pose. Each arm must release its moved object before returning to its initial pose. Pose-sensitive placements must use a final `MoveHeldObject` object-pose move -followed by release-only `Place`. +safe high `MoveHeldObject` lift with orientation preserved before high-pose +orientation adjustment, then a final object-pose move followed by release-only +`Place`. """ @@ -711,12 +784,7 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: active_arm = f"{spec.active_side}_arm" inactive_arm = "right_arm" if spec.active_side == "left" else "left_arm" - high_spec = _format_relative_pose_spec( - active_arm, - spec, - pose_kind="high", - sample_interval=45, - ) + high_actions = _relative_high_action_patterns(active_arm, spec) release_actions = _relative_release_action_patterns(active_arm, spec) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Relative Placement @@ -727,8 +795,7 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: Use exactly these action patterns: - Pick up `{spec.moved_runtime_uid}`: {_format_pick_up_spec(active_arm, spec.moved_runtime_uid)} -- {_relative_pose_step_label(spec, "High staging")}: - {high_spec} +{high_actions} {release_actions} - Return to initial qpos: {_format_initial_qpos_spec(active_arm, sample_interval=30)} @@ -739,18 +806,8 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: first, second = spec.placements first_arm = f"{first.active_side}_arm" second_arm = f"{second.active_side}_arm" - first_high_spec = _format_relative_pose_spec( - first_arm, - first, - pose_kind="high", - sample_interval=45, - ) - second_high_spec = _format_relative_pose_spec( - second_arm, - second, - pose_kind="high", - sample_interval=45, - ) + first_high_actions = _relative_high_action_patterns(first_arm, first) + second_high_actions = _relative_high_action_patterns(second_arm, second) first_release_actions = _relative_release_action_patterns(first_arm, first) second_release_actions = _relative_release_action_patterns(second_arm, second) return f"""### Atomic Action Class JSON Specs for Dual-UR5 Dual-Arm Relative Placement @@ -764,11 +821,9 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: {_format_pick_up_spec(first_arm, first.moved_runtime_uid)} - Second arm pick-up: {_format_pick_up_spec(second_arm, second.moved_runtime_uid)} -- First high staging: - {first_high_spec} +{first_high_actions} {first_release_actions} -- Second high staging: - {second_high_spec} +{second_high_actions} {second_release_actions} - Keep a holding arm closed: {_format_gripper_spec("", "close", sample_interval=10)} @@ -1120,7 +1175,17 @@ def _format_relative_pose_spec( *, pose_kind: str, sample_interval: int, + orientation_goal: str | None = None, + orientation_axis: str | None = None, + align_to: str | None | object = _USE_PLACEMENT_ALIGN_TO, ) -> str: + resolved_orientation_goal = orientation_goal or placement.orientation_goal + resolved_orientation_axis = orientation_axis or placement.orientation_axis + resolved_align_to = ( + placement.orientation_align_to_runtime_uid + if align_to is _USE_PLACEMENT_ALIGN_TO + else align_to + ) if getattr(placement, "reference_is_initial_pose", False): position = ( placement.high_position @@ -1135,9 +1200,9 @@ def _format_relative_pose_spec( robot_name, position, sample_interval=sample_interval, - orientation_goal=placement.orientation_goal, - orientation_axis=placement.orientation_axis, - align_to=placement.orientation_align_to_runtime_uid, + orientation_goal=resolved_orientation_goal, + orientation_axis=resolved_orientation_axis, + align_to=resolved_align_to, ) offset = placement.high_offset if pose_kind == "high" else placement.release_offset @@ -1146,9 +1211,9 @@ def _format_relative_pose_spec( placement.reference_runtime_uid, offset, sample_interval=sample_interval, - orientation_goal=placement.orientation_goal, - orientation_axis=placement.orientation_axis, - align_to=placement.orientation_align_to_runtime_uid, + orientation_goal=resolved_orientation_goal, + orientation_axis=resolved_orientation_axis, + align_to=resolved_align_to, ) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py index 4aee046d..cab92f0d 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py @@ -27,6 +27,8 @@ from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( _clean_vector3, _iter_generated_scene_object_configs, + _mesh_config_local_zmin_after_rotation, + _mesh_config_world_zmax, _mesh_config_world_xy_extents, ) from embodichain.gen_sim.action_agent_pipeline.generation.relative_spec import ( @@ -35,12 +37,15 @@ ) __all__ = [ + "_POSE_SENSITIVE_STAGING_Z_DELTA", + "_STAGING_Z_DELTA", "_inside_container_axis_offsets", "_inside_container_slot_axis_and_distance", "_make_relative_summary", "_offset_position", "_relative_release_offset", "_side_relation_xy_offsets", + "_with_on_surface_release_offsets", "_with_inside_container_slot_offsets", "_with_self_relative_absolute_targets", ] @@ -53,7 +58,9 @@ _CONTAINER_SLOT_MAX_FRACTION = 0.40 _CONTAINER_SLOT_AXIS_TIE_RATIO = 0.10 _STAGING_Z_DELTA = 0.10 +_POSE_SENSITIVE_STAGING_Z_DELTA = 0.25 _ON_RELEASE_Z_OFFSET = 0.2 +_ON_SURFACE_RELEASE_CLEARANCE = 0.003 _ROBOT_VIEW_LEFT_WORLD_Y_SIGN = 1.0 _ROBOT_VIEW_FRONT_WORLD_X_SIGN = 1.0 @@ -264,6 +271,111 @@ def _replace_relative_spec_placements( ) +def _with_on_surface_release_offsets( + spec: _RelativePlacementSpec, + gym_config: Mapping[str, Any], +) -> _RelativePlacementSpec: + placements = tuple( + _with_on_surface_release_offset(placement, gym_config) + for placement in spec.placements + ) + return _replace_relative_spec_placements(spec, placements) + + +def _with_on_surface_release_offset( + placement: _RelativePlacementStepSpec, + gym_config: Mapping[str, Any], +) -> _RelativePlacementStepSpec: + if placement.relation != "on" or placement.reference_is_initial_pose: + return placement + + object_configs = { + str(obj.get("uid")): obj + for obj in _iter_generated_scene_object_configs(gym_config) + if obj.get("uid") is not None + } + reference_config = object_configs.get(placement.reference_runtime_uid) + moved_config = object_configs.get(placement.moved_runtime_uid) + if reference_config is None or moved_config is None: + return placement + + reference_top_z = _mesh_config_world_zmax(reference_config) + moved_bottom_offset = _target_local_zmin_for_orientation( + moved_config, + placement.orientation_goal, + ) + if reference_top_z is None or moved_bottom_offset is None: + return placement + + reference_origin_z = _clean_vector3(reference_config.get("init_pos", [0, 0, 0]))[2] + release_offset = list(placement.release_offset) + release_offset[2] = round( + float(reference_top_z) + - float(reference_origin_z) + + _ON_SURFACE_RELEASE_CLEARANCE + - float(moved_bottom_offset), + 6, + ) + high_offset = list(release_offset) + high_offset[2] = round( + release_offset[2] + + ( + _POSE_SENSITIVE_STAGING_Z_DELTA + if placement.orientation_goal != "preserve" + else _STAGING_Z_DELTA + ), + 6, + ) + return replace( + placement, + release_offset=release_offset, + high_offset=high_offset, + ) + + +def _target_local_zmin_for_orientation( + obj_config: Mapping[str, Any], + orientation_goal: str, +) -> float | None: + if orientation_goal in {"preserve", "axis_align"}: + return _mesh_config_local_zmin_after_rotation(obj_config) + if orientation_goal == "upright": + return 0.0 + if orientation_goal == "lay_flat": + return _lay_flat_local_zmin(obj_config) + return _mesh_config_local_zmin_after_rotation(obj_config) + + +def _lay_flat_local_zmin(obj_config: Mapping[str, Any]) -> float | None: + shape = obj_config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + mesh_path = shape.get("fpath") + if not isinstance(mesh_path, str): + return None + + from pathlib import Path + + from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( + _load_mesh_vertices, + ) + + vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) + if not vertices: + return None + scale = _clean_vector3(obj_config.get("body_scale", [1.0, 1.0, 1.0])) + extents = [ + ( + max(vertex[index] for vertex in vertices) + - min(vertex[index] for vertex in vertices) + ) + * scale[index] + for index in range(3) + ] + sorted_extents = sorted(float(extent) for extent in extents) + return -0.5 * sorted_extents[1] + + def _inside_container_slot_axis_and_distance( container_config: Mapping[str, Any] | None, ) -> tuple[str, float]: diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py index 079e59d7..b5834505 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py @@ -162,6 +162,7 @@ def _build_relative_placement_spec_with_llm( model: str | None, release_offset_fn: Callable[[str], Sequence[float]], staging_z_delta: float, + pose_sensitive_staging_z_delta: float, task_llm_caller: Callable[..., Mapping[str, Any]] | None = None, ) -> _RelativePlacementSpec: background_objects = [ @@ -202,6 +203,7 @@ def _build_relative_placement_spec_with_llm( task_description=task_description, release_offset_fn=release_offset_fn, staging_z_delta=staging_z_delta, + pose_sensitive_staging_z_delta=pose_sensitive_staging_z_delta, ) @@ -337,6 +339,7 @@ def _apply_relative_task_response( task_description: str, release_offset_fn: Callable[[str], Sequence[float]], staging_z_delta: float, + pose_sensitive_staging_z_delta: float, ) -> _RelativePlacementSpec: by_uid = {obj.source_uid: obj for obj in scene_objects} runtime_uids = _relative_scene_runtime_uid_mapping( @@ -363,6 +366,7 @@ def _apply_relative_task_response( forced_side=forced_side, release_offset_fn=release_offset_fn, staging_z_delta=staging_z_delta, + pose_sensitive_staging_z_delta=pose_sensitive_staging_z_delta, ) for entry, forced_side in zip(placement_entries, forced_arm_sides) ) @@ -470,6 +474,7 @@ def _build_relative_placement_step( forced_side: str | None, release_offset_fn: Callable[[str], Sequence[float]], staging_z_delta: float, + pose_sensitive_staging_z_delta: float, ) -> _RelativePlacementStepSpec: moved_source_uid = _resolve_rigid_source_uid( entry.get("moved_object"), @@ -517,7 +522,11 @@ def _build_relative_placement_step( release_offset = [float(value) for value in release_offset_fn(relation)] high_offset = list(release_offset) - high_offset[2] += float(staging_z_delta) + high_offset[2] += float( + pose_sensitive_staging_z_delta + if orientation_goal != "preserve" + else staging_z_delta + ) moved_position = _vector3( by_uid[moved_source_uid].config.get("init_pos", [0, 0, 0]) ) diff --git a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt index b068d85f..5d3887b3 100644 --- a/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt +++ b/embodichain/gen_sim/action_agent_pipeline/prompts/atom_actions.txt @@ -21,8 +21,10 @@ Use only these native atomic action classes: 2. `MoveHeldObject` - Use only after the same arm has successfully executed `PickUp`. - Moves the already-held object in the air without releasing it. - - For pose-sensitive placement, move the held object all the way to the final - release object pose before calling release-only `Place`. + - For pose-sensitive placement, first move the held object to a safe high + staging pose with orientation_goal="preserve"; only then adjust orientation + at the same high pose, move down to the final release object pose, and call + release-only `Place`. - Required target_object_pose: {"reference": "object", "obj_name": "", "offset": [x, y, z], "orientation_goal": "preserve|upright|lay_flat|axis_align", "orientation_axis": "none|x|y|long_axis|short_axis"} {"reference": "absolute", "position": [x, y, z], "orientation_goal": "preserve|upright|lay_flat|axis_align", "orientation_axis": "none|x|y|long_axis|short_axis"} diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index b7ae49ac..6ed8e021 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -1109,14 +1109,27 @@ def fake_call_relative_task_llm(**kwargs): release_offset_json = json.dumps( summary["release_offset"], ensure_ascii=False, separators=(",", ":") ) + high_offset = list(summary["release_offset"]) + high_offset[2] = round(float(high_offset[2]) + 0.25, 6) + high_offset_json = json.dumps( + high_offset, ensure_ascii=False, separators=(",", ":") + ) assert ( - "Generate one deterministic nominal graph with exactly 6 nominal edges" + "Generate one deterministic nominal graph with exactly 7 nominal edges" in task_prompt ) for text in (task_prompt, atom_actions): assert '"atomic_action_class":"MoveHeldObject"' in text assert '"target_object_pose":{"reference":"object"' in text assert '"obj_name":"colored_pad"' in text + assert ( + f'"offset":{high_offset_json},"orientation_goal":"preserve",' + '"orientation_axis":"none"}' in text + ) + assert ( + f'"offset":{high_offset_json},"orientation_goal":"axis_align",' + '"orientation_axis":"long_axis","align_to":"colored_pad"' in text + ) assert f'"offset":{release_offset_json}' in text assert '"orientation_goal":"axis_align"' in text assert '"orientation_axis":"long_axis"' in text @@ -1170,7 +1183,12 @@ def fake_call_relative_task_llm(**kwargs): task_prompt = paths.task_prompt.read_text(encoding="utf-8") atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert ( + "Generate one deterministic nominal graph with exactly 7 nominal edges" + in task_prompt + ) for text in (task_prompt, atom_actions): + assert '"orientation_goal":"preserve","orientation_axis":"none"' in text assert '"orientation_goal":"upright"' in text assert '"orientation_axis":"none"' in text assert '"align_to"' not in text @@ -1184,6 +1202,90 @@ def fake_call_relative_task_llm(**kwargs): assert paths.summary["orientation_align_to"] is None +def test_relative_on_table_release_offset_uses_tabletop_surface( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "43_Shake Bottle_gym_project" + table_vertices = [ + (-0.5, -0.4, 0.0), + (0.5, -0.4, 0.0), + (0.0, 0.4, 0.0), + (-0.5, -0.4, 0.36), + (0.5, -0.4, 0.36), + (0.0, 0.4, 0.36), + ] + bottle_vertices = [ + (-0.02, 0.0, 0.0), + (0.02, 0.0, 0.0), + (0.0, 0.02, 0.16), + ] + _write_minimal_glb(project_dir / "mesh_assets/table/table_0.glb", table_vertices) + _write_minimal_glb( + project_dir / "mesh_assets/bottle/bottle_1.glb", + bottle_vertices, + ) + gym_config = { + "id": "Image2Tabletop-43-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, -0.02], + [0.0, 0.0, 0.0], + ) + ], + "rigid_object": [ + _mesh_object( + "interact_bottle_1", + "mesh_assets/bottle/bottle_1.glb", + [0.05, 0.05, 0.36], + [90.0, 0.0, 0.0], + ) + ], + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "interact_bottle_1", + "reference_object": "table", + "goal_relation": "on", + "arm": "left", + "orientation_goal": "upright", + "orientation_reference": "none", + "task_prompt_summary": "Use the left arm to stand the bottle on table.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_bottle_on_table_agent", + task_name="Demo43", + task_description="用左臂把瓶子扶正放到桌面上", + target_body_scale=0.8, + prewarm_coacd_cache=False, + ) + + summary = _stable_summary(paths.summary) + expected_release_z = 0.36 + 0.003 + assert summary["release_offset"][2] == pytest.approx(expected_release_z) + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + release_offset_json = json.dumps( + summary["release_offset"], ensure_ascii=False, separators=(",", ":") + ) + assert f'"offset":{release_offset_json},"orientation_goal":"upright"' in task_prompt + assert '"offset":[0.0,0.0,0.2]' not in task_prompt + + def test_relative_orientation_rejects_invalid_enum( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, From 6ca279b81ebb1ad4b4c24b39dac652fdf52b931b Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sat, 27 Jun 2026 12:52:01 +0800 Subject: [PATCH 26/33] Use object-pose release for on placements --- .../generation/prompt_builders.py | 101 +++++++++++++-- .../test_ur5_basket_config_generation.py | 120 ++++++++++++++++++ 2 files changed, 211 insertions(+), 10 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 7bd93667..cbbf57e9 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -299,6 +299,7 @@ def make_relative_task_prompt( sample_interval=45, ) pose_sensitive = _is_pose_sensitive_placement(spec) + object_pose_release = _uses_object_pose_release(spec) if pose_sensitive: safe_high_spec = _format_relative_pose_spec( active_arm, @@ -355,6 +356,40 @@ def make_relative_task_prompt( "pose, move down to the final release object pose, and use the exact " "relative-zero release-only `Place` spec shown below." ) + elif object_pose_release: + release_move_spec = _format_relative_pose_spec( + active_arm, + spec, + pose_kind="release", + sample_interval=45, + ) + place_spec = _format_release_only_place_spec(active_arm) + retreat_spec = _format_empty_hand_retreat_spec(active_arm) + edge_count = 6 + high_instruction = f"""2. Move the held object to the {high_step_label} pose: + - {active_slot}: {high_spec} + - {inactive_slot}: null + +3. Move the held object down to the {release_step_label} object pose: + - {active_slot}: {release_move_spec} + - {inactive_slot}: null + +4. Release the held object in-place without moving the object pose: + - {active_slot}: {place_spec} + - {inactive_slot}: null + +5. Retreat the now-empty end-effector upward: + - {active_slot}: {retreat_spec} + - {inactive_slot}: null + +6. Return the active arm to its initial pose: + - {active_slot}: {initial_spec} + - {inactive_slot}: null""" + release_rule = ( + "For this support-surface `on` placement, use `MoveHeldObject` for " + "the final release object pose, then use the exact relative-zero " + "release-only `Place` spec shown below." + ) else: place_spec = _format_relative_place_spec( active_arm, @@ -620,6 +655,37 @@ def _dual_relative_release_edge_blocks( }, ), ] + if _uses_object_pose_release(placement): + return [ + ( + f"Move `{placement.moved_runtime_uid}` down to the final " + "release object pose", + { + active_slot: _format_relative_pose_spec( + active_arm, + placement, + pose_kind="release", + sample_interval=45, + ), + waiting_slot: waiting_value, + }, + ), + ( + f"Release `{placement.moved_runtime_uid}` in-place without moving " + "the object pose", + { + active_slot: _format_release_only_place_spec(active_arm), + waiting_slot: waiting_value, + }, + ), + ( + f"Retreat `{active_arm}` upward after release", + { + active_slot: _format_empty_hand_retreat_spec(active_arm), + waiting_slot: waiting_value, + }, + ), + ] return [ ( @@ -645,7 +711,17 @@ def _dual_relative_release_rule(spec: _RelativeSpecLike) -> str: "orientation at the same high pose before moving down to the final " "release object pose. The following `Place` must be the exact " "relative-zero release-only spec shown below, and then the empty " - "hand retreats upward. For preserve placements, keep the normal " + "hand retreats upward. Support-surface `on` placements must also " + "use final object-pose `MoveHeldObject` plus relative-zero " + "release-only `Place`, even when orientation is preserved. Other " + "preserve placements keep the normal `Place` release-place action." + ) + if any(_uses_object_pose_release(placement) for placement in spec.placements): + return ( + "For support-surface `on` placements, use `MoveHeldObject` for the " + "final release object pose. The following `Place` must be the exact " + "relative-zero release-only spec shown below, and then the empty " + "hand retreats upward. Other preserve placements keep the normal " "`Place` release-place action." ) return ( @@ -671,7 +747,7 @@ def _relative_release_action_patterns( robot_name: str, placement: _RelativePlacementLike, ) -> str: - if _is_pose_sensitive_placement(placement): + if _uses_object_pose_release(placement): return f"""- Final release object pose: {_format_relative_pose_spec(robot_name, placement, pose_kind="release", sample_interval=45)} - Release-only Place: @@ -730,10 +806,11 @@ def make_relative_basic_background( The execution-stage LLM should generate graph JSON that grasps the moved object, moves it to the configured high staging pose, releases it at the final pose, and -returns the active arm to its initial pose. Pose-sensitive placements must use a -safe high `MoveHeldObject` lift with orientation preserved before high-pose -orientation adjustment, then a final object-pose move followed by release-only -`Place`. +returns the active arm to its initial pose. Support-surface `on` placements and +pose-sensitive placements must use a final `MoveHeldObject` object-pose move +followed by release-only `Place`. Pose-sensitive placements must additionally +use a safe high `MoveHeldObject` lift with orientation preserved before +high-pose orientation adjustment. """ @@ -771,10 +848,10 @@ def _make_dual_relative_basic_background( objects, stages and releases the first moved object, then stages and releases the second moved object while the first arm returns to its initial pose. Each arm must release its moved object before returning to its initial pose. -Pose-sensitive placements must use a final `MoveHeldObject` object-pose move -safe high `MoveHeldObject` lift with orientation preserved before high-pose -orientation adjustment, then a final object-pose move followed by release-only -`Place`. +Support-surface `on` placements and pose-sensitive placements must use a final +`MoveHeldObject` object-pose move followed by release-only `Place`. +Pose-sensitive placements must additionally use a safe high `MoveHeldObject` +lift with orientation preserved before high-pose orientation adjustment. """ @@ -1247,6 +1324,10 @@ def _is_pose_sensitive_placement(placement: _RelativePlacementLike) -> bool: return placement.orientation_goal != "preserve" +def _uses_object_pose_release(placement: _RelativePlacementLike) -> bool: + return _is_pose_sensitive_placement(placement) or placement.relation == "on" + + def _format_release_only_place_spec(robot_name: str) -> str: return _format_place_spec( robot_name, diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 6ed8e021..d1bb0119 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -1286,6 +1286,126 @@ def fake_call_relative_task_llm(**kwargs): assert '"offset":[0.0,0.0,0.2]' not in task_prompt +def test_relative_on_preserve_uses_object_pose_release( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "2_Beat Block Hammer_gym_project" + cube_vertices = [ + (-0.03, -0.03, 0.0), + (0.03, -0.03, 0.0), + (0.03, 0.03, 0.0), + (-0.03, -0.03, 0.06), + (0.03, -0.03, 0.06), + (0.03, 0.03, 0.06), + ] + hammer_vertices = [ + (-0.08, -0.01, -0.01), + (0.08, -0.01, -0.01), + (0.08, 0.01, -0.01), + (-0.08, -0.01, 0.02), + (0.08, -0.01, 0.02), + (0.08, 0.01, 0.02), + ] + _write_minimal_glb(project_dir / "mesh_assets/table/table_0.glb", cube_vertices) + _write_minimal_glb(project_dir / "mesh_assets/cube/cube_1.glb", cube_vertices) + _write_minimal_glb( + project_dir / "mesh_assets/hammer/hammer_1.glb", + hammer_vertices, + ) + gym_config = { + "id": "Image2Tabletop-2-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.30], + [0.0, 0.0, 0.0], + ), + _mesh_object( + "cube_1", + "mesh_assets/cube/cube_1.glb", + [0.0, 0.05, 0.40], + [0.0, 0.0, 0.0], + ), + ], + "rigid_object": [ + _mesh_object( + "hammer_1", + "mesh_assets/hammer/hammer_1.glb", + [0.1, -0.05, 0.40], + [0.0, 0.0, 0.0], + ) + ], + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def fake_call_relative_task_llm(**kwargs): + return { + "moved_object": "hammer_1", + "reference_object": "cube_1", + "goal_relation": "on", + "arm": "auto", + "task_prompt_summary": "Place the hammer on the cube.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_hammer_on_cube_agent", + task_name="Demo02", + task_description="桌上有一把锤子和一个方块,用机械臂抓住锤子放到方块上", + target_body_scale=0.8, + prewarm_coacd_cache=False, + ) + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + summary = _stable_summary(paths.summary) + active_arm = summary["active_arm"] + moved_object = summary["moved_object"] + reference_object = summary["reference_object"] + release_offset_json = json.dumps( + summary["release_offset"], ensure_ascii=False, separators=(",", ":") + ) + + assert ( + "Generate one deterministic nominal graph with exactly 6 nominal edges" + in task_prompt + ) + for text in (task_prompt, atom_actions): + assert "Place at the release pose" not in text + assert ( + f'"atomic_action_class":"MoveHeldObject","robot_name":"{active_arm}",' + '"control":"arm","target_object_pose":{"reference":"object",' + f'"obj_name":"{reference_object}","offset":{release_offset_json},' + '"orientation_goal":"preserve","orientation_axis":"none"}' in text + ) + assert ( + f'"atomic_action_class":"Place","robot_name":"{active_arm}",' + '"control":"arm","target_pose":{"reference":"relative",' + '"offset":[0.0,0.0,0.0],"frame":"world"},' + '"cfg":{"sample_interval":10,"lift_height":0.0}' in text + ) + assert ( + f'"atomic_action_class":"MoveEndEffector","robot_name":"{active_arm}",' + '"control":"arm","target_pose":{"reference":"relative",' + '"offset":[0.0,0.0,0.1],"frame":"world"}' in text + ) + + assert summary["relation"] == "on" + assert moved_object == "hammer" + assert reference_object == "cube" + + def test_relative_orientation_rejects_invalid_enum( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, From ed4ad8baaca401bad324ce496c34b0b67c53c235 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sat, 27 Jun 2026 14:59:23 +0800 Subject: [PATCH 27/33] improve arrangement_spec --- .../generation/action_agent_config.py | 8 + .../generation/arrangement_spec.py | 217 +++++++++++++- .../generation/config_types.py | 5 + .../generation/prompt_builders.py | 87 ++++-- .../generation/success_specs.py | 3 +- .../test_ur5_basket_config_generation.py | 266 +++++++++++++++++- 6 files changed, 553 insertions(+), 33 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py index c20b1273..f9ec5ea2 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py @@ -567,6 +567,12 @@ def _make_arrangement_summary(spec: _ArrangementLineSpec) -> dict[str, Any]: "anchor": spec.anchor, "order_by": spec.order_by, "order_direction": spec.order_direction, + "line_origin_xy": [ + float(spec.line_origin_xy[0]), + float(spec.line_origin_xy[1]), + ], + "spacing": float(spec.spacing), + "layout_clearance": float(spec.layout_clearance), "placements": [ { "object": step.runtime_uid, @@ -574,6 +580,8 @@ def _make_arrangement_summary(spec: _ArrangementLineSpec) -> dict[str, Any]: "slot_index": step.slot_index, "active_arm": f"{step.active_side}_arm", "target_xy": [float(step.target_xy[0]), float(step.target_xy[1])], + "orientation_goal": step.orientation_goal, + "orientation_axis": step.orientation_axis, } for step in spec.steps ], diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py index cd38d7fb..fc00ccdf 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py @@ -69,11 +69,13 @@ "排成", "一行", ) -_DEFAULT_RELEASE_Z = 0.12 +_DEFAULT_RELEASE_Z = 0.01 _DEFAULT_STAGING_Z_DELTA = 0.10 -_SLOT_MARGIN = 0.01 +_SLOT_MARGIN = 0.025 _MIN_SLOT_SPACING = 0.07 -_MAX_SLOT_SPACING = 0.12 +_LAYOUT_CLEARANCE = 0.025 +_ROW_SEARCH_STEP = 0.025 +_ROW_SEARCH_RADIUS = 0.25 _SUPPORTED_ORDER_BY = {"size", "color", "explicit"} _SUPPORTED_ORDER_DIRECTIONS = {"ascending", "descending", "given"} _SUPPORTED_AXES = {"left_to_right"} @@ -269,12 +271,17 @@ def _apply_arrangement_task_response( [rigid_by_uid[uid] for uid in object_source_uids], scene_dir=scene_dir, ) - slots = _arrangement_line_slot_positions( + slots, line_origin_xy = _arrangement_collision_aware_line_slots( anchor_xy=anchor_xy, + table_obj=table_obj, + objects=[rigid_by_uid[uid] for uid in object_source_uids], count=len(ordered_source_uids), spacing=spacing, line_axis=axis, + scene_dir=scene_dir, + clearance=_LAYOUT_CLEARANCE, ) + orientation_axis = _arrangement_orientation_axis(axis) steps = [] for slot_index, (source_uid, target_xy) in enumerate( @@ -305,6 +312,8 @@ def _apply_arrangement_task_response( high_position=high_position, size_score=_arrangement_object_size_score(obj, scene_dir=scene_dir), color=_object_color(source_uid, object_attributes), + orientation_goal="axis_align", + orientation_axis=orientation_axis, ) ) @@ -323,6 +332,9 @@ def _apply_arrangement_task_response( axis=axis, anchor=anchor, steps=tuple(steps), + line_origin_xy=line_origin_xy, + spacing=spacing, + layout_clearance=_LAYOUT_CLEARANCE, ) @@ -353,6 +365,202 @@ def _arrangement_line_slot_positions( return slots +def _arrangement_collision_aware_line_slots( + *, + anchor_xy: Sequence[float], + table_obj: _SceneObject, + objects: Sequence[_SceneObject], + count: int, + spacing: float, + line_axis: str, + scene_dir: Path, + clearance: float, +) -> tuple[list[list[float]], list[float]]: + axis = _normalize_axis(line_axis) + if axis != "left_to_right": + raise ValueError(f"Unsupported arrangement line axis: {line_axis!r}.") + if count != len(objects): + raise ValueError("Arrangement slot count must match object count.") + + table_bounds = _source_object_xy_bounds(table_obj, scene_dir=scene_dir) + if table_bounds is None: + raise ValueError("Arrangement requires table mesh XY bounds for safe layout.") + table_min, table_max = table_bounds + object_footprints = [ + _arrangement_object_footprint(obj, scene_dir=scene_dir) for obj in objects + ] + max_half_extent = max(footprint.half_extent for footprint in object_footprints) + init_bounds = [footprint.xy_bounds for footprint in object_footprints] + + for x_offset in _row_search_offsets(_ROW_SEARCH_RADIUS, _ROW_SEARCH_STEP): + origin = [ + round(float(anchor_xy[0]) + x_offset, 6), + round(float(anchor_xy[1]), 6), + ] + slots = _arrangement_line_slot_positions( + anchor_xy=origin, + count=count, + spacing=spacing, + line_axis=axis, + ) + slot_bounds = [ + _slot_xy_bounds(slot, max_half_extent=max_half_extent) for slot in slots + ] + if not _slot_bounds_within_table( + slot_bounds, + table_min=table_min, + table_max=table_max, + clearance=clearance, + ): + continue + if any( + _xy_bounds_overlap(slot_bound, init_bound, clearance=clearance) + for slot_bound in slot_bounds + for init_bound in init_bounds + ): + continue + return slots, origin + + raise ValueError( + "Unable to generate a collision-free one-line arrangement near the table " + "center. The selected objects may be too many, too large, or already " + "occupying all candidate row positions; use a larger table or add parking " + "slot planning." + ) + + +def _row_search_offsets(radius: float, step: float) -> list[float]: + offsets = [0.0] + steps = int(float(radius) / float(step)) + for index in range(1, steps + 1): + value = round(float(index) * float(step), 6) + offsets.extend([value, -value]) + return offsets + + +class _ArrangementFootprint: + def __init__( + self, + *, + xy_bounds: tuple[list[float], list[float]], + half_extent: float, + ) -> None: + self.xy_bounds = xy_bounds + self.half_extent = half_extent + + +def _arrangement_object_footprint( + obj: _SceneObject, + *, + scene_dir: Path, +) -> _ArrangementFootprint: + bounds = _source_object_xy_bounds(obj, scene_dir=scene_dir) + if bounds is None: + position = _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) + half_extent = _MIN_SLOT_SPACING / 2.0 + bounds = ( + [position[0] - half_extent, position[1] - half_extent], + [position[0] + half_extent, position[1] + half_extent], + ) + mins, maxs = bounds + half_extent = max( + (float(maxs[0]) - float(mins[0])) / 2.0, + (float(maxs[1]) - float(mins[1])) / 2.0, + _MIN_SLOT_SPACING / 2.0, + ) + return _ArrangementFootprint(xy_bounds=bounds, half_extent=half_extent) + + +def _source_object_xy_bounds( + obj: _SceneObject, + *, + scene_dir: Path, +) -> tuple[list[float], list[float]] | None: + config = _resolved_mesh_config(obj, scene_dir=scene_dir) + return _mesh_config_world_xy_bounds(config) + + +def _mesh_config_world_xy_bounds( + obj_config: Mapping[str, Any], +) -> tuple[list[float], list[float]] | None: + shape = obj_config.get("shape", {}) + if not isinstance(shape, Mapping): + return None + mesh_path = shape.get("fpath") + if not isinstance(mesh_path, str): + return None + from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( + _load_mesh_vertices, + _mesh_config_transform_matrix, + _transform_point, + ) + + vertices = _load_mesh_vertices(Path(mesh_path).expanduser().resolve()) + if not vertices: + return None + matrix = _mesh_config_transform_matrix(obj_config) + transformed_vertices = [_transform_point(matrix, vertex) for vertex in vertices] + x_values = [vertex[0] for vertex in transformed_vertices] + y_values = [vertex[1] for vertex in transformed_vertices] + return ( + [min(x_values), min(y_values)], + [max(x_values), max(y_values)], + ) + + +def _slot_xy_bounds( + slot: Sequence[float], + *, + max_half_extent: float, +) -> tuple[list[float], list[float]]: + return ( + [float(slot[0]) - max_half_extent, float(slot[1]) - max_half_extent], + [float(slot[0]) + max_half_extent, float(slot[1]) + max_half_extent], + ) + + +def _slot_bounds_within_table( + slot_bounds: Sequence[tuple[list[float], list[float]]], + *, + table_min: Sequence[float], + table_max: Sequence[float], + clearance: float, +) -> bool: + for mins, maxs in slot_bounds: + if mins[0] < float(table_min[0]) + clearance: + return False + if maxs[0] > float(table_max[0]) - clearance: + return False + if mins[1] < float(table_min[1]) + clearance: + return False + if maxs[1] > float(table_max[1]) - clearance: + return False + return True + + +def _xy_bounds_overlap( + first: tuple[list[float], list[float]], + second: tuple[list[float], list[float]], + *, + clearance: float, +) -> bool: + first_min, first_max = first + second_min, second_max = second + return not ( + first_max[0] + clearance <= second_min[0] + or second_max[0] + clearance <= first_min[0] + or first_max[1] + clearance <= second_min[1] + or second_max[1] + clearance <= first_min[1] + ) + + +def _arrangement_orientation_axis(line_axis: str) -> str: + axis = _normalize_axis(line_axis) + if axis == "left_to_right": + return "y" + raise ValueError(f"Unsupported arrangement line axis: {line_axis!r}.") + + def _with_arrangement_generated_z_targets( spec: _ArrangementLineSpec, gym_config: Mapping[str, Any], @@ -626,7 +834,6 @@ def _arrangement_spacing( for obj in objects ) spacing = max(max_extent + _SLOT_MARGIN, _MIN_SLOT_SPACING) - spacing = min(spacing, _MAX_SLOT_SPACING) return round(float(spacing), 6) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py index a3db87d5..238b39f4 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py @@ -140,6 +140,8 @@ class _ArrangementLineStepSpec: high_position: list[float] size_score: float | None = None color: str | None = None + orientation_goal: str = "axis_align" + orientation_axis: str = "y" @dataclass(frozen=True) @@ -153,3 +155,6 @@ class _ArrangementLineSpec: axis: str anchor: str steps: tuple[_ArrangementLineStepSpec, ...] + line_origin_xy: list[float] + spacing: float + layout_clearance: float diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index cbbf57e9..7fef00b6 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -101,6 +101,8 @@ class _ArrangementStepLike(Protocol): high_position: Sequence[float] size_score: float | None color: str | None + orientation_goal: str + orientation_axis: str class _ArrangementSpecLike(Protocol): @@ -111,6 +113,9 @@ class _ArrangementSpecLike(Protocol): order_direction: str axis: str anchor: str + line_origin_xy: Sequence[float] + spacing: float + layout_clearance: float steps: Sequence[_ArrangementStepLike] @@ -144,7 +149,7 @@ def make_arrangement_task_prompt( project_name: str, spec: _ArrangementSpecLike, ) -> str: - edge_count = len(spec.steps) * 4 + edge_count = len(spec.steps) * 6 step_blocks = "\n\n".join( _arrangement_step_prompt_block(index, step) for index, step in enumerate(spec.steps, start=1) @@ -165,20 +170,27 @@ def make_arrangement_task_prompt( - Layout axis: `{spec.axis}`. Slot 0 is the robot-view leftmost slot, and later slots move monotonically toward robot-view right. - Anchor: `{spec.anchor}` in the exported {project_name} environment. +- Collision-aware line origin xy: `{list(spec.line_origin_xy)}`. +- Slot spacing: `{float(spec.spacing):.6g}` with clearance `{float(spec.layout_clearance):.6g}`. - Ordering rule: `{spec.order_by}` with direction `{spec.order_direction}`. - Final order: {final_order}. Generate one deterministic nominal graph with exactly {edge_count} nominal edges. Use only the atomic action class JSON specs shown below. Do not add recovery, -monitor, search, alignment, or extra lift edges. Use `Place` for each -release-place step so lowering, gripper opening, and upward retreat remain one -atomic action. The arm not listed for a step must remain null. +monitor, search, alignment, or extra lift edges. The absolute target object +poses are collision-aware slots computed by the config-stage generator; do not +rewrite them. First move each held object to the high staging pose with +orientation preserved, then use `MoveHeldObject` at the same high pose to align +its principal axis to the configured world axis, then move down to the final +release object pose. Use the exact relative-zero release-only `Place` spec +shown below. The arm not listed for a step must remain null. {step_blocks} Final state: all listed objects must rest near their assigned absolute XY slots -and remain upright. Use the exact absolute target_pose JSON specs shown above; -do not rewrite slot placement as object-referenced poses. +with their principal axes aligned to the configured arrangement axis. Use the +exact absolute target_object_pose JSON specs shown above; do not rewrite slot +placement as object-referenced poses. """ @@ -186,20 +198,49 @@ def _arrangement_step_prompt_block(index: int, step: _ArrangementStepLike) -> st active_arm = f"{step.active_side}_arm" active_slot = f"{step.active_side}_arm_action" inactive_slot = f"{'right' if step.active_side == 'left' else 'left'}_arm_action" - base_edge = (index - 1) * 4 + base_edge = (index - 1) * 6 + high_preserve_spec = _format_pose_absolute_spec( + active_arm, + step.high_position, + sample_interval=45, + orientation_goal="preserve", + orientation_axis="none", + ) + high_align_spec = _format_pose_absolute_spec( + active_arm, + step.high_position, + sample_interval=45, + orientation_goal=step.orientation_goal, + orientation_axis=step.orientation_axis, + ) + release_move_spec = _format_pose_absolute_spec( + active_arm, + step.release_position, + sample_interval=45, + orientation_goal=step.orientation_goal, + orientation_axis=step.orientation_axis, + ) return f"""{base_edge + 1}. Pick up `{step.runtime_uid}` for slot {step.slot_index}: - {active_slot}: {_format_pick_up_spec(active_arm, step.runtime_uid)} - {inactive_slot}: null -{base_edge + 2}. Move `{step.runtime_uid}` to the high staging pose above slot {step.slot_index}: - - {active_slot}: {_format_pose_absolute_spec(active_arm, step.high_position, sample_interval=45)} +{base_edge + 2}. Move `{step.runtime_uid}` to the high staging pose above slot {step.slot_index} without changing orientation: + - {active_slot}: {high_preserve_spec} - {inactive_slot}: null -{base_edge + 3}. Place `{step.runtime_uid}` at slot {step.slot_index}: - - {active_slot}: {_format_place_absolute_spec(active_arm, step.release_position, sample_interval=80, lift_height=_PLACE_LIFT_HEIGHT)} +{base_edge + 3}. Align `{step.runtime_uid}` at the high staging pose to the configured arrangement axis: + - {active_slot}: {high_align_spec} + - {inactive_slot}: null + +{base_edge + 4}. Move `{step.runtime_uid}` down to the final release object pose at slot {step.slot_index}: + - {active_slot}: {release_move_spec} - {inactive_slot}: null -{base_edge + 4}. Return `{active_arm}` to its initial pose: +{base_edge + 5}. Release `{step.runtime_uid}` in-place without moving the object pose: + - {active_slot}: {_format_release_only_place_spec(active_arm)} + - {inactive_slot}: null + +{base_edge + 6}. Return `{active_arm}` to its initial pose: - {active_slot}: {_format_initial_qpos_spec(active_arm, sample_interval=30)} - {inactive_slot}: null""" @@ -230,6 +271,11 @@ def make_arrangement_basic_background( Config-stage LLM notes: {notes} + +The execution-stage LLM should preserve each object's initial orientation while +lifting to the high staging pose, align the held object to the configured +arrangement world axis at that safe height, move down to the final object pose, +release in place, and then return the arm to its initial pose. """ @@ -252,8 +298,9 @@ def make_arrangement_atom_actions_prompt(spec: _ArrangementSpecLike) -> str: return f"""### Atomic Action Class JSON Specs for Dual-UR5 Line Arrangement Use only the native atomic action class JSON specs shown below. Each object is -moved to an absolute slot pose computed by the -config-stage generator. Keep the non-active arm null for each listed object. +moved to an absolute collision-aware slot pose computed by the config-stage +generator. Align at the high pose before moving down to the final object pose. +Keep the non-active arm null for each listed object. {blocks} """ @@ -264,10 +311,14 @@ def _arrangement_atom_action_block(step: _ArrangementStepLike) -> str: return f"""Object `{step.runtime_uid}` to slot {step.slot_index}: - Pick up: {_format_pick_up_spec(active_arm, step.runtime_uid)} -- High staging: - {_format_pose_absolute_spec(active_arm, step.high_position, sample_interval=45)} -- Place: - {_format_place_absolute_spec(active_arm, step.release_position, sample_interval=80, lift_height=_PLACE_LIFT_HEIGHT)} +- High staging without orientation change: + {_format_pose_absolute_spec(active_arm, step.high_position, sample_interval=45, orientation_goal="preserve", orientation_axis="none")} +- High staging axis alignment: + {_format_pose_absolute_spec(active_arm, step.high_position, sample_interval=45, orientation_goal=step.orientation_goal, orientation_axis=step.orientation_axis)} +- Final release object pose: + {_format_pose_absolute_spec(active_arm, step.release_position, sample_interval=45, orientation_goal=step.orientation_goal, orientation_axis=step.orientation_axis)} +- Release-only Place: + {_format_release_only_place_spec(active_arm)} - Return: {_format_initial_qpos_spec(active_arm, sample_interval=30)}""" diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py index a8a6a365..c3905ef3 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py @@ -122,6 +122,7 @@ def _make_arrangement_extensions_config(spec: _ArrangementLineSpec) -> dict[str, def _make_arrangement_success_spec(spec: _ArrangementLineSpec) -> dict[str, Any]: terms: list[dict[str, Any]] = [] + xy_tolerance = min(0.03, float(spec.spacing) * 0.35) for step in spec.steps: terms.extend( [ @@ -129,7 +130,7 @@ def _make_arrangement_success_spec(spec: _ArrangementLineSpec) -> dict[str, Any] "type": "object_xy_near", "object": step.runtime_uid, "target_xy": [float(step.target_xy[0]), float(step.target_xy[1])], - "tolerance": 0.05, + "tolerance": xy_tolerance, }, { "type": "object_not_fallen", diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index d1bb0119..e2ce2d9e 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -1720,6 +1720,16 @@ def fake_call_relative_task_llm(**kwargs): def test_arrangement_response_orders_explicit_color_sequence(tmp_path: Path) -> None: + _write_minimal_glb( + tmp_path / "mesh_assets/table/table_0.glb", + [(-0.60, -0.40, 0.0), (0.60, -0.40, 0.0), (0.0, 0.40, 0.0)], + ) + for uid in ("cube_red", "cube_blue", "cube_green"): + _write_minimal_glb( + tmp_path / f"mesh_assets/cube/{uid}.glb", + [(-0.02, -0.02, 0.0), (0.02, -0.02, 0.0), (0.0, 0.02, 0.04)], + ) + scene_objects = [ action_agent_config_generation._SceneObject( source_uid="table", @@ -1793,6 +1803,12 @@ def test_arrangement_response_orders_explicit_color_sequence(tmp_path: Path) -> assert [step.target_xy[1] for step in spec.steps] == sorted( step.target_xy[1] for step in spec.steps ) + assert [step.orientation_goal for step in spec.steps] == [ + "axis_align", + "axis_align", + "axis_align", + ] + assert [step.orientation_axis for step in spec.steps] == ["y", "y", "y"] def test_arrangement_line_slot_positions_are_centered_left_to_right() -> None: @@ -1863,30 +1879,53 @@ def fake_call_arrangement_task_llm(**kwargs): "anchor": "table_center", "order_by": "size", "order_direction": "descending", + "line_origin_xy": paths.summary["line_origin_xy"], + "spacing": paths.summary["spacing"], + "layout_clearance": paths.summary["layout_clearance"], "placements": [ { "object": "cube_2", "source_uid": "cube_2", "slot_index": 0, "active_arm": "right_arm", - "target_xy": [0.0, -0.07], + "target_xy": paths.summary["placements"][0]["target_xy"], + "orientation_goal": "axis_align", + "orientation_axis": "y", }, { "object": "cube_1", "source_uid": "cube_1", "slot_index": 1, "active_arm": "left_arm", - "target_xy": [0.0, 0.0], + "target_xy": paths.summary["placements"][1]["target_xy"], + "orientation_goal": "axis_align", + "orientation_axis": "y", }, { "object": "cube_3", "source_uid": "cube_3", "slot_index": 2, "active_arm": "left_arm", - "target_xy": [0.0, 0.07], + "target_xy": paths.summary["placements"][2]["target_xy"], + "orientation_goal": "axis_align", + "orientation_axis": "y", }, ], } + target_x_values = [ + placement["target_xy"][0] for placement in paths.summary["placements"] + ] + target_y_values = [ + placement["target_xy"][1] for placement in paths.summary["placements"] + ] + assert len({round(value, 6) for value in target_x_values}) == 1 + assert target_y_values == sorted(target_y_values) + assert paths.summary["spacing"] >= 0.07 + assert paths.summary["layout_clearance"] == pytest.approx(0.025) + _assert_arrangement_slots_avoid_initial_objects( + paths.summary, + gym_config, + ) success = gym_config["env"]["extensions"]["agent_success"] assert success["op"] == "all" @@ -1895,24 +1934,129 @@ def fake_call_arrangement_task_llm(**kwargs): for term in success["terms"] if term["type"] == "object_xy_near" } - assert xy_targets == { - ("cube_2", (0.0, -0.07)), - ("cube_1", (0.0, 0.0)), - ("cube_3", (0.0, 0.07)), + expected_xy_targets = { + (placement["object"], tuple(placement["target_xy"])) + for placement in paths.summary["placements"] } + assert xy_targets == expected_xy_targets + expected_xy_tolerance = min(0.03, paths.summary["spacing"] * 0.35) + xy_tolerances = { + term["tolerance"] + for term in success["terms"] + if term["type"] == "object_xy_near" + } + assert len(xy_tolerances) == 1 + assert next(iter(xy_tolerances)) == pytest.approx(expected_xy_tolerance) task_prompt = paths.task_prompt.read_text(encoding="utf-8") atom_actions = paths.atom_actions.read_text(encoding="utf-8") - assert "Generate one deterministic nominal graph with exactly 12 nominal edges" in ( + assert "Generate one deterministic nominal graph with exactly 18 nominal edges" in ( task_prompt ) assert task_prompt.count('"atomic_action_class":"PickUp"') == 3 assert task_prompt.count('"atomic_action_class":"Place"') == 3 - assert task_prompt.count('"reference":"absolute"') >= 6 + assert task_prompt.count('"reference":"absolute"') >= 9 + assert task_prompt.count('"orientation_goal":"axis_align"') == 6 + assert task_prompt.count('"orientation_axis":"y"') == 6 + assert task_prompt.count('"orientation_goal":"preserve"') == 3 + assert task_prompt.count('"target_pose":{"reference":"relative"') == 3 + assert "Collision-aware line origin xy" in task_prompt assert atom_actions.count('"atomic_action_class":"PickUp"') == 3 + assert atom_actions.count('"orientation_goal":"axis_align"') == 6 + assert atom_actions.count('"orientation_axis":"y"') == 6 assert atom_actions.count('"atomic_action_class":"Place"') == 3 +def test_arrangement_collision_aware_layout_scales_to_six_objects( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "six_blocks_gym_project" + _write_arrangement_project_with_count(project_dir, count=6, cube_size=0.035) + + def fake_call_arrangement_task_llm(**kwargs): + return { + "objects": [f"cube_{index}" for index in range(1, 7)], + "order_by": "explicit", + "order_direction": "given", + "anchor": "table_center", + "line_axis": "left_to_right", + "task_prompt_summary": "Arrange six cubes left to right.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_arrangement_task_llm", + fake_call_arrangement_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_six_arrangement_agent", + task_description="把六个方块从左到右排成一行", + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + summary = paths.summary + assert len(summary["placements"]) == 6 + assert summary["spacing"] >= 0.07 + assert summary["layout_clearance"] == pytest.approx(0.025) + assert all( + placement["orientation_goal"] == "axis_align" + and placement["orientation_axis"] == "y" + for placement in summary["placements"] + ) + x_values = [placement["target_xy"][0] for placement in summary["placements"]] + y_values = [placement["target_xy"][1] for placement in summary["placements"]] + assert len({round(value, 6) for value in x_values}) == 1 + assert y_values == sorted(y_values) + _assert_arrangement_slots_avoid_initial_objects(summary, gym_config) + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + assert "Generate one deterministic nominal graph with exactly 36 nominal edges" in ( + task_prompt + ) + + +def test_arrangement_layout_fails_when_row_cannot_fit( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "crowded_blocks_gym_project" + _write_arrangement_project_with_count( + project_dir, + count=6, + cube_size=0.12, + table_half_x=0.18, + table_half_y=0.22, + ) + + def fake_call_arrangement_task_llm(**kwargs): + return { + "objects": [f"cube_{index}" for index in range(1, 7)], + "order_by": "explicit", + "order_direction": "given", + "anchor": "table_center", + "line_axis": "left_to_right", + "task_prompt_summary": "Arrange six oversized cubes left to right.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_arrangement_task_llm", + fake_call_arrangement_task_llm, + ) + + with pytest.raises(ValueError, match="collision-free one-line arrangement"): + generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_crowded_arrangement_agent", + task_description="把六个大方块从左到右排成一行", + prewarm_coacd_cache=False, + ) + + def test_dual_inside_same_container_uses_container_long_axis_slots( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -2440,6 +2584,61 @@ def _write_arrangement_project(project_dir: Path) -> None: ) +def _write_arrangement_project_with_count( + project_dir: Path, + *, + count: int, + cube_size: float, + table_half_x: float = 0.60, + table_half_y: float = 0.40, +) -> None: + _write_minimal_glb( + project_dir / "mesh_assets/table/table_0.glb", + [ + (-table_half_x, -table_half_y, 0.0), + (table_half_x, -table_half_y, 0.0), + (0.0, table_half_y, 0.0), + ], + ) + rigid_objects = [] + for index in range(count): + uid = f"cube_{index + 1}" + _write_minimal_glb( + project_dir / f"mesh_assets/cube/{uid}/{uid}.glb", + [ + (-cube_size / 2.0, -cube_size / 2.0, 0.0), + (cube_size / 2.0, -cube_size / 2.0, 0.0), + (0.0, cube_size / 2.0, cube_size), + ], + ) + y = (index - (count - 1) / 2.0) * (cube_size + 0.01) + rigid_objects.append( + _mesh_object( + uid, + f"mesh_assets/cube/{uid}/{uid}.glb", + [0.0, round(float(y), 6), 0.76], + [0.0, 0.0, 0.0], + ) + ) + + gym_config = { + "id": "Image2Tabletop-arrangement-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 0.0], + ) + ], + "rigid_object": rigid_objects, + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def _mesh_object( uid: str, fpath: str, @@ -2491,6 +2690,55 @@ def _stable_summary(summary: dict) -> dict: return stable +def _assert_arrangement_slots_avoid_initial_objects( + summary: dict, + gym_config: dict, +) -> None: + clearance = float(summary["layout_clearance"]) + spacing = float(summary["spacing"]) + half_extent = max(0.035, (spacing - clearance) / 2.0) + initial_bounds = [ + _xy_bounds_around(obj["init_pos"][:2], half_extent) + for obj in gym_config["rigid_object"] + ] + for placement in summary["placements"]: + slot_bounds = _xy_bounds_around(placement["target_xy"], half_extent) + assert all( + not _xy_bounds_overlap_for_test( + slot_bounds, + init_bound, + clearance=clearance, + ) + for init_bound in initial_bounds + ) + + +def _xy_bounds_around( + xy: list[float], + half_extent: float, +) -> tuple[list[float], list[float]]: + return ( + [float(xy[0]) - half_extent, float(xy[1]) - half_extent], + [float(xy[0]) + half_extent, float(xy[1]) + half_extent], + ) + + +def _xy_bounds_overlap_for_test( + first: tuple[list[float], list[float]], + second: tuple[list[float], list[float]], + *, + clearance: float, +) -> bool: + first_min, first_max = first + second_min, second_max = second + return not ( + first_max[0] + clearance <= second_min[0] + or second_max[0] + clearance <= first_min[0] + or first_max[1] + clearance <= second_min[1] + or second_max[1] + clearance <= first_min[1] + ) + + def _obj_vertices(path: Path) -> list[tuple[float, float, float]]: vertices = [] for line in path.read_text(encoding="utf-8").splitlines(): From 4733e16692f005ac2d682bfb928c72db7867b5f2 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sat, 27 Jun 2026 15:21:35 +0800 Subject: [PATCH 28/33] fix Camera high --- .../action_agent_pipeline/generation/arrangement_spec.py | 2 +- .../generation/templates/default_sensors.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py index fc00ccdf..37d13030 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/arrangement_spec.py @@ -69,7 +69,7 @@ "排成", "一行", ) -_DEFAULT_RELEASE_Z = 0.01 +_DEFAULT_RELEASE_Z = 0.04 _DEFAULT_STAGING_Z_DELTA = 0.10 _SLOT_MARGIN = 0.025 _MIN_SLOT_SPACING = 0.07 diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_sensors.json b/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_sensors.json index 3da01498..09ae7b51 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_sensors.json +++ b/embodichain/gen_sim/action_agent_pipeline/generation/templates/default_sensors.json @@ -7,7 +7,7 @@ "intrinsics": [420, 420, 480, 270], "extrinsics": { "pos": [0.4, 0.0, 2.2], - "eye": [0.6, 0.0, 3.3], + "eye": [0.6, 0.0, 1.8], "target": [0.0, 0.0, 0.75], "up": [1.0, 0.0, 0.0] } From f803dbca514f99d3e52cde25a941ebe78282039d Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sat, 27 Jun 2026 18:26:41 +0800 Subject: [PATCH 29/33] Object Manipulation update --- .../env_adapters/tableware/success.py | 56 ++ .../generation/action_agent_config.py | 198 +++++- .../generation/config_blocks.py | 7 + .../generation/config_types.py | 40 ++ .../generation/object_manipulation_spec.py | 35 ++ .../generation/prompt_builders.py | 435 +++++++++++++ .../generation/relative_geometry.py | 15 +- .../generation/relative_spec.py | 254 +++++++- .../generation/stacking_spec.py | 578 ++++++++++++++++++ .../generation/success_specs.py | 127 +++- .../test_ur5_basket_config_generation.py | 485 ++++++++++++++- 11 files changed, 2183 insertions(+), 47 deletions(-) create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/object_manipulation_spec.py create mode 100644 embodichain/gen_sim/action_agent_pipeline/generation/stacking_spec.py diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py index 2826ebc4..5abc4e14 100644 --- a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py @@ -77,6 +77,8 @@ def _evaluate_spec( return _object_axis_near(env, spec) if term_type in {"object_lifted", "object_height_above_initial"}: return _object_lifted(env, spec) + if term_type in {"object_held_by_gripper", "object_gripper_near"}: + return _object_held_by_gripper(env, spec) raise ValueError(f"Unsupported success term type: {term_type!r}.") @@ -233,6 +235,60 @@ def _object_lifted(env, spec: Mapping[str, Any]) -> torch.Tensor: return position[:, 2] >= initial_height + float(spec.get("min_height", 0.1)) +def _object_held_by_gripper(env, spec: Mapping[str, Any]) -> torch.Tensor: + object_position = _position(env, _object_name(spec)) + arm_name = str(spec.get("arm", spec.get("robot_name", ""))) + eef_pose = _arm_eef_pose(env, arm_name).to( + dtype=object_position.dtype, + device=object_position.device, + ) + if eef_pose.ndim == 2: + eef_pose = eef_pose.unsqueeze(0) + if eef_pose.shape[0] == 1 and object_position.shape[0] > 1: + eef_pose = eef_pose.expand(object_position.shape[0], -1, -1) + eef_position = eef_pose[:, :3, 3] + near = torch.linalg.norm(object_position - eef_position, dim=-1) <= float( + spec.get("max_distance", 0.12) + ) + return near & _gripper_is_closed(env, arm_name, object_position.device) + + +def _arm_eef_pose(env, arm_name: str) -> torch.Tensor: + if hasattr(env, "get_current_xpos_agent"): + left_pose, right_pose = env.get_current_xpos_agent() + return torch.as_tensor( + right_pose if "right" in arm_name else left_pose, + dtype=torch.float32, + device=env.device, + ) + raise ValueError("object_held_by_gripper requires current eef pose access.") + + +def _gripper_is_closed(env, arm_name: str, device: torch.device) -> torch.Tensor: + if not hasattr(env, "get_current_gripper_state_agent"): + return _constant(env, True) + left_state, right_state = env.get_current_gripper_state_agent() + state = right_state if "right" in arm_name else left_state + state_tensor = torch.as_tensor(state, dtype=torch.float32, device=device) + if state_tensor.numel() == 0: + return _constant(env, True) + state_tensor = ( + state_tensor.reshape(1, -1) if state_tensor.ndim == 1 else state_tensor + ) + if state_tensor.shape[0] == 1 and env.num_envs > 1: + state_tensor = state_tensor.expand(env.num_envs, -1) + close_state = getattr(env, "close_state", None) + if close_state is None: + return torch.mean(state_tensor, dim=-1) > 0.0 + close_tensor = torch.as_tensor(close_state, dtype=torch.float32, device=device) + close_tensor = ( + close_tensor.reshape(1, -1) if close_tensor.ndim == 1 else close_tensor + ) + if close_tensor.shape[0] == 1 and state_tensor.shape[0] > 1: + close_tensor = close_tensor.expand(state_tensor.shape[0], -1) + return torch.linalg.norm(state_tensor - close_tensor, dim=-1) < 1e-3 + + def _axis_index(axis: str) -> int: axes = {"x": 0, "y": 1, "z": 2} if axis not in axes: diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py index f9ec5ea2..15769aae 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/action_agent_config.py @@ -33,6 +33,7 @@ _RelativePlacementSpec, _ResolvedTargetReplacement, _SceneObject, + _StackingSpec, ) from embodichain.gen_sim.action_agent_pipeline.generation.scene_objects import ( _collect_scene_objects, @@ -55,6 +56,9 @@ make_relative_atom_actions_prompt, make_relative_basic_background, make_relative_task_prompt, + make_stacking_atom_actions_prompt, + make_stacking_basic_background, + make_stacking_task_prompt, ) from embodichain.gen_sim.action_agent_pipeline.generation.arrangement_spec import ( _build_arrangement_line_spec_with_llm, @@ -62,6 +66,13 @@ _is_arrangement_task_description, _with_arrangement_generated_z_targets, ) +from embodichain.gen_sim.action_agent_pipeline.generation.stacking_spec import ( + _build_stacking_spec_with_llm, + _call_stacking_task_llm, + _is_stacking_task_description, + _make_stacking_summary, + _with_stacking_generated_targets, +) from embodichain.gen_sim.action_agent_pipeline.generation.action_agent_templates import ( make_dual_ur5_robot_config as _make_dual_ur5_robot_config, make_light_config as _make_light_config, @@ -111,12 +122,14 @@ _with_self_relative_absolute_targets, ) from embodichain.gen_sim.action_agent_pipeline.generation.relative_spec import ( - _build_relative_placement_spec_with_llm, - _call_relative_task_llm, _normalize_relative_relation, _relative_relation_phrase, _relative_scene_runtime_uid_mapping, ) +from embodichain.gen_sim.action_agent_pipeline.generation.object_manipulation_spec import ( + _build_object_manipulation_spec_with_llm, + _call_object_manipulation_task_llm, +) from embodichain.gen_sim.action_agent_pipeline.generation.replacement_generation import ( _apply_replacement_names, _normalize_target_replacements, @@ -131,13 +144,16 @@ _make_arrangement_extensions_config, _make_extensions_config, _make_relative_extensions_config, + _make_stacking_extensions_config, _object_in_container_success, _validate_arrangement_bundle, _validate_bundle, _validate_relative_bundle, - _validate_success_uids, + _validate_stacking_bundle, ) +_call_relative_task_llm = _call_object_manipulation_task_llm + __all__ = [ "GeneratedActionAgentConfigPaths", "TargetReplacementSpec", @@ -219,6 +235,34 @@ def generate_action_agent_config_from_project( "target_replacements are only supported by the default basket " "template. Do not combine them with task_description." ) + if _is_stacking_task_description(task_description): + spec = _build_stacking_spec_with_llm( + scene_objects=scene_objects, + project_name=project_name, + scene_dir=scene_dir, + task_description=task_description, + model=llm_model, + task_llm_caller=_call_stacking_task_llm, + ) + bundle = _build_stacking_bundle( + scene_dir=scene_dir, + source_config=source_config, + spec=spec, + project_name=project_name, + task_name=task_name, + max_episodes=max_episodes, + max_episode_steps=max_episode_steps, + mesh_normalizer=mesh_normalizer, + ) + _validate_stacking_bundle(bundle, spec) + _attach_mesh_normalization_summary(bundle, mesh_normalizer) + if prewarm_coacd_cache: + _attach_coacd_cache_summary(bundle) + return _write_config_bundle( + output_dir=output_dir_path, + bundle=bundle, + overwrite=overwrite, + ) if _is_arrangement_task_description(task_description): spec = _build_arrangement_line_spec_with_llm( scene_objects=scene_objects, @@ -247,7 +291,7 @@ def generate_action_agent_config_from_project( bundle=bundle, overwrite=overwrite, ) - spec = _build_relative_placement_spec_with_llm( + spec = _build_object_manipulation_spec_with_llm( scene_objects=scene_objects, project_name=project_name, task_description=task_description, @@ -588,6 +632,141 @@ def _make_arrangement_summary(spec: _ArrangementLineSpec) -> dict[str, Any]: } +def _build_stacking_bundle( + *, + scene_dir: Path, + source_config: Mapping[str, Any], + spec: _StackingSpec, + project_name: str, + task_name: str, + max_episodes: int, + max_episode_steps: int, + mesh_normalizer: MeshFrameNormalizer, +) -> dict[str, Any]: + scene_objects = _collect_scene_objects(source_config) + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + by_uid = {obj.source_uid: obj for obj in scene_objects} + runtime_uids = _relative_scene_runtime_uid_mapping( + scene_objects, + table_source_uid=spec.table_source_uid, + ) + moved_source_uids = {step.source_uid for step in spec.steps} + for step in spec.steps: + runtime_uids[step.source_uid] = step.runtime_uid + + dynamic_rigid_objects = [ + obj for obj in rigid_objects if obj.source_uid in moved_source_uids + ] + static_scene_objects = [ + obj for obj in rigid_objects if obj.source_uid not in moved_source_uids + ] + table_config = _make_background_config( + scene_dir, + by_uid[spec.table_source_uid], + mesh_normalizer, + ) + table_top_z = _mesh_config_world_zmax(table_config) + robot_init_z = _dual_ur5_init_z_from_table_top(table_top_z) + + gym_config = { + "id": "AtomicActionsAgent-v3", + "max_episodes": int(max_episodes), + "max_episode_steps": int(max_episode_steps), + "env": { + "extensions": {}, + "events": _make_arrangement_events_config( + [step.runtime_uid for step in spec.steps], + sensor_config_factory=_make_sensor_config, + ), + "observations": _make_observations_config(), + "dataset": {}, + }, + "robot": _make_dual_ur5_robot_config(robot_init_z=robot_init_z), + "sensor": _make_sensor_config(), + "light": _make_light_config(), + "background": [ + table_config, + *[ + _make_relative_background_object_config( + scene_dir, + obj, + runtime_uids[obj.source_uid], + max_convex_hull_num=1, + mesh_normalizer=mesh_normalizer, + ) + for obj in static_scene_objects + ], + *[ + _make_extra_background_config( + scene_dir, + obj, + mesh_normalizer, + runtime_uid=runtime_uids[obj.source_uid], + ) + for obj in background_objects + if obj.source_uid != spec.table_source_uid + ], + ], + "rigid_object": [ + _make_relative_rigid_object_config( + scene_dir=scene_dir, + obj=obj, + runtime_uid=runtime_uids[obj.source_uid], + body_scale=_source_body_scale(obj), + max_convex_hull_num=16, + mesh_normalizer=mesh_normalizer, + ) + for obj in dynamic_rigid_objects + ], + } + _apply_tabletop_z_placement(gym_config, table_top_z) + spec = _with_stacking_generated_targets(spec, gym_config) + gym_config["env"]["extensions"] = _make_stacking_extensions_config(spec) + gym_config["env"]["dataset"] = _make_stacking_dataset_config(project_name, spec) + return { + "gym_config": gym_config, + "agent_config": make_agent_config(), + "task_prompt": make_stacking_task_prompt(task_name, project_name, spec), + "basic_background": make_stacking_basic_background(project_name, spec), + "atom_actions": make_stacking_atom_actions_prompt(spec), + "summary": _make_stacking_summary(spec), + } + + +def _make_stacking_dataset_config( + project_name: str, + spec: _StackingSpec, +) -> dict[str, Any]: + ordered = ", ".join(step.runtime_uid for step in spec.steps) + return { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "DualUR5", + "control_freq": 25, + }, + "instruction": { + "lang": ( + "Move the selected objects to the table center and stack " + f"them bottom-to-top as: {ordered}." + ), + }, + "extra": { + "scene_type": project_name, + "task_description": spec.task_description, + "data_type": "sim", + }, + "use_videos": True, + }, + } + } + + def _attach_coacd_cache_summary(bundle: dict[str, Any]) -> None: from embodichain.gen_sim.action_agent_pipeline.generation.coacd_cache import ( prewarm_coacd_cache_for_gym_config, @@ -631,7 +810,9 @@ def _build_relative_placement_bundle( ) moved_source_uids = {placement.moved_source_uid for placement in spec.placements} reference_runtime_uids = { - placement.reference_runtime_uid for placement in spec.placements + placement.reference_runtime_uid + for placement in spec.placements + if placement.intent == "place_relative" } registered_runtime_uids = sorted( {runtime_uids[obj.source_uid] for obj in rigid_objects} | reference_runtime_uids @@ -710,9 +891,10 @@ def _build_relative_placement_bundle( ], } _apply_tabletop_z_placement(gym_config, table_top_z) - spec = _with_self_relative_absolute_targets(spec, gym_config) - spec = _with_inside_container_slot_offsets(spec, gym_config) - spec = _with_on_surface_release_offsets(spec, gym_config) + if spec.intent == "place_relative": + spec = _with_self_relative_absolute_targets(spec, gym_config) + spec = _with_inside_container_slot_offsets(spec, gym_config) + spec = _with_on_surface_release_offsets(spec, gym_config) gym_config["env"]["extensions"] = _make_relative_extensions_config( spec, side_relation_xy_offsets=_side_relation_xy_offsets, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py index 97d4ec48..f5cf9b13 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_blocks.py @@ -386,6 +386,13 @@ def _relative_dataset_instruction( *, relation_phrase: Callable[[str], str], ) -> str: + if spec.intent == "hold_hover": + return " ".join( + f"Use the {placement.active_side} UR5 to pick up " + f"{placement.moved_runtime_uid} and keep it hovering in a closed " + "gripper." + for placement in spec.placements + ) if len(spec.placements) == 1: placement = spec.placements[0] return ( diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py index 238b39f4..ee9a30b9 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/config_types.py @@ -23,6 +23,10 @@ __all__ = [ "_ArrangementLineSpec", "_ArrangementLineStepSpec", + "_ObjectManipulationSpec", + "_ObjectManipulationStepSpec", + "_StackingSpec", + "_StackingStepSpec", "GeneratedActionAgentConfigPaths", "TargetReplacementSpec", "_BasketTaskRoles", @@ -89,6 +93,7 @@ class _ResolvedTargetReplacement: @dataclass(frozen=True) class _RelativePlacementStepSpec: + intent: str moved_source_uid: str reference_source_uid: str moved_runtime_uid: str @@ -103,10 +108,12 @@ class _RelativePlacementStepSpec: orientation_goal: str = "preserve" orientation_axis: str = "none" orientation_align_to_runtime_uid: str | None = None + hover_height: float = 0.10 @dataclass(frozen=True) class _RelativePlacementSpec: + intent: str table_source_uid: str moved_source_uid: str reference_source_uid: str @@ -127,6 +134,11 @@ class _RelativePlacementSpec: orientation_goal: str = "preserve" orientation_axis: str = "none" orientation_align_to_runtime_uid: str | None = None + hover_height: float = 0.10 + + +_ObjectManipulationStepSpec = _RelativePlacementStepSpec +_ObjectManipulationSpec = _RelativePlacementSpec @dataclass(frozen=True) @@ -158,3 +170,31 @@ class _ArrangementLineSpec: line_origin_xy: list[float] spacing: float layout_clearance: float + + +@dataclass(frozen=True) +class _StackingStepSpec: + source_uid: str + runtime_uid: str + layer_index: int + active_side: str + target_position: list[float] + high_position: list[float] + support_runtime_uid: str | None = None + size_score: float | None = None + color: str | None = None + orientation_goal: str = "preserve" + orientation_axis: str = "none" + + +@dataclass(frozen=True) +class _StackingSpec: + table_source_uid: str + task_description: str + task_prompt_summary: str + basic_background_notes: str + stack_mode: str + order_by: str + anchor: str + anchor_xy: list[float] + steps: tuple[_StackingStepSpec, ...] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/object_manipulation_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/object_manipulation_spec.py new file mode 100644 index 00000000..78f52e35 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/object_manipulation_spec.py @@ -0,0 +1,35 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from embodichain.gen_sim.action_agent_pipeline.generation.relative_spec import ( + _SIDE_RELATIONS, + _build_object_manipulation_spec_with_llm, + _call_object_manipulation_task_llm, + _normalize_relative_relation, + _relative_relation_phrase, + _relative_scene_runtime_uid_mapping, +) + +__all__ = [ + "_SIDE_RELATIONS", + "_build_object_manipulation_spec_with_llm", + "_call_object_manipulation_task_llm", + "_normalize_relative_relation", + "_relative_relation_phrase", + "_relative_scene_runtime_uid_mapping", +] diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py index 7fef00b6..b85d9c09 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/prompt_builders.py @@ -33,6 +33,9 @@ "make_relative_atom_actions_prompt", "make_relative_basic_background", "make_relative_task_prompt", + "make_stacking_atom_actions_prompt", + "make_stacking_basic_background", + "make_stacking_task_prompt", ] _BASKET_LEFT_RELEASE_OFFSET_Y = 0.04 @@ -67,6 +70,7 @@ class _BasketRolesLike(Protocol): class _RelativePlacementLike(Protocol): + intent: str active_side: str moved_runtime_uid: str moved_source_uid: str @@ -81,6 +85,7 @@ class _RelativePlacementLike(Protocol): orientation_goal: str orientation_axis: str orientation_align_to_runtime_uid: str | None + hover_height: float class _RelativeSpecLike(_RelativePlacementLike, Protocol): @@ -119,6 +124,31 @@ class _ArrangementSpecLike(Protocol): steps: Sequence[_ArrangementStepLike] +class _StackingStepLike(Protocol): + source_uid: str + runtime_uid: str + layer_index: int + active_side: str + target_position: Sequence[float] + high_position: Sequence[float] + support_runtime_uid: str | None + size_score: float | None + color: str | None + orientation_goal: str + orientation_axis: str + + +class _StackingSpecLike(Protocol): + task_description: str + task_prompt_summary: str + basic_background_notes: str + stack_mode: str + order_by: str + anchor: str + anchor_xy: Sequence[float] + steps: Sequence[_StackingStepLike] + + def make_agent_config() -> dict[str, Any]: return { "TaskAgent": { @@ -323,11 +353,250 @@ def _arrangement_atom_action_block(step: _ArrangementStepLike) -> str: {_format_initial_qpos_spec(active_arm, sample_interval=30)}""" +def make_stacking_task_prompt( + task_name: str, + project_name: str, + spec: _StackingSpecLike, +) -> str: + edge_count = sum(_stacking_step_edge_count(step) for step in spec.steps) + edge_index = 1 + step_blocks_list = [] + for step in spec.steps: + step_blocks_list.append(_stacking_step_prompt_block(edge_index, step)) + edge_index += _stacking_step_edge_count(step) + step_blocks = "\n\n".join(step_blocks_list) + stack_order = ", ".join( + f"`{step.runtime_uid}` layer {step.layer_index}" for step in spec.steps + ) + return f"""Task: +{task_name}: {spec.task_prompt_summary} + +This config was generated from a stacking task description by the config-stage +LLM. The execution-stage LLM must now generate the graph JSON from this prompt. + +Original simple task description: +{spec.task_description} + +Stacking plan: +- Stack mode: `{spec.stack_mode}`. +- Anchor: `{spec.anchor}` at xy `{list(spec.anchor_xy)}` in the exported {project_name} environment. +- Ordering rule: `{spec.order_by}`. +- Bottom-to-top order: {stack_order}. + +Generate one deterministic nominal graph with exactly {edge_count} nominal edges. +Use only the atomic action class JSON specs shown below. Do not add recovery, +monitor, search, alignment, or extra lift edges. Execute one object at a time; +do not pick up two objects simultaneously. Move each held object to the high +staging object pose. If a step has `orientation_goal:"preserve"`, do not add a +separate high-orientation/alignment edge. Only steps with an explicit +non-preserve orientation goal may align at the same high pose before moving down +to the final object pose. Release with the exact relative-zero `Place` spec, +retreat the empty hand upward, then return that arm to its initial pose. + +{step_blocks} + +Final state: the objects must be stacked at the configured table-center anchor. +For `on_top`, each upper layer rests on the previous layer. For `nested`, each +upper bowl is nested into the previous bowl. Use the exact absolute +target_object_pose JSON specs shown above; do not rewrite them. +""" + + +def _stacking_step_edge_count(step: _StackingStepLike) -> int: + return 6 if step.orientation_goal == "preserve" else 7 + + +def _stacking_step_prompt_block(start_edge: int, step: _StackingStepLike) -> str: + active_arm = f"{step.active_side}_arm" + active_slot = f"{step.active_side}_arm_action" + inactive_slot = f"{'right' if step.active_side == 'left' else 'left'}_arm_action" + high_preserve_spec = _format_pose_absolute_spec( + active_arm, + step.high_position, + sample_interval=45, + orientation_goal="preserve", + orientation_axis="none", + ) + if step.orientation_goal == "preserve": + high_oriented_spec = high_preserve_spec + else: + high_oriented_spec = _format_pose_absolute_spec( + active_arm, + step.high_position, + sample_interval=45, + orientation_goal=step.orientation_goal, + orientation_axis=step.orientation_axis, + ) + release_move_spec = _format_pose_absolute_spec( + active_arm, + step.target_position, + sample_interval=45, + orientation_goal=step.orientation_goal, + orientation_axis=step.orientation_axis, + ) + if step.orientation_goal == "preserve": + return f"""{start_edge}. Pick up `{step.runtime_uid}` for stack layer {step.layer_index}: + - {active_slot}: {_format_pick_up_spec(active_arm, step.runtime_uid)} + - {inactive_slot}: null + +{start_edge + 1}. Move `{step.runtime_uid}` to the high staging pose without changing orientation: + - {active_slot}: {high_preserve_spec} + - {inactive_slot}: null + +{start_edge + 2}. Move `{step.runtime_uid}` down to the final stack object pose without changing orientation: + - {active_slot}: {release_move_spec} + - {inactive_slot}: null + +{start_edge + 3}. Release `{step.runtime_uid}` in-place without moving the object pose: + - {active_slot}: {_format_release_only_place_spec(active_arm)} + - {inactive_slot}: null + +{start_edge + 4}. Retreat `{active_arm}` upward after release: + - {active_slot}: {_format_empty_hand_retreat_spec(active_arm)} + - {inactive_slot}: null + +{start_edge + 5}. Return `{active_arm}` to its initial pose: + - {active_slot}: {_format_initial_qpos_spec(active_arm, sample_interval=30)} + - {inactive_slot}: null""" + return f"""{start_edge}. Pick up `{step.runtime_uid}` for stack layer {step.layer_index}: + - {active_slot}: {_format_pick_up_spec(active_arm, step.runtime_uid)} + - {inactive_slot}: null + +{start_edge + 1}. Move `{step.runtime_uid}` to the high staging pose without changing orientation: + - {active_slot}: {high_preserve_spec} + - {inactive_slot}: null + +{start_edge + 2}. Align `{step.runtime_uid}` at the high staging pose if the spec requires it: + - {active_slot}: {high_oriented_spec} + - {inactive_slot}: null + +{start_edge + 3}. Move `{step.runtime_uid}` down to the final stack object pose: + - {active_slot}: {release_move_spec} + - {inactive_slot}: null + +{start_edge + 4}. Release `{step.runtime_uid}` in-place without moving the object pose: + - {active_slot}: {_format_release_only_place_spec(active_arm)} + - {inactive_slot}: null + +{start_edge + 5}. Retreat `{active_arm}` upward after release: + - {active_slot}: {_format_empty_hand_retreat_spec(active_arm)} + - {inactive_slot}: null + +{start_edge + 6}. Return `{active_arm}` to its initial pose: + - {active_slot}: {_format_initial_qpos_spec(active_arm, sample_interval=30)} + - {inactive_slot}: null""" + + +def make_stacking_basic_background( + project_name: str, + spec: _StackingSpecLike, +) -> str: + notes = spec.basic_background_notes or ( + "No extra scene notes were provided by the config-stage LLM." + ) + object_lines = "\n".join( + _stacking_object_background_line(step) for step in spec.steps + ) + return f"""The scene comes from the exported {project_name} mesh environment. + +This configuration directory is for a Dual-UR5 stacking task generated from a +simple natural-language task description. + +The robot is a dual-UR5 composite robot with DH_PGI_140_80 parallel grippers: +- left_arm is the semantic robot-view left slot, mapped to the physical + right_arm control part. +- right_arm is the semantic robot-view right slot, mapped to the physical + left_arm control part. + +Stack mode: `{spec.stack_mode}` at table-center xy `{list(spec.anchor_xy)}`. + +Interactive task objects and stack layers: +{object_lines} + +Config-stage LLM notes: +{notes} + +The execution-stage LLM should manipulate one object at a time, release it in +place, retreat upward with an empty gripper, and then return the active arm to +its initial pose before starting the next stack layer. +""" + + +def _stacking_object_background_line(step: _StackingStepLike) -> str: + attrs = [] + if step.color: + attrs.append(f"color={step.color}") + if step.size_score is not None: + attrs.append(f"size_score={float(step.size_score):.6g}") + attr_text = f" ({', '.join(attrs)})" if attrs else "" + support = step.support_runtime_uid or "table" + return ( + f"- {step.runtime_uid}: source `{step.source_uid}`{attr_text}, " + f"layer {step.layer_index}, support `{support}`, " + f"target_position={list(step.target_position)}, " + f"handled by {step.active_side}_arm." + ) + + +def make_stacking_atom_actions_prompt(spec: _StackingSpecLike) -> str: + blocks = "\n\n".join(_stacking_atom_action_block(step) for step in spec.steps) + return f"""### Atomic Action Class JSON Specs for Dual-UR5 Stacking + +Use only the native atomic action class JSON specs shown below. Each object is +moved to an absolute table-center stack pose computed by the config-stage +generator. Keep the non-active arm null for each listed object. + +{blocks} +""" + + +def _stacking_atom_action_block(step: _StackingStepLike) -> str: + active_arm = f"{step.active_side}_arm" + high_oriented_spec = _format_pose_absolute_spec( + active_arm, + step.high_position, + sample_interval=45, + orientation_goal=step.orientation_goal, + orientation_axis=step.orientation_axis, + ) + if step.orientation_goal == "preserve": + return f"""Object `{step.runtime_uid}` to stack layer {step.layer_index}: +- Pick up: + {_format_pick_up_spec(active_arm, step.runtime_uid)} +- High staging without orientation change: + {_format_pose_absolute_spec(active_arm, step.high_position, sample_interval=45, orientation_goal="preserve", orientation_axis="none")} +- Final stack object pose without orientation change: + {_format_pose_absolute_spec(active_arm, step.target_position, sample_interval=45, orientation_goal="preserve", orientation_axis="none")} +- Release-only Place: + {_format_release_only_place_spec(active_arm)} +- Empty-hand retreat: + {_format_empty_hand_retreat_spec(active_arm)} +- Return: + {_format_initial_qpos_spec(active_arm, sample_interval=30)}""" + return f"""Object `{step.runtime_uid}` to stack layer {step.layer_index}: +- Pick up: + {_format_pick_up_spec(active_arm, step.runtime_uid)} +- High staging without orientation change: + {_format_pose_absolute_spec(active_arm, step.high_position, sample_interval=45, orientation_goal="preserve", orientation_axis="none")} +- High staging orientation: + {high_oriented_spec} +- Final stack object pose: + {_format_pose_absolute_spec(active_arm, step.target_position, sample_interval=45, orientation_goal=step.orientation_goal, orientation_axis=step.orientation_axis)} +- Release-only Place: + {_format_release_only_place_spec(active_arm)} +- Empty-hand retreat: + {_format_empty_hand_retreat_spec(active_arm)} +- Return: + {_format_initial_qpos_spec(active_arm, sample_interval=30)}""" + + def make_relative_task_prompt( task_name: str, project_name: str, spec: _RelativeSpecLike, ) -> str: + if spec.intent == "hold_hover": + return _make_hold_hover_task_prompt(task_name, project_name, spec) if len(spec.placements) > 1: return _make_dual_relative_task_prompt(task_name, project_name, spec) @@ -507,6 +776,8 @@ def _make_dual_relative_task_prompt( project_name: str, spec: _RelativeSpecLike, ) -> str: + if spec.intent == "hold_hover": + return _make_hold_hover_task_prompt(task_name, project_name, spec) first, second = spec.placements first_arm = f"{first.active_side}_arm" second_arm = f"{second.active_side}_arm" @@ -637,6 +908,80 @@ def _make_dual_relative_task_prompt( """ +def _make_hold_hover_task_prompt( + task_name: str, + project_name: str, + spec: _RelativeSpecLike, +) -> str: + pick_actions = { + f"{placement.active_side}_arm_action": _format_pick_up_spec( + f"{placement.active_side}_arm", + placement.moved_runtime_uid, + ) + for placement in spec.placements + } + hover_actions = { + f"{placement.active_side}_arm_action": _format_hover_move_spec( + f"{placement.active_side}_arm", + placement, + ) + for placement in spec.placements + } + close_actions = { + f"{placement.active_side}_arm_action": _format_gripper_spec( + f"{placement.active_side}_arm", + "close", + sample_interval=10, + post_hold_steps=20, + ) + for placement in spec.placements + } + for side in ("left", "right"): + pick_actions.setdefault(f"{side}_arm_action", None) + hover_actions.setdefault(f"{side}_arm_action", None) + close_actions.setdefault(f"{side}_arm_action", None) + + numbered_edges = _format_numbered_edge_blocks( + [ + ("Pick up the selected object(s)", pick_actions), + ( + "Move the held object(s) to the hover pose without releasing", + hover_actions, + ), + ("Keep the gripper(s) closed and finish while holding", close_actions), + ] + ) + objects = ", ".join( + f"`{placement.moved_runtime_uid}` with {placement.active_side}_arm" + for placement in spec.placements + ) + return f"""Task: +{task_name}: {spec.task_prompt_summary} + +This config was generated from an object-manipulation task description by the +config-stage LLM. The execution-stage LLM must now generate the graph JSON from +this prompt. + +Original simple task description: +{spec.task_description} + +Object and arm mapping: +- Hold-hover manipulation(s): {objects}. +- Do not release any held object. +- Do not return a holding arm to its initial qpos. + +Generate one deterministic nominal graph with exactly 3 nominal edges. +Use only the atomic action class JSON specs shown below. Do not add recovery, +monitor, search, release, placement, or return-to-initial edges. The final state +must keep every selected object hovering in a closed gripper. + +{numbered_edges} + +Final state: every selected object must remain lifted and held by its assigned +UR5 arm in the exported {project_name} environment config. +""" + + def _dual_relative_release_edge_blocks( *, placement: _RelativePlacementLike, @@ -826,6 +1171,8 @@ def make_relative_basic_background( project_name: str, spec: _RelativeSpecLike, ) -> str: + if spec.intent == "hold_hover": + return _make_hold_hover_basic_background(project_name, spec) if len(spec.placements) > 1: return _make_dual_relative_basic_background(project_name, spec) @@ -869,6 +1216,8 @@ def _make_dual_relative_basic_background( project_name: str, spec: _RelativeSpecLike, ) -> str: + if spec.intent == "hold_hover": + return _make_hold_hover_basic_background(project_name, spec) notes = spec.basic_background_notes or ( "No extra scene notes were provided by the config-stage LLM." ) @@ -906,7 +1255,45 @@ def _make_dual_relative_basic_background( """ +def _make_hold_hover_basic_background( + project_name: str, + spec: _RelativeSpecLike, +) -> str: + notes = spec.basic_background_notes or ( + "No extra scene notes were provided by the config-stage LLM." + ) + object_lines = "\n".join( + f"- {placement.moved_runtime_uid}: source `{placement.moved_source_uid}`, " + f"handled by {placement.active_side}_arm, hover_height={placement.hover_height}." + for placement in spec.placements + ) + return f"""The scene comes from the exported {project_name} mesh environment. + +This configuration directory is for a Dual-UR5 object-manipulation hold-hover +task generated from a simple natural-language task description. + +The robot is a dual-UR5 composite robot with DH_PGI_140_80 parallel grippers: +- left_arm is the semantic robot-view left slot, mapped to the physical + right_arm control part. +- right_arm is the semantic robot-view right slot, mapped to the physical + left_arm control part. + +Hold-hover task objects: +{object_lines} + +Config-stage LLM notes: +{notes} + +The execution-stage LLM should pick up the selected object(s), move them to the +configured hover pose if needed, and keep the gripper(s) closed. It must not use +`Place` or return a holding arm to its initial qpos because the final state is +the object still hovering in the gripper. +""" + + def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: + if spec.intent == "hold_hover": + return _make_hold_hover_atom_actions_prompt(spec) if len(spec.placements) > 1: return _make_dual_relative_atom_actions_prompt(spec) @@ -931,6 +1318,8 @@ def make_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: + if spec.intent == "hold_hover": + return _make_hold_hover_atom_actions_prompt(spec) first, second = spec.placements first_arm = f"{first.active_side}_arm" second_arm = f"{second.active_side}_arm" @@ -960,6 +1349,31 @@ def _make_dual_relative_atom_actions_prompt(spec: _RelativeSpecLike) -> str: """ +def _make_hold_hover_atom_actions_prompt(spec: _RelativeSpecLike) -> str: + blocks = "\n\n".join( + _hold_hover_atom_action_block(placement) for placement in spec.placements + ) + return f"""### Atomic Action Class JSON Specs for Dual-UR5 Object Manipulation + +Use only the native atomic action class JSON specs shown below. The final state +must keep the listed object(s) held in closed grippers. Do not use `Place` and +do not return a holding arm to its initial qpos. + +{blocks} +""" + + +def _hold_hover_atom_action_block(placement: _RelativePlacementLike) -> str: + active_arm = f"{placement.active_side}_arm" + return f"""Object `{placement.moved_runtime_uid}` hold-hover: +- Pick up: + {_format_pick_up_spec(active_arm, placement.moved_runtime_uid)} +- Hover move: + {_format_hover_move_spec(active_arm, placement)} +- Keep gripper closed: + {_format_gripper_spec(active_arm, "close", sample_interval=10, post_hold_steps=20)}""" + + def make_basket_task_prompt( task_name: str, project_name: str, @@ -1345,6 +1759,27 @@ def _format_relative_pose_spec( ) +def _format_hover_move_spec( + robot_name: str, + placement: _RelativePlacementLike, +) -> str: + return _compact_json( + { + "atomic_action_class": "MoveHeldObject", + "robot_name": robot_name, + "control": "arm", + "target_object_pose": { + "reference": "relative", + "offset": [0.0, 0.0, float(placement.hover_height)], + "frame": "world", + "orientation_goal": "preserve", + "orientation_axis": "none", + }, + "cfg": {"sample_interval": 45}, + } + ) + + def _format_relative_place_spec( robot_name: str, placement: _RelativePlacementLike, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py index cab92f0d..ba501382 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_geometry.py @@ -119,6 +119,7 @@ def _with_self_relative_absolute_targets( ) primary = placements[0] return _RelativePlacementSpec( + intent=primary.intent, table_source_uid=spec.table_source_uid, moved_source_uid=primary.moved_source_uid, reference_source_uid=primary.reference_source_uid, @@ -139,6 +140,7 @@ def _with_self_relative_absolute_targets( orientation_goal=primary.orientation_goal, orientation_axis=primary.orientation_axis, orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, + hover_height=primary.hover_height, ) @@ -157,6 +159,7 @@ def _with_self_relative_absolute_target( release_position = _offset_position(initial_position, placement.release_offset) high_position = _offset_position(initial_position, placement.high_offset) return _RelativePlacementStepSpec( + intent=placement.intent, moved_source_uid=placement.moved_source_uid, reference_source_uid=placement.reference_source_uid, moved_runtime_uid=placement.moved_runtime_uid, @@ -171,6 +174,7 @@ def _with_self_relative_absolute_target( orientation_goal=placement.orientation_goal, orientation_axis=placement.orientation_axis, orientation_align_to_runtime_uid=placement.orientation_align_to_runtime_uid, + hover_height=placement.hover_height, ) @@ -268,6 +272,7 @@ def _replace_relative_spec_placements( orientation_goal=primary.orientation_goal, orientation_axis=primary.orientation_axis, orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, + hover_height=primary.hover_height, ) @@ -495,25 +500,29 @@ def _offset_position( def _make_relative_summary(spec: _RelativePlacementSpec) -> dict[str, Any]: if len(spec.placements) == 1: return { - "mode": "relative_placement", + "mode": "object_manipulation", + "intent": spec.intent, "moved_object": spec.moved_runtime_uid, "reference_object": spec.reference_runtime_uid, "relation": spec.relation, "active_arm": f"{spec.active_side}_arm", "release_offset": spec.release_offset, + "hover_height": spec.hover_height, "orientation_goal": spec.orientation_goal, "orientation_axis": spec.orientation_axis, "orientation_align_to": spec.orientation_align_to_runtime_uid, } return { - "mode": "dual_arm_relative_placement", - "placements": [ + "mode": "dual_arm_object_manipulation", + "manipulations": [ { + "intent": placement.intent, "moved_object": placement.moved_runtime_uid, "reference_object": placement.reference_runtime_uid, "relation": placement.relation, "active_arm": f"{placement.active_side}_arm", "release_offset": placement.release_offset, + "hover_height": placement.hover_height, "orientation_goal": placement.orientation_goal, "orientation_axis": placement.orientation_axis, "orientation_align_to": placement.orientation_align_to_runtime_uid, diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py index b5834505..e83aad9b 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/relative_spec.py @@ -42,7 +42,9 @@ __all__ = [ "_SIDE_RELATIONS", + "_build_object_manipulation_spec_with_llm", "_build_relative_placement_spec_with_llm", + "_call_object_manipulation_task_llm", "_normalize_relative_relation", "_relative_relation_phrase", "_relative_scene_runtime_uid_mapping", @@ -62,6 +64,8 @@ } _SIDE_RELATIONS = _RELATIVE_RELATIONS - {"inside", "on"} +_SUPPORTED_MANIPULATION_INTENTS = {"place_relative", "hold_hover"} +_DEFAULT_HOVER_HEIGHT = 0.10 _SELF_REFERENCE_VALUES = { "self", @@ -207,6 +211,29 @@ def _build_relative_placement_spec_with_llm( ) +def _build_object_manipulation_spec_with_llm( + *, + scene_objects: list[_SceneObject], + project_name: str, + task_description: str, + model: str | None, + release_offset_fn: Callable[[str], Sequence[float]], + staging_z_delta: float, + pose_sensitive_staging_z_delta: float, + task_llm_caller: Callable[..., Mapping[str, Any]] | None = None, +) -> _RelativePlacementSpec: + return _build_relative_placement_spec_with_llm( + scene_objects=scene_objects, + project_name=project_name, + task_description=task_description, + model=model, + release_offset_fn=release_offset_fn, + staging_z_delta=staging_z_delta, + pose_sensitive_staging_z_delta=pose_sensitive_staging_z_delta, + task_llm_caller=task_llm_caller or _call_object_manipulation_task_llm, + ) + + def _call_relative_task_llm( *, project_name: str, @@ -330,6 +357,104 @@ def _call_relative_task_llm( return extract_json_object(content) +def _call_object_manipulation_task_llm( + *, + project_name: str, + task_description: str, + scene_summary: list[dict[str, Any]], + model: str | None, +) -> dict[str, Any]: + from langchain_core.messages import HumanMessage, SystemMessage + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + extract_json_object, + ) + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( + create_chat_openai, + ) + + prompt = ( + "Parse a simple Dual-UR5 tabletop object-manipulation task and produce " + "one constrained config-level JSON spec. The generator computes offsets, " + "robot config, success JSON, and action prompts deterministically.\n\n" + "Return exactly one JSON object with this schema:\n" + "{\n" + ' "manipulations": [\n' + " {\n" + ' "intent": "place_relative|hold_hover",\n' + ' "moved_object": "",\n' + ' "arm": "left|right|auto",\n' + ' "reference_object": "",\n' + ' "goal_relation": "inside|on|left_of|right_of|front_of|behind|front_left_of|back_left_of|front_right_of|back_right_of",\n' + ' "hover_height": 0.10,\n' + ' "orientation_goal": "preserve|upright|lay_flat|axis_align",\n' + ' "orientation_reference": "none|world_axes|reference_object",\n' + ' "orientation_axis": "none|x|y|long_axis|short_axis"\n' + " }\n" + " ],\n" + ' "task_prompt_summary": "",\n' + ' "basic_background_notes": "",\n' + ' "action_sketch": [""]\n' + "}\n\n" + "Rules:\n" + "- Use only source_uid values from the scene objects listed below.\n" + "- Use intent='hold_hover' when the task asks to pick up, lift, hold, " + "or suspend an object in the air without placing or releasing it. For " + "hold_hover, omit reference_object and goal_relation, use " + "orientation_goal='preserve', orientation_reference='none', " + "orientation_axis='none', and hover_height=0.10 unless the user gives a " + "specific height.\n" + "- Use intent='place_relative' for tasks that ask to place, put, stack " + "onto, move beside, move into, or release an object at a spatial target. " + "For place_relative, include reference_object and goal_relation.\n" + "- Return exactly two manipulations for a dual-arm task. Treat the task " + "as dual-arm when it explicitly says 双臂, 两臂, both arms, two arms, " + "一只机械臂...另一只机械臂, or separate work for left and right arms.\n" + "- Do not mix hold_hover and place_relative in one response; v1 only " + "supports homogeneous manipulation intents.\n" + "- For dual-arm tasks, use two different moved_object values and one " + "left arm plus one right arm. Use arm='auto' only when the user did not " + "specify which arm handles that manipulation.\n" + "- For place_relative, reference_object may be a rigid_object or a " + "background object such as a pad, tray, basket, or container. For " + "single-object directional tasks from the object's initial position, " + "set reference_object to the moved object or 'self'.\n" + "- If the task says to release an object above a basket/container so it " + "falls into it, use goal_relation='inside'. If it says to place on a " + "non-container support, use goal_relation='on'.\n" + "- orientation_goal captures the held object's intended pose before " + "release. Use 'upright' for 扶正/竖起来, 'lay_flat' for 平放/横放, " + "'axis_align' for 水平摆正/摆正/alignment, and 'preserve' otherwise.\n" + "- For axis_align, use orientation_reference='reference_object' with " + "orientation_axis='long_axis' for aligning to a pad/box/container long " + "side, or orientation_reference='world_axes' with orientation_axis='x' " + "or 'y' only when a world/table axis is explicit.\n" + "- Do not return numeric offsets, object poses, robot config, success " + "JSON, or full prompt files.\n\n" + f"Project: {project_name}\n" + f"Task description:\n{task_description}\n" + f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}" + ) + llm = create_chat_openai( + temperature=0.0, + model=model, + usage_stage="config_generation.object_manipulation_task", + ) + response = llm.invoke( + [ + SystemMessage( + content=( + "You produce strict JSON specs for simulation config " + "generation. Do not include markdown." + ) + ), + HumanMessage(content=prompt), + ] + ) + content = getattr(response, "content", response) + return extract_json_object(content) + + def _apply_relative_task_response( *, response: Mapping[str, Any], @@ -349,7 +474,7 @@ def _apply_relative_task_response( placement_entries = _relative_placement_entries(response) if len(placement_entries) > 2: - raise ValueError("Relative placement supports at most two arm placements.") + raise ValueError("Object manipulation supports at most two arm actions.") forced_arm_sides = _relative_forced_arm_sides( placement_entries, @@ -383,6 +508,7 @@ def _apply_relative_task_response( primary = placements[0] return _RelativePlacementSpec( + intent=primary.intent, table_source_uid=table_source_uid, moved_source_uid=primary.moved_source_uid, reference_source_uid=primary.reference_source_uid, @@ -403,11 +529,12 @@ def _apply_relative_task_response( orientation_goal=primary.orientation_goal, orientation_axis=primary.orientation_axis, orientation_align_to_runtime_uid=primary.orientation_align_to_runtime_uid, + hover_height=primary.hover_height, ) def _relative_placement_entries(response: Mapping[str, Any]) -> list[Mapping[str, Any]]: - placements = response.get("placements") + placements = response.get("manipulations", response.get("placements")) if placements is None: return [response] if not isinstance(placements, list) or not placements: @@ -476,27 +603,33 @@ def _build_relative_placement_step( staging_z_delta: float, pose_sensitive_staging_z_delta: float, ) -> _RelativePlacementStepSpec: + intent = _normalize_manipulation_intent(entry.get("intent")) moved_source_uid = _resolve_rigid_source_uid( entry.get("moved_object"), rigid_objects, field_name="moved_object", ) - relation = _normalize_relative_relation(entry.get("goal_relation")) - reference_source_uid = _resolve_relative_reference_source_uid( - entry.get("reference_object"), - moved_source_uid=moved_source_uid, - scene_objects=scene_objects, - ) - reference_is_initial_pose = moved_source_uid == reference_source_uid - if reference_is_initial_pose and relation not in _SIDE_RELATIONS: - raise ValueError( - "Initial-position self-relative placement only supports directional " - "relations, not inside/on." + if intent == "hold_hover": + relation = "on" + reference_source_uid = moved_source_uid + reference_is_initial_pose = True + else: + relation = _normalize_relative_relation(entry.get("goal_relation")) + reference_source_uid = _resolve_relative_reference_source_uid( + entry.get("reference_object"), + moved_source_uid=moved_source_uid, + scene_objects=scene_objects, ) + reference_is_initial_pose = moved_source_uid == reference_source_uid + if reference_is_initial_pose and relation not in _SIDE_RELATIONS: + raise ValueError( + "Initial-position self-relative placement only supports directional " + "relations, not inside/on." + ) - reference_obj = by_uid[reference_source_uid] - if relation == "on" and _is_container_like(reference_obj): - relation = "inside" + reference_obj = by_uid[reference_source_uid] + if relation == "on" and _is_container_like(reference_obj): + relation = "inside" moved_runtime_uid = runtime_uids[moved_source_uid] reference_runtime_uid = runtime_uids[reference_source_uid] @@ -509,6 +642,12 @@ def _build_relative_placement_step( entry.get("orientation_reference") ) orientation_axis = _normalize_orientation_axis(entry.get("orientation_axis")) + if intent == "hold_hover" and ( + orientation_goal != "preserve" + or orientation_reference != "none" + or orientation_axis != "none" + ): + raise ValueError("hold_hover requires preserve orientation fields.") _validate_orientation_fields( orientation_goal=orientation_goal, orientation_reference=orientation_reference, @@ -520,13 +659,19 @@ def _build_relative_placement_step( else None ) - release_offset = [float(value) for value in release_offset_fn(relation)] + if intent == "hold_hover": + hover_height = _normalize_hover_height(entry.get("hover_height")) + release_offset = [0.0, 0.0, hover_height] + else: + hover_height = _DEFAULT_HOVER_HEIGHT + release_offset = [float(value) for value in release_offset_fn(relation)] high_offset = list(release_offset) - high_offset[2] += float( - pose_sensitive_staging_z_delta - if orientation_goal != "preserve" - else staging_z_delta - ) + if intent == "place_relative": + high_offset[2] += float( + pose_sensitive_staging_z_delta + if orientation_goal != "preserve" + else staging_z_delta + ) moved_position = _vector3( by_uid[moved_source_uid].config.get("init_pos", [0, 0, 0]) ) @@ -542,6 +687,7 @@ def _build_relative_placement_step( ) return _RelativePlacementStepSpec( + intent=intent, moved_source_uid=moved_source_uid, reference_source_uid=reference_source_uid, moved_runtime_uid=moved_runtime_uid, @@ -554,6 +700,7 @@ def _build_relative_placement_step( orientation_goal=orientation_goal, orientation_axis=orientation_axis, orientation_align_to_runtime_uid=orientation_align_to_runtime_uid, + hover_height=hover_height, ) @@ -561,18 +708,59 @@ def _validate_relative_placements( placements: tuple[_RelativePlacementStepSpec, ...], ) -> None: if not placements: - raise ValueError("Relative placement requires at least one placement.") + raise ValueError("Object manipulation requires at least one manipulation.") moved_source_uids = [placement.moved_source_uid for placement in placements] if len(moved_source_uids) != len(set(moved_source_uids)): - raise ValueError("Relative placements must use distinct moved_object values.") + raise ValueError("Object manipulations must use distinct moved_object values.") + intents = {placement.intent for placement in placements} + if len(intents) > 1: + raise ValueError("Mixed manipulation intents are not supported in v1.") if len(placements) == 2: active_sides = {placement.active_side for placement in placements} if active_sides != {"left", "right"}: raise ValueError( - "Dual-arm relative placement requires one left arm and one right arm." + "Dual-arm object manipulation requires one left arm and one right arm." ) +def _normalize_manipulation_intent(value: Any) -> str: + if value is None: + return "place_relative" + text = str(value).strip().lower().replace("-", "_").replace(" ", "_") + aliases = { + "relative": "place_relative", + "relative_placement": "place_relative", + "place": "place_relative", + "put": "place_relative", + "hold": "hold_hover", + "hover": "hold_hover", + "pick_hold": "hold_hover", + "pick_and_hold": "hold_hover", + "lift": "hold_hover", + "悬空": "hold_hover", + "拿起悬空": "hold_hover", + } + text = aliases.get(text, text) + if text not in _SUPPORTED_MANIPULATION_INTENTS: + raise ValueError( + f"Unsupported manipulation intent {value!r}; expected one of " + f"{sorted(_SUPPORTED_MANIPULATION_INTENTS)}." + ) + return text + + +def _normalize_hover_height(value: Any) -> float: + if value is None: + return _DEFAULT_HOVER_HEIGHT + try: + height = float(value) + except (TypeError, ValueError) as exc: + raise ValueError(f"Invalid hover_height {value!r}.") from exc + if height <= 0.0 or height > 0.5: + raise ValueError("hover_height must be in (0.0, 0.5].") + return height + + def _resolve_rigid_source_uid( value: Any, rigid_objects: list[_SceneObject], @@ -864,11 +1052,16 @@ def _default_relative_plan_summary( ) -> str: if len(placements) == 1: placement = placements[0] + if placement.intent == "hold_hover": + return f"Pick up `{placement.moved_runtime_uid}` and keep it hovering." return _default_relative_task_summary( placement.moved_runtime_uid, placement.reference_runtime_uid, placement.relation, ) + if all(placement.intent == "hold_hover" for placement in placements): + held = ", ".join(placement.moved_runtime_uid for placement in placements) + return f"Use both UR5 arms to pick up and hold hovering objects: {held}." placement_text = "; ".join( f"use the {placement.active_side} UR5 to move " f"`{placement.moved_runtime_uid}` " @@ -876,7 +1069,7 @@ def _default_relative_plan_summary( f"`{placement.reference_runtime_uid}`" for placement in placements ) - return f"Use both UR5 arms for a dual-arm relative placement: {placement_text}." + return f"Use both UR5 arms for object manipulation: {placement_text}." def _default_relative_action_sketch( @@ -884,6 +1077,12 @@ def _default_relative_action_sketch( ) -> list[str]: if len(placements) == 1: placement = placements[0] + if placement.intent == "hold_hover": + return [ + f"grasp {placement.moved_runtime_uid}", + "lift and keep the object hovering without release", + "keep the gripper closed", + ] return [ f"grasp {placement.moved_runtime_uid}", ( @@ -893,6 +1092,9 @@ def _default_relative_action_sketch( "place at the release pose with Place", ] sketch = ["grasp both moved objects with their assigned arms"] + if all(placement.intent == "hold_hover" for placement in placements): + sketch.append("keep both objects hovering with closed grippers") + return sketch for placement in placements: sketch.extend( [ diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/stacking_spec.py b/embodichain/gen_sim/action_agent_pipeline/generation/stacking_spec.py new file mode 100644 index 00000000..fc780026 --- /dev/null +++ b/embodichain/gen_sim/action_agent_pipeline/generation/stacking_spec.py @@ -0,0 +1,578 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +from collections.abc import Callable, Mapping, Sequence +from dataclasses import replace +from pathlib import Path +from typing import Any +import json + +from embodichain.gen_sim.action_agent_pipeline.generation.config_types import ( + _SceneObject, + _StackingSpec, + _StackingStepSpec, +) +from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( + _clean_vector3, + _iter_generated_scene_object_configs, + _mesh_config_local_zmin_after_rotation, + _mesh_config_world_z_bounds, +) +from embodichain.gen_sim.action_agent_pipeline.generation.naming import ( + _base_name, + _normalize_runtime_uid, + _string_list, +) +from embodichain.gen_sim.action_agent_pipeline.generation.scene_objects import ( + _arm_side_for_position, + _pick_table, +) + +__all__ = [ + "_build_stacking_spec_with_llm", + "_call_stacking_task_llm", + "_is_stacking_task_description", + "_make_stacking_summary", + "_with_stacking_generated_targets", +] + +_STACKING_KEYWORDS = ( + "stack", + "stacking", + "pile", + "叠", + "叠放", + "堆叠", + "摞", +) +_SUPPORTED_STACK_MODES = {"on_top", "nested"} +_SUPPORTED_ORDER_BY = {"explicit", "size"} +_STACKING_ANCHOR = "table_center" +_STAGING_Z_DELTA = 0.10 +_STACK_CLEARANCE = 0.003 + + +def _is_stacking_task_description(task_description: str) -> bool: + text = task_description.strip().lower() + return any(keyword in text for keyword in _STACKING_KEYWORDS) + + +def _build_stacking_spec_with_llm( + *, + scene_objects: list[_SceneObject], + project_name: str, + scene_dir: Path, + task_description: str, + model: str | None, + task_llm_caller: Callable[..., Mapping[str, Any]] | None = None, +) -> _StackingSpec: + background_objects = [ + obj for obj in scene_objects if obj.source_role == "background" + ] + rigid_objects = [obj for obj in scene_objects if obj.source_role == "rigid_object"] + if not background_objects: + raise ValueError("Stacking generation requires a background table.") + if len(rigid_objects) < 2: + raise ValueError("Stacking generation requires at least two movable objects.") + + table = _pick_table(background_objects) + scene_summary = _make_stacking_scene_summary(scene_objects, scene_dir=scene_dir) + if task_llm_caller is None: + task_llm_caller = _call_stacking_task_llm + response = task_llm_caller( + project_name=project_name, + task_description=task_description, + scene_summary=scene_summary, + model=model, + ) + return _apply_stacking_task_response( + response=response, + table_source_uid=table.source_uid, + scene_objects=scene_objects, + rigid_objects=rigid_objects, + scene_dir=scene_dir, + task_description=task_description, + ) + + +def _call_stacking_task_llm( + *, + project_name: str, + task_description: str, + scene_summary: list[dict[str, Any]], + model: str | None, +) -> dict[str, Any]: + from langchain_core.messages import HumanMessage, SystemMessage + + from embodichain.gen_sim.action_agent_pipeline.utils.llm_json import ( + extract_json_object, + ) + from embodichain.gen_sim.action_agent_pipeline.utils.mllm import ( + create_chat_openai, + ) + + prompt = ( + "Parse a tabletop object stacking task and produce one strict " + "config-level JSON spec. The generator computes all center positions, " + "heights, robot config, and action graphs deterministically.\n\n" + "Return exactly one JSON object with this schema:\n" + "{\n" + ' "objects": ["", "..."],\n' + ' "stack_mode": "on_top|nested",\n' + ' "bottom_to_top": ["", "..."],\n' + ' "order_by": "explicit|size",\n' + ' "object_attributes": {"": {"color": "red"}},\n' + ' "anchor": "table_center",\n' + ' "task_prompt_summary": "",\n' + ' "basic_background_notes": ""\n' + "}\n\n" + "Rules:\n" + "- Use only source_uid values from rigid_object scene items.\n" + "- Include every object that must be stacked.\n" + "- Use stack_mode='on_top' for blocks, cubes, books, and solid objects " + "that should be vertically stacked.\n" + "- Use stack_mode='nested' for bowls or cup-like containers that should " + "be nested into each other.\n" + "- For explicit statements like blue on green and green on red, return " + "bottom_to_top=[red, green, blue] and order_by='explicit'.\n" + "- If no order is specified for nested bowls, return order_by='size' " + "and leave bottom_to_top empty; the generator sorts large-to-small.\n" + "- Use anchor='table_center'. Do not return target positions, robot " + "config, success JSON, or action graphs.\n\n" + f"Project: {project_name}\n" + f"Task description:\n{task_description}\n" + f"Scene objects:\n{json.dumps(scene_summary, ensure_ascii=False, indent=2)}" + ) + llm = create_chat_openai( + temperature=0.0, + model=model, + usage_stage="config_generation.stacking_task", + ) + response = llm.invoke( + [ + SystemMessage( + content=( + "You produce strict JSON specs for simulation config " + "generation. Do not include markdown." + ) + ), + HumanMessage(content=prompt), + ] + ) + content = getattr(response, "content", response) + return extract_json_object(content) + + +def _make_stacking_scene_summary( + scene_objects: Sequence[_SceneObject], + *, + scene_dir: Path, +) -> list[dict[str, Any]]: + return [ + { + "source_uid": obj.source_uid, + "role": obj.source_role, + "object_type": _base_name(obj), + "mesh": obj.config.get("shape", {}).get("fpath"), + "init_pos": obj.config.get("init_pos"), + "body_scale": obj.config.get("body_scale"), + "color_hint": _color_hint_for_object(obj), + "size_score": _stacking_object_size_score(obj, scene_dir=scene_dir), + } + for obj in scene_objects + ] + + +def _apply_stacking_task_response( + *, + response: Mapping[str, Any], + table_source_uid: str, + scene_objects: list[_SceneObject], + rigid_objects: list[_SceneObject], + scene_dir: Path, + task_description: str, +) -> _StackingSpec: + by_uid = {obj.source_uid: obj for obj in scene_objects} + table_obj = by_uid[table_source_uid] + rigid_by_uid = {obj.source_uid: obj for obj in rigid_objects} + runtime_uids = _stacking_runtime_uid_mapping(rigid_objects) + + object_source_uids = _resolve_stacking_object_uids( + response.get("objects"), rigid_by_uid + ) + stack_mode = _normalize_stack_mode(response.get("stack_mode")) + order_by = _normalize_order_by(response.get("order_by")) + anchor = _normalize_anchor(response.get("anchor")) + object_attributes = _object_attributes(response.get("object_attributes")) + + explicit_order = _string_list(response.get("bottom_to_top")) + if explicit_order: + ordered_source_uids = [ + _resolve_rigid_uid(uid, rigid_by_uid, field_name="bottom_to_top") + for uid in explicit_order + ] + if set(ordered_source_uids) != set(object_source_uids): + raise ValueError( + "Stacking bottom_to_top must contain exactly the stacking objects." + ) + order_by = "explicit" + elif order_by == "size": + ordered_source_uids = sorted( + object_source_uids, + key=lambda uid: ( + _stacking_object_size_score(rigid_by_uid[uid], scene_dir=scene_dir) + or 0.0 + ), + reverse=True, + ) + else: + ordered_source_uids = object_source_uids + + anchor_xy = _table_anchor_xy(table_obj, anchor) + steps = [] + for layer_index, source_uid in enumerate(ordered_source_uids): + obj = rigid_by_uid[source_uid] + orientation_goal = "axis_align" if stack_mode == "on_top" else "preserve" + orientation_axis = "x" if stack_mode == "on_top" else "none" + steps.append( + _StackingStepSpec( + source_uid=source_uid, + runtime_uid=runtime_uids[source_uid], + layer_index=layer_index, + active_side=_arm_side_for_position( + _clean_vector3(obj.config.get("init_pos", [0.0, 0.0, 0.0])) + ), + target_position=[float(anchor_xy[0]), float(anchor_xy[1]), 0.0], + high_position=[float(anchor_xy[0]), float(anchor_xy[1]), 0.0], + support_runtime_uid=( + runtime_uids[ordered_source_uids[layer_index - 1]] + if layer_index > 0 + else None + ), + size_score=_stacking_object_size_score(obj, scene_dir=scene_dir), + color=_object_color(source_uid, object_attributes), + orientation_goal=orientation_goal, + orientation_axis=orientation_axis, + ) + ) + + summary = str(response.get("task_prompt_summary", "")).strip() + if not summary: + summary = "Move the selected objects to the table center and stack them." + notes = str(response.get("basic_background_notes", "")).strip() + + return _StackingSpec( + table_source_uid=table_source_uid, + task_description=task_description, + task_prompt_summary=summary, + basic_background_notes=notes, + stack_mode=stack_mode, + order_by=order_by, + anchor=anchor, + anchor_xy=anchor_xy, + steps=tuple(steps), + ) + + +def _with_stacking_generated_targets( + spec: _StackingSpec, + gym_config: Mapping[str, Any], +) -> _StackingSpec: + object_configs = { + str(obj.get("uid")): obj + for obj in _iter_generated_scene_object_configs(gym_config) + if obj.get("uid") is not None + } + z_by_runtime_uid: dict[str, float] = {} + steps = [] + for step in spec.steps: + moved_config = object_configs.get(step.runtime_uid) + if moved_config is None: + steps.append(step) + continue + moved_bottom_offset = _mesh_config_local_zmin_after_rotation(moved_config) + if moved_bottom_offset is None: + steps.append(step) + continue + + if step.layer_index == 0: + target_z = _clean_vector3(moved_config.get("init_pos", [0.0, 0.0, 0.0]))[2] + else: + support_uid = step.support_runtime_uid + support_z = z_by_runtime_uid.get(str(support_uid)) + support_config = object_configs.get(str(support_uid)) + if support_z is None or support_config is None: + steps.append(step) + continue + support_top_offset = _mesh_config_local_zmax_after_rotation(support_config) + if support_top_offset is None: + steps.append(step) + continue + target_z = ( + support_z + + support_top_offset + + _STACK_CLEARANCE + - float(moved_bottom_offset) + ) + + target_position = [ + float(spec.anchor_xy[0]), + float(spec.anchor_xy[1]), + round(float(target_z), 6), + ] + high_position = list(target_position) + high_position[2] = round(high_position[2] + _STAGING_Z_DELTA, 6) + z_by_runtime_uid[step.runtime_uid] = target_position[2] + steps.append( + replace( + step, + target_position=target_position, + high_position=high_position, + ) + ) + return replace(spec, steps=tuple(steps)) + + +def _make_stacking_summary(spec: _StackingSpec) -> dict[str, Any]: + return { + "mode": "stacking", + "stack_mode": spec.stack_mode, + "anchor": spec.anchor, + "anchor_xy": [float(spec.anchor_xy[0]), float(spec.anchor_xy[1])], + "order_by": spec.order_by, + "bottom_to_top": [step.runtime_uid for step in spec.steps], + "placements": [ + { + "object": step.runtime_uid, + "source_uid": step.source_uid, + "layer_index": step.layer_index, + "active_arm": f"{step.active_side}_arm", + "support": step.support_runtime_uid, + "target_position": [float(value) for value in step.target_position], + "orientation_goal": step.orientation_goal, + "orientation_axis": step.orientation_axis, + } + for step in spec.steps + ], + } + + +def _mesh_config_local_zmax_after_rotation( + obj_config: Mapping[str, Any], +) -> float | None: + z_bounds = _mesh_config_world_z_bounds({**obj_config, "init_pos": [0.0, 0.0, 0.0]}) + if z_bounds is None: + return None + return z_bounds[1] + + +def _resolve_stacking_object_uids( + value: Any, + rigid_by_uid: Mapping[str, _SceneObject], +) -> list[str]: + values = _string_list(value) + if not values: + raise ValueError("Stacking response requires non-empty objects.") + resolved = [ + _resolve_rigid_uid(raw_value, rigid_by_uid, field_name="objects") + for raw_value in values + ] + if len(resolved) < 2: + raise ValueError("Stacking requires at least two distinct objects.") + if len(resolved) != len(set(resolved)): + raise ValueError("Stacking objects must be distinct.") + return resolved + + +def _resolve_rigid_uid( + value: str, + rigid_by_uid: Mapping[str, _SceneObject], + *, + field_name: str, +) -> str: + if value in rigid_by_uid: + return value + normalized = _normalize_runtime_uid(value) + matches = [ + source_uid + for source_uid, obj in rigid_by_uid.items() + if _normalize_runtime_uid(source_uid) == normalized + or _base_name(obj) == normalized + ] + if len(matches) == 1: + return matches[0] + if not matches: + raise ValueError(f"LLM returned unknown stacking {field_name}: {value!r}.") + raise ValueError( + f"LLM returned ambiguous stacking {field_name}: {value!r}; " + f"candidates: {matches}." + ) + + +def _normalize_stack_mode(value: Any) -> str: + text = str(value or "on_top").strip().lower().replace("-", "_").replace(" ", "_") + aliases = { + "on": "on_top", + "vertical": "on_top", + "nested_bowls": "nested", + "inside": "nested", + } + text = aliases.get(text, text) + if text not in _SUPPORTED_STACK_MODES: + raise ValueError( + f"Unsupported stack_mode {value!r}; expected one of " + f"{sorted(_SUPPORTED_STACK_MODES)}." + ) + return text + + +def _normalize_order_by(value: Any) -> str: + text = str(value or "explicit").strip().lower().replace("-", "_").replace(" ", "_") + aliases = { + "given": "explicit", + "bottom_to_top": "explicit", + "large_to_small": "size", + "big_to_small": "size", + } + text = aliases.get(text, text) + if text not in _SUPPORTED_ORDER_BY: + raise ValueError( + f"Unsupported stacking order_by {value!r}; expected one of " + f"{sorted(_SUPPORTED_ORDER_BY)}." + ) + return text + + +def _normalize_anchor(value: Any) -> str: + text = str(value or _STACKING_ANCHOR).strip().lower().replace("-", "_") + aliases = { + "center": _STACKING_ANCHOR, + "table_centre": _STACKING_ANCHOR, + "桌子中央": _STACKING_ANCHOR, + "桌面中央": _STACKING_ANCHOR, + } + text = aliases.get(text, text) + if text != _STACKING_ANCHOR: + raise ValueError("Stacking only supports anchor='table_center'.") + return text + + +def _object_attributes(value: Any) -> dict[str, dict[str, str]]: + if not isinstance(value, Mapping): + return {} + attributes: dict[str, dict[str, str]] = {} + for source_uid, raw_attrs in value.items(): + if not isinstance(raw_attrs, Mapping): + continue + attributes[str(source_uid)] = { + str(key): str(attr_value).strip().lower() + for key, attr_value in raw_attrs.items() + if str(attr_value).strip() + } + return attributes + + +def _object_color( + source_uid: str, + object_attributes: Mapping[str, Mapping[str, str]], +) -> str | None: + attrs = object_attributes.get(source_uid, {}) + color = attrs.get("color") + return color.strip().lower() if isinstance(color, str) and color.strip() else None + + +def _color_hint_for_object(obj: _SceneObject) -> str | None: + text = (f"{obj.source_uid} {obj.config.get('shape', {}).get('fpath', '')}").lower() + color_aliases = { + "red": ("red", "红"), + "green": ("green", "绿"), + "blue": ("blue", "蓝"), + "yellow": ("yellow", "黄"), + "orange": ("orange", "橙"), + "purple": ("purple", "紫"), + "black": ("black", "黑"), + "white": ("white", "白"), + } + for canonical, aliases in color_aliases.items(): + if any(alias in text for alias in aliases): + return canonical + return None + + +def _stacking_runtime_uid_mapping( + rigid_objects: Sequence[_SceneObject], +) -> dict[str, str]: + candidates = {obj.source_uid: _base_name(obj) for obj in rigid_objects} + counts: dict[str, int] = {} + for runtime_uid in candidates.values(): + counts[runtime_uid] = counts.get(runtime_uid, 0) + 1 + return { + source_uid: ( + runtime_uid + if counts[runtime_uid] == 1 + else _normalize_runtime_uid(source_uid) + ) + for source_uid, runtime_uid in candidates.items() + } + + +def _table_anchor_xy(table_obj: _SceneObject, anchor: str) -> list[float]: + _normalize_anchor(anchor) + init_pos = _clean_vector3(table_obj.config.get("init_pos", [0.0, 0.0, 0.0])) + return [round(init_pos[0], 6), round(init_pos[1], 6)] + + +def _stacking_object_size_score( + obj: _SceneObject, + *, + scene_dir: Path, +) -> float | None: + config = _resolved_mesh_config(obj, scene_dir=scene_dir) + bounds = _mesh_config_world_z_bounds(config) + if bounds is None: + return None + xy_extents = _mesh_config_world_xy_extents(config) + if xy_extents is None: + return None + return round(float(max(*xy_extents, bounds[1] - bounds[0])), 6) + + +def _mesh_config_world_xy_extents( + obj_config: Mapping[str, Any], +) -> tuple[float, float] | None: + from embodichain.gen_sim.action_agent_pipeline.generation.mesh_bounds import ( + _mesh_config_world_xy_extents as _shared_mesh_config_world_xy_extents, + ) + + return _shared_mesh_config_world_xy_extents(obj_config) + + +def _resolved_mesh_config( + obj: _SceneObject, + *, + scene_dir: Path, +) -> dict[str, Any]: + config = dict(obj.config) + shape = dict(config.get("shape", {}) or {}) + fpath = shape.get("fpath") + if isinstance(fpath, str): + raw_path = Path(fpath) + if not raw_path.is_absolute(): + shape["fpath"] = (scene_dir / raw_path).resolve().as_posix() + config["shape"] = shape + return config diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py index c3905ef3..a30a91fd 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py +++ b/embodichain/gen_sim/action_agent_pipeline/generation/success_specs.py @@ -24,16 +24,19 @@ _BasketTaskRoles, _RelativePlacementSpec, _RelativePlacementStepSpec, + _StackingSpec, ) __all__ = [ "_make_arrangement_extensions_config", "_make_extensions_config", "_make_relative_extensions_config", + "_make_stacking_extensions_config", "_object_in_container_success", "_validate_arrangement_bundle", "_validate_bundle", "_validate_relative_bundle", + "_validate_stacking_bundle", "_validate_success_uids", ] @@ -120,6 +123,60 @@ def _make_arrangement_extensions_config(spec: _ArrangementLineSpec) -> dict[str, } +def _make_stacking_extensions_config(spec: _StackingSpec) -> dict[str, Any]: + return { + **_make_dual_ur5_arm_slot_config(), + "gripper_open_state": [0.0], + "gripper_close_state": [0.04], + "ignore_terminations_during_agent": True, + "viewer_camera_uid": "cam_high", + "agent_success": _make_stacking_success_spec(spec), + } + + +def _make_stacking_success_spec(spec: _StackingSpec) -> dict[str, Any]: + terms: list[dict[str, Any]] = [] + for step in spec.steps: + if step.layer_index == 0: + terms.append( + { + "type": "object_xy_near", + "object": step.runtime_uid, + "target_xy": [ + float(spec.anchor_xy[0]), + float(spec.anchor_xy[1]), + ], + "tolerance": 0.03, + } + ) + elif spec.stack_mode == "nested": + terms.append( + _object_in_container_success( + step.runtime_uid, + str(step.support_runtime_uid), + ) + ) + else: + terms.append( + { + "type": "object_on_object", + "object": step.runtime_uid, + "support": step.support_runtime_uid, + "xy_radius": 0.06, + "min_z_offset": 0.02, + "max_z_offset": 0.35, + } + ) + terms.append( + { + "type": "object_not_fallen", + "object": step.runtime_uid, + "max_tilt": 0.9, + } + ) + return {"op": "all", "terms": terms} + + def _make_arrangement_success_spec(spec: _ArrangementLineSpec) -> dict[str, Any]: terms: list[dict[str, Any]] = [] xy_tolerance = min(0.03, float(spec.spacing) * 0.35) @@ -152,6 +209,15 @@ def _make_relative_success_spec( spec.placements[0], side_relation_xy_offsets=side_relation_xy_offsets, ) + if all(placement.intent == "hold_hover" for placement in spec.placements): + terms: list[dict[str, Any]] = [] + for placement in spec.placements: + placement_success = _make_relative_placement_success_spec( + placement, + side_relation_xy_offsets=side_relation_xy_offsets, + ) + terms.extend(placement_success["terms"]) + return {"op": "all", "terms": terms} return { "op": "all", "terms": [ @@ -169,6 +235,23 @@ def _make_relative_placement_success_spec( *, side_relation_xy_offsets: Callable[[str], tuple[float, float]], ) -> dict[str, Any]: + if placement.intent == "hold_hover": + return { + "op": "all", + "terms": [ + { + "type": "object_lifted", + "object": placement.moved_runtime_uid, + "min_height": 0.08, + }, + { + "type": "object_held_by_gripper", + "object": placement.moved_runtime_uid, + "arm": f"{placement.active_side}_arm", + "max_distance": 0.12, + }, + ], + } if placement.relation == "inside": return _object_in_container_success( placement.moved_runtime_uid, @@ -310,7 +393,9 @@ def _validate_relative_bundle( f"Generated relative config missing moved rigid object(s): {missing_moved}" ) reference_required = { - placement.reference_runtime_uid for placement in spec.placements + placement.reference_runtime_uid + for placement in spec.placements + if placement.intent == "place_relative" } missing_reference = reference_required - scene_uids if missing_reference: @@ -368,6 +453,42 @@ def _validate_arrangement_bundle( ) +def _validate_stacking_bundle( + bundle: Mapping[str, Any], + spec: _StackingSpec, +) -> None: + gym_config = bundle["gym_config"] + if gym_config.get("id") != "AtomicActionsAgent-v3": + raise ValueError("Generated gym config must use AtomicActionsAgent-v3.") + if gym_config.get("robot", {}).get("uid") != "DualUR5": + raise ValueError("Generated stacking config must use DualUR5.") + + rigid_uid_list = [obj["uid"] for obj in gym_config.get("rigid_object", [])] + if len(rigid_uid_list) != len(set(rigid_uid_list)): + raise ValueError(f"Duplicate rigid object runtime uid(s): {rigid_uid_list}") + rigid_uids = set(rigid_uid_list) + background_uids = {obj["uid"] for obj in gym_config.get("background", [])} + scene_uids = rigid_uids | background_uids + required = {step.runtime_uid for step in spec.steps} + missing = required - rigid_uids + if missing: + raise ValueError( + f"Generated stacking config missing moved rigid object(s): {missing}" + ) + + _validate_success_uids( + gym_config["env"]["extensions"]["agent_success"], + rigid_uids=rigid_uids, + scene_uids=scene_uids, + ) + registry = gym_config["env"]["events"]["register_info_to_env"]["params"]["registry"] + registered = {entry["entity_cfg"]["uid"] for entry in registry} + if not required.issubset(registered): + raise ValueError( + f"Stacking config registry missing: {sorted(required - registered)}" + ) + + def _validate_success_uids( success: Mapping[str, Any], *, @@ -395,6 +516,10 @@ def _validate_success_uids( required_keys = ("object",) elif success_type in {"object_not_fallen", "not_fallen"}: required_keys = ("object",) + elif success_type in {"object_lifted", "object_height_above_initial"}: + required_keys = ("object",) + elif success_type in {"object_held_by_gripper", "object_gripper_near"}: + required_keys = ("object",) else: raise ValueError(f"Unsupported generated success term: {success_type!r}.") diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index e2ce2d9e..f039b71e 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -48,6 +48,9 @@ _apply_arrangement_task_response, _arrangement_line_slot_positions, ) +from embodichain.gen_sim.action_agent_pipeline.generation.stacking_spec import ( + _is_stacking_task_description, +) from embodichain.gen_sim.action_agent_pipeline.env_adapters.tableware.success import ( evaluate_configured_success, ) @@ -577,7 +580,7 @@ def fake_call_relative_task_llm(**kwargs): assert "Generate exactly 10 nominal edges" not in task_prompt assert _stable_summary(paths.summary) == { - "mode": "relative_placement", + "mode": "object_manipulation", "moved_object": "apple_2", "reference_object": "wicker_basket", "relation": "left_of", @@ -647,7 +650,7 @@ def fake_call_relative_task_llm(**kwargs): assert '"offset":[0.16,0.0,0.22]' in atom_actions assert _stable_summary(paths.summary) == { - "mode": "relative_placement", + "mode": "object_manipulation", "moved_object": "apple_1", "reference_object": "apple_2", "relation": "front_of", @@ -738,7 +741,7 @@ def fake_call_relative_task_llm(**kwargs): assert f'"position":[{expected_x},{expected_y},' in task_prompt assert _stable_summary(paths.summary) == { - "mode": "relative_placement", + "mode": "object_manipulation", "moved_object": "chip_bag", "reference_object": "chip_bag", "relation": "front_left_of", @@ -1670,8 +1673,8 @@ def fake_call_relative_task_llm(**kwargs): assert "grasp_pose_object" not in attr_names assert _stable_summary(paths.summary) == { - "mode": "dual_arm_relative_placement", - "placements": [ + "mode": "dual_arm_object_manipulation", + "manipulations": [ { "moved_object": "apple_2", "reference_object": "wicker_basket", @@ -1719,6 +1722,87 @@ def fake_call_relative_task_llm(**kwargs): assert '"obj_name":"apple_1"' in atom_actions +def test_task_description_generates_dual_hold_hover_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "19_Pick Dual Bottles_gym_project" + _write_dual_bottles_project(project_dir) + + def fake_call_relative_task_llm(**kwargs): + assert kwargs["task_description"] == ( + "用一只机械臂拿起一个瓶子悬空,并用另一只机械臂拿起另一个瓶子也悬空。" + ) + return { + "manipulations": [ + { + "intent": "hold_hover", + "moved_object": "interact_soda_bottle_1", + "arm": "left", + "hover_height": 0.10, + }, + { + "intent": "hold_hover", + "moved_object": "interact_soda_bottle_2", + "arm": "right", + "hover_height": 0.10, + }, + ], + "task_prompt_summary": "Pick up both bottles and keep them hovering.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_relative_task_llm", + fake_call_relative_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_dual_hold_hover_agent", + task_name="Demo19", + task_description=( + "用一只机械臂拿起一个瓶子悬空,并用另一只机械臂拿起另一个瓶子也悬空。" + ), + target_body_scale=0.8, + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + summary = paths.summary + assert summary["mode"] == "dual_arm_object_manipulation" + assert [item["intent"] for item in summary["manipulations"]] == [ + "hold_hover", + "hold_hover", + ] + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert "exactly 3 nominal edges" in task_prompt + assert task_prompt.count('"atomic_action_class":"PickUp"') == 2 + assert task_prompt.count('"atomic_action_class":"MoveHeldObject"') == 2 + assert ( + task_prompt.count('"target_qpos":{"source":"gripper_state","state":"close"}') + == 2 + ) + assert '"atomic_action_class":"Place"' not in task_prompt + assert '"source":"initial"' not in task_prompt + assert '"atomic_action_class":"Place"' not in atom_actions + assert '"source":"initial"' not in atom_actions + + success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] + lifted = [term for term in success_terms if term["type"] == "object_lifted"] + held = [term for term in success_terms if term["type"] == "object_held_by_gripper"] + assert {term["object"] for term in lifted} == { + "interact_soda_bottle_1", + "interact_soda_bottle_2", + } + assert {(term["object"], term["arm"]) for term in held} == { + ("interact_soda_bottle_1", "left_arm"), + ("interact_soda_bottle_2", "right_arm"), + } + + def test_arrangement_response_orders_explicit_color_sequence(tmp_path: Path) -> None: _write_minimal_glb( tmp_path / "mesh_assets/table/table_0.glb", @@ -2057,6 +2141,207 @@ def fake_call_arrangement_task_llm(**kwargs): ) +def test_task_description_generates_three_block_stacking_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "44_Stack Blocks Three_gym_project" + _write_stacking_blocks_project(project_dir, count=3) + + def fake_call_stacking_task_llm(**kwargs): + return { + "objects": ["red_cube_1", "green_cube_2", "blue_cube_3"], + "stack_mode": "on_top", + "bottom_to_top": ["red_cube_1", "green_cube_2", "blue_cube_3"], + "order_by": "explicit", + "anchor": "table_center", + "object_attributes": { + "red_cube_1": {"color": "red"}, + "green_cube_2": {"color": "green"}, + "blue_cube_3": {"color": "blue"}, + }, + "task_prompt_summary": ( + "Move the cubes to the table center and stack blue on green " + "and green on red." + ), + "basic_background_notes": "Table with red, green, and blue cubes.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_stacking_task_llm", + fake_call_stacking_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_three_block_stacking_agent", + task_name="Demo44", + task_description=( + "桌上有红、绿、蓝三个方块,将它们移动到桌子中央,并把蓝色方块叠到绿色方块上、绿色方块叠到红色方块上。" + ), + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + summary = paths.summary + assert summary["mode"] == "stacking" + assert summary["stack_mode"] == "on_top" + assert summary["bottom_to_top"] == ["red_cube", "green_cube", "blue_cube"] + assert [placement["support"] for placement in summary["placements"]] == [ + None, + "red_cube", + "green_cube", + ] + assert all( + placement["orientation_goal"] == "axis_align" + and placement["orientation_axis"] == "x" + for placement in summary["placements"] + ) + target_xy = [ + placement["target_position"][:2] for placement in summary["placements"] + ] + assert target_xy == [summary["anchor_xy"]] * 3 + + success = gym_config["env"]["extensions"]["agent_success"] + object_on_terms = [ + term for term in success["terms"] if term["type"] == "object_on_object" + ] + assert {(term["object"], term["support"]) for term in object_on_terms} == { + ("green_cube", "red_cube"), + ("blue_cube", "green_cube"), + } + assert any( + term["type"] == "object_xy_near" and term["object"] == "red_cube" + for term in success["terms"] + ) + + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert "Generate one deterministic nominal graph with exactly 21 nominal edges" in ( + task_prompt + ) + assert "Pick up both" not in task_prompt + assert task_prompt.count('"atomic_action_class":"PickUp"') == 3 + assert task_prompt.count('"atomic_action_class":"MoveEndEffector"') == 3 + assert task_prompt.count('"atomic_action_class":"Place"') == 3 + assert atom_actions.count('"orientation_goal":"axis_align"') == 6 + + +def test_task_description_generates_two_block_stacking_config( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "45_Stack Blocks Two_gym_project" + _write_stacking_blocks_project(project_dir, count=2) + + def fake_call_stacking_task_llm(**kwargs): + return { + "objects": ["red_cube_1", "green_cube_2"], + "stack_mode": "on_top", + "bottom_to_top": ["red_cube_1", "green_cube_2"], + "order_by": "explicit", + "anchor": "table_center", + "task_prompt_summary": "Stack green on red at the table center.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_stacking_task_llm", + fake_call_stacking_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_two_block_stacking_agent", + task_name="Demo45", + task_description="桌上有红、绿两个方块,将它们移动到桌子中央,并把绿色方块叠到红色方块上。", + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + summary = paths.summary + assert summary["mode"] == "stacking" + assert summary["bottom_to_top"] == ["red_cube", "green_cube"] + success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] + assert any( + term.get("type") == "object_on_object" + and term.get("object") == "green_cube" + and term.get("support") == "red_cube" + for term in success_terms + ) + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + assert "exactly 14 nominal edges" in task_prompt + assert "Pick up both" not in task_prompt + + +def test_task_description_generates_nested_bowl_stacking_by_size( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "46_Stack Bowls Three_gym_project" + _write_stacking_bowls_project(project_dir, count=3) + + def fake_call_stacking_task_llm(**kwargs): + return { + "objects": ["interact_bowl_1", "interact_bowl_2", "interact_bowl_3"], + "stack_mode": "nested", + "bottom_to_top": [], + "order_by": "size", + "anchor": "table_center", + "task_prompt_summary": "Nest the three bowls at the table center.", + } + + monkeypatch.setattr( + action_agent_config_generation, + "_call_stacking_task_llm", + fake_call_stacking_task_llm, + ) + + paths = generate_action_agent_config_from_project( + project_dir, + tmp_path / "generated_three_bowl_stacking_agent", + task_name="Demo46", + task_description="将三个碗相互叠放。", + prewarm_coacd_cache=False, + ) + + gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) + summary = paths.summary + assert summary["mode"] == "stacking" + assert summary["stack_mode"] == "nested" + assert summary["order_by"] == "size" + assert summary["bottom_to_top"] == [ + "interact_bowl_1", + "interact_bowl_2", + "interact_bowl_3", + ] + assert all( + placement["orientation_goal"] == "preserve" + and placement["orientation_axis"] == "none" + for placement in summary["placements"] + ) + success_terms = gym_config["env"]["extensions"]["agent_success"]["terms"] + in_container_terms = [ + term for term in success_terms if term["type"] == "object_in_container" + ] + assert {(term["object"], term["container"]) for term in in_container_terms} == { + ("interact_bowl_2", "interact_bowl_1"), + ("interact_bowl_3", "interact_bowl_2"), + } + task_prompt = paths.task_prompt.read_text(encoding="utf-8") + atom_actions = paths.atom_actions.read_text(encoding="utf-8") + assert "Stack mode: `nested`" in task_prompt + assert "exactly 18 nominal edges" in task_prompt + assert "High staging orientation" not in atom_actions + assert "Align `interact_bowl" not in task_prompt + + +def test_stacking_keyword_routes_before_arrangement() -> None: + assert _is_stacking_task_description("将红、绿、蓝三个方块叠成一列") + assert _is_stacking_task_description("Stack the bowls at the table center") + + def test_dual_inside_same_container_uses_container_long_axis_slots( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -2103,8 +2388,8 @@ def fake_call_relative_task_llm(**kwargs): ) assert _stable_summary(paths.summary) == { - "mode": "dual_arm_relative_placement", - "placements": [ + "mode": "dual_arm_object_manipulation", + "manipulations": [ { "moved_object": "apple_2", "reference_object": "wicker_basket", @@ -2237,7 +2522,9 @@ def fake_call_relative_task_llm(**kwargs): prewarm_coacd_cache=False, ) - active_arms = [placement["active_arm"] for placement in paths.summary["placements"]] + active_arms = [ + placement["active_arm"] for placement in paths.summary["manipulations"] + ] assert active_arms == ["right_arm", "left_arm"] gym_config = json.loads(paths.gym_config.read_text(encoding="utf-8")) @@ -2433,6 +2720,29 @@ def test_object_on_object_success_predicate() -> None: assert bool(success.item()) is True +def test_object_held_by_gripper_success_predicate() -> None: + env = _FakeEnv( + { + "bottle": [0.0, 0.18, 0.24], + } + ) + env.left_eef_pose = _pose_at([0.0, 0.18, 0.25]) + env.left_gripper_state = torch.tensor([0.04], dtype=torch.float32) + env.close_state = torch.tensor([0.04], dtype=torch.float32) + + success = evaluate_configured_success( + env, + { + "type": "object_held_by_gripper", + "object": "bottle", + "arm": "left_arm", + "max_distance": 0.12, + }, + ) + + assert bool(success.item()) is True + + def _write_project(project_dir: Path) -> None: for rel_path in ( "mesh_assets/table/table_0.glb", @@ -2639,6 +2949,138 @@ def _write_arrangement_project_with_count( ) +def _write_stacking_blocks_project(project_dir: Path, *, count: int) -> None: + _write_minimal_glb( + project_dir / "mesh_assets/table/table_0.glb", + [(-0.60, -0.40, 0.0), (0.60, -0.40, 0.0), (0.0, 0.40, 0.0)], + ) + cube_specs = [ + ("red_cube_1", "cube_1", "red", [-0.08, -0.10, 0.76]), + ("green_cube_2", "cube_2", "green", [0.05, -0.04, 0.76]), + ("blue_cube_3", "cube_3", "blue", [-0.04, 0.12, 0.76]), + ][:count] + rigid_objects = [] + for index, (uid, mesh_uid, _color, init_pos) in enumerate(cube_specs, start=1): + size = 0.035 + index * 0.005 + _write_minimal_glb( + project_dir / f"mesh_assets/cube/{mesh_uid}/{mesh_uid}.glb", + [ + (-size / 2.0, -size / 2.0, 0.0), + (size / 2.0, -size / 2.0, 0.0), + (0.0, size / 2.0, size), + ], + ) + rigid_objects.append( + _mesh_object( + uid, + f"mesh_assets/cube/{mesh_uid}/{mesh_uid}.glb", + init_pos, + [0.0, 0.0, 0.0], + ) + ) + + gym_config = { + "id": "Image2Tabletop-stacking-blocks-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 0.0], + ) + ], + "rigid_object": rigid_objects, + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + +def _write_stacking_bowls_project(project_dir: Path, *, count: int) -> None: + _write_minimal_glb( + project_dir / "mesh_assets/table/table_0.glb", + [(-0.60, -0.40, 0.0), (0.60, -0.40, 0.0), (0.0, 0.40, 0.0)], + ) + rigid_objects = [] + for index in range(1, count + 1): + uid = f"interact_bowl_{index}" + radius = 0.025 + (count - index + 1) * 0.008 + height = 0.025 + (count - index + 1) * 0.006 + _write_minimal_glb( + project_dir / f"mesh_assets/bowl/bowl_{index}/bowl_{index}.glb", + [ + (-radius, -radius, 0.0), + (radius, -radius, 0.0), + (0.0, radius, height), + ], + ) + rigid_objects.append( + _mesh_object( + uid, + f"mesh_assets/bowl/bowl_{index}/bowl_{index}.glb", + [round(0.08 * index, 6), round(-0.04 * index, 6), 0.76], + [0.0, 0.0, 0.0], + ) + ) + + gym_config = { + "id": "Image2Tabletop-stacking-bowls-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 0.0], + ) + ], + "rigid_object": rigid_objects, + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + +def _write_dual_bottles_project(project_dir: Path) -> None: + for rel_path in ( + "mesh_assets/table/table_0.glb", + "mesh_assets/soda_bottle/soda_bottle_1/soda_bottle_1.glb", + "mesh_assets/soda_bottle/soda_bottle_2/soda_bottle_2.glb", + ): + _write_minimal_glb(project_dir / rel_path, _default_mesh_vertices()) + + gym_config = { + "id": "Image2Tabletop-dual-bottles-v0", + "background": [ + _mesh_object( + "table", + "mesh_assets/table/table_0.glb", + [0.0, 0.0, 0.36], + [0.0, 0.0, 0.0], + ) + ], + "rigid_object": [ + _mesh_object( + "interact_soda_bottle_1", + "mesh_assets/soda_bottle/soda_bottle_1/soda_bottle_1.glb", + [0.0, 0.18, 0.76], + [0.0, 0.0, -90.0], + ), + _mesh_object( + "interact_soda_bottle_2", + "mesh_assets/soda_bottle/soda_bottle_2/soda_bottle_2.glb", + [0.0, -0.18, 0.76], + [0.0, 0.0, -90.0], + ), + ], + } + (project_dir / "gym_config.json").write_text( + json.dumps(gym_config, indent=2), + encoding="utf-8", + ) + + def _mesh_object( uid: str, fpath: str, @@ -2680,13 +3122,21 @@ def _stable_summary(summary: dict) -> dict: stable.pop("orientation_axis", None) if stable.get("orientation_align_to") is None: stable.pop("orientation_align_to", None) - for placement in stable.get("placements", []): + for placement in [*stable.get("placements", []), *stable.get("manipulations", [])]: + if placement.get("intent") == "place_relative": + placement.pop("intent", None) + if placement.get("hover_height") == 0.1: + placement.pop("hover_height", None) if placement.get("orientation_goal") == "preserve": placement.pop("orientation_goal", None) if placement.get("orientation_axis") == "none": placement.pop("orientation_axis", None) if placement.get("orientation_align_to") is None: placement.pop("orientation_align_to", None) + if stable.get("intent") == "place_relative": + stable.pop("intent", None) + if stable.get("hover_height") == 0.1: + stable.pop("hover_height", None) return stable @@ -2954,6 +3404,17 @@ class _FakeEnv: def __init__(self, positions: dict[str, list[float]]) -> None: self.sim = _FakeSim(positions) + self.left_eef_pose = _pose_at([0.0, 0.0, 0.0]) + self.right_eef_pose = _pose_at([0.0, 0.0, 0.0]) + self.left_gripper_state = torch.tensor([0.0], dtype=torch.float32) + self.right_gripper_state = torch.tensor([0.0], dtype=torch.float32) + self.close_state = torch.tensor([0.04], dtype=torch.float32) + + def get_current_xpos_agent(self): + return self.left_eef_pose, self.right_eef_pose + + def get_current_gripper_state_agent(self): + return self.left_gripper_state, self.right_gripper_state class _FakeSim: @@ -2974,3 +3435,9 @@ def get_local_pose(self, to_matrix: bool = True) -> torch.Tensor: pose = torch.eye(4, dtype=torch.float32).unsqueeze(0) pose[:, :3, 3] = self._position.reshape(1, 3) return pose + + +def _pose_at(position: list[float]) -> torch.Tensor: + pose = torch.eye(4, dtype=torch.float32).unsqueeze(0) + pose[:, :3, 3] = torch.tensor(position, dtype=torch.float32).reshape(1, 3) + return pose From 21b45a81af8b018cd2ba38638696293d94261162 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 28 Jun 2026 02:05:01 +0800 Subject: [PATCH 30/33] fix Object Manipulation bug --- .../env_adapters/tableware/success.py | 32 ++++++++++------ .../test_ur5_basket_config_generation.py | 37 +++++++++++++++++++ 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py index 5abc4e14..359a0a86 100644 --- a/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py +++ b/embodichain/gen_sim/action_agent_pipeline/env_adapters/tableware/success.py @@ -238,7 +238,10 @@ def _object_lifted(env, spec: Mapping[str, Any]) -> torch.Tensor: def _object_held_by_gripper(env, spec: Mapping[str, Any]) -> torch.Tensor: object_position = _position(env, _object_name(spec)) arm_name = str(spec.get("arm", spec.get("robot_name", ""))) - eef_pose = _arm_eef_pose(env, arm_name).to( + eef_pose = _arm_eef_pose(env, arm_name) + if eef_pose is None: + return _constant(env, False) + eef_pose = eef_pose.to( dtype=object_position.dtype, device=object_position.device, ) @@ -253,22 +256,29 @@ def _object_held_by_gripper(env, spec: Mapping[str, Any]) -> torch.Tensor: return near & _gripper_is_closed(env, arm_name, object_position.device) -def _arm_eef_pose(env, arm_name: str) -> torch.Tensor: - if hasattr(env, "get_current_xpos_agent"): +def _arm_eef_pose(env, arm_name: str) -> torch.Tensor | None: + if not hasattr(env, "get_current_xpos_agent"): + return None + try: left_pose, right_pose = env.get_current_xpos_agent() - return torch.as_tensor( - right_pose if "right" in arm_name else left_pose, - dtype=torch.float32, - device=env.device, - ) - raise ValueError("object_held_by_gripper requires current eef pose access.") + except AttributeError: + return None + pose = right_pose if "right" in arm_name else left_pose + if pose is None: + return None + return torch.as_tensor(pose, dtype=torch.float32, device=env.device) def _gripper_is_closed(env, arm_name: str, device: torch.device) -> torch.Tensor: if not hasattr(env, "get_current_gripper_state_agent"): - return _constant(env, True) - left_state, right_state = env.get_current_gripper_state_agent() + return _constant(env, False) + try: + left_state, right_state = env.get_current_gripper_state_agent() + except AttributeError: + return _constant(env, False) state = right_state if "right" in arm_name else left_state + if state is None: + return _constant(env, False) state_tensor = torch.as_tensor(state, dtype=torch.float32, device=device) if state_tensor.numel() == 0: return _constant(env, True) diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index f039b71e..64fc9fd0 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -2743,6 +2743,26 @@ def test_object_held_by_gripper_success_predicate() -> None: assert bool(success.item()) is True +def test_object_held_by_gripper_returns_false_before_agent_state_init() -> None: + env = _FakeEnvWithoutAgentState( + { + "bottle": [0.0, 0.18, 0.24], + } + ) + + success = evaluate_configured_success( + env, + { + "type": "object_held_by_gripper", + "object": "bottle", + "arm": "left_arm", + "max_distance": 0.12, + }, + ) + + assert bool(success.item()) is False + + def _write_project(project_dir: Path) -> None: for rel_path in ( "mesh_assets/table/table_0.glb", @@ -3417,6 +3437,23 @@ def get_current_gripper_state_agent(self): return self.left_gripper_state, self.right_gripper_state +class _FakeEnvWithoutAgentState: + num_envs = 1 + device = torch.device("cpu") + + def __init__(self, positions: dict[str, list[float]]) -> None: + self.sim = _FakeSim(positions) + + def get_current_xpos_agent(self): + return self.left_arm_current_xpos, self.right_arm_current_xpos + + def get_current_gripper_state_agent(self): + return ( + self.left_arm_current_gripper_state, + self.right_arm_current_gripper_state, + ) + + class _FakeSim: def __init__(self, positions: dict[str, list[float]]) -> None: self._objects = { From 546703674c0cb05e25a9cbf86cc7993577bc19fc Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 28 Jun 2026 15:10:13 +0800 Subject: [PATCH 31/33] change ur solver --- .../generation/templates/dual_ur5_robot.json | 8 +- embodichain/lab/sim/solvers/ur_solver.py | 25 ++++-- .../test_ur5_basket_config_generation.py | 56 +++++++++++++ tests/sim/solvers/test_ur_solver_cfg.py | 78 +++++++++++++++++++ 4 files changed, 160 insertions(+), 7 deletions(-) create mode 100644 tests/sim/solvers/test_ur_solver_cfg.py diff --git a/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json b/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json index d3c6abc9..20f56da1 100644 --- a/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json +++ b/embodichain/gen_sim/action_agent_pipeline/generation/templates/dual_ur5_robot.json @@ -81,7 +81,9 @@ }, "solver_cfg": { "left_arm": { - "class_type": "PytorchSolver", + "class_type": "URSolver", + "ur_type": "ur5", + "urdf_path": null, "end_link_name": "left_ee_link", "root_link_name": "left_base_link", "tcp": [ @@ -92,7 +94,9 @@ ] }, "right_arm": { - "class_type": "PytorchSolver", + "class_type": "URSolver", + "ur_type": "ur5", + "urdf_path": null, "end_link_name": "right_ee_link", "root_link_name": "right_base_link", "tcp": [ diff --git a/embodichain/lab/sim/solvers/ur_solver.py b/embodichain/lab/sim/solvers/ur_solver.py index 1ff83afb..ec48235f 100644 --- a/embodichain/lab/sim/solvers/ur_solver.py +++ b/embodichain/lab/sim/solvers/ur_solver.py @@ -14,22 +14,26 @@ # limitations under the License. # ---------------------------------------------------------------------------- -import torch +import math +from typing import Any + import numpy as np +import torch import warp as wp -from embodichain.utils import configclass -from embodichain.lab.sim.solvers import SolverCfg, BaseSolver + from embodichain.data import get_data_path +from embodichain.lab.sim.solvers import BaseSolver, SolverCfg +from embodichain.utils import configclass, logger +from embodichain.utils.device_utils import standardize_device_string from embodichain.utils.warp.kinematics.ur_solver import ( URParam, ur_ik_kernel, ) -import math -from embodichain.utils.device_utils import standardize_device_string @configclass class URSolverCfg(SolverCfg): + class_type: str = "URSolver" ur_type: str = "ur10" end_link_name: str = "ee_link" root_link_name: str = "base_link" @@ -100,6 +104,17 @@ def __post_init__(self): else: raise ValueError(f"Unknown UR type: {self.ur_type}") + @classmethod + def from_dict(cls, init_dict: dict[str, Any]) -> "URSolverCfg": + """Initialize the UR solver configuration from a dictionary.""" + cfg = cls(ur_type=init_dict.get("ur_type", "ur10")) + for key, value in init_dict.items(): + if hasattr(cfg, key): + setattr(cfg, key, value) + else: + logger.log_warning(f"Key '{key}' not found in {cls.__name__}.") + return cfg + def init_solver( self, device: torch.device = torch.device("cpu"), **kwargs ) -> "URSolver": diff --git a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py index 64fc9fd0..b587e042 100644 --- a/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py +++ b/tests/gen_sim/action_agent_pipeline/test_ur5_basket_config_generation.py @@ -25,6 +25,8 @@ import pytest import torch +from embodichain.lab.sim.cfg import RobotCfg +from embodichain.lab.sim.solvers import URSolverCfg from embodichain.gen_sim.action_agent_pipeline.cli import ( target_replacements as target_replacements_cli, ) @@ -75,6 +77,46 @@ def test_action_agent_templates_load_fresh_json_copies() -> None: assert second_lights["direct"][0]["uid"] == "main_light" +def test_dual_ur5_template_uses_ur_solver_config() -> None: + robot = make_dual_ur5_robot_config(robot_init_z=0.42) + + left_solver = robot["solver_cfg"]["left_arm"] + right_solver = robot["solver_cfg"]["right_arm"] + + assert left_solver["class_type"] == "URSolver" + assert right_solver["class_type"] == "URSolver" + assert left_solver["ur_type"] == "ur5" + assert right_solver["ur_type"] == "ur5" + assert left_solver["urdf_path"] is None + assert right_solver["urdf_path"] is None + assert left_solver["root_link_name"] == "left_base_link" + assert left_solver["end_link_name"] == "left_ee_link" + assert right_solver["root_link_name"] == "right_base_link" + assert right_solver["end_link_name"] == "right_ee_link" + assert left_solver["tcp"][2][3] == pytest.approx(0.16) + assert right_solver["tcp"][2][3] == pytest.approx(0.16) + + +def test_dual_ur5_template_deserializes_to_ur5_solver_cfg() -> None: + robot_cfg = RobotCfg.from_dict(make_dual_ur5_robot_config(robot_init_z=0.42)) + + for arm_name, root_link_name, end_link_name in ( + ("left_arm", "left_base_link", "left_ee_link"), + ("right_arm", "right_base_link", "right_ee_link"), + ): + solver_cfg = robot_cfg.solver_cfg[arm_name] + assert isinstance(solver_cfg, URSolverCfg) + assert solver_cfg.class_type == "URSolver" + assert solver_cfg.ur_type == "ur5" + assert solver_cfg.urdf_path is None + assert solver_cfg.root_link_name == root_link_name + assert solver_cfg.end_link_name == end_link_name + assert solver_cfg.d1 == pytest.approx(0.089159) + assert solver_cfg.a2 == pytest.approx(-0.425) + assert solver_cfg.a3 == pytest.approx(-0.39225) + assert solver_cfg.tcp[2][3] == pytest.approx(0.16) + + def test_action_agent_config_generator_uses_parallel_handoff( tmp_path: Path, ) -> None: @@ -115,6 +157,20 @@ def test_action_agent_config_generator_uses_parallel_handoff( [-2.0, 0.0, expected_robot_init_z] ) assert gym_config["robot"]["init_rot"] == [0.0, 0.0, 90.0] + for solver in gym_config["robot"]["solver_cfg"].values(): + assert solver["class_type"] == "URSolver" + assert solver["ur_type"] == "ur5" + assert solver["urdf_path"] is None + + robot_cfg = RobotCfg.from_dict(gym_config["robot"]) + for solver_cfg in robot_cfg.solver_cfg.values(): + assert isinstance(solver_cfg, URSolverCfg) + assert solver_cfg.ur_type == "ur5" + assert solver_cfg.urdf_path is None + assert solver_cfg.d1 == pytest.approx(0.089159) + assert solver_cfg.a2 == pytest.approx(-0.425) + assert solver_cfg.a3 == pytest.approx(-0.39225) + extensions = gym_config["env"]["extensions"] assert extensions["agent_arm_slots"]["left"] == { "arm": "right_arm", diff --git a/tests/sim/solvers/test_ur_solver_cfg.py b/tests/sim/solvers/test_ur_solver_cfg.py new file mode 100644 index 00000000..2061509e --- /dev/null +++ b/tests/sim/solvers/test_ur_solver_cfg.py @@ -0,0 +1,78 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +from __future__ import annotations + +import numpy as np +import pytest + +from embodichain.lab.sim.solvers import URSolverCfg + +UR5_EXPECTED_DH = { + "d1": 0.089159, + "d4": 0.10915, + "d5": 0.09465, + "d6": 0.0823, + "a2": -0.425, + "a3": -0.39225, +} +DUAL_UR5_TCP = [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.16], + [0.0, 0.0, 0.0, 1.0], +] + + +def test_ur_solver_cfg_from_dict_applies_ur5_variant_defaults() -> None: + cfg = URSolverCfg.from_dict( + { + "class_type": "URSolver", + "ur_type": "ur5", + "root_link_name": "left_base_link", + "end_link_name": "left_ee_link", + "tcp": DUAL_UR5_TCP, + } + ) + + assert cfg.class_type == "URSolver" + assert cfg.ur_type == "ur5" + assert cfg.root_link_name == "left_base_link" + assert cfg.end_link_name == "left_ee_link" + assert np.asarray(cfg.tcp).tolist() == DUAL_UR5_TCP + for param_name, expected_value in UR5_EXPECTED_DH.items(): + assert getattr(cfg, param_name) == pytest.approx(expected_value) + assert cfg.urdf_path.endswith("UniversalRobots/UR5/UR5.urdf") + + +def test_ur_solver_cfg_from_dict_preserves_explicit_none_urdf_path() -> None: + cfg = URSolverCfg.from_dict( + { + "class_type": "URSolver", + "ur_type": "ur5", + "urdf_path": None, + "root_link_name": "right_base_link", + "end_link_name": "right_ee_link", + "tcp": DUAL_UR5_TCP, + } + ) + + assert cfg.ur_type == "ur5" + assert cfg.urdf_path is None + assert cfg.root_link_name == "right_base_link" + assert cfg.end_link_name == "right_ee_link" + for param_name, expected_value in UR5_EXPECTED_DH.items(): + assert getattr(cfg, param_name) == pytest.approx(expected_value) From bbc85e71edef357e0daa2eaf53f71056979be73e Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 28 Jun 2026 17:26:10 +0800 Subject: [PATCH 32/33] fix(sim): export UR solver module API --- embodichain/lab/sim/solvers/ur_solver.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/embodichain/lab/sim/solvers/ur_solver.py b/embodichain/lab/sim/solvers/ur_solver.py index ec48235f..04991f49 100644 --- a/embodichain/lab/sim/solvers/ur_solver.py +++ b/embodichain/lab/sim/solvers/ur_solver.py @@ -14,6 +14,8 @@ # limitations under the License. # ---------------------------------------------------------------------------- +from __future__ import annotations + import math from typing import Any @@ -267,3 +269,6 @@ def dh_matrix(theta_i, d_i, a_i, alpha_i): m[3, 3] = 1.0 return m + + +__all__ = ["URSolverCfg", "URSolver"] From f35defd7c053033bc7a80101ba5b65fc93d1c950 Mon Sep 17 00:00:00 2001 From: skywhite1024 <129768272+skywhite1024@users.noreply.github.com> Date: Sun, 28 Jun 2026 18:16:46 +0800 Subject: [PATCH 33/33] delete old action agent --- .../embodichain/embodichain.agents.rst | 39 - .../embodichain/embodichain.lab.sim.rst | 11 +- docs/source/api_reference/index.rst | 1 - embodichain/agents/__init__.py | 5 +- embodichain/agents/hierarchy/__init__.py | 19 - embodichain/agents/hierarchy/agent_base.py | 94 -- embodichain/agents/hierarchy/code_agent.py | 288 ------ embodichain/agents/hierarchy/llm.py | 72 -- embodichain/agents/hierarchy/task_agent.py | 157 --- .../agents/hierarchy/validation_agent.py | 240 ----- embodichain/agents/mllm/prompt/__init__.py | 8 - embodichain/agents/mllm/prompt/code_prompt.py | 149 --- embodichain/agents/mllm/prompt/task_prompt.py | 144 --- embodichain/agents/prompts/atom_actions.txt | 136 --- .../agents/prompts/basic_background.txt | 42 - embodichain/agents/prompts/code_example.txt | 35 - embodichain/agents/prompts/code_prompt.txt | 7 - embodichain/lab/gym/envs/tasks/__init__.py | 4 - .../envs/tasks/tableware/base_agent_env.py | 201 ---- .../tasks/tableware/pour_water/pour_water.py | 18 +- .../gym/envs/tasks/tableware/rearrangement.py | 41 +- embodichain/lab/sim/atom_actions.py | 948 ------------------ 22 files changed, 10 insertions(+), 2649 deletions(-) delete mode 100644 docs/source/api_reference/embodichain/embodichain.agents.rst delete mode 100644 embodichain/agents/hierarchy/__init__.py delete mode 100644 embodichain/agents/hierarchy/agent_base.py delete mode 100644 embodichain/agents/hierarchy/code_agent.py delete mode 100644 embodichain/agents/hierarchy/llm.py delete mode 100644 embodichain/agents/hierarchy/task_agent.py delete mode 100644 embodichain/agents/hierarchy/validation_agent.py delete mode 100644 embodichain/agents/mllm/prompt/__init__.py delete mode 100644 embodichain/agents/mllm/prompt/code_prompt.py delete mode 100644 embodichain/agents/mllm/prompt/task_prompt.py delete mode 100644 embodichain/agents/prompts/atom_actions.txt delete mode 100644 embodichain/agents/prompts/basic_background.txt delete mode 100644 embodichain/agents/prompts/code_example.txt delete mode 100644 embodichain/agents/prompts/code_prompt.txt delete mode 100644 embodichain/lab/gym/envs/tasks/tableware/base_agent_env.py delete mode 100644 embodichain/lab/sim/atom_actions.py diff --git a/docs/source/api_reference/embodichain/embodichain.agents.rst b/docs/source/api_reference/embodichain/embodichain.agents.rst deleted file mode 100644 index ad9946e4..00000000 --- a/docs/source/api_reference/embodichain/embodichain.agents.rst +++ /dev/null @@ -1,39 +0,0 @@ -embodichain.agents -================== - -.. automodule:: embodichain.agents - - .. rubric:: Submodules - - .. autosummary:: - - hierarchy - mllm - -Hierarchy ---------- - -.. automodule:: embodichain.agents.hierarchy - :members: - :undoc-members: - :show-inheritance: - - .. autosummary:: - - agent_base - code_agent - task_agent - validation_agent - llm - -MLLM Prompts ------------- - -.. automodule:: embodichain.agents.mllm - :members: - :undoc-members: - :show-inheritance: - - .. autosummary:: - - prompt diff --git a/docs/source/api_reference/embodichain/embodichain.lab.sim.rst b/docs/source/api_reference/embodichain/embodichain.lab.sim.rst index 412f570d..655b3bfa 100644 --- a/docs/source/api_reference/embodichain/embodichain.lab.sim.rst +++ b/docs/source/api_reference/embodichain/embodichain.lab.sim.rst @@ -78,14 +78,6 @@ Shapes :show-inheritance: :exclude-members: __init__, copy, replace, to_dict, validate -Atomic Actions --------------- - -.. automodule:: embodichain.lab.sim.atom_actions - :members: - :undoc-members: - :show-inheritance: - Objects ------- @@ -133,6 +125,7 @@ Atomic Actions :maxdepth: 1 embodichain.lab.sim.atomic_actions + Shared Types ------------ @@ -147,4 +140,4 @@ Utility .. toctree:: :maxdepth: 1 - embodichain.lab.sim.utility \ No newline at end of file + embodichain.lab.sim.utility diff --git a/docs/source/api_reference/index.rst b/docs/source/api_reference/index.rst index 77146760..3f0e9941 100644 --- a/docs/source/api_reference/index.rst +++ b/docs/source/api_reference/index.rst @@ -22,7 +22,6 @@ The following modules are available in the core ``embodichain`` framework: .. autosummary:: :toctree: embodichain - agents data data_pipeline lab diff --git a/embodichain/agents/__init__.py b/embodichain/agents/__init__.py index 082cdcad..015c4151 100644 --- a/embodichain/agents/__init__.py +++ b/embodichain/agents/__init__.py @@ -14,7 +14,6 @@ # limitations under the License. # ---------------------------------------------------------------------------- -from . import hierarchy -from . import mllm +from __future__ import annotations -__all__ = ["hierarchy", "mllm"] +__all__: list[str] = [] diff --git a/embodichain/agents/hierarchy/__init__.py b/embodichain/agents/hierarchy/__init__.py deleted file mode 100644 index 02d7359c..00000000 --- a/embodichain/agents/hierarchy/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -from langchain_openai import AzureChatOpenAI -from langchain_openai import ChatOpenAI -import os diff --git a/embodichain/agents/hierarchy/agent_base.py b/embodichain/agents/hierarchy/agent_base.py deleted file mode 100644 index 8956bb6f..00000000 --- a/embodichain/agents/hierarchy/agent_base.py +++ /dev/null @@ -1,94 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -from abc import ABCMeta -import os -from pathlib import Path -from embodichain.utils.utility import load_txt -import embodichain.agents.mllm.prompt as mllm_prompt -from embodichain.data import database_2d_dir - - -def _resolve_prompt_path(file_name: str, config_dir: str = None) -> str: - # If absolute path, use directly - if os.path.isabs(file_name): - if os.path.exists(file_name): - return file_name - raise FileNotFoundError(f"Prompt file not found: {file_name}") - - # Try config directory first (for task-specific prompts) - if config_dir: - config_path = os.path.join(config_dir, file_name) - if os.path.exists(config_path): - return config_path - - # Try agents/prompts directory (for reusable prompts) - agents_prompts_dir = os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "prompts" - ) - agents_path = os.path.join(agents_prompts_dir, file_name) - if os.path.exists(agents_path): - return agents_path - - # If still not found, raise error with search paths - searched_paths = [] - if config_dir: - searched_paths.append(f" - {config_dir}/{file_name}") - searched_paths.append(f" - {agents_prompts_dir}/{file_name}") - - raise FileNotFoundError( - f"Prompt file not found: {file_name}\n" - f"Searched in:\n" + "\n".join(searched_paths) - ) - - -class AgentBase(metaclass=ABCMeta): - def __init__(self, **kwargs) -> None: - - assert ( - "prompt_kwargs" in kwargs.keys() - ), "Key prompt_kwargs must exist in config." - - for key, value in kwargs.items(): - setattr(self, key, value) - - # Get config directory if provided - config_dir = kwargs.get("config_dir", None) - if config_dir: - config_dir = os.path.dirname(os.path.abspath(config_dir)) - - # Preload and store prompt contents inside self.prompt_kwargs - for key, val in self.prompt_kwargs.items(): - if val["type"] == "text": - file_path = _resolve_prompt_path(val["name"], config_dir) - val["content"] = load_txt(file_path) # ← store content here - else: - raise ValueError( - f"Now only support `text` type but {val['type']} is given." - ) - - def generate(self, *args, **kwargs): - pass - - def act(self, *args, **kwargs): - pass - - def get_composed_observations(self, **kwargs): - ret = {"observations": kwargs.get("env").get_obs_for_agent()} - for key, val in self.prompt_kwargs.items(): - ret[key] = val["content"] - ret.update(kwargs) - return ret diff --git a/embodichain/agents/hierarchy/code_agent.py b/embodichain/agents/hierarchy/code_agent.py deleted file mode 100644 index 1a4c84d2..00000000 --- a/embodichain/agents/hierarchy/code_agent.py +++ /dev/null @@ -1,288 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -from embodichain.agents.hierarchy.agent_base import AgentBase -from langchain_core.prompts import ChatPromptTemplate -import os -import numpy as np -from typing import Dict, Tuple -from embodichain.agents.mllm.prompt import CodePrompt -from embodichain.data import database_agent_prompt_dir -from pathlib import Path -import re -import importlib.util -from datetime import datetime - - -def format_execution_history(execution_history): - if not execution_history or len(execution_history) == 0: - return "None." - - return "\n\n".join(f"{i}. {entry}" for i, entry in enumerate(execution_history, 1)) - - -def extract_python_code_and_text(llm_response: str) -> Tuple[str, str]: - """ - Extract exactly ONE python code block from the LLM response, - and return: - - code: the content inside the python block - - text: all remaining explanation text (outside the code block) - - Raises ValueError if zero or multiple python blocks are found. - """ - - pattern = r"```python\s*(.*?)\s*```" - matches = list(re.finditer(pattern, llm_response, re.DOTALL)) - - if len(matches) == 0: - raise ValueError("No python code block found in LLM response.") - if len(matches) > 1: - raise ValueError("Multiple python code blocks found in LLM response.") - - match = matches[0] - code = match.group(1).strip() - - # Optional sanity check - if not code.startswith("#") and not code.startswith("drive("): - raise ValueError( - f"Invalid code block content. Expected `drive(...)` or `# TASK_COMPLETE`, got:\n{code}" - ) - - # Extract remaining text (before + after the code block) - text_before = llm_response[: match.start()].strip() - text_after = llm_response[match.end() :].strip() - - explanation_text = "\n\n".join(part for part in [text_before, text_after] if part) - - return code, explanation_text - - -def format_llm_response_md( - llm_analysis: str, # plain-text explanation (NO code) - extracted_code: str, # validated executable code - step_id: int = None, - execution_history: str = None, - obs_image_path: Path = None, - md_file_path: Path = None, -) -> str: - ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - header = f"## Step: {step_id if step_id is not None else '-'} | {ts}\n\n" - - history_block = "" - if execution_history: - history_block = ( - "### Execution History (Input to LLM)\n\n" - "```\n" - f"{execution_history}\n" - "```\n\n" - ) - - image_block = "" - if obs_image_path is not None and md_file_path is not None: - try: - rel_path = obs_image_path.relative_to(md_file_path.parent) - except ValueError: - # Fallback: just use filename - rel_path = obs_image_path.name - - image_block = ( - "### Observation Image\n\n" f"![]({Path(rel_path).as_posix()})\n\n" - ) - - body = ( - image_block + history_block + "### LLM Analysis\n\n" - f"{llm_analysis.strip()}\n\n" - "### Executed Code\n\n" - "```python\n" - f"{extracted_code.strip()}\n" - "```\n\n" - "---\n\n" - ) - - return header + body - - -class CodeAgent(AgentBase): - query_prefix = "# " - query_suffix = "." - prompt: ChatPromptTemplate - prompt_kwargs: Dict[str, Dict] - - def __init__(self, llm, **kwargs) -> None: - super().__init__(**kwargs) - if llm is None: - raise ValueError( - "LLM is None. Please set the following environment variables:\n" - " - AZURE_OPENAI_ENDPOINT\n" - " - AZURE_OPENAI_API_KEY\n" - "Example:\n" - " export AZURE_OPENAI_ENDPOINT='https://your-endpoint.openai.azure.com/'\n" - " export AZURE_OPENAI_API_KEY='your-api-key'" - ) - self.llm = llm - - def generate(self, **kwargs): - log_dir = kwargs.get( - "log_dir", Path(database_agent_prompt_dir) / self.task_name - ) - file_path = log_dir / "agent_generated_code.py" - - # Check if the file already exists - if not kwargs.get("regenerate", False): - if file_path.exists(): - print(f"Code file already exists at {file_path}, skipping writing.") - return file_path, kwargs, None - - # Generate code via LLM - prompt = getattr(CodePrompt, self.prompt_name)( - **kwargs, - ) - - # insert feedback if exists - if len(kwargs.get("error_messages", [])) != 0: - # just use the last one - last_code = kwargs["generated_codes"][-1] - last_error = kwargs["error_messages"][-1] - last_observation = ( - kwargs.get("observation_feedbacks")[-1] - if kwargs.get("observation_feedbacks") - else None - ) - - # Add extra human message with feedback - feedback_msg = self.build_feedback_message( - last_code, last_error, last_observation - ) - prompt.messages.append(feedback_msg) - - llm_code = self.llm.invoke(prompt) - - # Normalize content - llm_code = getattr(llm_code, "content", str(llm_code)) - - print(f"\033[92m\nCode agent output:\n{llm_code}\n\033[0m") - - # Write the code to the file if it does not exist - match = re.search(r"```python\n(.*?)\n```", llm_code, re.DOTALL) - if match: - code_to_save = match.group(1).strip() - else: - code_to_save = llm_code.strip() - - file_path.parent.mkdir(parents=True, exist_ok=True) - with open(file_path, "w") as f: - f.write(code_to_save) - print(f"Generated function code saved to {file_path}") - - return file_path, kwargs, code_to_save - - def act(self, code_file_path, **kwargs): - """Execute generated code with proper execution environment. - - Supports two modes: - 1. If code defines 'create_agent_action_list' function, call it - 2. If code contains module-level drive() calls, execute them directly - """ - import ast - - # Read the generated code file - with open(code_file_path, "r") as f: - code_content = f.read() - - # Build execution namespace with necessary imports - ns = { - "__builtins__": __builtins__, - "__name__": "__main__", - "__file__": str(code_file_path), - "kwargs": kwargs, # Make kwargs available for injection - } - - # Import atom action functions into namespace - try: - exec( - "from embodichain.lab.sim.atom_actions import *", - ns, - ns, - ) - except Exception as e: - raise RuntimeError( - "Failed to import embodichain.lab.sim.atom_actions" - ) from e - - # Parse code to check if it defines a function or contains module-level calls - tree = ast.parse(code_content) - - # Check if code defines create_agent_action_list function - has_function = any( - isinstance(node, ast.FunctionDef) - and node.name == "create_agent_action_list" - for node in tree.body - ) - - if has_function: - # Execute code (function will be defined in namespace) - exec(code_content, ns, ns) - - # Call the function if it exists - if "create_agent_action_list" in ns: - result = ns["create_agent_action_list"](**kwargs) - print("Function executed successfully.") - return result - else: - raise AttributeError( - "The function 'create_agent_action_list' was not found after execution." - ) - else: - # Code contains module-level drive() calls - # AST transformer to inject **kwargs into function calls - class InjectKwargs(ast.NodeTransformer): - def visit_Call(self, node): - self.generic_visit(node) - # Inject **kwargs if not present - has_kwargs = any( - kw.arg is None - and isinstance(kw.value, ast.Name) - and kw.value.id == "kwargs" - for kw in node.keywords - ) - if not has_kwargs: - node.keywords.append( - ast.keyword( - arg=None, value=ast.Name(id="kwargs", ctx=ast.Load()) - ) - ) - return node - - # Transform AST to inject kwargs - tree = InjectKwargs().visit(tree) - ast.fix_missing_locations(tree) - - # Compile and execute transformed code - compiled_code = compile(tree, filename=str(code_file_path), mode="exec") - exec(compiled_code, ns, ns) - - # Collect actions from drive() calls if they were executed - # drive() function stores actions in env._episode_action_list - if "env" in kwargs: - env = kwargs["env"] - if hasattr(env, "_episode_action_list") and env._episode_action_list: - print( - f"Collected {len(env._episode_action_list)} actions from module-level drive() calls." - ) - return env._episode_action_list - - print("Code executed successfully, but no actions were collected.") - return [] diff --git a/embodichain/agents/hierarchy/llm.py b/embodichain/agents/hierarchy/llm.py deleted file mode 100644 index 1fc1bbfe..00000000 --- a/embodichain/agents/hierarchy/llm.py +++ /dev/null @@ -1,72 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -import os -from langchain_openai import AzureChatOpenAI - -# ------------------------------------------------------------------------------ -# Environment configuration -# ------------------------------------------------------------------------------ - -# Clear proxy if not needed (optional, can be set via environment variables) - -os.environ["ALL_PROXY"] = "" -os.environ["all_proxy"] = "" - -# Proxy configuration (optional, uncomment if needed) -# os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890" -# os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890" - -# API version (optional, defaults to "2024-10-21" if not set) -# os.environ["OPENAI_API_VERSION"] = "2024-10-21" - -# Note: AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY must be set via environment variables -# Example in bash: -# export AZURE_OPENAI_ENDPOINT="https://your-endpoint.openai.azure.com/" -# export AZURE_OPENAI_API_KEY="your-api-key" - -# ------------------------------------------------------------------------------ -# LLM factory -# ------------------------------------------------------------------------------ - - -def create_llm(*, temperature=0.0, model="gpt-4o"): - return AzureChatOpenAI( - temperature=temperature, - model=model, - azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), - api_key=os.getenv("AZURE_OPENAI_API_KEY"), - api_version=os.getenv("OPENAI_API_VERSION", "2024-10-21"), - ) - - -# ------------------------------------------------------------------------------ -# LLM instances -# ------------------------------------------------------------------------------ - - -# Initialize LLM instances, but handle errors gracefully for documentation builds -def _create_llm_safe(*, temperature=0.0, model="gpt-4o"): - try: - return create_llm(temperature=temperature, model=model) - except Exception: - return None - - -task_llm = _create_llm_safe(temperature=0.0, model="gpt-4o") -code_llm = _create_llm_safe(temperature=0.0, model="gpt-4o") -validation_llm = _create_llm_safe(temperature=0.0, model="gpt-4o") -view_selection_llm = _create_llm_safe(temperature=0.0, model="gpt-4o") diff --git a/embodichain/agents/hierarchy/task_agent.py b/embodichain/agents/hierarchy/task_agent.py deleted file mode 100644 index 9c4f37cc..00000000 --- a/embodichain/agents/hierarchy/task_agent.py +++ /dev/null @@ -1,157 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -from typing import List, Dict, Tuple -from embodichain.agents.hierarchy.agent_base import AgentBase -from langchain_core.prompts import ChatPromptTemplate -from embodichain.data import database_2d_dir -from embodichain.utils.utility import load_txt -from embodichain.agents.mllm.prompt import TaskPrompt -from embodichain.data import database_agent_prompt_dir -from pathlib import Path -import numpy as np -import time -import re - -USEFUL_INFO = """The error may be caused by: -1. You did not follow the basic background information, especially the world coordinate system with its xyz directions. -2. You did not take into account the NOTE given in the atom actions or in the example functions. -3. You did not follow the steps of the task descriptions.\n -""" - - -def extract_plan_and_validation(text: str) -> Tuple[str, List[str], List[str]]: - def get_section(src: str, name: str, next_name) -> str: - if next_name: - pat = re.compile( - rf"\[{name}\]\s*:\s*(.*?)\s*(?=\[{next_name}\]\s*:|\Z)", - re.DOTALL | re.IGNORECASE, - ) - else: - pat = re.compile( - rf"\[{name}\]\s*:\s*(.*?)\s*\Z", - re.DOTALL | re.IGNORECASE, - ) - m = pat.search(src) - return m.group(1).strip() if m else "" - - step_re = re.compile( - r"Step\s*\d+\s*:.*?(?=Step\s*\d+\s*:|\Z)", - re.DOTALL | re.IGNORECASE, - ) - - # ---- plans ---- - plans_raw = get_section(text, "PLANS", "VALIDATION_CONDITIONS") - plan_steps = [m.group(0).rstrip() for m in step_re.finditer(plans_raw)] - plan_str = "\n".join(plan_steps) - - # normalized plan list (strip "Step k:") - plan_list = [] - for step in plan_steps: - content = re.sub(r"^Step\s*\d+\s*:\s*", "", step, flags=re.IGNORECASE).strip() - if content: - plan_list.append(content) - - # ---- validations ---- - vals_raw = get_section(text, "VALIDATION_CONDITIONS", None) - validation_list = [] - for m in step_re.finditer(vals_raw): - content = re.sub( - r"^Step\s*\d+\s*:\s*", "", m.group(0), flags=re.IGNORECASE - ).strip() - if content: - validation_list.append(content) - - return plan_str, plan_list, validation_list - - -class TaskAgent(AgentBase): - prompt: ChatPromptTemplate - object_list: List[str] - target: np.ndarray - prompt_name: str - prompt_kwargs: Dict[str, Dict] - - def __init__(self, llm, **kwargs) -> None: - super().__init__(**kwargs) - if llm is None: - raise ValueError( - "LLM is None. Please set the following environment variables:\n" - " - AZURE_OPENAI_ENDPOINT\n" - " - AZURE_OPENAI_API_KEY\n" - "Example:\n" - " export AZURE_OPENAI_ENDPOINT='https://your-endpoint.openai.azure.com/'\n" - " export AZURE_OPENAI_API_KEY='your-api-key'" - ) - self.llm = llm - - def generate(self, **kwargs) -> str: - log_dir = kwargs.get( - "log_dir", Path(database_agent_prompt_dir) / self.task_name - ) - file_path = log_dir / "agent_generated_plan.txt" - - # Check if the file already exists - if not kwargs.get("regenerate", False): - if file_path.exists(): - print(f"Plan file already exists at {file_path}, skipping writing.") - return load_txt(file_path) - - # Generate query via LLM - prompts_ = getattr(TaskPrompt, self.prompt_name)(**kwargs) - if isinstance(prompts_, list): - # TODO: support two-stage prompts with feedback - start_time = time.time() - response = self.llm.invoke(prompts_[0]) - query = response.content - print( - f"\033[92m\nSystem tasks output ({np.round(time.time()-start_time, 4)}s):\n{query}\n\033[0m" - ) - for prompt in prompts_[1:]: - temp = prompt["kwargs"] - temp.update({"query": query}) - start_time = time.time() - response = self.llm.invoke(prompt["prompt"].invoke(temp)) - query = response.content - print( - f"\033[92m\nSystem tasks output({np.round(time.time()-start_time, 4)}s):\n{query}\n\033[0m" - ) - else: - # insert feedback if exists - if len(kwargs.get("error_messages", [])) != 0: - # just use the last one - last_plan = kwargs["generated_plans"][-1] - last_code = kwargs["generated_codes"][-1] - last_error = kwargs["error_messages"][-1] - - # Add extra human message with feedback - feedback_msg = self.build_feedback_message( - last_plan, last_code, last_error - ) - prompts_.messages.append(feedback_msg) - - response = self.llm.invoke(prompts_) - print(f"\033[92m\nTask agent output:\n{response.content}\n\033[0m") - - file_path.parent.mkdir(parents=True, exist_ok=True) - with open(file_path, "w") as f: - f.write(response.content) - print(f"Generated task plan saved to {file_path}") - - return response.content - - def act(self, *args, **kwargs): - return super().act(*args, **kwargs) diff --git a/embodichain/agents/hierarchy/validation_agent.py b/embodichain/agents/hierarchy/validation_agent.py deleted file mode 100644 index f2cce047..00000000 --- a/embodichain/agents/hierarchy/validation_agent.py +++ /dev/null @@ -1,240 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -import os -from langchain_core.messages import SystemMessage, HumanMessage -from abc import ABCMeta -from embodichain.utils.utility import encode_image_from_path -import glob -from embodichain.agents.hierarchy.llm import view_selection_llm - - -def save_obs_image(obs_image, save_dir, step_id=None): - """ - Save observation image using encode_image() and return its file path. - """ - import base64 - from embodichain.utils.utility import encode_image - - if obs_image is None: - return None - - if isinstance(save_dir, str): - from pathlib import Path - - save_dir = Path(save_dir) - - save_dir.mkdir(parents=True, exist_ok=True) - - name = f"obs_step_{step_id}.png" if step_id is not None else "obs.png" - img_path = save_dir / name - - # Encode to base64 - base64_image = encode_image(obs_image) - - # Decode base64 → bytes - img_bytes = base64.b64decode(base64_image) - - # Write to file - with open(img_path, "wb") as f: - f.write(img_bytes) - - return img_path - - -def get_obj_position_info(env): - import json - - position_info = {} - obj_uids = env.sim.get_rigid_object_uid_list() - for obj_name in obj_uids: - target_obj = env.sim.get_rigid_object(obj_name) - target_obj_pose = target_obj.get_local_pose(to_matrix=True).squeeze(0)[:3, 3] - position_info[obj_name] = target_obj_pose.tolist() - return json.dumps(position_info, indent=4) - - -class ValidationAgent(metaclass=ABCMeta): - def __init__(self, llm, **kwargs) -> None: - super().__init__() - for key, value in kwargs.items(): - setattr(self, key, value) - if llm is None: - raise ValueError( - "LLM is None. Please set the following environment variables:\n" - " - AZURE_OPENAI_ENDPOINT\n" - " - AZURE_OPENAI_API_KEY\n" - "Example:\n" - " export AZURE_OPENAI_ENDPOINT='https://your-endpoint.openai.azure.com/'\n" - " export AZURE_OPENAI_API_KEY='your-api-key'" - ) - self.llm = llm - - def validate(self, step_names, problematic_code, error_message, image_files): - # Construct the prompt - prompt = f""" - Analyze the execution of the following robot task: - - Task name: {self.task_name} - Task description: {self.task_description} - Basic background knowledge: {self.basic_background} - - You will be given images showing each step of the execution. For the step sequence: - {', '.join(step_names)} - - Provide the following analysis: - 1. Decide whether the full task succeeded or failed. - 2. If the task failed, provide a precise and detailed explanation. - - Below is a potentially problematic piece of code and the corresponding execution error: - - ```python - {problematic_code} - # Execution error: - {error_message} - Explain whether (and how) this code contributed to the observed failure. - """ - - # Prepare message content for API call - user_content = [] - - # Add textual prompt - user_content.append({"type": "text", "text": prompt}) - - # Add images and step names - for img_path in image_files: - filename = os.path.basename(img_path) - first_underscore_pos = filename.find("_") - if first_underscore_pos != -1: - step_name = filename[first_underscore_pos + 1 :].rsplit(".", 1)[0] - else: - step_name = filename.rsplit(".", 1)[0] - - # Add step name - user_content.append({"type": "text", "text": f"Step: {step_name}"}) - - # Add image as base64 - base64_image = encode_image_from_path(img_path) - user_content.append( - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{base64_image}"}, - } - ) - - messages = [ - SystemMessage( - content="You are a robot task execution analysis expert. Please analyze the provided image sequence." - ), - HumanMessage(content=user_content), - ] - - response = self.llm.invoke(messages) - return response.content - - def select_best_view_dir( - self, img_dirs: dict, action_description: str, valid_condition: str - ): - """ - img_dirs: { - "cam_1": Path, - "cam_2": Path, - "cam_3": Path - } - """ - - # --- collect final images --- - last_images = {} - for cam_id, cam_dir in img_dirs.items(): - imgs = sorted( - glob.glob(os.path.join(cam_dir, "obs_step_*.png")), - key=lambda p: int(os.path.basename(p).split("_")[-1].split(".")[0]), - ) - if imgs: - last_images[cam_id] = imgs[-1] - - if not last_images: - raise ValueError("No images found in any camera directory.") - - # --- system prompt --- - system_prompt = ( - "You are a robot perception assistant specialized in VIEW SELECTION.\n\n" - "TASK:\n" - "- You are given ONE final observation image from EACH camera view.\n" - "- Your job is NOT to judge success or failure.\n" - "- Your job is ONLY to select the SINGLE camera view that is MOST SUITABLE\n" - " for OBJECT-LEVEL validation of the action result.\n\n" - "ACTION CONTEXT:\n" - "- The robot has just executed ONE atomic action.\n" - "- You are given the action intention and the expected object-level outcome\n" - " ONLY to help you decide which view best reveals that outcome.\n\n" - "SELECTION CRITERIA (PRIORITY ORDER):\n" - "- Prefer views with:\n" - " * the clearest visibility of the relevant object(s)\n" - " * minimal occlusion by the arm or environment\n" - " * the clearest evidence related to the expected object-level result\n" - " (e.g., contact, separation, support, stability)\n\n" - "STRICT CONSTRAINTS:\n" - "- Do NOT judge robot motion quality or execution accuracy.\n" - "- Do NOT reason about numeric values (distance, angle, offset).\n" - "- Do NOT decide whether the action succeeded or failed.\n" - "- If multiple views are acceptable, choose the clearest overall view.\n\n" - "OUTPUT FORMAT (STRICT):\n" - "Output EXACTLY ONE of the following tokens:\n" - "- cam_1\n" - "- cam_2\n" - "- cam_3\n" - ) - - # --- human content --- - human_content = [ - { - "type": "text", - "text": ( - "Select the best camera view for object-level validation.\n\n" - "--------------------------------------------------\n" - "ACTION DESCRIPTION (INTENT ONLY):\n" - f"{action_description}\n\n" - "EXPECTED OBJECT-LEVEL RESULT (REFERENCE ONLY):\n" - f"{valid_condition}\n" - "--------------------------------------------------" - ), - } - ] - - for cam_id, img_path in last_images.items(): - img_b64 = encode_image_from_path(img_path) - human_content.extend( - [ - {"type": "text", "text": f"View candidate: {cam_id}"}, - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{img_b64}"}, - }, - ] - ) - - messages = [ - SystemMessage(content=system_prompt), - HumanMessage(content=human_content), - ] - - response = view_selection_llm.invoke(messages).content.strip() - - if response not in img_dirs: - raise ValueError(f"Invalid camera selection from LLM: {response}") - - return response diff --git a/embodichain/agents/mllm/prompt/__init__.py b/embodichain/agents/mllm/prompt/__init__.py deleted file mode 100644 index 5d5a8d46..00000000 --- a/embodichain/agents/mllm/prompt/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# All rights reserved. -# ---------------------------------------------------------------------------- - -from .task_prompt import TaskPrompt -from .code_prompt import CodePrompt diff --git a/embodichain/agents/mllm/prompt/code_prompt.py b/embodichain/agents/mllm/prompt/code_prompt.py deleted file mode 100644 index 794e554f..00000000 --- a/embodichain/agents/mllm/prompt/code_prompt.py +++ /dev/null @@ -1,149 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -from langchain_core.messages import SystemMessage -from langchain_core.prompts import ( - ChatPromptTemplate, - HumanMessagePromptTemplate, -) -from embodichain.utils.utility import encode_image - - -class CodePrompt: - @staticmethod - def one_stage_prompt(**kwargs) -> ChatPromptTemplate: - prompt = ChatPromptTemplate.from_messages( - [ - SystemMessage( - content="You are an AI assistant that can generate python code to execute robot arms." - ), - HumanMessagePromptTemplate.from_template( - [ - { - "type": "text", - "text": ( - "Generate a Python code snippet that accomplishes the following task:\n" - "{query}\n\n" - "You must strictly follow the rules and available functions described below:\n" - "{code_prompt}\n\n" - "Here are some reference examples of the expected output code:\n" - "{code_example}\n\n" - ), - } - ] - ), - ] - ) - return prompt.invoke(kwargs) - - @staticmethod - def unified_prompt(observations, **kwargs): - """ - Unified Vision→Code prompt: - - Model observes the image - - Understands the scene and the task goal - - Generates final executable Python code using atomic robot APIs - """ - - # Encode the image - observation = observations["rgb"] - kwargs.update({"observation": encode_image(observation)}) - - prompt = ChatPromptTemplate.from_messages( - [ - SystemMessage( - content=( - "You are a reliable Vision-Language-Code robot assistant. " - "You observe an image, understand the scene and the task goal, " - "and generate correct Python code using ONLY the allowed atomic robot actions. " - "Your final output must be a single Python code block." - ) - ), - HumanMessagePromptTemplate.from_template( - [ - { - "type": "image_url", - "image_url": { - "url": "data:image/png;base64,{observation}", - }, - }, - { - "type": "text", - "text": ( - "### Task Goal\n" - "{task_prompt}\n\n" - "### Environment Background\n" - "{basic_background}\n\n" - "### Allowed Atomic Actions\n" - "{atom_actions}\n\n" - "### Code Rules\n" - "{code_prompt}\n\n" - "### Reference Code Examples\n" - "{code_example}\n\n" - "### Final Instructions\n" - "Understand the scene from the image and generate final executable Python code " - "that performs the task using ONLY the allowed atomic actions.\n\n" - "Your entire response must be EXACTLY one Python code block:\n" - "```python\n" - "# your solution code here\n" - "```\n" - ), - }, - ] - ), - ] - ) - - return prompt.invoke(kwargs) - - @staticmethod - def one_stage_prompt_according_to_task_plan(**kwargs) -> ChatPromptTemplate: - prompt = ChatPromptTemplate.from_messages( - [ - SystemMessage( - content=( - "You are a reliable robot control code generator.\n" - "Your task is to generate Python code that executes robot arm actions.\n\n" - "CRITICAL RULES:\n" - "- The TASK PLAN defines the available atomic actions, rules, and execution logic.\n" - "- You MUST strictly follow the TASK PLAN.\n" - "- The CONSTRAINTS section contains additional global constraints you must obey.\n" - "- Do NOT invent new actions, functions, parameters, or control flow.\n" - "- You MAY include Python comments (# ...) inside the code.\n" - "- Your ENTIRE response MUST be a single Python code block.\n" - "- The code block MUST be directly executable without modification.\n" - "- Do NOT include any text, explanation, or markdown outside the Python code block.\n" - ) - ), - HumanMessagePromptTemplate.from_template( - [ - { - "type": "text", - "text": ( - "TASK PLAN (atomic actions, rules, and intended behavior):\n" - "{task_plan}\n\n" - "GLOBAL CONSTRAINTS (must be satisfied):\n" - "{code_prompt}\n\n" - "REFERENCE CODE (style and structure only; do NOT copy logic):\n" - "{code_example}\n\n" - "Generate the corrected Python code now." - ), - } - ] - ), - ] - ) - return prompt.invoke(kwargs) diff --git a/embodichain/agents/mllm/prompt/task_prompt.py b/embodichain/agents/mllm/prompt/task_prompt.py deleted file mode 100644 index 7db5d1bb..00000000 --- a/embodichain/agents/mllm/prompt/task_prompt.py +++ /dev/null @@ -1,144 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -import torch -from langchain_core.messages import SystemMessage -from langchain_core.prompts import ( - ChatPromptTemplate, - HumanMessagePromptTemplate, -) -from embodichain.utils.utility import encode_image - - -class TaskPrompt: - @staticmethod - def one_stage_prompt(observations, **kwargs): - """ - Hybrid one-pass prompt: - Step 1: VLM analyzes the image and extracts object IDs. - Step 2: LLM generates task instructions using only those IDs. - """ - # Encode image - observation = ( - observations["rgb"].cpu().numpy() - if isinstance(observations["rgb"], torch.Tensor) - else observations["rgb"] - ) - kwargs.update({"observation": encode_image(observation)}) - - # Build hybrid prompt - prompt = ChatPromptTemplate.from_messages( - [ - SystemMessage( - content=( - "You are a precise and reliable robotic manipulation planner. " - "Given a camera observation and a task description, you must generate " - "a clear, step-by-step task plan for a robotic arm. " - "All actions must strictly use the provided atomic API functions, " - "and the plan must be executable without ambiguity." - ) - ), - HumanMessagePromptTemplate.from_template( - [ - { - "type": "image_url", - "image_url": { - "url": "data:image/png;base64,{observation}", - }, - }, - { - "type": "text", - "text": ( - "Here is the latest camera observation.\n" - "First, analyze the scene in the image.\n" - "Then, using the context below, produce an actionable task plan.\n\n" - "**Environment background:** \n{basic_background}\n\n" - '**Task goal:** \n"{task_prompt}"\n\n' - "**Available atomic actions:** \n{atom_actions}\n" - ), - }, - ] - ), - ] - ) - - # Return the prompt template and kwargs to be executed by the caller - return prompt.invoke(kwargs) - - @staticmethod - def two_stage_prompt(observations, **kwargs): - # for VLM generate image descriptions - prompt = ChatPromptTemplate.from_messages( - [ - SystemMessage( - content="You are a helpful assistant to operate a robotic arm with a camera to generate task plans according to descriptions." - ), - HumanMessagePromptTemplate.from_template( - [ - { - "type": "image_url", - "image_url": { - "url": "data:image/jpg;base64,{observation}", - }, - }, - { - "type": "text", - "text": "What is in the image? Return answer with their potential effects.", - }, - ] - ), - ] - ) - - observation = ( - observations["rgb"].cpu().numpy() - if isinstance(observations["rgb"], torch.Tensor) - else observations["rgb"] - ) - kwargs.update({"observation": encode_image(observation)}) - # for LLM generate task descriptions - prompt_query = ChatPromptTemplate.from_messages( - [ - SystemMessage( - content="You are a helpful assistant to operate a robotic arm with a camera to generate task plans according to descriptions." - ), - HumanMessagePromptTemplate.from_template( - [ - { - "type": "image_url", - "image_url": { - "url": "data:image/jpg;base64,{observation}", - }, - }, - { - "type": "text", - "text": "Here is analysis for this image: {query}.", - }, - { - "type": "text", - "text": ( - "Using the context below, produce an actionable task plan.\n\n" - "**Environment background:** \n{basic_background}\n\n" - '**Task goal:** \n"{task_prompt}"\n\n' - "**Available atomic actions:** \n{atom_actions}\n" - ), - }, - ] - ), - ] - ) - - return [prompt.invoke(kwargs), {"prompt": prompt_query, "kwargs": kwargs}] diff --git a/embodichain/agents/prompts/atom_actions.txt b/embodichain/agents/prompts/atom_actions.txt deleted file mode 100644 index 257464c5..00000000 --- a/embodichain/agents/prompts/atom_actions.txt +++ /dev/null @@ -1,136 +0,0 @@ -### Atom Functions for Robot Arm Control -Each atomic function returns a list of joint-space trajectories (list[np.ndarray]). -All functions support an optional argument: - -force_valid: bool - If True, the system will automatically correct an invalid target pose by - projecting it to the nearest valid pose. Use this option carefully: - enable it only for actions where small spatial deviations are acceptable - and will not compromise task correctness. Default is False. - -Use the following functions exactly as defined. Do not invent new APIs or parameters. - -"grasp": - def grasp(robot_name: str, obj_name: str, pre_grasp_dis: float, **kwargs) -> list[np.ndarray] - - Moves the specified arm to the target object’s affordance-based grasp pose and executes a grasp by closing the gripper. - - The function plans a two-stage trajectory: - (1) from the current pose to a pre-grasp pose offset from the object, and - (2) from the pre-grasp pose to the final grasp pose, followed by gripper closure. - - Upon completion, the gripper is closed and the target object is expected to be stably held by the gripper. - - Example: - grasp(robot_name='right_arm', obj_name='bottle', pre_grasp_dis=0.10) # Moves the right arm to a pre-grasp pose 10 cm from the bottle, then to the grasp pose and closes the gripper to grasp the bottle. - -"place_on_table": - def place_on_table(robot_name: str, obj_name: str, x: float, y: float, pre_place_dis: float, **kwargs) -> list[np.ndarray] - - Moves the specified robot arm with the target object to the desired [x, y] location on the table and opens the gripper to place the object. - The z-coordinate is automatically adjusted based on the table height and the object’s dimensions. - This function assumes that the robot is already holding the object and that the task is to place it on the table at the specified coordinates. - Remember that when you need to place some objects on the table at specific coordinates, use this function without using other movement atom actions. - Otherwise, **if you need to place some objects relative to some place, then use "move_relative_to_object" first to move to the desired position, then use "open_gripper" to release the object.** - - Example: - place_on_table(robot_name='right_arm', obj_name='bottle', x=0.1, y=0.5, pre_place_dis=0.08) # Moves the right arm to a pre-place position 8 cm from the table, then places the bottle at the specified [0.1, 0.5] location on the table and opens the gripper. - -"move_relative_to_object": - def move_relative_to_object(robot_name: str, obj_name: str, - x_offset=0, y_offset=0, z_offset=0, - **kwargs) -> list[np.ndarray] - Moves the end-effector to a pose defined relative to the target object: - target = object_position + [x_offset, y_offset, z_offset] - Orientation is preserved. - Example: - move_relative_to_object(robot_name='right_arm', obj_name='cup', - x_offset=0.05, y_offset=0.10, z_offset=0.10) # Moves the right arm’s end-effector to a spot located 5 cm forward, 10 cm to the left, and 10 cm above the cup, while preserving the current gripper orientation. - move_relative_to_object(robot_name='right_arm', obj_name='cup', - x_offset=-0.05, y_offset=-0.10, z_offset=0.10) # Moves the right arm’s end-effector to a spot located 5 cm backward, 10 cm to the right, and 10 cm above the cup, while preserving the current gripper orientation. - -"move_to_absolute_position": - def move_to_absolute_position(robot_name: str, - x=None, y=None, z=None, - **kwargs) -> list[np.ndarray] - Moves the end-effector to an absolute (x, y, z) position in world coordinates. - Any coordinate set to None remains unchanged. - Orientation is preserved. - Example: - move_to_absolute_position(robot_name='right_arm', x=0.10, y=0.10, z=None) # Moves the end-effector to the absolute world position (x=0.10 m, y=0.10 m) while leaving z unchanged, and preserves the orientation. - -"move_by_relative_offset": - def move_by_relative_offset(robot_name: str, - dx=0.0, dy=0.0, dz=0.0, mode='extrinsic', - **kwargs) -> list[np.ndarray] - Moves the end-effector by a relative translation: - new_position = current_position + [dx, dy, dz] - The offset is applied along the specified axes using the given mode, while preserving the original end-effector orientation. - Mode can be 'extrinsic' (world frame) or 'intrinsic' (end-effector frame). If you want to move along the world axes, use 'extrinsic'. If you want to move along the end-effector’s local axes, use "intrinsic". - Example: - move_by_relative_offset(robot_name='right_arm', dx=0.05, dy=-0.10, dz=0.20, mode='extrinsic') # Translates the end-effector by +5 cm in x (front), −10 cm in y (right), +20 cm in z (above) in the world coordinate, relative to its current position, with orientation preserved. - move_by_relative_offset(robot_name='right_arm', dx=0, dy=0, dz=0.1, mode='intrinsic') # Translates the end-effector by +10 cm in z (forward) in the EEF coordinate, meaning that it moves forward relative to its current facing direction, with orientation preserved. - -"rotate_eef" - def rotate_eef(robot_name: str, degree: float, **kwargs) -> list[np.ndarray] - Rotates the wrist roll joint (joint index 5) of the specified arm by the - given number of degrees. End-effector position is preserved. - Example: - rotate_eef(robot_name='right_arm', degree=-90) # Rotates the right arm’s wrist-roll joint by −45° (counterclockwise), while keeping the end-effector position unchanged. This is a joint-level rotation, not a full orientation override. - rotate_eef(robot_name='right_arm', degree=90) # Rotates the right arm’s wrist-roll joint by 45° (clockwise), while keeping the end-effector position unchanged. This is a joint-level rotation, not a full orientation override. - Typical use cases: - Pouring or tilting a grasped object. - Rotating the gripper around its forward axis without translating the end effector. - After rotating, you typically need to apply an opposite rotation back to return to the original pose. - Usage notes: - Rotation sign convention: negative = counterclockwise, positive = clockwise, viewed along the end-effector forward axis. - For pouring with the right arm, a common pattern is: first apply a negative rotation to start pouring, then apply a positive rotation to return. - For the left arm, the sign convention is typically reversed. - -"orient_eef": - def orient_eef(robot_name: str, - direction: str = 'front', # 'front' or 'down' - **kwargs) -> list[np.ndarray] - Reorients the end-effector to a predefined canonical orientation in the - WORLD coordinate frame, while keeping the EE’s current position fixed. - This function replaces the entire 3×3 orientation matrix of the current - end-effector pose. - Usage notes: - This function should only be used when you explicitly need to override the end-effector’s full orientation. - This differs from rotate_eef(). orient_eef performs a full orientation override of the end-effector, not a single-joint rotation. For tasks like pouring, no need to use it. - For general wrist rotation, prefer using rotate_eef instead. - For aligning tasks, use "front" or "down" orientations as needed. - Supported orientations: - • 'front' : Align the end-effector so its direction faces forward. - • 'down' : Align the end-effector so its direction faces downward. - Example: - orient_eef(robot_name='right_arm', direction='front') # Reorients the right arm’s end-effector so it faces forward - -"back_to_initial_pose": - def back_to_initial_pose(robot_name: str, **kwargs) -> list[np.ndarray] - Returns the specified arm to its predefined initial joint configuration - stored in the environment. - Example: - back_to_initial_pose(robot_name='right_arm') # Returns the right arm back to its predefined initial joint configuration stored in the environment, regardless of its current pose. - -"close_gripper": - def close_gripper(robot_name: str, **kwargs) -> list[np.ndarray] - Closes the arm’s gripper using a short (10-step) gripper-only trajectory. - Example: - close_gripper(robot_name='right_arm') # Closes the right gripper using a short, smooth 10-step gripper-only trajectory. - -"open_gripper": - def open_gripper(robot_name: str, **kwargs) -> list[np.ndarray] - Opens the arm’s gripper using a short (10-step) gripper-only trajectory. - Example: - open_gripper(robot_name='right_arm') # Opens the right gripper using a 10-step gripper-only trajectory. - -### Drive Function (Trajectory Synchronization) -"drive": - def drive(left_arm_action=None, right_arm_action=None, **kwargs) -> list[torch.Tensor] - Wraps one or two arm trajectories into synchronized full-robot actions. - • If only one arm action is provided, the other arm stays idle. - • If both are provided, they are temporally aligned and executed together. - • The actions are obtained from the output of the above functions. - Example: - drive(left_arm_action=left_actions, right_arm_action=right_actions) \ No newline at end of file diff --git a/embodichain/agents/prompts/basic_background.txt b/embodichain/agents/prompts/basic_background.txt deleted file mode 100644 index dc6d1c30..00000000 --- a/embodichain/agents/prompts/basic_background.txt +++ /dev/null @@ -1,42 +0,0 @@ -The environment uses a right-handed world coordinate system, where 1 unit equals 1 meter. -All robot poses are represented as 4×4 homogeneous transformation matrices. - -The robot base coordinate frame is the ONLY authoritative frame for all spatial reasoning, planning, and action generation. - -CAMERA AND IMAGE INTERPRETATION - -The camera is positioned in front of the robot, facing the robot arm and looking toward the robot base. -Because of this viewpoint, the rendered image is horizontally mirrored relative to the robot base frame. -This mirroring affects LEFT–RIGHT only. There is NO vertical or depth inversion. - -Mirror mapping (image → robot base frame): - -* Image left corresponds to robot right -* Image right corresponds to robot left -* Image up corresponds to robot up -* Image down corresponds to robot down - -REQUIRED REASONING PERSPECTIVE (NON-NEGOTIABLE) - -You must ignore the camera and rendered image orientation when reasoning. -All spatial reasoning must be performed as if you are physically located at the robot base, looking outward along the robot’s +x (forward) direction. - -Do NOT reason from the camera viewpoint. -Do NOT trust left/right as shown in the image. -Always remap image left/right before reasoning. - -ROBOT BASE COORDINATE DEFINITIONS - -All directions below are defined strictly in the robot base frame: - -* Moving forward increases x -* Moving backward decreases x -* Moving left increases y (appears as right in the image) -* Moving right decreases y (appears as left in the image) -* Moving up increases z -* Moving down decreases z - -ROBOT INITIALIZATION AND TERMINATION - -Both robot arms start in predefined initial configurations with their end-effectors open. -At task completion, both arms must be returned to their initial poses. \ No newline at end of file diff --git a/embodichain/agents/prompts/code_example.txt b/embodichain/agents/prompts/code_example.txt deleted file mode 100644 index c2952fed..00000000 --- a/embodichain/agents/prompts/code_example.txt +++ /dev/null @@ -1,35 +0,0 @@ -# Python scripts -# Use the right arm to grasp bottle, move to the target location (x=0.2, y=0.1), and then open the gripper to release the object. - -```python -# Step 1 — Reach and grasp the bottle -drive( - right_arm_action=grasp( - robot_name="right_arm", - obj_name="bottle", - ), -) - -# Step 2 — Move to target location -drive( - right_arm_action=move_to_absolute_position( - robot_name="right_arm", - x=0.2, - y=0.1, - ), -) - -# Step 3 — Open gripper to release the object -drive( - right_arm_action=open_gripper( - robot_name="right_arm", - ), -) - -# Step 4 — Return the arm to the initial pose -drive( - right_arm_action=back_to_initial_pose( - robot_name="right_arm", - ), -) -``` \ No newline at end of file diff --git a/embodichain/agents/prompts/code_prompt.txt b/embodichain/agents/prompts/code_prompt.txt deleted file mode 100644 index 3fadf1c9..00000000 --- a/embodichain/agents/prompts/code_prompt.txt +++ /dev/null @@ -1,7 +0,0 @@ -Constraints: -- Every atomic action MUST be executed via a single drive(...) call. -- Each drive(...) call must directly contain the atomic action(s); do NOT define actions separately and then pass them into drive. -- For single-arm execution: specify the active arm’s action and explicitly set the unused arm to None within the same drive(...) call. -- For dual-arm execution: both arms’ actions MUST be specified within the same drive(...) call. -- Use exactly one drive(...) call per step; no exceptions. -- Output MUST be executable Python code only: no explanations, no comments, no markdown, and no extra text. \ No newline at end of file diff --git a/embodichain/lab/gym/envs/tasks/__init__.py b/embodichain/lab/gym/envs/tasks/__init__.py index 56da5a8c..53cf8a1d 100644 --- a/embodichain/lab/gym/envs/tasks/__init__.py +++ b/embodichain/lab/gym/envs/tasks/__init__.py @@ -19,7 +19,6 @@ # Tableware task environments from embodichain.lab.gym.envs.tasks.tableware.pour_water.pour_water import ( PourWaterEnv, - PourWaterAgentEnv, ) from embodichain.lab.gym.envs.tasks.tableware.scoop_ice import ScoopIce from embodichain.lab.gym.envs.tasks.tableware.stack_blocks_two import StackBlocksTwoEnv @@ -40,7 +39,6 @@ ) from embodichain.lab.gym.envs.tasks.tableware.rearrangement import ( RearrangementEnv, - RearrangementAgentEnv, ) # Reinforcement learning environments @@ -52,7 +50,6 @@ __all__ = [ "PourWaterEnv", - "PourWaterAgentEnv", "ScoopIce", "StackBlocksTwoEnv", "BlocksRankingRGBEnv", @@ -61,7 +58,6 @@ "StackCupsEnv", "MatchObjectContainerEnv", "RearrangementEnv", - "RearrangementAgentEnv", "PushCubeEnv", "CartPoleEnv", "SimpleTaskEnv", diff --git a/embodichain/lab/gym/envs/tasks/tableware/base_agent_env.py b/embodichain/lab/gym/envs/tasks/tableware/base_agent_env.py deleted file mode 100644 index 8814ea56..00000000 --- a/embodichain/lab/gym/envs/tasks/tableware/base_agent_env.py +++ /dev/null @@ -1,201 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -import torch -from embodichain.utils import logger - - -class BaseAgentEnv: - def _init_agents(self, agent_config, task_name, agent_config_path=None): - from embodichain.agents.hierarchy.task_agent import TaskAgent - from embodichain.agents.hierarchy.code_agent import CodeAgent - from embodichain.agents.hierarchy.validation_agent import ValidationAgent - from embodichain.agents.hierarchy.llm import ( - task_llm, - code_llm, - validation_llm, - ) - - if agent_config.get("TaskAgent") is not None: - self.task_agent = TaskAgent( - task_llm, - **agent_config["Agent"], - **agent_config["TaskAgent"], - task_name=task_name, - config_dir=agent_config_path, - ) - self.code_agent = CodeAgent( - code_llm, - **agent_config["Agent"], - **agent_config.get("CodeAgent"), - task_name=task_name, - config_dir=agent_config_path, - ) - self.validation_agent = ValidationAgent( - validation_llm, - task_name=task_name, - task_description=self.code_agent.prompt_kwargs.get("task_prompt")[ - "content" - ], - basic_background=self.code_agent.prompt_kwargs.get("basic_background")[ - "content" - ], - atom_actions=self.code_agent.prompt_kwargs.get("atom_actions")["content"], - ) - - def get_states(self): - # TODO: only support num_env = 1 for now - # store robot states in each env.reset - self.init_qpos = self.robot.get_qpos().squeeze(0) - - self.left_arm_joints = self.robot.get_joint_ids(name="left_arm") - self.right_arm_joints = self.robot.get_joint_ids(name="right_arm") - self.left_eef_joints = self.robot.get_joint_ids(name="left_eef") - self.right_eef_joints = self.robot.get_joint_ids(name="right_eef") - - self.left_arm_init_qpos = self.init_qpos[self.left_arm_joints] - self.right_arm_init_qpos = self.init_qpos[self.right_arm_joints] - - self.left_arm_init_xpos = self.robot.compute_fk( - self.left_arm_init_qpos, name="left_arm", to_matrix=True - ).squeeze(0) - self.right_arm_init_xpos = self.robot.compute_fk( - self.right_arm_init_qpos, name="right_arm", to_matrix=True - ).squeeze(0) - - self.left_arm_current_qpos = self.left_arm_init_qpos - self.right_arm_current_qpos = self.right_arm_init_qpos - - self.left_arm_current_xpos = self.left_arm_init_xpos - self.right_arm_current_xpos = self.right_arm_init_xpos - - self.left_arm_base_pose = self.robot.get_control_part_base_pose( - "left_arm", to_matrix=True - ).squeeze(0) - self.right_arm_base_pose = self.robot.get_control_part_base_pose( - "right_arm", to_matrix=True - ).squeeze(0) - - self.open_state = torch.tensor([0.05]) - self.close_state = torch.tensor([0.0]) - self.left_arm_current_gripper_state = self.open_state - self.right_arm_current_gripper_state = self.open_state - - # store some useful obj information - init_obj_info = {} - obj_uids = self.sim.get_rigid_object_uid_list() - for obj_name in obj_uids: - obj = self.sim.get_rigid_object(obj_name) - obj_pose = obj.get_local_pose(to_matrix=True).squeeze(0) - obj_height = obj_pose[2, 3] # Extract the height (z-coordinate) - obj_grasp_pose = self.affordance_datas.get( - f"{obj_name}_grasp_pose_object", None - ) - init_obj_info[obj_name] = { - "pose": obj_pose, # Store the full pose (4x4 matrix) - "height": obj_height, # Store the height (z-coordinate) - "grasp_pose_obj": ( - obj_grasp_pose.squeeze(0) if obj_grasp_pose is not None else None - ), # Store the grasp pose if available - } - self.init_obj_info = init_obj_info - - # -------------------- Common getters / setters -------------------- - - def get_obs_for_agent(self): - obs = self.get_obs() - rgb = obs["sensor"]["cam_high"]["color"].squeeze(0) - - # Get validation camera data - camera_data = self.event_manager.get_functor("validation_cameras")(self, None) - result = {"rgb": rgb} - result.update({k: v.squeeze(0) for k, v in camera_data.items()}) - return result - - def get_current_qpos_agent(self): - return self.left_arm_current_qpos, self.right_arm_current_qpos - - def set_current_qpos_agent(self, arm_qpos, is_left): - if is_left: - self.left_arm_current_qpos = arm_qpos - else: - self.right_arm_current_qpos = arm_qpos - - def get_current_xpos_agent(self): - return self.left_arm_current_xpos, self.right_arm_current_xpos - - def set_current_xpos_agent(self, arm_xpos, is_left): - if is_left: - self.left_arm_current_xpos = arm_xpos - else: - self.right_arm_current_xpos = arm_xpos - - def get_current_gripper_state_agent(self): - return self.left_arm_current_gripper_state, self.right_arm_current_gripper_state - - def set_current_gripper_state_agent(self, arm_gripper_state, is_left): - if is_left: - self.left_arm_current_gripper_state = arm_gripper_state - else: - self.right_arm_current_gripper_state = arm_gripper_state - - # -------------------- IK / FK -------------------- - def get_arm_ik(self, target_xpos, is_left, qpos_seed=None): - control_part = "left_arm" if is_left else "right_arm" - ret, qpos = self.robot.compute_ik( - name=control_part, pose=target_xpos, joint_seed=qpos_seed - ) - return ret.all().item(), qpos.squeeze(0) - - def get_arm_fk(self, qpos, is_left): - control_part = "left_arm" if is_left else "right_arm" - xpos = self.robot.compute_fk( - name=control_part, qpos=torch.as_tensor(qpos), to_matrix=True - ) - return xpos.squeeze(0) - - # -------------------- get only code for action list -------------------- - def generate_code_for_actions(self, regenerate=False, **kwargs): - logger.log_info( - f"Generate code for creating action list for {self.code_agent.task_name}.", - color="green", - ) - - # Task planning - print(f"\033[92m\nStart task planning.\n\033[0m") - - task_agent_input = self.task_agent.get_composed_observations( - env=self, regenerate=regenerate, **kwargs - ) - task_plan = self.task_agent.generate(**task_agent_input) - - # Code generation - print(f"\033[94m\nStart code generation.\n\033[0m") - code_agent_input = self.code_agent.get_composed_observations( - env=self, regenerate=regenerate, **kwargs - ) - code_agent_input["task_plan"] = task_plan - - code_file_path, kwargs, code = self.code_agent.generate(**code_agent_input) - return code_file_path, kwargs, code - - # -------------------- get action list -------------------- - def create_demo_action_list(self, regenerate=False, *args, **kwargs): - code_file_path, kwargs, _ = self.generate_code_for_actions( - regenerate=regenerate - ) - action_list = self.code_agent.act(code_file_path, **kwargs) - return action_list diff --git a/embodichain/lab/gym/envs/tasks/tableware/pour_water/pour_water.py b/embodichain/lab/gym/envs/tasks/tableware/pour_water/pour_water.py index 83e356bf..0d37a070 100644 --- a/embodichain/lab/gym/envs/tasks/tableware/pour_water/pour_water.py +++ b/embodichain/lab/gym/envs/tasks/tableware/pour_water/pour_water.py @@ -14,19 +14,19 @@ # limitations under the License. # ---------------------------------------------------------------------------- +from __future__ import annotations + import torch -from typing import Dict, Optional from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg from embodichain.lab.gym.utils.registration import register_env from embodichain.utils import logger -from embodichain.lab.gym.envs.tasks.tableware.base_agent_env import BaseAgentEnv from embodichain.lab.gym.envs.tasks.tableware.pour_water.action_bank import ( PourWaterActionBank, ) -__all__ = ["PourWaterEnv", "PourWaterAgentEnv"] +__all__ = ["PourWaterEnv"] @register_env("PourWater-v3", max_episode_steps=600) @@ -147,15 +147,3 @@ def _is_fall(self, pose: torch.Tensor) -> torch.Tensor: # Compute angle and check if fallen angle = torch.arccos(dot_product) return angle >= torch.pi / 4 - - -@register_env("PourWaterAgent-v3", max_episode_steps=600) -class PourWaterAgentEnv(BaseAgentEnv, PourWaterEnv): - def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): - super().__init__(cfg, **kwargs) - super()._init_agents(**kwargs) - - def reset(self, seed: Optional[int] = None, options: Optional[Dict] = None): - obs, info = super().reset(seed=seed, options=options) - super().get_states() - return obs, info diff --git a/embodichain/lab/gym/envs/tasks/tableware/rearrangement.py b/embodichain/lab/gym/envs/tasks/tableware/rearrangement.py index 7f9559ca..3d526ea5 100644 --- a/embodichain/lab/gym/envs/tasks/tableware/rearrangement.py +++ b/embodichain/lab/gym/envs/tasks/tableware/rearrangement.py @@ -4,12 +4,12 @@ # All rights reserved. # ---------------------------------------------------------------------------- -from typing import Dict, Optional +from __future__ import annotations + from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg from embodichain.lab.gym.utils.registration import register_env -from embodichain.lab.gym.envs.tasks.tableware.base_agent_env import BaseAgentEnv -__all__ = ["RearrangementEnv", "RearrangementAgentEnv"] +__all__ = ["RearrangementEnv"] @register_env("Rearrangement-v3", max_episode_steps=600) @@ -53,38 +53,3 @@ def is_task_success(self) -> bool: or abs(fork_x - fork_place_target_x) > tolerance or abs(fork_y - fork_place_target_y) > tolerance ) - - -@register_env("RearrangementAgent-v3", max_episode_steps=600) -class RearrangementAgentEnv(BaseAgentEnv, RearrangementEnv): - def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): - super().__init__(cfg, **kwargs) - super()._init_agents(**kwargs) - - def reset(self, seed: Optional[int] = None, options: Optional[Dict] = None): - obs, info = super().reset(seed=seed, options=options) - super().get_states() - return obs, info - - def is_task_success(self): - fork = self.sim.get_rigid_object("fork") - spoon = self.sim.get_rigid_object("spoon") - plate = self.sim.get_rigid_object("plate") - - plate_pose = plate.get_local_pose(to_matrix=True) - spoon_place_target_y = plate_pose[0, 1, 3] - 0.16 - fork_place_target_y = plate_pose[0, 1, 3] + 0.16 - - spoon_pose = spoon.get_local_pose(to_matrix=True) - spoon_y = spoon_pose[0, 1, 3] - - fork_pose = fork.get_local_pose(to_matrix=True) - fork_y = fork_pose[0, 1, 3] - - tolerance = self.metadata.get("success_params", {}).get("tolerance", 0.02) - - # spoon and fork should with the y range of tolerance related to plate. - return ( - abs(spoon_y - spoon_place_target_y) <= tolerance - and abs(fork_y - fork_place_target_y) <= tolerance - ) diff --git a/embodichain/lab/sim/atom_actions.py b/embodichain/lab/sim/atom_actions.py deleted file mode 100644 index 2abefea9..00000000 --- a/embodichain/lab/sim/atom_actions.py +++ /dev/null @@ -1,948 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2026 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -import numpy as np -from embodichain.utils.logger import log_info, log_warning, log_error -from copy import deepcopy -from embodichain.lab.gym.utils.misc import ( - mul_linear_expand, - get_rotation_replaced_pose, -) -from embodichain.utils.math import get_offset_pose -import torch -from tqdm import tqdm -from scipy.spatial.transform import Rotation as R -from embodichain.utils.utility import encode_image - -# Import utility functions for atom actions -from embodichain.lab.sim.utility.atom_action_utils import ( - draw_axis, - get_arm_states, - find_nearest_valid_pose, - get_qpos, - plan_trajectory, - plan_gripper_trajectory, - finalize_actions, - extract_drive_calls, -) - -""" ---------------------------------------------Atom action functions---------------------------------------------------- ---------------------------------------------Atom action functions---------------------------------------------------- ---------------------------------------------Atom action functions---------------------------------------------------- -""" - - -# TODO: write a move_to_pose atom action, the use this action to form other atom actions -def grasp( - robot_name: str, - obj_name: str, - pre_grasp_dis: float = 0.05, - env=None, - force_valid=False, - **kwargs, -): - # Get target object - obj_uids = env.sim.get_rigid_object_uid_list() - if obj_name in obj_uids: - target_obj = env.sim.get_rigid_object(obj_name) - else: - log_error(f"No matched object {obj_uids}.") - target_obj_pose = target_obj.get_local_pose(to_matrix=True).squeeze(0) - - # Open the gripper if currently closed - actions = None - select_arm_current_gripper_state = ( - env.left_arm_current_gripper_state - if "left" in robot_name - else env.right_arm_current_gripper_state - ) - if select_arm_current_gripper_state <= env.open_state - 0.01: - actions = open_gripper(robot_name, env, **kwargs) - - # Retract the end-effector to avoid collision - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - select_arm_base_pose = ( - env.left_arm_base_pose if is_left else env.right_arm_base_pose - ) - base_to_eef_xy_dis = torch.norm( - select_arm_base_pose[:2, 3] - select_arm_current_pose[:2, 3] - ) - base_to_obj_xy_dis = torch.norm( - select_arm_base_pose[:2, 3] - target_obj_pose[:2, 3] - ) - dis_eps = kwargs.get("dis_eps", 0.05) - select_arm_init_pose = ( - env.left_arm_init_xpos if is_left else env.right_arm_init_xpos - ) - if base_to_eef_xy_dis > base_to_obj_xy_dis and not torch.allclose( - select_arm_current_pose, select_arm_init_pose, rtol=1e-5, atol=1e-8 - ): - delta = base_to_eef_xy_dis - (base_to_obj_xy_dis - dis_eps) - back_actions = move_by_relative_offset( - robot_name=robot_name, - dx=0.0, - dy=0.0, - dz=-delta, - env=env, - force_valid=force_valid, - mode="intrinsic", - sample_num=15, - **kwargs, - ) - actions = ( - np.concatenate([actions, back_actions], axis=0) - if actions is not None - else back_actions - ) - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # ---------------------------------------- Pose ---------------------------------------- - # Move the end-effector to a good place for starting grasping to avoid bad poses - select_arm_retract_pose = deepcopy( - env.left_arm_init_xpos if is_left else env.right_arm_init_xpos - ) - select_arm_retract_pose = get_offset_pose( - select_arm_retract_pose, 0.15, "z", "intrinsic" - ) - select_arm_retract_qpos = get_qpos( - env, - is_left, - select_arm, - select_arm_retract_pose, - env.left_arm_init_qpos if is_left else env.right_arm_init_qpos, - force_valid=force_valid, - name="retract_to_good_pose", - ) - qpos_list_back_to_retract = [select_arm_current_qpos, select_arm_retract_qpos] - sample_num = 30 - - plan_trajectory( - env, - select_arm, - qpos_list_back_to_retract, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - select_arm_current_qpos = select_arm_retract_qpos - select_arm_current_pose = select_arm_retract_pose - - # Rotate the arm base to face the object for better grasping - delta_xy = target_obj_pose[:2, 3] - select_arm_base_pose[:2, 3] - dx, dy = delta_xy[0], delta_xy[1] - aim_horizontal_angle = np.arctan2(dy, dx) - select_arm_aim_qpos = deepcopy(select_arm_current_qpos) - select_arm_aim_qpos[0] = aim_horizontal_angle - - # Get best grasp pose from affordance data - grasp_pose_object = env.init_obj_info.get(obj_name)["grasp_pose_obj"] - if ( - grasp_pose_object[0, 2] > 0.5 - ): # whether towards x direction TODO: make it robust - # Align the object pose's z-axis with the arm's aiming direction - target_obj_pose = torch.tensor( - get_rotation_replaced_pose( - np.array(target_obj_pose), - float(select_arm_aim_qpos[0]), - "z", - "intrinsic", - ) - ) - best_pickpose = target_obj_pose @ grasp_pose_object - grasp_pose = deepcopy(best_pickpose) - grasp_pose_pre1 = deepcopy(grasp_pose) - grasp_pose_pre1 = get_offset_pose(grasp_pose_pre1, -pre_grasp_dis, "z", "intrinsic") - - # Solve IK for pre-grasp and grasp poses - grasp_qpos_pre1 = get_qpos( - env, - is_left, - select_arm, - grasp_pose_pre1, - select_arm_aim_qpos, - force_valid=force_valid, - name="grasp pre1", - ) - grasp_qpos = get_qpos( - env, - is_left, - select_arm, - grasp_pose, - grasp_qpos_pre1, - force_valid=force_valid, - name="grasp", - ) - - # Update env state to final grasp pose - env.set_current_qpos_agent(grasp_qpos, is_left=is_left) - env.set_current_xpos_agent(grasp_pose, is_left=is_left) - - # ------------------------------------ Traj 0: init → aim ------------------------------------ - qpos_list_init_to_aim = [select_arm_current_qpos, select_arm_aim_qpos] - # base_sample_num = 10 - # base_angle = 0.08 - # sample_num = max(int(delta_angle / base_angle * base_sample_num), 2) - - sample_num = 10 - - plan_trajectory( - env, - select_arm, - qpos_list_init_to_aim, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ------------------------------------ Traj 1: aim → pre-grasp ------------------------------------ - qpos_list_aim_to_pre1 = [select_arm_aim_qpos, grasp_qpos_pre1] - sample_num = kwargs.get("sample_num", 30) - - plan_trajectory( - env, - select_arm, - qpos_list_aim_to_pre1, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ------------------------------------ Traj 2: pre-grasp → grasp ------------------------------------ - qpos_list_pre1_to_grasp = [grasp_qpos_pre1, grasp_qpos] - sample_num = kwargs.get("sample_num", 20) - - plan_trajectory( - env, - select_arm, - qpos_list_pre1_to_grasp, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - traj_actions = finalize_actions(select_qpos_traj, ee_state_list_select) - actions = ( - traj_actions - if actions is None - else np.concatenate([actions, traj_actions], axis=0) - ) - - # ------------------------------------ Close gripper ------------------------------------ - close_gripper_actions = close_gripper(robot_name, env, **kwargs) - actions = np.concatenate([actions, close_gripper_actions], axis=0) - - log_info( - f"Total generated trajectory number for grasp: {len(actions)}.", color="green" - ) - - return actions - - -def place_on_table( - robot_name: str, - obj_name: str, - x: float = None, - y: float = None, - pre_place_dis: float = 0.08, - env=None, - force_valid=False, - **kwargs, -): - - init_obj_height = env.init_obj_info.get(obj_name).get("height") - height = init_obj_height + kwargs.get("eps", 0.03) - - traj_actions = move_to_absolute_position( - robot_name, x=x, y=y, z=height, env=env, force_valid=force_valid, **kwargs - ) - open_actions = open_gripper(robot_name, env, **kwargs) - - actions = np.concatenate([traj_actions, open_actions], axis=0) - - log_info( - f"Total generated trajectory number for place on table: {len(actions)}.", - color="green", - ) - - return actions - - -def move_relative_to_object( - robot_name: str, - obj_name: str, - x_offset: float = 0, - y_offset: float = 0, - z_offset: float = 0, - env=None, - force_valid=False, - **kwargs, -): - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # ---------------------------------------- Pose ---------------------------------------- - # Resolve target object - obj_uids = env.sim.get_rigid_object_uid_list() - if obj_name in obj_uids: - target_obj = env.sim.get_rigid_object(obj_name) - else: - log_error("No matched object.") - - # Get object base pose (4x4 matrix) - target_obj_pose = target_obj.get_local_pose(to_matrix=True).squeeze(0) - - # Construct target pose (preserve orientation) - move_target_pose = deepcopy(select_arm_current_pose) - move_target_pose[:3, 3] = target_obj_pose[:3, 3] - move_target_pose[0, 3] += x_offset - move_target_pose[1, 3] += y_offset - move_target_pose[2, 3] += z_offset - - # Solve IK for target pose - move_target_qpos = get_qpos( - env, - is_left, - select_arm, - move_target_pose, - select_arm_current_qpos, - force_valid=force_valid, - name="move relative to object", - ) - - # Update env states - env.set_current_qpos_agent(move_target_qpos, is_left=is_left) - env.set_current_xpos_agent(move_target_pose, is_left=is_left) - - # ------------------------------------ Traj 1: init → target ------------------------------------ - qpos_list_init_to_target = [select_arm_current_qpos, move_target_qpos] - sample_num = kwargs.get("sample_num", 30) - - plan_trajectory( - env, - select_arm, - qpos_list_init_to_target, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - actions = finalize_actions(select_qpos_traj, ee_state_list_select) - - log_info( - f"Total generated trajectory number for move relative to object: {len(actions)}.", - color="green", - ) - - return actions - - -def move_to_absolute_position( - robot_name: str, - x: float = None, - y: float = None, - z: float = None, - env=None, - force_valid=False, - **kwargs, -): - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # ---------------------------------------- Pose ---------------------------------------- - # Start from current pose, then selectively update xyz - move_pose = deepcopy(select_arm_current_pose) - - current_xyz = move_pose[:3, 3].clone() - - target_xyz = current_xyz.clone() - if x is not None: - target_xyz[0] = x - if y is not None: - target_xyz[1] = y - if z is not None: - target_xyz[2] = z - - move_pose[:3, 3] = target_xyz - - # Try IK on target pose - move_qpos = get_qpos( - env, - is_left, - select_arm, - move_pose, - select_arm_current_qpos, - force_valid=force_valid, - name="move to absolute position", - ) - - # Update env states - env.set_current_qpos_agent(move_qpos, is_left=is_left) - env.set_current_xpos_agent(move_pose, is_left=is_left) - - # ------------------------------------ Traj: init → target ------------------------------------ - qpos_list_init_to_move = [select_arm_current_qpos, move_qpos] - sample_num = kwargs.get("sample_num", 30) - - plan_trajectory( - env, - select_arm, - qpos_list_init_to_move, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - actions = finalize_actions(select_qpos_traj, ee_state_list_select) - - log_info( - f"Total generated trajectory number for move to absolute position: {len(actions)}.", - color="green", - ) - - return actions - - -def move_by_relative_offset( - robot_name: str, - dx: float = 0.0, - dy: float = 0.0, - dz: float = 0.0, - mode: str = "extrinsic", - env=None, - force_valid=False, - **kwargs, -): - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # ---------------------------------------- Pose ---------------------------------------- - move_pose = deepcopy(select_arm_current_pose) - - # Apply relative offsets (dx, dy, dz always floats) - move_pose = get_offset_pose(move_pose, dx, "x", mode) - move_pose = get_offset_pose(move_pose, dy, "y", mode) - move_pose = get_offset_pose(move_pose, dz, "z", mode) - - # Solve IK - move_qpos = get_qpos( - env, - is_left, - select_arm, - move_pose, - select_arm_current_qpos, - force_valid=force_valid, - name="move by relative offset", - ) - - # Update environment states - env.set_current_qpos_agent(move_qpos, is_left=is_left) - env.set_current_xpos_agent(move_pose, is_left=is_left) - - # ------------------------------------ Traj: init → target ------------------------------------ - qpos_list_init_to_move = [select_arm_current_qpos, move_qpos] - sample_num = kwargs.get("sample_num", 20) - - plan_trajectory( - env, - select_arm, - qpos_list_init_to_move, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - actions = finalize_actions(select_qpos_traj, ee_state_list_select) - - log_info( - f"Total generated trajectory number for move by relative offset: {len(actions)}.", - color="green", - ) - - return actions - - -def back_to_initial_pose(robot_name: str, env=None, **kwargs): - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - # Get arm states - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # Retrieve the initial joint configuration of this arm - target_qpos = env.left_arm_init_qpos if is_left else env.right_arm_init_qpos - target_qpos = torch.as_tensor(target_qpos, dtype=select_arm_current_qpos.dtype) - - # ---------------------------------------- Pose ---------------------------------------- - # Pre-back pose: move along tool z by a small offset (use intrinsic frame) - pre_back_pose = deepcopy(select_arm_current_pose) - pre_back_pose = get_offset_pose(pre_back_pose, -0.08, "z", "intrinsic") - - # IK for pre-back - pre_back_qpos = get_qpos( - env, - is_left, - select_arm, - pre_back_pose, - select_arm_current_qpos, - force_valid=kwargs.get("force_valid", False), - name="pre back pose", - ) - - # Update env states (move to target pose) - target_pose = env.get_arm_fk(qpos=target_qpos, is_left=is_left) - env.set_current_qpos_agent(target_qpos, is_left=is_left) - env.set_current_xpos_agent(target_pose, is_left=is_left) - - # ------------------------------------ Traj: init → pre back_pose ------------------------------------ - qpos_list_init_to_preback = [select_arm_current_qpos, pre_back_qpos] - sample_num = 20 - - plan_trajectory( - env, - select_arm, - qpos_list_init_to_preback, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ------------------------------------ Traj: init → initial_pose ------------------------------------ - qpos_list_preback_to_target = [pre_back_qpos, target_qpos] - sample_num = kwargs.get("sample_num", 30) - - plan_trajectory( - env, - select_arm, - qpos_list_preback_to_target, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - actions = finalize_actions(select_qpos_traj, ee_state_list_select) - - log_info( - f"Total generated trajectory number for back to initial pose: {len(actions)}.", - color="green", - ) - - return actions - - -def rotate_eef(robot_name: str, degree: float = 0, env=None, **kwargs): - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # ---------------------------------------- Pose ---------------------------------------- - # Compute new joint positions - rotated_qpos = deepcopy(select_arm_current_qpos) - rotated_qpos[5] += np.deg2rad(degree) - - # Optional: limit checking (commented out by default) - # joint5_limit = env.get_joint_limits(select_arm)[5] - # if rotated_qpos[5] < joint5_limit[0] or rotated_qpos[5] > joint5_limit[1]: - # log_warning("Rotated qpos exceeds joint limits.\n") - - # Compute FK for new pose - rotated_pose = env.get_arm_fk( - qpos=rotated_qpos, - is_left=is_left, - ) - - # Update environment state - env.set_current_qpos_agent(rotated_qpos, is_left=is_left) - env.set_current_xpos_agent(rotated_pose, is_left=is_left) - - # ------------------------------------ Traj 1: init → rotated ------------------------------------ - qpos_list_init_to_rotated = [select_arm_current_qpos, rotated_qpos] - sample_num = kwargs.get("sample_num", 20) - - plan_trajectory( - env, - select_arm, - qpos_list_init_to_rotated, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - actions = finalize_actions(select_qpos_traj, ee_state_list_select) - - log_info( - f"Total generated trajectory number for rotate eef: {len(actions)}.", - color="green", - ) - - return actions - - -def orient_eef( - robot_name: str, - direction: str = "front", # 'front' or 'down' - env=None, - force_valid=False, - **kwargs, -): - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - # Get arm state - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # ---------------------------------------- Pose ---------------------------------------- - # Generate replacement rotation matrix - replaced_rotation_matrix = np.eye(4) - if direction == "front": - rotation_matrix = R.from_euler("xyz", [180, -90, 0], degrees=True).as_matrix() - replaced_rotation_matrix[:3, :3] = ( - rotation_matrix @ replaced_rotation_matrix[:3, :3] - ) - elif direction == "down": - rotation_matrix = R.from_euler("x", 180, degrees=True).as_matrix() - replaced_rotation_matrix[:3, :3] = ( - rotation_matrix @ replaced_rotation_matrix[:3, :3] - ) - else: - log_error("Rotation direction must be 'front' or 'down'.") - - rotation_replaced_pose = deepcopy(select_arm_current_pose) - rot_torch = torch.as_tensor( - replaced_rotation_matrix[:3, :3], - dtype=rotation_replaced_pose.dtype, - device=rotation_replaced_pose.device, - ) - rotation_replaced_pose[:3, :3] = rot_torch - - # Solve IK for the new pose - replace_target_qpos = get_qpos( - env, - is_left, - select_arm, - rotation_replaced_pose, - select_arm_current_qpos, - force_valid=force_valid, - name="replaced-rotation", - ) - - # ---------------------------------------- Update env ---------------------------------------- - env.set_current_qpos_agent(replace_target_qpos, is_left=is_left) - env.set_current_xpos_agent(rotation_replaced_pose, is_left=is_left) - - # ------------------------------------ Traj: init → target ------------------------------------ - qpos_list_init_to_rotated = [select_arm_current_qpos, replace_target_qpos] - sample_num = kwargs.get("sample_num", 20) - - plan_trajectory( - env, - select_arm, - qpos_list_init_to_rotated, - sample_num, - select_arm_current_gripper_state, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - actions = finalize_actions(select_qpos_traj, ee_state_list_select) - - log_info( - f"Total generated trajectory number for orient eef: {len(actions)}.", - color="green", - ) - - return actions - - -def close_gripper(robot_name: str, env=None, **kwargs): - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # ---------------------------------------- Traj ---------------------------------------- - sample_num = kwargs.get("sample_num", 15) - execute_open = False # False → closing motion - - plan_gripper_trajectory( - env, - is_left, - sample_num, - execute_open, - select_arm_current_qpos, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - actions = finalize_actions(select_qpos_traj, ee_state_list_select) - - log_info( - f"Total generated trajectory number for close gripper: {len(actions)}.", - color="green", - ) - - return actions - - -def open_gripper(robot_name: str, env=None, **kwargs): - - # ---------------------------------------- Prepare ---------------------------------------- - select_qpos_traj = [] - ee_state_list_select = [] - - ( - is_left, - select_arm, - select_arm_current_qpos, - select_arm_current_pose, - select_arm_current_gripper_state, - ) = get_arm_states(env, robot_name) - - # ---------------------------------------- Traj ---------------------------------------- - sample_num = kwargs.get("sample_num", 15) - execute_open = True # True → opening motion - - plan_gripper_trajectory( - env, - is_left, - sample_num, - execute_open, - select_arm_current_qpos, - select_qpos_traj, - ee_state_list_select, - ) - - # ---------------------------------------- Final ---------------------------------------- - actions = finalize_actions(select_qpos_traj, ee_state_list_select) - - log_info( - f"Total generated trajectory number for open gripper: {len(actions)}.", - color="green", - ) - - return actions - - -def drive( - left_arm_action=None, - right_arm_action=None, - env=None, - **kwargs, -): - - if left_arm_action is not None and right_arm_action is not None: - len_left = len(left_arm_action) - len_right = len(right_arm_action) - - if len_left < len_right: - diff = len_right - len_left - padding = np.repeat(left_arm_action[-1:], diff, axis=0) - left_arm_action = np.concatenate([left_arm_action, padding], axis=0) - elif len_right < len_left: - diff = len_left - len_right - padding = np.repeat(right_arm_action[-1:], diff, axis=0) - right_arm_action = np.concatenate([right_arm_action, padding], axis=0) - - left_arm_index = env.left_arm_joints + env.left_eef_joints - right_arm_index = env.right_arm_joints + env.right_eef_joints - actions = np.zeros((len(right_arm_action), len(env.init_qpos))) - actions[:, left_arm_index] = left_arm_action - actions[:, right_arm_index] = right_arm_action - - elif left_arm_action is None and right_arm_action is not None: - left_arm_index = env.left_arm_joints + env.left_eef_joints - right_arm_index = env.right_arm_joints + env.right_eef_joints - left_arm_action = finalize_actions( - env.left_arm_current_qpos, env.left_arm_current_gripper_state - ) - left_arm_action = np.repeat( - left_arm_action[None, :], len(right_arm_action), axis=0 - ) - - actions = np.zeros( - (len(right_arm_action), len(env.robot.get_qpos().squeeze(0))), - dtype=np.float32, - ) - actions[:, left_arm_index] = left_arm_action - actions[:, right_arm_index] = right_arm_action - - elif right_arm_action is None and left_arm_action is not None: - left_arm_index = env.left_arm_joints + env.left_eef_joints - right_arm_index = env.right_arm_joints + env.right_eef_joints - right_arm_action = finalize_actions( - env.right_arm_current_qpos, env.right_arm_current_gripper_state - ) - right_arm_action = np.repeat( - right_arm_action[None, :], len(left_arm_action), axis=0 - ) - - actions = np.zeros( - (len(left_arm_action), len(env.robot.get_qpos().squeeze(0))), - dtype=np.float32, - ) - actions[:, left_arm_index] = left_arm_action - actions[:, right_arm_index] = right_arm_action - - else: - log_error("At least one arm action should be provided.") - - actions = torch.from_numpy(actions).to(dtype=torch.float32).unsqueeze(1) - actions = list(actions.unbind(dim=0)) - for i in tqdm(range(len(actions))): - action = actions[i] - env.step(action) - return actions - - -def save_observations( - step_id: int = 0, - step_name: str = None, - env=None, - **kwargs, -): - # When using feedback script - log_dir = kwargs.get("log_dir") - if log_dir: - save_dir = log_dir / "camera_images" - - # Prepare subfolder: {id}_generate_num/episode{current_check_num} - gen_id = kwargs.get("id", "unknown_id") - episode_id = kwargs.get("current_check_num", 0) - - sub_dir = save_dir / f"{gen_id}_generate_num" / f"episode{episode_id}" - sub_dir.mkdir(parents=True, exist_ok=True) - - # Encode image to Base64 - base64_image = encode_image(env.get_obs_for_agent()["rgb"]) - - # Decode Base64 back to raw image bytes - import base64 - - img_bytes = base64.b64decode(base64_image) - - # Ensure step_name is not None - step_name = step_name if step_name is not None else "unnamed_step" - - # Save the decoded image - output_path = sub_dir / f"step{step_id}_{step_name}.png" - with open(output_path, "wb") as f: - f.write(img_bytes) - - # Print save info - log_info(f"[save_observations] Saved image to: {output_path}") - - # When only running the script (no feedback script) - else: - pass