From a9534610c842eb8b4fb9c69ab6aaff1222642cee Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Sun, 14 Jun 2026 12:35:56 +0300 Subject: [PATCH] fix(update): honor yaml source_root so update stops mass-deleting the index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_update passed the discovered config dir as an explicit source_root to resolve_operator_config, routing it into the branch that SKIPS the YAML source_root field. With a config living in a subdir next to `source_root: ../`, update then indexed that subdir (no Java) against the real index one level up, so cocoindex treated every indexed file as removed and deleted them — the "Updating index (Lance + graph)..." hang, and the ever-growing Lance `_deletions` + 1000s+ increment after a ctrl+C left cocoindex.db mid-reconcile. This is the same bug class #316 fixed for the MCP server (its docstring warns that a non-None source_root skips the YAML field); run_update was the last production caller still passing a discovered dir. Pass source_root=None so the YAML source_root is honored exactly like increment/init/reprocess. run_install is unaffected (it passes the user-confirmed Java root). Adds a regression test mirroring the reported layout (config in my-project-context/, source_root: ../, real index one level up) that captures the env handed to cocoindex and asserts SOURCE_ROOT resolves to the YAML root, not the config dir. No schema, ontology, embedding, or env-var change. Existing indexes remain valid; no reindex required. Co-Authored-By: Claude --- java_codebase_rag/installer.py | 9 +++- tests/test_installer.py | 96 ++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 2 deletions(-) diff --git a/java_codebase_rag/installer.py b/java_codebase_rag/installer.py index 9c9ac8d..0dfcdfd 100644 --- a/java_codebase_rag/installer.py +++ b/java_codebase_rag/installer.py @@ -1250,9 +1250,14 @@ def run_update( print("Skipping index update.") return EXIT_PARTIAL if has_artifact_failures else EXIT_SUCCESS - # Resolve configuration + # Resolve configuration. Pass source_root=None so the YAML ``source_root`` + # field is honored exactly like increment/init/reprocess — passing the + # discovered config dir here routes resolve_operator_config into the + # explicit-override branch that SKIPS the YAML field, which made `update` + # point cocoindex at the config dir (no Java) against the real index and + # mass-delete it. Discovery still runs against the CLI's cwd. try: - cfg = resolve_operator_config(source_root=project_root, cli_index_dir=None) + cfg = resolve_operator_config(source_root=None, cli_index_dir=None) index_dir = cfg.index_dir except Exception as e: print(f"\nWarning: Failed to resolve configuration: {e}") diff --git a/tests/test_installer.py b/tests/test_installer.py index ed7dd05..4d45e43 100644 --- a/tests/test_installer.py +++ b/tests/test_installer.py @@ -1161,6 +1161,102 @@ def test_update_no_index_skips_increment(self, tmp_path, monkeypatch): # Should succeed (no hosts is fatal, but no index is just a warning) assert result == 0 + def test_update_honors_yaml_source_root_for_nested_config_dir( + self, tmp_path, monkeypatch + ): + """run_update must resolve source_root exactly like increment. + + Regression for the "update mass-deletes the index" bug. run_update passed + the discovered config dir as an explicit source_root, routing + resolve_operator_config into the branch that SKIPS the YAML source_root + field. With a config living in my-project-context/ next to + ``source_root: ../``, update then indexed my-project-context/ (no Java) + against the real index one level up — so cocoindex saw every indexed + file as removed and deleted it (the "_deletions keeps growing" symptom + after the run was ctrl+C'd mid-delete). + + After the fix, the env handed to cocoindex carries the YAML-resolved + source_root (one level above the config dir), NOT the config dir itself. + """ + import json + import shutil + from subprocess import CompletedProcess + from java_codebase_rag.installer import run_update + + # Layout mirroring the reported bug: + # tmp_path/ + # my-project-context/ <- cwd; config lives here + # .java-codebase-rag.yml <- source_root: ../ ; index_dir: ../.java-codebase-rag + # .java-codebase-rag/ <- real index, one level above the config + # code_graph.lbug <- marker so "index exists" + config_dir = tmp_path / "my-project-context" + config_dir.mkdir() + (config_dir / ".java-codebase-rag.yml").write_text( + "source_root: ../\nindex_dir: ../.java-codebase-rag\n", + encoding="utf-8", + ) + index_dir = tmp_path / ".java-codebase-rag" + index_dir.mkdir() + (index_dir / "code_graph.lbug").write_text("", encoding="utf-8") + + # A configured host so run_update reaches the index phase. + (config_dir / ".mcp.json").write_text( + json.dumps( + { + "mcpServers": { + "java-codebase-rag": { + "command": "/usr/local/bin/java-codebase-rag-mcp", + "type": "stdio", + } + } + } + ) + ) + monkeypatch.setattr(shutil, "which", lambda x: "/usr/local/bin/java-codebase-rag-mcp") + monkeypatch.setattr( + "java_codebase_rag.installer._read_package_artifact", + lambda path: "PACKAGE CONTENT", + ) + + # The CLI invokes update from the config dir, so the process cwd is the + # config dir — resolve_operator_config(source_root=None) discovers the + # config via Path.cwd(), exactly as increment/init/reprocess do. + # delenv: resolve_operator_config honors JAVA_CODEBASE_RAG_SOURCE_ROOT / + # _INDEX_DIR from os.environ first, and apply_to_os_environ() writes them + # unscoped — a sibling test can leak a value that overrides discovery. + monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False) + monkeypatch.delenv("JAVA_CODEBASE_RAG_INDEX_DIR", raising=False) + monkeypatch.chdir(config_dir) + + # Capture the subprocess env run_update hands cocoindex: it carries the + # resolved JAVA_CODEBASE_RAG_SOURCE_ROOT / _INDEX_DIR. + captured: dict = {} + + def capture_coco(env, *, full_reprocess, quiet, verbose=True, lance_project_root=None): + captured["env"] = env + return CompletedProcess(["cocoindex"], 0) + + def noop_graph(**kwargs): + return CompletedProcess(["build_ast_graph", "--incremental"], 0) + + monkeypatch.setattr("java_codebase_rag.pipeline.run_cocoindex_update", capture_coco) + monkeypatch.setattr("java_codebase_rag.pipeline.run_incremental_graph", noop_graph) + + result = run_update(force=False, dry_run=False, cwd=config_dir) + + # The index phase must have run (env captured), not been skipped. + assert "env" in captured, "run_update did not reach the cocoindex update step" + env = captured["env"] + # source_root: ../ must resolve ONE level above the config dir (the real + # Java tree), NOT the config dir itself. + assert env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] == str(tmp_path.resolve()) + assert env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] != str(config_dir.resolve()) + # index_dir lands on the real index one level above the config dir. + assert env["JAVA_CODEBASE_RAG_INDEX_DIR"] == str(index_dir.resolve()) + # result is independent of the source_root assertion (artifact refresh + # may report partial failure unrelated to this regression); tolerate it. + assert result in (0, 1) + def test_install_then_update_cycle(self, tmp_path, monkeypatch): """install then update: artifacts refreshed, no errors""" from java_codebase_rag.installer import run_install, run_update