From 838d00bdd5cf7a3619c36e3622575488edf31741 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Mon, 15 Jun 2026 22:51:37 +0300 Subject: [PATCH 1/5] Bump version to 0.6.3 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 449b523..ea4d579 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "java-codebase-rag" -version = "0.6.2" +version = "0.6.3" description = "MCP server for semantic + structural search over Java codebases" readme = "README.md" requires-python = ">=3.11" From dcba1413116b925ec542a89805d7ea812f8cf946 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Mon, 15 Jun 2026 23:44:39 +0300 Subject: [PATCH 2/5] =?UTF-8?q?perf(path-filter):=20drop=20O(n=C2=B2)=20wi?= =?UTF-8?q?nning-row=20from=20is=5Fignored=20hot=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit is_ignored returned (bool, IgnoreLayer) and computed _winning_row (one GitIgnoreSpec rebuild per ignore-rule prefix) on every call. All 7 production callers (iter_java_source_files + java_index_flow_lancedb) discarded the layer; only diagnose-ignore needs source attribution, and diagnose_dict already computes it. is_ignored now returns a plain bool; the two test assertions that checked the layer's source migrate to diagnose_dict. On a repo with ~100 ignore rules this cuts ~5000 spec compilations per indexed file down to one. Co-Authored-By: Claude --- java_index_flow_lancedb.py | 12 ++++++------ path_filtering.py | 31 ++++++++++++++----------------- tests/test_path_filtering.py | 28 ++++++++++++++-------------- 3 files changed, 34 insertions(+), 37 deletions(-) diff --git a/java_index_flow_lancedb.py b/java_index_flow_lancedb.py index 10425a2..0f6edac 100644 --- a/java_index_flow_lancedb.py +++ b/java_index_flow_lancedb.py @@ -198,12 +198,12 @@ def _excluded(rel_posix: str) -> bool: continue # Java: **/*.java if fn.endswith(".java"): - if not ignore.is_ignored(full)[0]: + if not ignore.is_ignored(full): total += 1 continue # SQL: **/src/main/resources/db/migration/*.sql if fn.endswith(".sql") and "/db/migration/" in rel: - if not ignore.is_ignored(full)[0]: + if not ignore.is_ignored(full): total += 1 continue # YAML: **/src/main/resources/application*.yml / .yaml @@ -214,7 +214,7 @@ def _excluded(rel_posix: str) -> bool: # total below the actual done count. The ``rel``-based # ``"/src/main/resources/"`` gate stays (full path component). if fn.endswith((".yml", ".yaml")) and fn.startswith("application") and "/src/main/resources/" in rel: - if not ignore.is_ignored(full)[0]: + if not ignore.is_ignored(full): total += 1 return total @@ -313,7 +313,7 @@ async def process_java_file( ) -> None: embedder = coco.use_context(EMBEDDER) project_root = coco.use_context(PROJECT_ROOT) - if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]: + if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()): return try: content = await file.read_text() @@ -379,7 +379,7 @@ async def process_sql_file( ) -> None: embedder = coco.use_context(EMBEDDER) project_root = coco.use_context(PROJECT_ROOT) - if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]: + if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()): return try: content = await file.read_text() @@ -425,7 +425,7 @@ async def process_yaml_file( ) -> None: embedder = coco.use_context(EMBEDDER) project_root = coco.use_context(PROJECT_ROOT) - if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]: + if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()): return try: content = await file.read_text() diff --git a/path_filtering.py b/path_filtering.py index 6275f17..4ff36db 100644 --- a/path_filtering.py +++ b/path_filtering.py @@ -342,24 +342,22 @@ def _mega(self, rel_project: str) -> tuple[list[str], GitIgnoreSpec, list[tuple[ ) return mega, GitIgnoreSpec.from_lines(mega), meta - def is_ignored(self, path: Path) -> tuple[bool, IgnoreLayer | None]: - """Return whether ``path`` is ignored and which layer last matched.""" + def is_ignored(self, path: Path) -> bool: + """Return whether ``path`` is ignored by any configured layer. + + Boolean-only fast path for the per-file index walk. It deliberately does + not compute *which* layer/source last matched: that attribution is + O(rules²) via :func:`_winning_row` (one ``GitIgnoreSpec`` rebuild per + rule prefix) and is only needed for ``diagnose-ignore``, so it lives in + :meth:`diagnose_dict` and is never paid on the hot path. + """ rel = self._rel_project(path) if rel is None: - return False, None - mega, spec, meta = self._mega(rel) + return False + mega, spec, _ = self._mega(rel) if not mega: - return False, None - ignored = spec.match_file(rel) - if not ignored: - return False, None - src, fp, ln, _pat = _winning_row(rel, mega, meta) - return True, IgnoreLayer( - root=self.project_root, - spec=spec, - source=src, - ignore_file=fp, - ) + return False + return spec.match_file(rel) def diagnose(self, path: Path) -> str: """Human-readable, multi-line explanation of the ignore decision.""" @@ -466,7 +464,6 @@ def iter_java_source_files( if not fn.endswith(".java"): continue p = Path(dirpath) / fn - ign, _ = ignore_ctx.is_ignored(p) - if ign: + if ignore_ctx.is_ignored(p): continue yield p diff --git a/tests/test_path_filtering.py b/tests/test_path_filtering.py index c33f695..78e40f3 100644 --- a/tests/test_path_filtering.py +++ b/tests/test_path_filtering.py @@ -55,10 +55,9 @@ def test_39_builtin_default_ignores_class_file(tmp_path: Path) -> None: f = root / "Foo.class" f.write_text("", encoding="utf-8") li = LayeredIgnore(root, use_gitignore=False) - ign, layer = li.is_ignored(f) - assert ign is True - assert layer is not None - assert layer.source == "builtin_default" + assert li.is_ignored(f) is True + d = li.diagnose_dict(f) + assert d["layer"] == "builtin_default" def test_40_project_root_negation_unignores(tmp_path: Path) -> None: @@ -70,7 +69,7 @@ def test_40_project_root_negation_unignores(tmp_path: Path) -> None: f = root / "Foo.class" f.write_text("", encoding="utf-8") li = LayeredIgnore(root, use_gitignore=False) - assert li.is_ignored(f)[0] is False + assert li.is_ignored(f) is False def test_41_nested_ignore_only_under_subtree(tmp_path: Path) -> None: @@ -84,8 +83,8 @@ def test_41_nested_ignore_only_under_subtree(tmp_path: Path) -> None: sibling.parent.mkdir(parents=True) sibling.write_text("class GeneratedBar {}\n", encoding="utf-8") li = LayeredIgnore(root, use_gitignore=False) - assert li.is_ignored(hit)[0] is True - assert li.is_ignored(sibling)[0] is False + assert li.is_ignored(hit) is True + assert li.is_ignored(sibling) is False def test_42_innermost_nested_reincludes(tmp_path: Path) -> None: @@ -100,7 +99,7 @@ def test_42_innermost_nested_reincludes(tmp_path: Path) -> None: f.parent.mkdir(parents=True, exist_ok=True) f.write_text("class GeneratedX {}\n", encoding="utf-8") li = LayeredIgnore(root, use_gitignore=False) - assert li.is_ignored(f)[0] is False + assert li.is_ignored(f) is False def test_43_gitignore_layer(tmp_path: Path) -> None: @@ -111,9 +110,10 @@ def test_43_gitignore_layer(tmp_path: Path) -> None: f.parent.mkdir(parents=True) f.write_text("class X {}\n", encoding="utf-8") li_on = LayeredIgnore(root, use_gitignore=True) - assert li_on.is_ignored(f)[0] is True - assert li_on.is_ignored(f)[1] is not None - assert li_on.is_ignored(f)[1].source == "gitignore" + assert li_on.is_ignored(f) is True + d = li_on.diagnose_dict(f) + assert d["ignored"] is True + assert d["layer"] == "gitignore" def test_44_gitignore_disabled(tmp_path: Path) -> None: @@ -124,7 +124,7 @@ def test_44_gitignore_disabled(tmp_path: Path) -> None: f.parent.mkdir(parents=True) f.write_text("class X {}\n", encoding="utf-8") li = LayeredIgnore(root, use_gitignore=False) - assert li.is_ignored(f)[0] is False + assert li.is_ignored(f) is False def test_45_diagnose_nested_cites_line(tmp_path: Path) -> None: @@ -151,7 +151,7 @@ def test_46_outside_project_not_ignored(tmp_path: Path) -> None: outside = tmp_path / "outside" / "Foo.java" outside.parent.mkdir(parents=True) outside.write_text("class Foo {}\n", encoding="utf-8") - assert li.is_ignored(outside) == (False, None) + assert li.is_ignored(outside) is False def test_bank_chat_java_count_no_lancedb_ignore_gitignore_off_matches_legacy( @@ -195,7 +195,7 @@ def test_out_as_java_package_dir_is_walked_when_no_build_indicator_sibling( li = LayeredIgnore(root, use_gitignore=False) files = list(iter_java_source_files(root, ignore=li)) assert f in files - ign, _ = li.is_ignored(f) + ign = li.is_ignored(f) assert ign is False From d1a15ede185507fbc375556834dcdda23521695f Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Mon, 15 Jun 2026 23:51:24 +0300 Subject: [PATCH 3/5] refactor: collapse mechanical duplication across four modules - ast_java: four byte-identical _codebase_*_inner_annotation_nodes walkers -> one _inner_annotation_nodes(node, src, target_simple). - graph_enrich: three identical _route/_client/_async _hint_lookup helpers -> one generic _hint_lookup (TypeVar) used at all three call sites. - ladybug_queries: find_callers/find_callees (~50-line near-twins) -> one _walk_calls helper; the two public methods keep their signatures and delegate, differing only in call-graph orientation. - pr_analysis: drop the dead 'notes' local in compute_risk (never appended; the real notes are assembled by analyze_pr_pipeline and merged there). meta()'s 5-level query cascade in ladybug_queries is intentionally left in place: it is reachable via direct LadybugGraph(...) construction that bypasses get()'s ontology gate, and test_feign_not_exoser relies on its fallback, so it is not cleanly dead right after the Ladybug migration. Co-Authored-By: Claude --- ast_java.py | 64 ++++++------------------------- graph_enrich.py | 48 +++++++---------------- ladybug_queries.py | 96 ++++++++++++++++++++++------------------------ pr_analysis.py | 3 +- 4 files changed, 70 insertions(+), 141 deletions(-) diff --git a/ast_java.py b/ast_java.py index 5922641..f6bf063 100644 --- a/ast_java.py +++ b/ast_java.py @@ -1565,62 +1565,20 @@ def _parse_codebase_http_route_inner_annotation( return out -def _codebase_route_inner_annotation_nodes(container_ann: Node, src: bytes) -> list[Node]: - found: list[Node] = [] - - def visit(n: Node) -> None: - if n.type == "annotation": - name_node = n.child_by_field_name("name") - n_simple = _txt(name_node, src).rsplit(".", 1)[-1] if name_node is not None else "" - if n_simple == "CodebaseHttpRoute": - found.append(n) - for c in n.children: - visit(c) - - visit(container_ann) - return found - - -def _codebase_async_route_inner_annotation_nodes(container_ann: Node, src: bytes) -> list[Node]: - found: list[Node] = [] - - def visit(n: Node) -> None: - if n.type == "annotation": - name_node = n.child_by_field_name("name") - n_simple = _txt(name_node, src).rsplit(".", 1)[-1] if name_node is not None else "" - if n_simple == "CodebaseAsyncRoute": - found.append(n) - for c in n.children: - visit(c) - - visit(container_ann) - return found - +def _inner_annotation_nodes(container_ann: Node, src: bytes, target_simple: str) -> list[Node]: + """Collect nested ``@`` annotations anywhere under ``container_ann``. -def _codebase_http_client_inner_annotation_nodes(container_ann: Node, src: bytes) -> list[Node]: - found: list[Node] = [] - - def visit(n: Node) -> None: - if n.type == "annotation": - name_node = n.child_by_field_name("name") - n_simple = _txt(name_node, src).rsplit(".", 1)[-1] if name_node is not None else "" - if n_simple == "CodebaseHttpClient": - found.append(n) - for c in n.children: - visit(c) - - visit(container_ann) - return found - - -def _codebase_producer_inner_annotation_nodes(container_ann: Node, src: bytes) -> list[Node]: + Shared by the four brownfield container walkers — ``CodebaseHttpRoute``, + ``CodebaseAsyncRoute``, ``CodebaseHttpClient``, ``CodebaseProducer`` — which + differ only by the target annotation simple name. + """ found: list[Node] = [] def visit(n: Node) -> None: if n.type == "annotation": name_node = n.child_by_field_name("name") n_simple = _txt(name_node, src).rsplit(".", 1)[-1] if name_node is not None else "" - if n_simple == "CodebaseProducer": + if n_simple == target_simple: found.append(n) for c in n.children: visit(c) @@ -1842,7 +1800,7 @@ def _outgoing_calls_from_codebase_http_client_producer_annotations( ), ) elif simple == "CodebaseHttpClients": - for inner in _codebase_http_client_inner_annotation_nodes(ann, src): + for inner in _inner_annotation_nodes(ann, src, "CodebaseHttpClient"): out.append( _parse_codebase_http_client_annotation( inner, @@ -1869,7 +1827,7 @@ def _outgoing_calls_from_codebase_http_client_producer_annotations( ), ) elif simple == "CodebaseProducers": - for inner in _codebase_producer_inner_annotation_nodes(ann, src): + for inner in _inner_annotation_nodes(ann, src, "CodebaseProducer"): out.append( _parse_codebase_producer_annotation( inner, @@ -2343,7 +2301,7 @@ def _collect_routes( ), ) elif simple == "CodebaseHttpRoutes": - for inner in _codebase_route_inner_annotation_nodes(node, src): + for inner in _inner_annotation_nodes(node, src, "CodebaseHttpRoute"): routes.extend( _parse_codebase_http_route_inner_annotation( inner, @@ -2359,7 +2317,7 @@ def _collect_routes( elif simple in ("CodebaseAsyncRoute", "CodebaseAsyncRoutes"): nodes = [node] if simple == "CodebaseAsyncRoutes": - nodes = list(_codebase_async_route_inner_annotation_nodes(node, src)) + nodes = list(_inner_annotation_nodes(node, src, "CodebaseAsyncRoute")) for ann in nodes: pairs, _ = _annotation_kv_nodes(ann, src) topic_node = pairs.get("topic") diff --git a/graph_enrich.py b/graph_enrich.py index 02119b2..97e54e3 100644 --- a/graph_enrich.py +++ b/graph_enrich.py @@ -23,7 +23,7 @@ from dataclasses import dataclass, field, replace from functools import lru_cache from pathlib import Path -from typing import Any +from typing import Any, TypeVar from ast_java import ( AnnotationRef, JavaFileAst, @@ -820,7 +820,15 @@ def _route_path_atom(raw_value: str, value_kind: str | None) -> tuple[str, str, return "", "constant_ref", 0.7, False -def _route_hint_lookup(ann: AnnotationRef, hints: dict[str, RouteHint]) -> RouteHint | None: +_HINT = TypeVar("_HINT") + + +def _hint_lookup(ann: AnnotationRef, hints: dict[str, _HINT]) -> _HINT | None: + """Resolve a brownfield hint by qualified name, then simple name, then suffix. + + Shared by route / http-client / async-producer hint resolution; the three + former copies differed only in the hint value type. + """ q = ann.qualified.strip() if q in hints: return hints[q] @@ -1118,7 +1126,7 @@ def resolve_routes_for_method( # ----- Step 2: Layer B — annotation route hints ----- for _is_m, ann in combined_anns: - hint = _route_hint_lookup(ann, overrides.annotation_to_route_hint) + hint = _hint_lookup(ann, overrides.annotation_to_route_hint) if hint is None: continue working.append( @@ -1172,36 +1180,6 @@ def resolve_routes_for_method( return working -def _client_hint_lookup( - ann: AnnotationRef, - hints: dict[str, HttpClientHint], -) -> HttpClientHint | None: - q = ann.qualified.strip() - if q in hints: - return hints[q] - if ann.name in hints: - return hints[ann.name] - for k, h in sorted(hints.items(), key=lambda kv: kv[0]): - if k.endswith("." + ann.name): - return h - return None - - -def _async_hint_lookup( - ann: AnnotationRef, - hints: dict[str, AsyncProducerHint], -) -> AsyncProducerHint | None: - q = ann.qualified.strip() - if q in hints: - return hints[q] - if ann.name in hints: - return hints[ann.name] - for k, h in sorted(hints.items(), key=lambda kv: kv[0]): - if k.endswith("." + ann.name): - return h - return None - - def _call_from_http_hint( *, hint: HttpClientHint, @@ -1296,7 +1274,7 @@ def resolve_http_client_for_method( anchor = builtin_http[0] if builtin_http else (layer_c_src[0] if layer_c_src else None) for _is_m, ann in combined_anns: - hint = _client_hint_lookup(ann, overrides.annotation_to_http_client_hint) + hint = _hint_lookup(ann, overrides.annotation_to_http_client_hint) if hint is None: continue brownfield_calls.append( @@ -1388,7 +1366,7 @@ def resolve_async_producer_for_method( anchor = builtin_async[0] if builtin_async else (layer_c_src[0] if layer_c_src else None) for _is_m, ann in combined_anns: - hint = _async_hint_lookup(ann, overrides.annotation_to_async_producer_hint) + hint = _hint_lookup(ann, overrides.annotation_to_async_producer_hint) if hint is None: continue brownfield_calls.append( diff --git a/ladybug_queries.py b/ladybug_queries.py index 77ae32b..66204b0 100644 --- a/ladybug_queries.py +++ b/ladybug_queries.py @@ -1161,8 +1161,11 @@ def _method_ids_for_call_graph_needle(self, needle: str, *, limit: int) -> list[ ) return [str(r["id"]) for r in rows2 if r.get("id")] - def find_callers( - self, needle: str, *, + def _walk_calls( + self, + needle: str, + *, + side: str, depth: int = 1, limit: int = 100, min_confidence: float = 0.0, @@ -1170,6 +1173,17 @@ def find_callers( module: str | None = None, microservice: str | None = None, ) -> list[CallEdge]: + """BFS the CALLS graph outward from ``needle`` along one relationship end. + + ``side="callers"`` treats the needle as the callee: the frontier matches + the ``callee`` end and discovered/expanded/external-filtered nodes are the + ``caller`` (src) end. ``side="callees"`` is the mirror. The two public + methods differ only in that orientation, so the BFS body is shared here. + """ + if side == "callers": + scope_alias, frontier_end, discovered = "caller", "callee", "src" + else: + scope_alias, frontier_end, discovered = "callee", "caller", "dst" frontier = self._method_ids_for_call_graph_needle(needle, limit=max(limit, 50)) if not frontier: return [] @@ -1182,8 +1196,8 @@ def find_callers( "frontier": list(frontier), "minc": float(min_confidence), } - sc = _scope_filters("caller", module=module, microservice=microservice, params=params) - wh_parts = ["callee.id IN $frontier", "c.confidence >= $minc"] + sc = _scope_filters(scope_alias, module=module, microservice=microservice, params=params) + wh_parts = [f"{frontier_end}.id IN $frontier", "c.confidence >= $minc"] wh_parts.extend(sc) wh = " AND ".join(wh_parts) q = ( @@ -1197,16 +1211,19 @@ def find_callers( next_frontier: list[str] = [] for row in self._rows(q, params): ce = _row_to_call_edge(row) - # Filter only discovered callers (src). Needle may be external - # (e.g. java.util.List#add) while still listing internal callers. - if exclude_external and _is_external_fqn(ce.src.fqn): + # The needle itself may be external (e.g. java.util.List#add); + # filter only the discovered end so internal callers/callees + # that touch it are still surfaced. + disc_fqn = ce.src.fqn if discovered == "src" else ce.dst.fqn + disc_id = ce.src.id if discovered == "src" else ce.dst.id + if exclude_external and _is_external_fqn(disc_fqn): continue key = (ce.src.id, ce.dst.id, ce.call_site_line, ce.call_site_byte) if key in seen: continue seen.add(key) out.append(ce) - next_frontier.append(ce.src.id) + next_frontier.append(disc_id) if len(out) >= limit: return out frontier = list(dict.fromkeys(next_frontier)) @@ -1214,6 +1231,21 @@ def find_callers( break return out + def find_callers( + self, needle: str, *, + depth: int = 1, + limit: int = 100, + min_confidence: float = 0.0, + exclude_external: bool = True, + module: str | None = None, + microservice: str | None = None, + ) -> list[CallEdge]: + return self._walk_calls( + needle, side="callers", depth=depth, limit=limit, + min_confidence=min_confidence, exclude_external=exclude_external, + module=module, microservice=microservice, + ) + def find_callees( self, needle: str, *, depth: int = 1, @@ -1223,49 +1255,11 @@ def find_callees( module: str | None = None, microservice: str | None = None, ) -> list[CallEdge]: - frontier = self._method_ids_for_call_graph_needle(needle, limit=max(limit, 50)) - if not frontier: - return [] - caller_proj = ", ".join(f"caller.{c} AS caller_{c}" for c in _SYM_COLS) - callee_proj = ", ".join(f"callee.{c} AS callee_{c}" for c in _SYM_COLS) - out: list[CallEdge] = [] - seen: set[tuple[str, str, int, int]] = set() - for _ in range(max(1, int(depth))): - params: dict[str, Any] = { - "frontier": list(frontier), - "minc": float(min_confidence), - } - sc = _scope_filters("callee", module=module, microservice=microservice, params=params) - wh_parts = ["caller.id IN $frontier", "c.confidence >= $minc"] - wh_parts.extend(sc) - wh = " AND ".join(wh_parts) - q = ( - f"MATCH (caller:Symbol)-[c:CALLS]->(callee:Symbol) WHERE {wh} " - f"RETURN {caller_proj}, {callee_proj}, " - f"c.call_site_line AS call_site_line, c.call_site_byte AS call_site_byte, " - f"c.arg_count AS arg_count, c.confidence AS confidence, c.strategy AS strategy, " - f"c.source AS source, c.resolved AS resolved " - f"LIMIT {int(limit) * 8}" - ) - next_frontier: list[str] = [] - for row in self._rows(q, params): - ce = _row_to_call_edge(row) - # Filter only discovered callees (dst). Needle may be external while - # still listing non-external outbound calls when any exist. - if exclude_external and _is_external_fqn(ce.dst.fqn): - continue - key = (ce.src.id, ce.dst.id, ce.call_site_line, ce.call_site_byte) - if key in seen: - continue - seen.add(key) - out.append(ce) - next_frontier.append(ce.dst.id) - if len(out) >= limit: - return out - frontier = list(dict.fromkeys(next_frontier)) - if not frontier: - break - return out + return self._walk_calls( + needle, side="callees", depth=depth, limit=limit, + min_confidence=min_confidence, exclude_external=exclude_external, + module=module, microservice=microservice, + ) def expand_methods( self, fqns: list[str], *, depth: int = 1, diff --git a/pr_analysis.py b/pr_analysis.py index 874d24c..d33502d 100644 --- a/pr_analysis.py +++ b/pr_analysis.py @@ -384,7 +384,6 @@ def compute_risk(graph: Any, changed: list[ChangedSymbol]) -> PrRiskReport: bump (up to +1.0) after normalization so they influence rank while preserving the public scalar contract. """ - notes: list[str] = [] blast_by: dict[str, int] = {} blast_total = 0 routes: list[str] = [] @@ -495,7 +494,7 @@ def _normalize(x: float, ceiling: float) -> float: routes_touched=routes, risk_score=score, risk_band=band, - notes=notes, + notes=[], ) From 2794074b87324e1643de019f2beaf4f20818a8bc Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Mon, 15 Jun 2026 23:58:26 +0300 Subject: [PATCH 4/5] fix: latent footguns in mcp_v2 error branches and brownfield layer literals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mcp_v2: 15 error branches passed hints=[] to NeighborsOutput/DescribeOutput, neither of which has a hints field (their fields are advisories / hints_structured, both defaulting to []). pydantic silently dropped the kwarg; today harmless, but the moment anyone adds extra='forbid' to those models every error branch would raise ValidationError swallowed by the catch-all. The models already default to empty, so the dead kwargs are simply removed. build_ast_graph: the four brownfield layer names were spelled out four times (_client_source_layer, _producer_source_layer, brownfield_strategies, and _BROWNFIELD_LAYERS). Promote _BROWNFIELD_LAYERS to the single source of truth and define brownfield_strategies as _BROWNFIELD_LAYERS plus the two caller-side declaration strategies (codebase_client/codebase_producer). The two sets still differ deliberately: _BROWNFIELD_LAYERS gates brownfield_only authoritativeness (per edge), while brownfield_strategies counts annotation-declared callers in the *_from_brownfield_pct stats — now that relationship is explicit instead of two independent literals that looked unrelated. Co-Authored-By: Claude --- build_ast_graph.py | 42 +++++++++++++++++++++++------------------- mcp_v2.py | 22 +++++++--------------- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/build_ast_graph.py b/build_ast_graph.py index 6b2c377..403d790 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -2010,8 +2010,21 @@ def _producer_id( return f"p:{hashlib.sha1(key.encode()).hexdigest()[:16]}" +# The four brownfield source layers — single source of truth. Consumed by the +# client/producer source-layer classifiers, the *_from_brownfield_pct stats +# (via brownfield_strategies), and the brownfield_only authoritativeness gate in +# _is_brownfield_sourced. codebase_client/codebase_producer are caller-side +# declaration strategies, not layers — they extend brownfield_strategies only. +_BROWNFIELD_LAYERS = frozenset({ + "layer_a_meta", + "layer_b_ann", + "layer_b_fqn", + "layer_c_source", +}) + + def _client_source_layer(strategy: str) -> str: - if strategy in {"layer_a_meta", "layer_b_ann", "layer_b_fqn", "layer_c_source"}: + if strategy in _BROWNFIELD_LAYERS: return strategy # Some caller extraction paths emit client kind as strategy; treat those # as builtin-source declarations instead of warning on every row. @@ -2023,7 +2036,7 @@ def _client_source_layer(strategy: str) -> str: def _producer_source_layer(strategy: str) -> str: - if strategy in {"layer_a_meta", "layer_b_ann", "layer_b_fqn", "layer_c_source"}: + if strategy in _BROWNFIELD_LAYERS: return strategy if strategy in VALID_PRODUCER_KINDS: return "builtin" @@ -2458,15 +2471,14 @@ def _phantom_async_route_id(call: OutgoingCallDecl) -> str: tables.producer_stats.producers_by_kind = defaultdict(int) for row in tables.producer_rows: tables.producer_stats.producers_by_kind[row.producer_kind] += 1 - brownfield_strategies = frozenset( - ( - "layer_b_ann", - "layer_a_meta", - "layer_c_source", - "layer_b_fqn", - "codebase_client", - "codebase_producer", - ), + # brownfield_strategies = the four brownfield layers plus the two + # caller-side declaration strategies (@CodebaseHttpClient / + # @CodebaseProducer). These extend _BROWNFIELD_LAYERS deliberately: + # the *_from_brownfield_pct stats count annotation-declared callers as + # brownfield-sourced even though they are not "layers" and so do not + # gate brownfield_only authoritativeness in _is_brownfield_sourced. + brownfield_strategies = _BROWNFIELD_LAYERS | frozenset( + {"codebase_client", "codebase_producer"}, ) if tables.call_edge_stats.http_calls_total: n_http = sum( @@ -2568,14 +2580,6 @@ def _match_call_edge( return "cross_service", candidates -_BROWNFIELD_LAYERS = frozenset({ - "layer_c_source", - "layer_b_ann", - "layer_b_fqn", - "layer_a_meta", -}) - - def _is_brownfield_sourced( call_strategy: str, candidates: list[RouteRow], diff --git a/mcp_v2.py b/mcp_v2.py index 5b5087b..a863dbb 100644 --- a/mcp_v2.py +++ b/mcp_v2.py @@ -1095,9 +1095,9 @@ def describe_v2( has_id = bool(id and str(id).strip()) has_fqn = bool(fqn and str(fqn).strip()) if not has_id and not has_fqn: - return DescribeOutput(success=False, message="id or fqn required", hints=[]) + return DescribeOutput(success=False, message="id or fqn required") if has_id and str(id).strip().startswith("ucs:"): - return DescribeOutput(success=False, message=_DESCRIBE_UCS_ID_MESSAGE, hints=[]) + return DescribeOutput(success=False, message=_DESCRIBE_UCS_ID_MESSAGE) hint_message: str | None = None node_id: str if has_id: @@ -1109,7 +1109,7 @@ def describe_v2( {"fqn": fqn_val}, ) if not rows: - return DescribeOutput(success=False, message=f"No Symbol found for fqn='{fqn_val}'", hints=[]) + return DescribeOutput(success=False, message=f"No Symbol found for fqn='{fqn_val}'") node_id = str(rows[0]["id"] or "") if len(rows) > 1: hint_message = ( @@ -1784,7 +1784,7 @@ def neighbors_v2( ) except ValueError as exc: _log_fail_loud("edge_filter") - return NeighborsOutput(success=False, message=str(exc), hints=[], requested_edge_types=[]) + return NeighborsOutput(success=False, message=str(exc), requested_edge_types=[]) if include_unresolved and ef is not None: return NeighborsOutput( success=False, @@ -1792,21 +1792,18 @@ def neighbors_v2( "include_unresolved=True is incompatible with edge_filter; " "UnresolvedCallSite rows have no edge attributes to filter on" ), - hints=[], requested_edge_types=requested_edge_types, ) if include_unresolved and requested_edge_types != ["CALLS"]: return NeighborsOutput( success=False, message="include_unresolved requires edge_types=['CALLS']", - hints=[], requested_edge_types=requested_edge_types, ) if include_unresolved and direction != "out": return NeighborsOutput( success=False, message='include_unresolved requires direction="out"', - hints=[], requested_edge_types=requested_edge_types, ) if ef and (err := _edgefilter_applicability_error(requested_edge_types, ef)): @@ -1814,17 +1811,15 @@ def neighbors_v2( return NeighborsOutput( success=False, message=err, - hints=[], requested_edge_types=requested_edge_types, ) if nf and (err := _validate_no_wildcards(nf)): _log_fail_loud("wildcard") - return NeighborsOutput(success=False, message=err, hints=[], requested_edge_types=[]) + return NeighborsOutput(success=False, message=err, requested_edge_types=[]) if composed_keys and direction != "out": return NeighborsOutput( success=False, message='Composed edge types require direction="out"', - hints=[], requested_edge_types=requested_edge_types, ) use_calls_path = flat_labels == ["CALLS"] and not composed_keys @@ -1849,7 +1844,6 @@ def neighbors_v2( return NeighborsOutput( success=False, message=axis_msg, - hints=[], requested_edge_types=requested_edge_types, ) origin_row = _load_node_record(g, origin_id, "symbol") @@ -1865,7 +1859,6 @@ def neighbors_v2( return NeighborsOutput( success=False, message=err, - hints=[], requested_edge_types=requested_edge_types, ) if use_calls_path: @@ -1891,7 +1884,6 @@ def neighbors_v2( return NeighborsOutput( success=False, message=str(exc), - hints=[], requested_edge_types=requested_edge_types, ) if ( @@ -1941,7 +1933,7 @@ def neighbors_v2( if nf and (err := _nodefilter_applicability_error(other_kind, nf)): _log_fail_loud("applicability") return NeighborsOutput( - success=False, message=err, hints=[], requested_edge_types=[] + success=False, message=err, requested_edge_types=[] ) if not _node_matches_filter(other_kind, other_rec, nf): continue @@ -1968,7 +1960,7 @@ def neighbors_v2( if nf and (err := _nodefilter_applicability_error(other_kind, nf)): _log_fail_loud("applicability") return NeighborsOutput( - success=False, message=err, hints=[], requested_edge_types=[] + success=False, message=err, requested_edge_types=[] ) if not _node_matches_filter(other_kind, other_rec, nf): continue From ea996a134b7a95970e83afd490a240b2490df303 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Tue, 16 Jun 2026 00:20:30 +0300 Subject: [PATCH 5/5] docs: close out the LadybugDB migration across docs, strings, and plans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KuzuDB->LadybugDB migration (commit #302) landed as code but its doc/string sweep and plan close-out were never finished, leaving operator and agent docs asserting the old store. This completes it: Docs (current-state Kuzu -> LadybugDB, code_graph.kuzu -> .lbug, --kuzu-path -> --ladybug-path, kuzu_queries.py -> ladybug_queries.py, KuzuGraph -> LadybugGraph, kuzu_path -> ladybug_path; ontology 15/16 -> 17): README, AGENTS, docs/CONFIGURATION, docs/JAVA-CODEBASE-RAG-CLI, docs/MANUAL-VERIFICATION-CHECKLIST, docs/CODEBASE_REQUIREMENTS, docs/AGENT-GUIDE, docs/PRODUCT-VISION, tests/README. Factual fixes surfaced by the markdown freshness audit: - README: DECLARES_ROUTE (nonexistent edge) -> EXPOSES; role list no longer lists PRODUCER (a node kind) and now includes COMPONENT/CONFIG/ENTITY; EMBEDDING_MODEL -> SBERT_MODEL (the real env var). - AGENT-GUIDE + SKILL: route frameworks corrected to spring_mvc/webflux (kafka/rabbitmq/jms/stream are route kinds; feign is a client kind). - PRODUCT-VISION: CALLS is shipped, not 'planned'. External citation titles (footnotes 12/17) intentionally left as 'Kuzu'. Shipped-artifact resync + plan close-out: - install_data/{skills,agents} explorer copies re-synced from source (they were behind, missing source_layer and the schema-rejection note). - Moved the landed PLAN/propose for LADYBUG-DB-MIGRATE and INDEX-OUTPUT-REWORK from active/ to completed/. Source docstring/help-string sweep only (cli/pr_analysis/mcp_v2/search_lancedb, conftest, test_ladybug_queries docstrings) — no behaviour change; the one clearly-stale kuzu 0.11.x version reference in mcp_v2 is genericized. Co-Authored-By: Claude --- AGENTS.md | 26 +++++++-------- README.md | 18 +++++------ docs/AGENT-GUIDE.md | 4 +-- docs/CODEBASE_REQUIREMENTS.md | 28 ++++++++-------- docs/CONFIGURATION.md | 22 ++++++------- docs/JAVA-CODEBASE-RAG-CLI.md | 22 ++++++------- docs/MANUAL-VERIFICATION-CHECKLIST.md | 6 ++-- docs/PRODUCT-VISION.md | 28 ++++++++-------- java_codebase_rag/cli.py | 20 ++++++------ .../agents/explorer-rag-enhanced.md | 6 ++-- .../skills/explore-codebase/SKILL.md | 10 +++--- mcp_v2.py | 4 +-- .../AGENT-PROMPTS-INDEX-OUTPUT-REWORK.md | 0 .../PLAN-INDEX-OUTPUT-REWORK.md | 0 .../PLAN-LADYBUG-DB-MIGRATE.md | 0 pr_analysis.py | 4 +-- .../INDEX-OUTPUT-REWORK-PROPOSE.md | 0 .../LADYBUG-DB-MIGRATE-PROPOSE.md | 0 search_lancedb.py | 4 +-- skills/explore-codebase/SKILL.md | 2 +- tests/README.md | 32 +++++++++---------- tests/conftest.py | 4 +-- 22 files changed, 120 insertions(+), 120 deletions(-) rename plans/{active => completed}/AGENT-PROMPTS-INDEX-OUTPUT-REWORK.md (100%) rename plans/{active => completed}/PLAN-INDEX-OUTPUT-REWORK.md (100%) rename plans/{active => completed}/PLAN-LADYBUG-DB-MIGRATE.md (100%) rename propose/{active => completed}/INDEX-OUTPUT-REWORK-PROPOSE.md (100%) rename propose/{active => completed}/LADYBUG-DB-MIGRATE-PROPOSE.md (100%) diff --git a/AGENTS.md b/AGENTS.md index d85ad92..7a35838 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -74,8 +74,8 @@ when needed. |------|------| | `server.py` | MCP stdio server. Every `@mcp.tool` lives here. | | `search_lancedb.py` | Vector / hybrid / graph-expanded search; ranking. | -| `build_ast_graph.py` | Tree-sitter → Kuzu graph builder (full rebuild). Owns `pass1`–`pass6` (`pass5` emits `HTTP_CALLS` / `ASYNC_CALLS` caller edges; `pass6_match_edges` resolves cross-service / intra-service / ambiguous / phantom / unresolved match outcomes — ontology 7). | -| `kuzu_queries.py` | Read-only Cypher helpers used by the server. Includes `meta()` decoder for the Kuzu MAP-as-STRING JSON-blob columns. | +| `build_ast_graph.py` | Tree-sitter → LadybugDB graph builder (full rebuild). Owns `pass1`–`pass6` (`pass5` emits `HTTP_CALLS` / `ASYNC_CALLS` caller edges; `pass6_match_edges` resolves cross-service / intra-service / ambiguous / phantom / unresolved match outcomes — ontology 7). | +| `ladybug_queries.py` | Read-only Cypher helpers used by the server. Includes `meta()` decoder for the LadybugDB MAP-as-STRING JSON-blob columns. | | `ast_java.py` | Tree-sitter Java parsing, role/capability inference, `_string_value_atoms` helper (shared by route/client/producer extractors), `_collect_outgoing_calls` for caller-side detection. | | `graph_enrich.py` | `module` / `microservice` resolution, `BrownfieldOverrides` (route + role + capability + http client + async producer), meta-annotation walk, `resolve_routes_for_method` / `resolve_http_client_for_method` / `resolve_async_producer_for_method`. | | `java_ontology.py` | Source of truth for `VALID_ROLES`, `VALID_CAPABILITIES`, `VALID_CLIENT_KINDS`, `VALID_HTTP_CALL_STRATEGIES`, `VALID_ASYNC_CALL_STRATEGIES`, `VALID_HTTP_CALL_MATCHES`. | @@ -90,7 +90,7 @@ when needed. ## Test layout -- `tests/conftest.py` — session-scoped Kuzu graph fixture. +- `tests/conftest.py` — session-scoped LadybugDB graph fixture. - `tests/bank-chat-system/` — deterministic Java corpus (fixture, not production model). - `tests/fixtures/call_graph_smoke/` — mini Maven tree calibrated against the call-graph resolver. - `tests/fixtures/brownfield_route_stubs/` — `@CodebaseRoute` / `@CodebaseRoutes` source stubs (PR-A3). @@ -188,7 +188,7 @@ template): `VALID_ASYNC_CALL_STRATEGIES`, `VALID_HTTP_CALL_MATCHES`, `VALID_ROUTE_FRAMEWORKS`, `VALID_ROUTE_KINDS`, `VALID_PRODUCER_KINDS`, `VALID_RESOLVE_REASONS`, `VALID_UNRESOLVED_CALL_REASONS`. -- Schema changes that affect the Lance index or Kuzu graph need a +- Schema changes that affect the Lance index or LadybugDB graph need a matching update to the README "Re-index required" callout. Bump `ontology_version` when enrichment semantics change (currently **17**). - Brownfield is a first-class surface: any new auto-detection (route, @@ -199,10 +199,10 @@ template): union when any brownfield layer fires on a method (single network packet → single edge). See `plans/completed/PLAN-TIER1B-COMPLETION.md` § "Caller-side composition divergence". -- Kuzu's Python binder rejects `dict` for `MAP` columns. Store all +- LadybugDB's Python binder rejects `dict` for `MAP` columns. Store all map-shaped graph_meta data (`routes_by_framework`, `routes_by_layer`, `http_calls_by_strategy`, `async_calls_by_strategy`, etc.) as `STRING` - JSON blobs and decode in `kuzu_queries.meta()`. + JSON blobs and decode in `ladybug_queries.meta()`. - `server.py` is a stdio MCP server: anything reachable from a tool handler must not write to **stdout** (that's the JSON-RPC transport). Diagnostics go to stderr. @@ -216,10 +216,10 @@ template): support. `BrownfieldOverrides` already holds route, role, capability, http client, and async producer dicts — extend it in place. -## Kuzu Cypher pitfalls +## LadybugDB Cypher pitfalls -When adding or editing Cypher run against Kuzu (for example in -`kuzu_queries.py`, `mcp_v2.py`, or any `KuzuGraph._rows` caller): +When adding or editing Cypher run against LadybugDB (for example in +`ladybug_queries.py`, `mcp_v2.py`, or any `LadybugGraph._rows` caller): - **Do not filter relationship types with** `label(e) IN $list` **or** `label(e) IN ["A","B"]` **in** `WHERE`. On supported versions this can @@ -252,7 +252,7 @@ When adding or editing Cypher run against Kuzu (for example in ```bash rm -rf /tmp/check && .venv/bin/python build_ast_graph.py \ --source-root tests/bank-chat-system \ - --kuzu-path /tmp/check/code_graph.kuzu --verbose + --ladybug-path /tmp/check/code_graph.lbug --verbose ``` ## Commit and PR @@ -289,7 +289,7 @@ When adding or editing Cypher run against Kuzu (for example in ## Cursor Cloud specific instructions This is a self-contained Python project — no external services -(no Postgres, Kafka, Docker) are needed. All storage (Kuzu, LanceDB, +(no Postgres, Kafka, Docker) are needed. All storage (LadybugDB, LanceDB, CocoIndex state) is embedded/file-based. ### Environment @@ -317,12 +317,12 @@ first run. They are not required for normal development. ### Hello-world verification -Build the Kuzu graph from the test fixture and inspect it: +Build the LadybugDB graph from the test fixture and inspect it: ```bash rm -rf /tmp/check && .venv/bin/python build_ast_graph.py \ --source-root tests/bank-chat-system \ - --kuzu-path /tmp/check/code_graph.kuzu --verbose + --ladybug-path /tmp/check/code_graph.lbug --verbose .venv/bin/java-codebase-rag meta \ --source-root tests/bank-chat-system --index-dir /tmp/check ``` diff --git a/README.md b/README.md index 1f2b68a..00cb46c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ A graph-native code intelligence layer for Java microservice estates, exposed to LLM agents via the **Model Context Protocol (MCP)**. -The system extracts a deterministic property graph from Java source (tree-sitter), stores it in **Kuzu** (graph) alongside a **LanceDB** vector index (chunks), and exposes a deliberately small MCP surface — **five tools**: `search`, `find`, `describe`, `neighbors`, `resolve` — that collapse onto three primitive agent operations: **locate**, **inspect**, **walk**. +The system extracts a deterministic property graph from Java source (tree-sitter), stores it in **LadybugDB** (graph) alongside a **LanceDB** vector index (chunks), and exposes a deliberately small MCP surface — **five tools**: `search`, `find`, `describe`, `neighbors`, `resolve` — that collapse onto three primitive agent operations: **locate**, **inspect**, **walk**. > **What this MCP is:** a **GPS for code navigation**, not a reasoning engine. > Agents use a simple loop: @@ -21,9 +21,9 @@ For the design rationale, the GPS metaphor, and the full ontology, see [`docs/pa Generic code-search tools (grep, ctags, vector-only RAG) hit a ceiling on real Java microservice estates: they find files but lose the structure that makes a Spring/JAX-RS system navigable. This project is built around five choices that target that gap. -- **Hybrid RAG + GraphRAG, not either-or.** Semantic recall (LanceDB chunk vectors) and structural navigation (Kuzu property graph) are composed in one surface. `search` finds candidate nodes by meaning; `neighbors` walks the exact edge you care about (`CALLS`, `IMPLEMENTS`, `INJECTS`, `DECLARES_ROUTE`, …). The agent picks the right primitive per step instead of being forced into pure-vector or pure-symbol search. +- **Hybrid RAG + GraphRAG, not either-or.** Semantic recall (LanceDB chunk vectors) and structural navigation (LadybugDB property graph) are composed in one surface. `search` finds candidate nodes by meaning; `neighbors` walks the exact edge you care about (`CALLS`, `IMPLEMENTS`, `INJECTS`, `EXPOSES`, …). The agent picks the right primitive per step instead of being forced into pure-vector or pure-symbol search. -- **A Java-tuned role model.** Symbols are labelled with stereotypes inferred from Spring and JAX-RS conventions — `CONTROLLER`, `SERVICE`, `REPOSITORY`, `CLIENT`, `PRODUCER`, `MAPPER`, `DTO`. Agents can ask "list controllers" or "who injects this repository" directly, instead of grep-ing for `@RestController` and hoping for the best. Roles drive both filtering (`find` with a `NodeFilter`) and ranking. +- **A Java-tuned role model.** Symbols are labelled with stereotypes inferred from Spring and JAX-RS conventions — `CONTROLLER`, `SERVICE`, `REPOSITORY`, `COMPONENT`, `CONFIG`, `ENTITY`, `CLIENT`, `MAPPER`, `DTO`. Agents can ask "list controllers" or "who injects this repository" directly, instead of grep-ing for `@RestController` and hoping for the best. Roles drive both filtering (`find` with a `NodeFilter`) and ranking. - **Ranking specialized for Java codebases.** The composite ranker is aware of role, microservice, and FQN structure — not a generic BM25. A search for `"chat ingress"` surfaces controllers before utility classes; a search scoped to one microservice doesn't drown in matches from the other 19. Defaults are tuned on the bank-chat fixture and exposed in `docs/CONFIGURATION.md` for per-repo overrides. @@ -71,7 +71,7 @@ All indexing lifecycle commands (`init`, `increment`, `reprocess`, `install`, `u If you prefer manual configuration, see [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md) for the full CLI reference. -> **Stability disclaimer.** This package does **not** promise backward compatibility. MCP tool contracts, env vars, Lance/Kuzu schemas, config files, and Python APIs may change without a deprecation period. Track `main` and rebuild indexes when ontology or embedding settings change. +> **Stability disclaimer.** This package does **not** promise backward compatibility. MCP tool contracts, env vars, Lance/LadybugDB schemas, config files, and Python APIs may change without a deprecation period. Track `main` and rebuild indexes when ontology or embedding settings change. --- @@ -84,7 +84,7 @@ This repo ships a small multi-module Spring fixture under [`tests/bank-chat-syst git clone https://github.com/HumanBean17/java-codebase-rag cd java-codebase-rag -# 2. Build the index (Lance vectors + Kuzu graph). First run downloads the +# 2. Build the index (Lance vectors + LadybugDB graph). First run downloads the # embedding model (~90 MB) and takes ~30-60s on the fixture. java-codebase-rag init --source-root tests/bank-chat-system --index-dir /tmp/bank-chat-index @@ -99,7 +99,7 @@ Smoke-test the index with two checks (`search_lancedb` ships with the package): JAVA_CODEBASE_RAG_INDEX_DIR=/tmp/bank-chat-index \ python -m search_lancedb "chat ingress controller" --table java --limit 3 -# Vector + graph expansion — proves Kuzu is wired in +# Vector + graph expansion — proves LadybugDB is wired in JAVA_CODEBASE_RAG_INDEX_DIR=/tmp/bank-chat-index \ python -m search_lancedb "chat ingress controller" --table java --limit 3 \ --graph-expand --expand-depth 2 @@ -199,8 +199,8 @@ Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook wi | Setup | `install` | Interactive setup wizard: config, MCP registration, skill/agent deployment, indexing. | | Setup | `update` | Refresh shipped artifacts (skill, agent, MCP entry) + incremental Lance/graph catch-up after pip upgrade. | | Lifecycle | `init` | First-time index. Refuses if artifacts already exist. | -| Lifecycle | `increment` | CocoIndex catch-up + incremental Kuzu update. `--vectors-only` for Lance only. | -| Lifecycle | `reprocess` | Full Lance + Kuzu rebuild. `--vectors-only` / `--graph-only` for a single phase. | +| Lifecycle | `increment` | CocoIndex catch-up + incremental LadybugDB update. `--vectors-only` for Lance only. | +| Lifecycle | `reprocess` | Full Lance + LadybugDB rebuild. `--vectors-only` / `--graph-only` for a single phase. | | Lifecycle | `erase` | Delete index artifacts. Requires `--yes` or TTY confirm. | | Introspection | `meta`, `tables`, `diagnose-ignore`, `unresolved-calls` | Health, table listing, ignore-layer diagnostics, receiver-failure call sites. | | Analysis | `analyze-pr` | Blast-radius / risk from a unified diff. | @@ -235,7 +235,7 @@ python3 -m venv .venv The `cocoindex` package powers lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation do not invoke it directly. -The default embedding model is `sentence-transformers/all-MiniLM-L6-v2` (downloaded on first `init`). Override via the `EMBEDDING_MODEL` env var — see [`docs/CONFIGURATION.md` §1](./docs/CONFIGURATION.md#1-environment-variables). +The default embedding model is `sentence-transformers/all-MiniLM-L6-v2` (downloaded on first `init`). Override via the `SBERT_MODEL` env var — see [`docs/CONFIGURATION.md` §1](./docs/CONFIGURATION.md#1-environment-variables). --- diff --git a/docs/AGENT-GUIDE.md b/docs/AGENT-GUIDE.md index ea6fc91..c469418 100644 --- a/docs/AGENT-GUIDE.md +++ b/docs/AGENT-GUIDE.md @@ -14,7 +14,7 @@ Copy the block between `