diff --git a/README.md b/README.md index af66387..fd70224 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Once you have the snapshot, the CLI works offline: ### MCP server - give your AI assistant a schema brain -The MCP server reads the same snapshot. It exposes 15 tools over stdio or SSE: schema exploration, query validation, migration checks, linting, vacuum health. Your AI assistant understands your database while it writes SQL. +The MCP server reads the same snapshot. It exposes 14 tools over stdio or SSE: schema exploration, query validation, migration checks, linting, vacuum health. Your AI assistant understands your database while it writes SQL. No database connection needed. The assistant never sees credentials. @@ -169,7 +169,7 @@ dryrun --profile replica1 snapshot activity --from "$REPLICA1_URL" --label repli dryrun --profile replica2 snapshot activity --from "$REPLICA2_URL" --label replica2 ``` -The MCP `compare_nodes` tool then exposes per-node `idx_scan` so you can spot routing imbalances. See [docs/multi-node-stats.md](docs/multi-node-stats.md). +The MCP `describe_table` (node breakdown) and `detect kind=anomalies` tools then expose per-node `idx_scan` so you can spot routing imbalances. See [docs/multi-node-stats.md](docs/multi-node-stats.md). ### Multiple databases per project diff --git a/TUTORIAL.md b/TUTORIAL.md index b3f3508..29a0de8 100644 --- a/TUTORIAL.md +++ b/TUTORIAL.md @@ -169,7 +169,7 @@ dryrun --profile replica2 snapshot activity --from "$REPLICA2_URL" --label repli dryrun --profile replica3 snapshot activity --from "$REPLICA3_URL" --label replica3 ``` -`--label` is required and identifies the node in `compare_nodes` and `detect`. `snapshot activity` refuses to run on the primary. Activity rows attach to the most recent `schema` row by `schema_ref_hash`; pass `--allow-orphan` to capture before a schema exists. +`--label` is required and identifies the node in `describe_table` and `detect`. `snapshot activity` refuses to run on the primary. Activity rows attach to the most recent `schema` row by `schema_ref_hash`; pass `--allow-orphan` to capture before a schema exists. ### 3. Define profiles for repeatable runs @@ -205,7 +205,7 @@ Schema changes rarely; activity counters shift daily. Capture each on its own sc dryrun snapshot list ``` -Each row prints its `kind` (`schema` / `planner_stats` / `activity_stats`), `node_label` for activity rows, and the `schema_ref_hash` linking activity to schema. The MCP `compare_nodes` tool then exposes per-node `idx_scan` for any table. +Each row prints its `kind` (`schema` / `planner_stats` / `activity_stats`), `node_label` for activity rows, and the `schema_ref_hash` linking activity to schema. The MCP `describe_table` node breakdown then exposes per-node `idx_scan` for any table. --- @@ -266,7 +266,6 @@ Connect your MCP client to `http://host:3000/sse`. | `schema_diff` | No\* | Compare snapshots for schema changes | | `vacuum_health` | No | Autovacuum analysis with effective settings and recommendations | | `detect` | No | Health checks: stale stats, unused indexes, seq-scan anomalies | -| `compare_nodes` | No | Per-node breakdown for a specific table with anomaly detection | | `analyze_plan` | No | Analyze a pre-existing EXPLAIN JSON plan | | `advise` | Hybrid | Comprehensive query analysis: EXPLAIN + anti-patterns + index suggestions | | `explain_query` | **Yes** | EXPLAIN with structured plan and warnings | diff --git a/cmd/dryrun/main.go b/cmd/dryrun/main.go index c92d408..25082bf 100644 --- a/cmd/dryrun/main.go +++ b/cmd/dryrun/main.go @@ -852,7 +852,7 @@ func mcpServeCmd() *cobra.Command { effectiveSchemaFile, len(snap.Tables)) server = drmcp.NewOfflineServer(snap, lintCfg) server.SetSchemaCandidates(candidates) - // history.db carries planner/activity stats; without it offline tools (vacuum_health, compare_nodes…) see nil sizing + // history.db carries planner/activity stats; without it offline tools (vacuum_health, detect…) see nil sizing if h, err := history.OpenDefault(); err == nil { server.SetHistory(h) server.SetSnapshotKey(resolveSnapshotKey()) diff --git a/docs/multi-node-stats.md b/docs/multi-node-stats.md index 16ee3c1..ba5f5df 100644 --- a/docs/multi-node-stats.md +++ b/docs/multi-node-stats.md @@ -51,7 +51,7 @@ dryrun --profile replica3 snapshot activity \ --from "postgres://readonly@replica-3:5432/mydb" --label replica3 ``` -`--label` is required and identifies the node in `compare_nodes` and `detect`. `snapshot activity` refuses to run on the primary. Each row captures `pg_stat_user_tables`, `pg_stat_user_indexes`, and `stats_reset` for the node, then joins to the latest schema by `schema_ref_hash`. Use `--allow-orphan` when activity arrives before any schema snapshot exists; orphan rows are stored but not reattached when a matching schema lands later. +`--label` is required and identifies the node in `describe_table` and `detect`. `snapshot activity` refuses to run on the primary. Each row captures `pg_stat_user_tables`, `pg_stat_user_indexes`, and `stats_reset` for the node, then joins to the latest schema by `schema_ref_hash`. Use `--allow-orphan` when activity arrives before any schema snapshot exists; orphan rows are stored but not reattached when a matching schema lands later. Activity dumps are small (single-digit MB) and safe for cron. See [Automating collection](#automating-collection). @@ -62,7 +62,7 @@ When activity rows from multiple nodes attach to the same schema, the `MergedAct | Field | Rule | Why | |---|---|---| | `idx_scan_sum` | sum across nodes | Total indexed reads hitting the cluster | -| `idx_scan_per_node` | per-node breakdown | Powers `compare_nodes` and routing-imbalance detection | +| `idx_scan_per_node` | per-node breakdown | Powers `describe_table`'s node breakdown and routing-imbalance detection | | `seq_scan_sum` | sum across nodes | Reveals which replicas are doing seq scans | | `n_dead_tup_sum` | sum across nodes | Worst-case dead-tuple pressure for vacuum decisions | | `last_vacuum_max` | max timestamp | Autovacuum runs on the primary only; replicas always report null | @@ -75,23 +75,16 @@ When activity rows from multiple nodes attach to the same schema, the `MergedAct All multi-node analysis tools are MCP tools. They read from `~/.dryrun/history.db` via `HistoryStore::get_annotated`, which joins the latest schema with each node's most recent activity row by `schema_ref_hash`. -### compare_nodes +### describe_table (node breakdown) -Side-by-side stats for a specific table across all nodes. +`describe_table` includes a per-node activity breakdown for a table, surfacing the +counters that genuinely differ between nodes — `seq_scan`, `idx_scan`, tuple +ins/upd/del, dead tuples, and last vacuum/analyze. Sizing (`reltuples`, `relpages`, +table size) is cluster-wide, captured once from the primary's `planner_stats` row, +so it does not vary per node and is reported once rather than repeated per node. -``` -Per-node breakdown (4 node(s)): - - reltuples relpages seq_scan idx_scan table_size collected -primary 1,234,567 5,123 1,024 45,000 10 MB 2026-04-01 14:32 -replica-1 1,234,567 5,123 12 45,000 10 MB 2026-04-01 14:30 -replica-2 1,234,567 5,098 987,654 44,998 10 MB 2026-04-01 14:31 -replica-3 1,234,567 5,123 203 45,000 10 MB 2026-04-01 14:28 -``` - -Here `replica-2` has 987k sequential scans while others sit under 1,100, pointing to a routing problem or a missing index on that replica's workload. - -The output also includes per-index scan counts and flags indexes with zero scans across all nodes. +A node showing far more `seq_scan` than its peers points to a routing problem or a +missing index on that node's workload — surfaced directly by `detect kind=anomalies`. ### detect @@ -140,7 +133,7 @@ Autovacuum analysis using aggregated dead tuple counts but primary-only vacuum t ### Reporting replica with seq scans -The primary uses indexed lookups on `orders`, but a BI tool connected through `replica-2` runs `SELECT ... WHERE created_at BETWEEN ...` without a covering index. Single-node monitoring on the primary shows nothing wrong. `compare_nodes` reveals `replica-2` with millions of sequential scans. +The primary uses indexed lookups on `orders`, but a BI tool connected through `replica-2` runs `SELECT ... WHERE created_at BETWEEN ...` without a covering index. Single-node monitoring on the primary shows nothing wrong. `detect kind=anomalies` (or `describe_table`'s node breakdown) reveals `replica-2` with millions of sequential scans. Fix: add a covering index for the BI query pattern, or route analytics to a dedicated replica. @@ -150,7 +143,7 @@ Fix: add a covering index for the BI query pattern, or route analytics to a dedi ### Load balancer misconfiguration -A connection pooler is supposed to round-robin across three replicas, but `compare_nodes` shows `replica-1` handling 5x more traffic than the others. The imbalance detection flags it automatically. +A connection pooler is supposed to round-robin across three replicas, but `replica-1` handles 5x more traffic than the others. `detect kind=anomalies` flags the imbalance automatically. ## Automating collection diff --git a/internal/mcp/handlers_health.go b/internal/mcp/handlers_health.go index 75d7d6c..fded95e 100644 --- a/internal/mcp/handlers_health.go +++ b/internal/mcp/handlers_health.go @@ -10,46 +10,6 @@ import ( "github.com/boringsql/dryrun/internal/schema" ) -func (s *Server) handleCompareNodes(_ context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { - a, err := s.getAnnotated() - if err != nil { - return errResult(err.Error()), nil - } - - tableName := getArg(req, "table") - schemaName := schemaArg(req) - qual := schema.QualifiedName{Schema: schemaName, Name: tableName} - - if a.Merged == nil { - return textResult("No node statistics available. Import stats from multiple nodes first."), nil - } - - var lines []string - lines = append(lines, fmt.Sprintf("Node comparison for %s.%s:\n", schemaName, tableName)) - - sz := a.SizingFor(qual) - for _, n := range a.Merged.Nodes { - for _, ts := range n.Tables { - if ts.Table != qual { - continue - } - rt := 0.0 - tableSize := int64(0) - if sz != nil { - rt = sz.Reltuples - tableSize = sz.TableSize - } - lines = append(lines, fmt.Sprintf(" %s: %.0f rows, seq_scan=%d, idx_scan=%d, size=%d", - n.Node.Source, rt, ts.Activity.SeqScan, ts.Activity.IdxScan, tableSize)) - } - } - - if len(lines) == 1 { - return textResult(s.wrapText(fmt.Sprintf("No stats found for %s.%s across nodes.", schemaName, tableName), "")), nil - } - return textResult(s.wrapText(strings.Join(lines, "\n"), "")), nil -} - func (s *Server) handleDetect(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { kind := argOr(req, "kind", "all") @@ -69,37 +29,93 @@ func (s *Server) handleDetect(ctx context.Context, req mcp.CallToolRequest) (*mc } } +// schema/table extractors for filterByQual +func staleKey(e schema.StaleStatsEntry) (string, string) { return e.Schema, e.Table } +func unusedKey(e schema.UnusedIndexEntry) (string, string) { return e.Schema, e.Table } +func bloatKey(e schema.BloatedIndexEntry) (string, string) { return e.Schema, e.Table } +func vacuumKey(e schema.VacuumHealth) (string, string) { return e.Schema, e.Table } +func anomalyKey(m map[string]any) (string, string) { + s, _ := m["schema"].(string) + t, _ := m["table"].(string) + return s, t +} + +// caps never-analyzed and stale independently so it can provide more targetted advice +func capStaleStats(entries []schema.StaleStatsEntry, max int) (kept []schema.StaleStatsEntry, omitted int) { + var never, stale []schema.StaleStatsEntry + for _, e := range entries { + if e.LastAnalyzedDaysAgo == nil { + never = append(never, e) + } else { + stale = append(stale, e) + } + } + neverKept, neverOmitted := capItems(never, max) + staleKept, staleOmitted := capItems(stale, max) + return append(neverKept, staleKept...), neverOmitted + staleOmitted +} + +// Pre-validated re-run of one kind, uncapped, keeping any active filter. +func narrowNext(kind, schemaF, tableF string) []NextCall { + args := map[string]any{"kind": kind, "limit": 0} + if schemaF != "" { + args["schema"] = schemaF + } + if tableF != "" { + args["table"] = tableF + } + return []NextCall{{Tool: "detect", Args: args}} +} + func (s *Server) handleDetectAll(_ context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { a, err := s.getAnnotated() if err != nil { return errResult(err.Error()), nil } - staleDays := int64(7) - staleEntries := schema.DetectStaleStats(a, staleDays) - unusedEntries := schema.DetectUnusedIndexes(a) - + schemaF := schemaArg(req) + tableF := getArg(req, "table") + max := limitArg(req) threshold := getFloatArg(req, "threshold", 4.0) - bloatEntries := schema.DetectBloatedIndexes(a, threshold) - anomalies := buildAnomalies(a) + staleEntries := filterByQual(schema.DetectStaleStats(a, int64(7)), schemaF, tableF, staleKey) + unusedEntries := filterByQual(schema.DetectUnusedIndexes(a), schemaF, tableF, unusedKey) + bloatEntries := filterByQual(schema.DetectBloatedIndexes(a, threshold), schemaF, tableF, bloatKey) + anomalies := filterByQual(buildAnomalies(a), schemaF, tableF, anomalyKey) + staleKept, staleOmitted := capStaleStats(staleEntries, max) wrapper := map[string]any{ - "stale_stats": map[string]any{"entries": staleEntries, "count": len(staleEntries)}, - "unused_indexes": map[string]any{"entries": unusedEntries, "count": len(unusedEntries)}, - "anomalies": map[string]any{"entries": anomalies, "count": len(anomalies)}, - "bloated_indexes": map[string]any{"entries": bloatEntries, "count": len(bloatEntries)}, + "stale_stats": cappedBlock(staleKept, staleOmitted, len(staleEntries)), + "unused_indexes": entryBlock(unusedEntries, max), + "anomalies": entryBlock(anomalies, max), + "bloated_indexes": entryBlock(bloatEntries, max), } + hint := "" switch { case len(staleEntries) > 0 && len(unusedEntries) > 0: - hint = "Stale stats may cause bad plans — run ANALYZE. Unused indexes add write overhead — verify with compare_nodes before dropping." + hint = "Stale stats may cause bad plans; run ANALYZE. Unused indexes add write overhead; verify per-node index scans before dropping." case len(staleEntries) > 0: - hint = "Stale stats may cause bad query plans — consider running ANALYZE." + hint = "Stale stats may cause bad query plans; consider running ANALYZE." case len(unusedEntries) > 0: - hint = "Unused indexes add write overhead. Use compare_nodes to verify across all replicas before dropping." + hint = "Unused indexes add write overhead. Verify index scans across all replicas before dropping." } - s.injectMeta(wrapper, hint, nil) + + // point next at the truncated categories while trancating + var next []NextCall + for _, k := range []string{"stale_stats", "unused_indexes", "anomalies", "bloated_indexes"} { + if block, ok := wrapper[k].(map[string]any); ok && block["truncated"] == true { + next = append(next, narrowNext(k, schemaF, tableF)...) + } + } + if len(next) > 0 { + if hint != "" { + hint += " " + } + hint += fmt.Sprintf("Some categories capped at %d; _meta.next re-runs them uncapped, or narrow with schema=/table=.", max) + } + + s.injectMeta(wrapper, hint, next) return jsonResult(wrapper), nil } @@ -109,20 +125,28 @@ func (s *Server) handleDetectStaleStats(_ context.Context, req mcp.CallToolReque return errResult(err.Error()), nil } - entries := schema.DetectStaleStats(a, int64(7)) + schemaF := schemaArg(req) + tableF := getArg(req, "table") + entries := filterByQual(schema.DetectStaleStats(a, int64(7)), schemaF, tableF, staleKey) if len(entries) == 0 { return textResult("No stale statistics detected."), nil } + total := len(entries) + kept, omitted := capStaleStats(entries, limitArg(req)) var lines []string - for _, e := range entries { + for _, e := range kept { if e.LastAnalyzedDaysAgo == nil { lines = append(lines, fmt.Sprintf(" %s: %s.%s - never analyzed", e.Node, e.Schema, e.Table)) } else { lines = append(lines, fmt.Sprintf(" %s: %s.%s - last analyzed %d days ago", e.Node, e.Schema, e.Table, *e.LastAnalyzedDaysAgo)) } } - return textResult(fmt.Sprintf("Stale statistics (%d entries):\n%s", len(entries), strings.Join(lines, "\n"))), nil + body := fmt.Sprintf("Stale statistics (%d entries):\n%s", total, strings.Join(lines, "\n")) + if omitted > 0 { + body += fmt.Sprintf("\n\n%d more not shown; narrow with schema=/table= or re-run with limit=0.", omitted) + } + return textResult(body), nil } func (s *Server) handleDetectUnusedIndexes(_ context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { @@ -131,14 +155,13 @@ func (s *Server) handleDetectUnusedIndexes(_ context.Context, req mcp.CallToolRe return errResult(err.Error()), nil } - entries := schema.DetectUnusedIndexes(a) + schemaF := schemaArg(req) + tableF := getArg(req, "table") + entries := filterByQual(schema.DetectUnusedIndexes(a), schemaF, tableF, unusedKey) if len(entries) == 0 { return textResult("No unused indexes detected. All indexes have at least one scan recorded."), nil } - return jsonResult(map[string]any{ - "unused_indexes": entries, - "count": len(entries), - }), nil + return cappedKindResult(s, "unused_indexes", entries, limitArg(req), schemaF, tableF), nil } func (s *Server) handleDetectAnomalies(_ context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { @@ -151,11 +174,13 @@ func (s *Server) handleDetectAnomalies(_ context.Context, req mcp.CallToolReques return textResult("No node statistics available for anomaly detection."), nil } - anomalies := buildAnomalies(a) + schemaF := schemaArg(req) + tableF := getArg(req, "table") + anomalies := filterByQual(buildAnomalies(a), schemaF, tableF, anomalyKey) if len(anomalies) == 0 { return textResult("No anomalies detected."), nil } - return jsonResult(anomalies), nil + return cappedKindResult(s, "anomalies", anomalies, limitArg(req), schemaF, tableF), nil } func (s *Server) handleDetectBloatedIndexes(_ context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { @@ -164,32 +189,55 @@ func (s *Server) handleDetectBloatedIndexes(_ context.Context, req mcp.CallToolR return errResult(err.Error()), nil } + schemaF := schemaArg(req) + tableF := getArg(req, "table") threshold := getFloatArg(req, "threshold", 4.0) - entries := schema.DetectBloatedIndexes(a, threshold) + entries := filterByQual(schema.DetectBloatedIndexes(a, threshold), schemaF, tableF, bloatKey) if len(entries) == 0 { return textResult("No bloated indexes detected."), nil } - return jsonResult(map[string]any{ - "bloated_indexes": entries, - "count": len(entries), - }), nil + return cappedKindResult(s, "bloated_indexes", entries, limitArg(req), schemaF, tableF), nil +} + +func cappedKindResult[T any](s *Server, kind string, entries []T, max int, schemaF, tableF string) *mcp.CallToolResult { + kept, omitted := capItems(entries, max) + wrapper := map[string]any{ + kind: kept, + "count": len(entries), + } + if omitted > 0 { + wrapper["truncated"] = true + wrapper["omitted"] = omitted + hint := fmt.Sprintf("Showing first %d of %d; %d not shown. Narrow with schema=/table= or re-run with limit=0.", len(kept), len(entries), omitted) + s.injectMeta(wrapper, hint, narrowNext(kind, schemaF, tableF)) + } + return jsonResult(wrapper) } -func (s *Server) handleVacuumHealth(_ context.Context, _ mcp.CallToolRequest) (*mcp.CallToolResult, error) { +func (s *Server) handleVacuumHealth(_ context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { a, err := s.getAnnotated() if err != nil { return errResult(err.Error()), nil } - results := schema.AnalyzeVacuumHealth(a) - + schemaF := schemaArg(req) + tableF := getArg(req, "table") + results := filterByQual(schema.AnalyzeVacuumHealth(a), schemaF, tableF, vacuumKey) if len(results) == 0 { return textResult(s.wrapText("No vacuum health concerns found.", "")), nil } + + kept, omitted := capItems(results, limitArg(req)) wrapper := map[string]any{ - "vacuum_health": results, + "vacuum_health": kept, "count": len(results), } - s.injectMeta(wrapper, "", nil) + hint := "" + if omitted > 0 { + wrapper["truncated"] = true + wrapper["omitted"] = omitted + hint = fmt.Sprintf("Showing first %d of %d; %d not shown. Narrow with schema=/table= or re-run with limit=0.", len(kept), len(results), omitted) + } + s.injectMeta(wrapper, hint, nil) return jsonResult(wrapper), nil } diff --git a/internal/mcp/handlers_health_test.go b/internal/mcp/handlers_health_test.go index 4c16a10..917190b 100644 --- a/internal/mcp/handlers_health_test.go +++ b/internal/mcp/handlers_health_test.go @@ -4,21 +4,20 @@ import ( "encoding/json" "strings" "testing" + "time" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/boringsql/dryrun/internal/lint" + "github.com/boringsql/dryrun/internal/schema" ) -// Smoke tests for health-family tools: compare_nodes, detect (all kinds), -// vacuum_health. Each subtest exercises one kind or filter and asserts the -// expected JSON keys or error text appear. +// Smoke tests for health-family tools: detect (all kinds), vacuum_health. +// Each subtest exercises one kind or filter and asserts the expected JSON +// keys or error text appear. func TestHealthHandlers_OfflineSmoke(t *testing.T) { c := setupOfflineTest(t) - t.Run("compare_nodes", func(t *testing.T) { - out := callTool(t, c, "compare_nodes", map[string]any{"table": "users"}) - if out == "" { - t.Fatal("empty result") - } - }) - t.Run("detect_default_all", func(t *testing.T) { out := callTool(t, c, "detect", nil) assertContains(t, out, "stale_stats") @@ -87,6 +86,196 @@ func TestHealthHandlers_OfflineSmoke(t *testing.T) { }) } +// The whole point of §6 is that a giant result set degrades gracefully instead +// of dumping the entire database into the model's context. This pins the full +// truncation contract for a single detect category at the boundary where it +// fires: 60 findings, a cap of 50. We assert the four things a downstream agent +// relies on — (1) `count` reports the true 60 so it knows the haystack's real +// size, (2) only 50 entries actually cross the wire, (3) truncated/omitted +// honestly advertise that 10 were withheld, and (4) _meta.next hands back a +// pre-validated, uncapped re-run of exactly this category (limit:0) so the agent +// can fetch the rest mechanically rather than guessing at the args. +func TestCappedKindResult_TruncationContract(t *testing.T) { + snap := &schema.SchemaSnapshot{ + PgVersion: "PostgreSQL 18", Database: "appdb", Timestamp: time.Now().UTC(), + } + srv := NewOfflineServer(snap, lint.DefaultConfig()) + + entries := make([]int, 60) + for i := range entries { + entries[i] = i + } + + res := cappedKindResult(srv, "unused_indexes", entries, 50, "", "") + tc, ok := res.Content[0].(mcp.TextContent) + if !ok { + t.Fatalf("expected TextContent, got %T", res.Content[0]) + } + + var decoded map[string]any + if err := json.Unmarshal([]byte(tc.Text), &decoded); err != nil { + t.Fatalf("invalid JSON: %v\n%s", err, tc.Text) + } + + if decoded["count"] != float64(60) { + t.Errorf("count must be the full pre-cap total 60, got %v", decoded["count"]) + } + shown, _ := decoded["unused_indexes"].([]any) + if len(shown) != 50 { + t.Errorf("expected 50 entries on the wire, got %d", len(shown)) + } + if decoded["truncated"] != true { + t.Errorf("expected truncated=true, got %v", decoded["truncated"]) + } + if decoded["omitted"] != float64(10) { + t.Errorf("expected omitted=10, got %v", decoded["omitted"]) + } + + meta, ok := decoded["_meta"].(map[string]any) + if !ok { + t.Fatalf("expected _meta object, got %T", decoded["_meta"]) + } + next, ok := meta["next"].([]any) + if !ok || len(next) != 1 { + t.Fatalf("expected exactly one _meta.next entry, got %v", meta["next"]) + } + call, _ := next[0].(map[string]any) + if call["tool"] != "detect" { + t.Errorf("next.tool must be detect, got %v", call["tool"]) + } + args, _ := call["args"].(map[string]any) + if args["kind"] != "unused_indexes" { + t.Errorf("next.args.kind must echo the category, got %v", args["kind"]) + } + if args["limit"] != float64(0) { + t.Errorf("next.args.limit must be 0 (uncapped) so the re-run returns everything, got %v", args["limit"]) + } +} + +// The flip side of the contract: when the result set fits under the cap, none of +// the truncation machinery should appear. A clean, small response must stay +// clean — no truncated flag, no omitted count, and crucially no _meta.next +// nudging the agent toward a pointless re-run. +func TestCappedKindResult_NoTruncationWhenUnderCap(t *testing.T) { + snap := &schema.SchemaSnapshot{ + PgVersion: "PostgreSQL 18", Database: "appdb", Timestamp: time.Now().UTC(), + } + srv := NewOfflineServer(snap, lint.DefaultConfig()) + + res := cappedKindResult(srv, "bloated_indexes", []int{1, 2, 3}, 50, "", "") + tc := res.Content[0].(mcp.TextContent) + + var decoded map[string]any + if err := json.Unmarshal([]byte(tc.Text), &decoded); err != nil { + t.Fatalf("invalid JSON: %v\n%s", err, tc.Text) + } + if decoded["count"] != float64(3) { + t.Errorf("expected count=3, got %v", decoded["count"]) + } + if _, has := decoded["truncated"]; has { + t.Error("nothing hidden: truncated must be absent") + } + if _, has := decoded["omitted"]; has { + t.Error("nothing hidden: omitted must be absent") + } + // _meta is only injected on the truncation branch, so it should be absent here. + if _, has := decoded["_meta"]; has { + t.Error("no truncation: _meta (and its next) must be absent") + } +} + +// buildAnomalies must emit hottest-first (by total seq_scan) so a downstream +// cap keeps the most alarming tables. We seed two tables that both trip the +// seq-scan-only flag (>100 seq scans, zero index scans) but with very different +// volumes, inserted cold-then-hot so a passing test proves the sort flipped +// them rather than preserving insertion order. +func TestBuildAnomalies_HottestFirst(t *testing.T) { + a := &schema.AnnotatedSchema{ + Schema: &schema.SchemaSnapshot{}, + Merged: &schema.MergedActivity{Nodes: []schema.NodeActivity{{ + Node: schema.NodeIdentity{Source: "primary"}, + Tables: []schema.TableActivityEntry{ + {Table: schema.QualifiedName{Schema: "public", Name: "warm"}, Activity: schema.TableActivity{SeqScan: 200, IdxScan: 0}}, + {Table: schema.QualifiedName{Schema: "public", Name: "hot"}, Activity: schema.TableActivity{SeqScan: 9000, IdxScan: 0}}, + }, + }}}, + } + + anomalies := buildAnomalies(a) + if len(anomalies) != 2 { + t.Fatalf("expected both seq-scan-only tables flagged, got %d: %v", len(anomalies), anomalies) + } + if anomalies[0]["table"] != "hot" { + t.Errorf("expected the 9000-seq-scan table first, got %q", anomalies[0]["table"]) + } + if anomalies[1]["table"] != "warm" { + t.Errorf("expected the 200-seq-scan table second, got %q", anomalies[1]["table"]) + } +} + +// capStaleStats is the two-bucket guard that keeps a fresh bulk-load (where +// nearly every table is never-analyzed) from blowing the response bound, while +// still guaranteeing the "no stats at all" class is never starved out by a +// flood of merely-stale tables. The contract: never-analyzed and stale are +// capped INDEPENDENTLY at max, never-analyzed comes first, and omitted sums +// both buckets. We feed 4 never-analyzed and 5 stale with max=2, so each bucket +// must shed its overflow separately — proving a big never-analyzed pile can't +// crowd out the stale tables and vice versa. +func TestCapStaleStats_TwoBuckets(t *testing.T) { + mk := func(name string, daysAgo *int64) schema.StaleStatsEntry { + return schema.StaleStatsEntry{Node: "primary", Schema: "public", Table: name, LastAnalyzedDaysAgo: daysAgo} + } + d := func(n int64) *int64 { return &n } + + entries := []schema.StaleStatsEntry{ + mk("never1", nil), mk("never2", nil), mk("never3", nil), mk("never4", nil), + mk("stale1", d(40)), mk("stale2", d(30)), mk("stale3", d(20)), mk("stale4", d(15)), mk("stale5", d(10)), + } + + kept, omitted := capStaleStats(entries, 2) + + // 2 from each bucket survive; the other 2 never + 3 stale are omitted. + if len(kept) != 4 { + t.Fatalf("expected 4 kept (2 never + 2 stale), got %d: %+v", len(kept), kept) + } + if omitted != 5 { + t.Errorf("expected omitted=5 (2 never + 3 stale), got %d", omitted) + } + + // never-analyzed must occupy the front of the kept slice. + for i := 0; i < 2; i++ { + if kept[i].LastAnalyzedDaysAgo != nil { + t.Errorf("position %d should be a never-analyzed entry, got %v days", i, *kept[i].LastAnalyzedDaysAgo) + } + } + // the stale survivors must be the two MOST stale (the detector pre-sorts + // worst-first, and capStaleStats preserves that within the bucket). + if kept[2].Table != "stale1" || kept[3].Table != "stale2" { + t.Errorf("expected the two most-stale tables (stale1, stale2), got %q, %q", kept[2].Table, kept[3].Table) + } +} + +// A never-analyzed flood must not starve the stale bucket: with far more +// never-analyzed than the cap, the stale tables still get their own slots +// rather than being squeezed to zero. +func TestCapStaleStats_NeverFloodDoesNotStarveStale(t *testing.T) { + var entries []schema.StaleStatsEntry + for i := 0; i < 100; i++ { + entries = append(entries, schema.StaleStatsEntry{Schema: "public", Table: "n", LastAnalyzedDaysAgo: nil}) + } + days := int64(12) + entries = append(entries, schema.StaleStatsEntry{Schema: "public", Table: "stale", LastAnalyzedDaysAgo: &days}) + + kept, _ := capStaleStats(entries, 10) + if len(kept) != 11 { + t.Fatalf("expected 10 never + 1 stale = 11 kept, got %d", len(kept)) + } + last := kept[len(kept)-1] + if last.Table != "stale" || last.LastAnalyzedDaysAgo == nil { + t.Errorf("the lone stale table must survive the never-analyzed flood, got %+v", last) + } +} + // Pins that vacuum_health with an unknown schema returns the friendly // "No vacuum health concerns" message rather than an error or empty payload. func TestVacuumHealth_SchemaFilter(t *testing.T) { diff --git a/internal/mcp/helpers.go b/internal/mcp/helpers.go index b2f4402..5bbea02 100644 --- a/internal/mcp/helpers.go +++ b/internal/mcp/helpers.go @@ -3,6 +3,7 @@ package mcp import ( "encoding/json" "fmt" + "sort" "github.com/mark3labs/mcp-go/mcp" @@ -75,6 +76,55 @@ func (s *Server) metaJSONResult(payload any, key, hint string, next []NextCall) return mcp.NewToolResultText(string(out)) } +const defaultMaxItems = 50 + +// max <= 0 disables the cap. +func capItems[T any](items []T, max int) (kept []T, omitted int) { + if max <= 0 || len(items) <= max { + return items, 0 + } + return items[:max], len(items) - max +} + +// count is the full total even when entries is capped, so callers see more exists. +func cappedBlock[T any](kept []T, omitted, total int) map[string]any { + block := map[string]any{"entries": kept, "count": total} + if omitted > 0 { + block["truncated"] = true + block["omitted"] = omitted + } + return block +} + +func entryBlock[T any](entries []T, max int) map[string]any { + kept, omitted := capItems(entries, max) + return cappedBlock(kept, omitted, len(entries)) +} + +// Filters result entries (not the schema) since detectors draw from mixed sources. Empty filter = no-op on that axis +func filterByQual[T any](items []T, schemaFilter, tableFilter string, key func(T) (string, string)) []T { + if schemaFilter == "" && tableFilter == "" { + return items + } + out := make([]T, 0, len(items)) + for _, it := range items { + s, t := key(it) + if schemaFilter != "" && s != schemaFilter { + continue + } + if tableFilter != "" && t != tableFilter { + continue + } + out = append(out, it) + } + return out +} + +// missing -> defaultMaxItems; 0 -> uncapped. +func limitArg(req mcp.CallToolRequest) int { + return int(getFloatArg(req, "limit", defaultMaxItems)) +} + // Shallow-copy snap, retaining tables matching filters. Empty filter = no filtering on that axis. func filterSnap(snap *schema.SchemaSnapshot, schemaFilter, tableFilter string) *schema.SchemaSnapshot { if schemaFilter == "" && tableFilter == "" { @@ -115,6 +165,12 @@ func buildAnomalies(a *schema.AnnotatedSchema) []map[string]any { "total_seq_scan": sm.TotalSeqScan, "total_idx_scan": sm.TotalIdxScan, }) } + // worst cases first (by seq-scan volume) for cap to be actually helpful + sort.SliceStable(anomalies, func(i, j int) bool { + si, _ := anomalies[i]["total_seq_scan"].(int64) + sj, _ := anomalies[j]["total_seq_scan"].(int64) + return si > sj + }) return anomalies } diff --git a/internal/mcp/helpers_test.go b/internal/mcp/helpers_test.go index 72b8cc8..70ff26d 100644 --- a/internal/mcp/helpers_test.go +++ b/internal/mcp/helpers_test.go @@ -182,6 +182,143 @@ func TestToCompactTable_IndexValidityIsExceptionOnly(t *testing.T) { } } +// capItems is the primitive behind every §6 response cap. The contract has +// four corners and each one matters for a different reason, so we pin all four: +// the happy "everything fits" path (no omission, slice returned untouched), the +// "spills over" path (exactly max kept, the rest counted as omitted so the +// caller can advertise how much it hid), and the two opt-out encodings — max of +// zero and a negative max — which both mean "the user disabled the cap, give me +// everything" and must never drop a single element. +func TestCapItems(t *testing.T) { + items := []int{1, 2, 3, 4, 5} + + t.Run("under_cap_keeps_all", func(t *testing.T) { + kept, omitted := capItems(items, 10) + if len(kept) != 5 || omitted != 0 { + t.Fatalf("expected all 5 kept and 0 omitted, got kept=%d omitted=%d", len(kept), omitted) + } + }) + + t.Run("equal_to_cap_keeps_all", func(t *testing.T) { + kept, omitted := capItems(items, 5) + if len(kept) != 5 || omitted != 0 { + t.Fatalf("boundary: expected 5 kept and 0 omitted, got kept=%d omitted=%d", len(kept), omitted) + } + }) + + t.Run("over_cap_truncates_and_counts", func(t *testing.T) { + kept, omitted := capItems(items, 2) + if len(kept) != 2 || omitted != 3 { + t.Fatalf("expected 2 kept and 3 omitted, got kept=%d omitted=%d", len(kept), omitted) + } + }) + + t.Run("zero_max_disables_cap", func(t *testing.T) { + kept, omitted := capItems(items, 0) + if len(kept) != 5 || omitted != 0 { + t.Fatalf("max=0 means uncapped: expected 5 kept and 0 omitted, got kept=%d omitted=%d", len(kept), omitted) + } + }) + + t.Run("negative_max_disables_cap", func(t *testing.T) { + kept, omitted := capItems(items, -1) + if len(kept) != 5 || omitted != 0 { + t.Fatalf("negative max means uncapped: expected 5 kept and 0 omitted, got kept=%d omitted=%d", len(kept), omitted) + } + }) +} + +// entryBlock is the JSON shape the detect tool emits per category. The crucial +// invariant is that `count` is ALWAYS the full pre-cap total — a model staring +// at a capped `entries` array still needs to know the true size of the haystack +// to decide whether to narrow or page. The truncated/omitted keys are the +// inverse signal: present (and honest) only when something was actually hidden, +// absent entirely when the whole set fit so the common case stays clean. +func TestEntryBlock(t *testing.T) { + t.Run("within_cap_omits_truncation_keys", func(t *testing.T) { + block := entryBlock([]int{1, 2, 3}, 50) + if block["count"] != 3 { + t.Errorf("expected count=3 (the full total), got %v", block["count"]) + } + if _, has := block["truncated"]; has { + t.Error("nothing was hidden, so truncated must be absent") + } + if _, has := block["omitted"]; has { + t.Error("nothing was hidden, so omitted must be absent") + } + }) + + t.Run("over_cap_reports_full_count_and_omission", func(t *testing.T) { + block := entryBlock([]int{1, 2, 3, 4, 5}, 2) + if block["count"] != 5 { + t.Errorf("count must stay the full total (5) even though entries is capped, got %v", block["count"]) + } + entries, ok := block["entries"].([]int) + if !ok || len(entries) != 2 { + t.Fatalf("expected 2 capped entries, got %v", block["entries"]) + } + if block["truncated"] != true { + t.Error("expected truncated=true when entries were dropped") + } + if block["omitted"] != 3 { + t.Errorf("expected omitted=3, got %v", block["omitted"]) + } + }) +} + +// filterByQual is what finally makes detect/vacuum_health's long-declared +// schema/table parameters real rather than decorative. It mirrors filterSnap's +// AND-narrowing semantics but operates on already-computed result entries +// (because the detectors draw from different sources and can't all be filtered +// upstream). We reuse a tiny (schema, table) pair type and the same overlapping +// names trick from filterTestSnap so the cross-axis behaviour is unambiguous. +func TestFilterByQual(t *testing.T) { + type qual struct { + s string + n string + } + key := func(q qual) (string, string) { return q.s, q.n } + items := []qual{ + {"public", "users"}, + {"public", "orders"}, + {"billing", "invoices"}, + {"billing", "orders"}, + } + + t.Run("empty_filters_passthrough", func(t *testing.T) { + out := filterByQual(items, "", "", key) + if len(out) != 4 { + t.Fatalf("expected all 4 entries unfiltered, got %d", len(out)) + } + }) + + t.Run("schema_only", func(t *testing.T) { + out := filterByQual(items, "public", "", key) + if len(out) != 2 { + t.Fatalf("expected 2 public entries, got %d", len(out)) + } + for _, q := range out { + if q.s != "public" { + t.Errorf("leaked non-public entry %+v", q) + } + } + }) + + t.Run("table_only_crosses_schemas", func(t *testing.T) { + out := filterByQual(items, "", "orders", key) + if len(out) != 2 { + t.Fatalf("expected both orders entries across schemas, got %d", len(out)) + } + }) + + t.Run("schema_and_table_and_narrow", func(t *testing.T) { + out := filterByQual(items, "billing", "orders", key) + if len(out) != 1 || out[0].s != "billing" || out[0].n != "orders" { + t.Fatalf("expected the unique billing.orders entry, got %+v", out) + } + }) +} + // metaJSONResult merges the payload at the top level and injects _meta below // it; the body must remain valid JSON for downstream MCP transport. func TestMetaJSONResult_ProducesValidJSON(t *testing.T) { diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index c99b3af..93dc4d4 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -112,14 +112,6 @@ func (s *Server) Register(srv *mcpserver.MCPServer) { ), s.handleLintSchema, ) - srv.AddTool( - mcp.NewTool("compare_nodes", - mcp.WithDescription("Per-node stats: reltuples, relpages, scans, size, indexes"), - mcp.WithString("table", mcp.Required(), mcp.Description("Table name.")), - mcp.WithString("schema", mcp.Description("Schema filter.")), - ), - s.handleCompareNodes, - ) srv.AddTool( mcp.NewTool("detect", mcp.WithDescription("Health checks: stale stats, unused/bloated indexes, anomalies"), @@ -134,6 +126,10 @@ func (s *Server) Register(srv *mcpserver.MCPServer) { ), mcp.WithString("schema", mcp.Description("Schema filter.")), mcp.WithString("table", mcp.Description("Table filter.")), + mcp.WithNumber("limit", + mcp.DefaultNumber(50), + mcp.Description("Max entries per category (default 50, 0=all)."), + ), ), s.handleDetect, ) @@ -142,6 +138,10 @@ func (s *Server) Register(srv *mcpserver.MCPServer) { mcp.WithDescription("Autovacuum status, dead tuples, tuning hints"), mcp.WithString("schema", mcp.Description("Schema filter.")), mcp.WithString("table", mcp.Description("Table filter.")), + mcp.WithNumber("limit", + mcp.DefaultNumber(50), + mcp.Description("Max entries (default 50, 0=all)."), + ), ), s.handleVacuumHealth, ) diff --git a/internal/mcp/tools_registration_test.go b/internal/mcp/tools_registration_test.go index e06ce58..d5897cb 100644 --- a/internal/mcp/tools_registration_test.go +++ b/internal/mcp/tools_registration_test.go @@ -35,7 +35,7 @@ func TestToolsRegistration_EveryListedToolHasHandler(t *testing.T) { // minimal valid args for tools that require them; everything else // is fine with nil since we only care about handler resolution. switch tool.Name { - case "describe_table", "find_related", "compare_nodes": + case "describe_table", "find_related": req.Params.Arguments = map[string]any{"table": "users"} case "search_schema": req.Params.Arguments = map[string]any{"query": "users"} @@ -89,7 +89,6 @@ func TestToolsRegistration_InputSchemaShape(t *testing.T) { "find_related": {"table"}, "validate_query": {"sql"}, "check_migration": {"ddl"}, - "compare_nodes": {"table"}, "advise": {"sql"}, "analyze_plan": {"sql", "plan_json"}, } @@ -135,7 +134,6 @@ func TestToolsRegistration_OfflineToolSurface(t *testing.T) { "validate_query": true, "check_migration": true, "lint_schema": true, - "compare_nodes": true, "detect": true, "vacuum_health": true, "reload_schema": true, diff --git a/internal/schema/stale_sort_test.go b/internal/schema/stale_sort_test.go new file mode 100644 index 0000000..1ec15d9 --- /dev/null +++ b/internal/schema/stale_sort_test.go @@ -0,0 +1,53 @@ +package schema + +import ( + "testing" + "time" +) + +// DetectStaleStats must return entries worst-first so a downstream response cap +// keeps the tables that most urgently need ANALYZE rather than an arbitrary +// slice. "Worst" has a deliberate ordering: a table that has NEVER been analyzed +// (nil days-ago) outranks any table that was analyzed at some point, and among +// the ever-analyzed the one analyzed longest ago comes first. We seed three +// tables on a single primary node — one never analyzed, one 30 days stale, one +// 10 days stale — in an order that does NOT match the desired output, so a +// passing test proves the sort actually reordered them rather than coincidence. +func TestDetectStaleStats_WorstFirstOrdering(t *testing.T) { + now := time.Now().UTC() + d10 := now.Add(-10 * 24 * time.Hour) + d30 := now.Add(-30 * 24 * time.Hour) + + // Insertion order: 10d, never, 30d — intentionally NOT the expected order. + a := &AnnotatedSchema{ + Schema: &SchemaSnapshot{}, + Merged: &MergedActivity{Nodes: []NodeActivity{{ + Node: NodeIdentity{Source: "primary"}, + Tables: []TableActivityEntry{ + {Table: qual("public", "recent"), Activity: TableActivity{LastAnalyze: &d10}}, + {Table: qual("public", "never"), Activity: TableActivity{}}, + {Table: qual("public", "ancient"), Activity: TableActivity{LastAnalyze: &d30}}, + }, + }}}, + } + + entries := DetectStaleStats(a, 7) + if len(entries) != 3 { + t.Fatalf("expected all 3 tables stale past the 7-day threshold, got %d", len(entries)) + } + + // never-analyzed first, then 30-day, then 10-day. + gotOrder := []string{entries[0].Table, entries[1].Table, entries[2].Table} + wantOrder := []string{"never", "ancient", "recent"} + for i := range wantOrder { + if gotOrder[i] != wantOrder[i] { + t.Errorf("position %d: got %q, want %q (full order: %v)", i, gotOrder[i], wantOrder[i], gotOrder) + } + } + + // The never-analyzed table specifically must carry a nil days-ago so callers + // can render it as "never" rather than "0 days ago". + if entries[0].LastAnalyzedDaysAgo != nil { + t.Errorf("never-analyzed table should have nil LastAnalyzedDaysAgo, got %v", *entries[0].LastAnalyzedDaysAgo) + } +} diff --git a/internal/schema/types.go b/internal/schema/types.go index 83b7d1f..93b5842 100644 --- a/internal/schema/types.go +++ b/internal/schema/types.go @@ -1,6 +1,9 @@ package schema -import "time" +import ( + "sort" + "time" +) // DDL-only schema snapshot; sizing/activity live in AnnotatedSchema type SchemaSnapshot struct { @@ -280,6 +283,20 @@ func DetectStaleStats(a *AnnotatedSchema, staleDays int64) []StaleStatsEntry { } } } + // Worst-first so a downstream cap keeps the most stale; nil = never analyzed = most stale (might revisit later) + sort.SliceStable(entries, func(i, j int) bool { + di, dj := entries[i].LastAnalyzedDaysAgo, entries[j].LastAnalyzedDaysAgo + switch { + case di == nil && dj == nil: + return false + case di == nil: + return true + case dj == nil: + return false + default: + return *di > *dj + } + }) return entries }