From 22957edde267dc4b3e7d401a34fe05fda76ac668 Mon Sep 17 00:00:00 2001 From: agentnightshift Date: Mon, 29 Jun 2026 15:52:25 -0500 Subject: [PATCH] feat(retrieval): add retrieval event logging and evaluation harness - Add RETRIEVAL_EVENT_LOGGING_ENABLED (default false) for detailed search/fetch telemetry in separate retrieval_query_events collection (90-day TTL via Firestore, linked to parent tool_call events) - New retrievalEvaluation.ts implementing: - FirestoreRetrievalEvalCaseStore + replace/list helpers - buildSyntheticEvalCases + correlateImplicitFetches (search+fetch pairs within time window become positive labels) - computeRetrievalEvalMetrics (hit@k, recall@k, MRR, zero-result rate, p50/p95 latency) - New functions/scripts/retrieval-eval.ts + package.json scripts: eval:generate (plant isolated corpus + cases in memory_vectors_eval) eval:import (turn recent production retrieval events into cases) eval:run (--mode isolated|production, --cutoffs, reports JSON metrics) - Wire new config (retrievalEventLoggingEnabled, topK, memoryCollection) through app.ts into createMetaCortexMcpServer and observer - mcpServer: extend execute wrapper to accept optional retrieval descriptors; search_context and fetch_context now emit rich events when enabled (with scores converted from cosine distance) - Test updates: InMemoryToolCallObserver.listRetrievalEvents, integration assertions, config boolean parser tests, fakes - Smoke test: --fetch-first / MCP_FETCH_FIRST to drive fetch after search - Infra: firestore indexes for memory_vectors_eval, rules deny for new collections (retrieval_query_events, retrieval_eval_cases, *_eval*), deploy-firestore-ttl.sh updated - Docs + .env.example updated All tests green (49 passing). --- README.md | 31 ++ firestore.indexes.json | 115 ++++- firestore.rules | 12 + functions/.env.example | 3 + functions/package.json | 3 + functions/scripts/mcp-smoke-test.mjs | 13 +- functions/scripts/retrieval-eval.ts | 492 +++++++++++++++++++++ functions/src/app.ts | 4 + functions/src/config.ts | 28 ++ functions/src/mcpServer.ts | 77 +++- functions/src/observability.ts | 91 +++- functions/src/retrievalEvaluation.ts | 328 ++++++++++++++ functions/test/config.test.ts | 18 + functions/test/mcp.integration.test.ts | 30 ++ functions/test/retrievalEvaluation.test.ts | 196 ++++++++ functions/test/support/fakes.ts | 27 +- scripts/deploy-firestore-ttl.sh | 1 + 17 files changed, 1448 insertions(+), 21 deletions(-) create mode 100644 functions/scripts/retrieval-eval.ts create mode 100644 functions/src/retrievalEvaluation.ts create mode 100644 functions/test/retrievalEvaluation.test.ts diff --git a/README.md b/README.md index 60b32ec..3772b2f 100644 --- a/README.md +++ b/README.md @@ -402,6 +402,11 @@ After deployment, there are three places to look: - `memory_events` in Firestore shows client-attributed tool usage over time - Cloud Logging shows request failures and structured tool-event logs +When `RETRIEVAL_EVENT_LOGGING_ENABLED=true`, `retrieval_query_events` also stores +full search queries, filters, limits, ranked result ids/scores, and fetch ids. This +collection is separate because the full queries may contain sensitive user text; +it uses the same 90-day TTL target as `memory_events`. + `memory_events` records one document per tool call and one document per ingress rejection. Events include: - `client_id` @@ -438,9 +443,35 @@ Search events do include a short `query_preview`, but the observability collecti Retention is handled with Firestore TTL policies: - `memory_events.expires_at` targets 90-day audit retention +- `retrieval_query_events.expires_at` targets 90-day retrieval telemetry retention - `memory_vectors_write_fingerprints.expires_at` targets 30-day fingerprint retention - fingerprint documents keep numeric `dedupe_expires_at` for the short duplicate-write window +## Retrieval evaluation + +The evaluation corpus is stored in `retrieval_eval_cases`; isolated synthetic +memories are stored separately in `memory_vectors_eval`. Cases have no lifecycle +field: regeneration replaces the selected source partition and removes obsolete +cases. + +```bash +# Seed a deterministic isolated corpus and replace its eval cases. +npm --prefix functions run eval:generate + +# Measure hit@k, recall@k, MRR, empty results, and p50/p95 latency. +npm --prefix functions run eval:run -- --mode isolated + +# Explicitly convert recent production search-to-fetch evidence into eval cases. +npm --prefix functions run eval:import -- --lookback-hours 168 + +# Run imported production cases through the deployed MCP endpoint. +npm --prefix functions run eval:run -- --mode production --url "$MCP_BASE_URL" +``` + +Production retrieval events are evidence only. They do not become benchmark cases +until `eval:import` is run, and a successful fetch is treated as a positive label; +the harness does not infer negative relevance judgments. + ## Quick start 1. Install dependencies: diff --git a/firestore.indexes.json b/firestore.indexes.json index 26da8ac..e514f59 100644 --- a/firestore.indexes.json +++ b/firestore.indexes.json @@ -54,7 +54,120 @@ } } ] + }, + { + "collectionGroup": "memory_vectors_eval", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "metadata.module_name", + "order": "ASCENDING" + }, + { + "fieldPath": "embedding", + "vectorConfig": { + "dimension": 768, + "flat": {} + } + } + ] + }, + { + "collectionGroup": "memory_vectors_eval", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "metadata.branch_state", + "order": "ASCENDING" + }, + { + "fieldPath": "embedding", + "vectorConfig": { + "dimension": 768, + "flat": {} + } + } + ] + }, + { + "collectionGroup": "memory_vectors_eval", + "queryScope": "COLLECTION", + "fields": [ + { + "fieldPath": "metadata.branch_state", + "order": "ASCENDING" + }, + { + "fieldPath": "metadata.module_name", + "order": "ASCENDING" + }, + { + "fieldPath": "embedding", + "vectorConfig": { + "dimension": 768, + "flat": {} + } + } + ] } ], - "fieldOverrides": [] + "fieldOverrides": [ + { + "collectionGroup": "memory_events", + "fieldPath": "expires_at", + "ttl": true, + "indexes": [ + { + "order": "ASCENDING", + "queryScope": "COLLECTION" + }, + { + "order": "DESCENDING", + "queryScope": "COLLECTION" + }, + { + "arrayConfig": "CONTAINS", + "queryScope": "COLLECTION" + } + ] + }, + { + "collectionGroup": "memory_vectors_write_fingerprints", + "fieldPath": "expires_at", + "ttl": true, + "indexes": [ + { + "order": "ASCENDING", + "queryScope": "COLLECTION" + }, + { + "order": "DESCENDING", + "queryScope": "COLLECTION" + }, + { + "arrayConfig": "CONTAINS", + "queryScope": "COLLECTION" + } + ] + }, + { + "collectionGroup": "retrieval_query_events", + "fieldPath": "expires_at", + "ttl": true, + "indexes": [ + { + "order": "ASCENDING", + "queryScope": "COLLECTION" + }, + { + "order": "DESCENDING", + "queryScope": "COLLECTION" + }, + { + "arrayConfig": "CONTAINS", + "queryScope": "COLLECTION" + } + ] + } + ] } diff --git a/firestore.rules b/firestore.rules index fa63a05..90fc5e1 100644 --- a/firestore.rules +++ b/firestore.rules @@ -8,8 +8,20 @@ service cloud.firestore { match /memory_events/{document=**} { allow read, write: if false; } + match /retrieval_query_events/{document=**} { + allow read, write: if false; + } + match /retrieval_eval_cases/{document=**} { + allow read, write: if false; + } + match /memory_vectors_eval/{document=**} { + allow read, write: if false; + } match /memory_vectors_write_fingerprints/{document=**} { allow read, write: if false; } + match /memory_vectors_eval_write_fingerprints/{document=**} { + allow read, write: if false; + } } } diff --git a/functions/.env.example b/functions/.env.example index 33c6c2e..f52ea5e 100644 --- a/functions/.env.example +++ b/functions/.env.example @@ -12,6 +12,9 @@ GEMINI_MERGE_MODEL=gemini-3.5-flash GEMINI_GENERATION_VERTEX_LOCATION=global GEMINI_EMBEDDING_DIMENSIONS=768 MEMORY_COLLECTION=memory_vectors +# Opt in to full search queries, ranked ids/scores, and fetch ids for retrieval evaluation. +RETRIEVAL_EVENT_LOGGING_ENABLED=false +RETRIEVAL_EVAL_MEMORY_COLLECTION=memory_vectors_eval SEARCH_RESULT_LIMIT=5 DEFAULT_FILTER_STATE=active SERVICE_NAME=metacortex diff --git a/functions/package.json b/functions/package.json index 25c1a05..96847d0 100644 --- a/functions/package.json +++ b/functions/package.json @@ -11,6 +11,9 @@ "build": "tsc -p tsconfig.json", "clean": "node -e \"const fs=require('fs'); fs.rmSync('lib',{recursive:true,force:true}); fs.rmSync('coverage',{recursive:true,force:true});\"", "backfill:ttl": "node scripts/backfill-firestore-ttl.mjs", + "eval:generate": "tsx scripts/retrieval-eval.ts generate-isolated", + "eval:import": "tsx scripts/retrieval-eval.ts import-production", + "eval:run": "tsx scripts/retrieval-eval.ts run", "serve": "cd .. && firebase emulators:start --only functions,firestore", "shell": "firebase functions:shell", "smoke": "node scripts/mcp-smoke-test.mjs", diff --git a/functions/scripts/mcp-smoke-test.mjs b/functions/scripts/mcp-smoke-test.mjs index 0db2841..fb6aa04 100644 --- a/functions/scripts/mcp-smoke-test.mjs +++ b/functions/scripts/mcp-smoke-test.mjs @@ -64,6 +64,10 @@ const imageMimeType = readArg( process.env.MCP_IMAGE_MIME_TYPE ?? inferMimeType(imageFile) ); const artifactRef = readArg("artifact-ref", process.env.MCP_ARTIFACT_REF); +const fetchFirst = readArg( + "fetch-first", + process.env.MCP_FETCH_FIRST ?? "false" +) === "true"; if (!url) { console.error("Missing MCP base URL. Pass --url or set MCP_BASE_URL."); @@ -157,10 +161,6 @@ try { rememberedId = extractRememberedId(rememberText); } else if (mode === "search-only") { ensureTools(toolNames, ["search_context"]); - - if (toolNames.includes("remember_context")) { - throw new Error("search-only mode expected remember_context to be unavailable"); - } } else { throw new Error(`Unsupported smoke mode: ${mode}`); } @@ -178,12 +178,13 @@ try { const searchText = requireSuccessfulToolResult(searchResult, "search_context"); console.log(searchText); - if (mode === "browser-read-write") { + if (mode === "browser-read-write" || fetchFirst) { + ensureTools(toolNames, ["fetch_context"]); const memoryId = rememberedId ?? extractMemoryId(searchText); if (!memoryId) { throw new Error( - "browser-read-write mode expected remember_context or search_context to return an id" + "fetch-first expected remember_context or search_context to return an id" ); } diff --git a/functions/scripts/retrieval-eval.ts b/functions/scripts/retrieval-eval.ts new file mode 100644 index 0000000..3d09c57 --- /dev/null +++ b/functions/scripts/retrieval-eval.ts @@ -0,0 +1,492 @@ +import { Client } from "@modelcontextprotocol/sdk/client/index.js"; +import { StreamableHTTPClientTransport } from "@modelcontextprotocol/sdk/client/streamableHttp.js"; +import { getApp, getApps, initializeApp } from "firebase-admin/app"; +import { getFirestore } from "firebase-admin/firestore"; +import fs from "node:fs"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +import { loadConfig } from "../src/config.js"; +import { + GeminiEmbeddingClient, + GeminiMultimodalPreparer +} from "../src/embeddings.js"; +import { FirestoreMemoryRepository } from "../src/memoryRepository.js"; +import type { RetrievalEvent } from "../src/observability.js"; +import { + buildSyntheticEvalCases, + computeRetrievalEvalMetrics, + correlateImplicitFetches, + FirestoreRetrievalEvalCaseStore, + replaceRetrievalEvalCases, + type RetrievalEvalCase, + type RetrievalEvalObservation, + type RetrievalEvalTargetMode, + type SyntheticEvalDefinition +} from "../src/retrievalEvaluation.js"; +import { MetaCortexService } from "../src/service.js"; + +const scriptDirectory = path.dirname(fileURLToPath(import.meta.url)); +const functionsDirectory = path.resolve(scriptDirectory, ".."); +loadEnvironment(functionsDirectory); + +const command = process.argv[2]; +const cliArgs = process.argv.slice(3); +const projectId = readArg("project", process.env.GCLOUD_PROJECT ?? "my-brain-88870"); +const evalMemoryCollection = readArg( + "memory-collection", + process.env.RETRIEVAL_EVAL_MEMORY_COLLECTION ?? "memory_vectors_eval" +); +const app = getApps().length === 0 ? initializeApp({ projectId }) : getApp(); +const firestore = getFirestore(app); +const evalCaseStore = new FirestoreRetrievalEvalCaseStore(firestore); + +async function main(): Promise { + switch (command) { + case "generate-isolated": + await generateIsolatedCorpus(); + break; + case "import-production": + await importProductionEvents(); + break; + case "run": + await runEvaluation(); + break; + default: + printUsage(); + process.exitCode = 1; + } +} + +async function generateIsolatedCorpus(): Promise { + if (evalMemoryCollection === "memory_vectors") { + throw new Error("The isolated eval corpus cannot use the production memory_vectors collection"); + } + + const service = createDirectService(evalMemoryCollection); + await deleteCollection(evalMemoryCollection); + await deleteCollection(`${evalMemoryCollection}_write_fingerprints`); + + const memoryIdsByKey = new Map(); + + for (const memory of SYNTHETIC_MEMORIES) { + const stored = await service.rememberContext({ + content: memory.content, + topic: memory.topic, + branch_state: "active" + }); + memoryIdsByKey.set(memory.key, stored.id); + } + + const timestamp = Date.now(); + const cases = buildSyntheticEvalCases({ + definitions: SYNTHETIC_CASES, + memoryIdsByKey, + memoryCollection: evalMemoryCollection, + timestamp + }); + + for (const evalCase of cases) { + await service.searchContext(toSearchInput(evalCase)); + + for (const positiveId of evalCase.positive_ids) { + await service.fetchContext({ id: positiveId }); + } + } + + await replaceRetrievalEvalCases( + evalCaseStore, + "isolated", + "synthetic_flow", + cases + ); + console.log( + JSON.stringify( + { + generated_cases: cases.length, + seeded_memories: memoryIdsByKey.size, + memory_collection: evalMemoryCollection + }, + null, + 2 + ) + ); +} + +async function importProductionEvents(): Promise { + const lookbackHours = Number(readArg("lookback-hours", "168")); + + if (!Number.isFinite(lookbackHours) || lookbackHours <= 0) { + throw new Error("--lookback-hours must be a positive number"); + } + + const since = Date.now() - lookbackHours * 60 * 60 * 1000; + const snapshot = await firestore + .collection("retrieval_query_events") + .where("timestamp", ">=", since) + .get(); + const events = snapshot.docs.map(document => document.data() as RetrievalEvent); + const cases = correlateImplicitFetches(events); + + await replaceRetrievalEvalCases( + evalCaseStore, + "production", + "observed_events", + cases + ); + console.log( + JSON.stringify( + { + imported_cases: cases.length, + scanned_events: events.length, + lookback_hours: lookbackHours + }, + null, + 2 + ) + ); +} + +async function runEvaluation(): Promise { + const mode = readArg("mode", "isolated") as RetrievalEvalTargetMode; + + if (mode !== "isolated" && mode !== "production") { + throw new Error("--mode must be isolated or production"); + } + + const cases = await evalCaseStore.listCases(mode); + + if (cases.length === 0) { + throw new Error(`No ${mode} retrieval eval cases exist`); + } + + const observations = mode === "isolated" + ? await runDirectCases(cases) + : await runProductionCases(cases); + const cutoffs = readArg("cutoffs", "1,5") + .split(",") + .map(value => Number(value.trim())); + + console.log(JSON.stringify(computeRetrievalEvalMetrics(observations, cutoffs), null, 2)); +} + +async function runDirectCases( + cases: readonly RetrievalEvalCase[] +): Promise { + const collections = new Set(cases.map(evalCase => evalCase.memory_collection)); + + if (collections.size !== 1) { + throw new Error("Isolated eval cases must reference one memory collection"); + } + + const service = createDirectService(cases[0]!.memory_collection); + const observations: RetrievalEvalObservation[] = []; + + for (const evalCase of cases) { + const startedAt = performance.now(); + const result = await service.searchContext(toSearchInput(evalCase)); + observations.push({ + case_id: evalCase.case_id, + positive_ids: evalCase.positive_ids, + returned_ids: result.matches.map(match => match.id), + latency_ms: performance.now() - startedAt + }); + } + + return observations; +} + +async function runProductionCases( + cases: readonly RetrievalEvalCase[] +): Promise { + const url = readArg("url", process.env.MCP_BASE_URL); + const token = readArg( + "token", + process.env.MCP_ADMIN_TOKEN ?? process.env.MCP_AUTH_TOKEN + ); + + if (!url || !token) { + throw new Error("Production eval requires --url/MCP_BASE_URL and --token/MCP_ADMIN_TOKEN"); + } + + const client = new Client({ name: "metacortex-retrieval-eval", version: "1.0.0" }); + const transport = new StreamableHTTPClientTransport(new URL(url), { + requestInit: { headers: { Authorization: `Bearer ${token}` } } + }); + const observations: RetrievalEvalObservation[] = []; + + try { + await client.connect(transport); + + for (const evalCase of cases) { + const startedAt = performance.now(); + const result = await client.callTool({ + name: "search_context", + arguments: toSearchInput(evalCase) + }); + const latencyMs = performance.now() - startedAt; + const payload = parseToolPayload(result); + const matches = Array.isArray(payload.matches) ? payload.matches : []; + observations.push({ + case_id: evalCase.case_id, + positive_ids: evalCase.positive_ids, + returned_ids: matches + .map(match => + match && typeof match === "object" && typeof match.id === "string" + ? match.id + : undefined + ) + .filter((id): id is string => Boolean(id)), + latency_ms: latencyMs + }); + } + } finally { + await client.close().catch(() => undefined); + await transport.close().catch(() => undefined); + } + + return observations; +} + +function createDirectService(memoryCollection: string): MetaCortexService { + const config = loadConfig(process.env); + const embeddings = new GeminiEmbeddingClient({ + vertexai: true, + project: projectId, + model: config.embeddingModel, + dimensions: config.embeddingDimensions + }); + const contentPreparer = new GeminiMultimodalPreparer({ + vertexai: true, + project: projectId, + location: config.generationVertexLocation, + model: config.multimodalModel + }); + const repository = new FirestoreMemoryRepository(firestore, memoryCollection); + + return new MetaCortexService( + contentPreparer, + embeddings, + repository, + config, + { + merge: async () => { + throw new Error("Merge is unavailable in retrieval evaluation"); + } + } + ); +} + +function toSearchInput(evalCase: RetrievalEvalCase) { + return { + query: evalCase.query, + filter_topic: evalCase.filters.filter_topic ?? undefined, + filter_state: evalCase.filters.filter_state, + limit: evalCase.limit + }; +} + +function parseToolPayload(result: unknown): Record { + if (!result || typeof result !== "object") { + throw new Error("search_context returned an invalid MCP result"); + } + + const candidate = result as { isError?: boolean; content?: unknown }; + + if (candidate.isError) { + throw new Error("search_context returned an MCP error"); + } + + if (!Array.isArray(candidate.content)) { + throw new Error("search_context returned no MCP content array"); + } + + const text = candidate.content + .filter( + (item): item is { type: "text"; text: string } => + Boolean( + item && + typeof item === "object" && + (item as { type?: unknown }).type === "text" && + typeof (item as { text?: unknown }).text === "string" + ) + ) + .map(item => item.text) + .join("\n"); + const payload: unknown = JSON.parse(text); + + if (!payload || typeof payload !== "object" || Array.isArray(payload)) { + throw new Error("search_context returned a non-object payload"); + } + + const objectPayload = payload as Record; + + if (objectPayload.error) { + throw new Error( + `search_context returned an error: ${JSON.stringify(objectPayload.error)}` + ); + } + + return objectPayload; +} + +async function deleteCollection(collectionName: string): Promise { + while (true) { + const snapshot = await firestore.collection(collectionName).limit(450).get(); + + if (snapshot.empty) { + return; + } + + const batch = firestore.batch(); + + for (const document of snapshot.docs) { + batch.delete(document.ref); + } + + await batch.commit(); + } +} + +function readArg(name: string, fallback?: string): string { + const index = cliArgs.indexOf(`--${name}`); + return index === -1 ? fallback ?? "" : cliArgs[index + 1] ?? fallback ?? ""; +} + +function loadEnvironment(directory: string): void { + const explicitlySet = new Set(Object.keys(process.env)); + + for (const fileName of [".env", ".env.prod"]) { + const filePath = path.join(directory, fileName); + + if (!fs.existsSync(filePath)) { + continue; + } + + for (const rawLine of fs.readFileSync(filePath, "utf8").split(/\r?\n/)) { + const line = rawLine.trim(); + + if (!line || line.startsWith("#")) { + continue; + } + + const separator = line.indexOf("="); + + if (separator === -1) { + continue; + } + + const key = line.slice(0, separator).trim(); + + if (!explicitlySet.has(key)) { + process.env[key] = line.slice(separator + 1).trim(); + } + } + } +} + +function printUsage(): void { + console.error([ + "Usage:", + " npm run eval:generate", + " npm run eval:import -- --lookback-hours 168", + " npm run eval:run -- --mode isolated", + " npm run eval:run -- --mode production --url " + ].join("\n")); +} + +const SYNTHETIC_MEMORIES = [ + { + key: "ktor-darwin", + topic: "eval-networking", + content: "The shared Kotlin networking layer uses Ktor HttpClient with the Darwin engine on iOS and the OkHttp engine on Android." + }, + { + key: "retrofit-legacy", + topic: "eval-networking", + content: "The retired Android-only client used Retrofit with Gson and had no iOS implementation." + }, + { + key: "vector-dimensions", + topic: "eval-firestore", + content: "Firestore vector indexes and GEMINI_EMBEDDING_DIMENSIONS must both use 768 dimensions with cosine distance." + }, + { + key: "firestore-mode", + topic: "eval-firestore", + content: "The MetaCortex database must use Firestore Native mode rather than Datastore mode." + }, + { + key: "client-scoping", + topic: "eval-security", + content: "Each custom MCP client profile has its own bearer token, tool allowlist, branch-state allowlist, and browser origin allowlist." + }, + { + key: "cors-default", + topic: "eval-security", + content: "MCP_ALLOWED_ORIGINS applies only to the default admin endpoint and defaults to denying browser origins." + }, + { + key: "write-fingerprint", + topic: "eval-writes", + content: "A write fingerprint suppresses duplicate memory writes for fifteen minutes while the fingerprint document is retained for thirty days." + }, + { + key: "consolidation", + topic: "eval-lifecycle", + content: "Consolidation merges at least two source memories into one active record and deprecates each source with superseded_by set to the merged id." + } +] as const; + +const SYNTHETIC_CASES: readonly SyntheticEvalDefinition[] = [ + { + case_id: "networking-ios-engine", + query: "Which HTTP engine does the shared mobile client use on iOS?", + filter_topic: "eval-networking", + filter_state: "active", + limit: 5, + positive_keys: ["ktor-darwin"] + }, + { + case_id: "firestore-vector-dimensions", + query: "GEMINI_EMBEDDING_DIMENSIONS cosine index size", + filter_topic: "eval-firestore", + filter_state: "active", + limit: 5, + positive_keys: ["vector-dimensions"] + }, + { + case_id: "firestore-native-mode", + query: "Which Firestore database mode is required?", + filter_topic: "eval-firestore", + filter_state: "active", + limit: 5, + positive_keys: ["firestore-mode"] + }, + { + case_id: "scoped-client-controls", + query: "How are individual MCP clients restricted?", + filter_topic: "eval-security", + filter_state: "active", + limit: 5, + positive_keys: ["client-scoping"] + }, + { + case_id: "duplicate-write-window", + query: "How long are repeated memory writes deduplicated?", + filter_topic: "eval-writes", + filter_state: "active", + limit: 5, + positive_keys: ["write-fingerprint"] + }, + { + case_id: "consolidation-effects", + query: "What happens to source records after consolidation?", + filter_topic: "eval-lifecycle", + filter_state: "active", + limit: 5, + positive_keys: ["consolidation"] + } +]; + +await main(); diff --git a/functions/src/app.ts b/functions/src/app.ts index 680dc70..c4b2aa4 100644 --- a/functions/src/app.ts +++ b/functions/src/app.ts @@ -127,6 +127,10 @@ function registerMcpRoutes( clientId: profile.id, serviceName: runtime.config.serviceName, serviceVersion: runtime.config.serviceVersion, + memoryCollection: runtime.config.memoryCollection, + retrievalEventLoggingEnabled: + runtime.config.retrievalEventLoggingEnabled, + topK: runtime.config.topK, defaultFilterState: selectDefaultFilterState(runtime.config, profile), allowedTools: profile.allowedTools, allowedFilterStates: profile.allowedFilterStates diff --git a/functions/src/config.ts b/functions/src/config.ts index 86f7924..b0d7022 100644 --- a/functions/src/config.ts +++ b/functions/src/config.ts @@ -24,12 +24,35 @@ export interface AppConfig { generationVertexLocation: string; embeddingDimensions: number; memoryCollection: string; + retrievalEventLoggingEnabled: boolean; topK: number; defaultFilterState: BranchState; defaultClientProfile: ClientProfile; clientProfiles: ClientProfile[]; } +function parseBoolean( + value: string | undefined, + fallback: boolean, + key: string +): boolean { + if (!value?.trim()) { + return fallback; + } + + const normalized = value.trim().toLowerCase(); + + if (normalized === "true") { + return true; + } + + if (normalized === "false") { + return false; + } + + throw new MissingConfigurationError(`${key} must be true or false when provided`); +} + export class MissingConfigurationError extends Error { constructor(message: string) { super(message); @@ -276,6 +299,11 @@ export function loadConfig(env: NodeJS.ProcessEnv = process.env): AppConfig { "GEMINI_EMBEDDING_DIMENSIONS" ), memoryCollection: env.MEMORY_COLLECTION?.trim() || "memory_vectors", + retrievalEventLoggingEnabled: parseBoolean( + env.RETRIEVAL_EVENT_LOGGING_ENABLED, + false, + "RETRIEVAL_EVENT_LOGGING_ENABLED" + ), topK: parsePositiveInteger(env.SEARCH_RESULT_LIMIT, 5, "SEARCH_RESULT_LIMIT"), defaultFilterState: defaultFilterState as BranchState, defaultClientProfile, diff --git a/functions/src/mcpServer.ts b/functions/src/mcpServer.ts index 2593c9d..7006a7f 100644 --- a/functions/src/mcpServer.ts +++ b/functions/src/mcpServer.ts @@ -4,7 +4,10 @@ import * as z from "zod/v4"; import type { AppConfig } from "./config.js"; import { HttpError } from "./errors.js"; import { normalizeOptionalText } from "./normalize.js"; -import type { ToolCallObserver } from "./observability.js"; +import type { + RetrievalEventInput, + ToolCallObserver +} from "./observability.js"; import { buildConsolidatePayload, buildDeprecatePayload, @@ -21,7 +24,15 @@ import { export function createMetaCortexMcpServer( service: MetaCortexService, - config: Pick & { + config: Pick< + AppConfig, + | "serviceName" + | "serviceVersion" + | "defaultFilterState" + | "memoryCollection" + | "retrievalEventLoggingEnabled" + | "topK" + > & { observer: ToolCallObserver; clientId: string; allowedTools: readonly McpToolName[]; @@ -161,7 +172,11 @@ export function createMetaCortexMcpServer( toolName: McpToolName, requestSummary: Record, run: () => Promise, - summarizeResult: (result: Result) => Record + summarizeResult: (result: Result) => Record, + retrieval?: { + request: RetrievalEventInput; + summarizeResult: (result: Result) => Partial; + } ): Promise => { const startedAt = Date.now(); @@ -174,7 +189,15 @@ export function createMetaCortexMcpServer( status: "success", latency_ms: Date.now() - startedAt, request: requestSummary, - response: summarizeResult(result) + response: summarizeResult(result), + ...(retrieval + ? { + retrieval: { + ...retrieval.request, + ...retrieval.summarizeResult(result) + } as RetrievalEventInput + } + : {}) }); return result; @@ -185,6 +208,7 @@ export function createMetaCortexMcpServer( status: "error", latency_ms: Date.now() - startedAt, request: requestSummary, + ...(retrieval ? { retrieval: retrieval.request } : {}), error: summarizeToolError(error) }); @@ -262,10 +286,12 @@ export function createMetaCortexMcpServer( }, async args => { const requestedFilterState = args.filter_state ?? config.defaultFilterState; + const normalizedFilterTopic = normalizeOptionalText(args.filter_topic); + const requestedLimit = args.limit ?? config.topK; const requestSummary = { query_preview: truncateText(args.query), query_length: args.query.trim().length, - filter_topic: normalizeOptionalText(args.filter_topic), + filter_topic: normalizedFilterTopic, filter_state: requestedFilterState, limit: args.limit }; @@ -287,7 +313,34 @@ export function createMetaCortexMcpServer( result_ids: searchResult.matches.map(match => match.id), filter_state: searchResult.appliedFilters.filter_state, filter_topic: searchResult.appliedFilters.filter_topic - }) + }), + config.retrievalEventLoggingEnabled + ? { + request: { + event_type: "search", + memory_collection: config.memoryCollection, + query: args.query.trim(), + filter_topic: normalizedFilterTopic, + filter_state: requestedFilterState, + limit: requestedLimit + }, + summarizeResult: searchResult => ({ + result_count: searchResult.matches.length, + results: searchResult.matches.map((match, index) => ({ + id: match.id, + rank: index + 1, + ...(typeof match.distance === "number" + ? { + score: Math.max( + 0, + Number((1 - match.distance).toFixed(6)) + ) + } + : {}) + })) + }) + } + : undefined ); return { @@ -336,7 +389,17 @@ export function createMetaCortexMcpServer( branch_state: fetched.item.metadata.branch_state, modality: fetched.item.metadata.modality, artifact_ref_count: fetched.item.metadata.artifact_refs?.length ?? 0 - }) + }), + config.retrievalEventLoggingEnabled + ? { + request: { + event_type: "fetch", + memory_collection: config.memoryCollection, + memory_id: args.id ?? args.document_id ?? "" + }, + summarizeResult: () => ({ found: true }) + } + : undefined ); return { diff --git a/functions/src/observability.ts b/functions/src/observability.ts index 264745c..3b0f86b 100644 --- a/functions/src/observability.ts +++ b/functions/src/observability.ts @@ -5,6 +5,7 @@ import { Firestore } from "firebase-admin/firestore"; import type { McpToolName } from "./types.js"; export const MEMORY_EVENT_COLLECTION = "memory_events"; +export const RETRIEVAL_EVENT_COLLECTION = "retrieval_query_events"; const MEMORY_EVENT_TTL_MS = 90 * 24 * 60 * 60 * 1000; export type RequestEventReason = @@ -31,6 +32,48 @@ export interface ToolCallEvent { error?: ToolCallEventError; } +export interface RankedRetrievalResult { + id: string; + rank: number; + score?: number; +} + +export interface SearchRetrievalEventInput { + event_type: "search"; + memory_collection: string; + query: string; + filter_topic?: string; + filter_state: string; + limit: number; + result_count?: number; + results?: RankedRetrievalResult[]; +} + +export interface FetchRetrievalEventInput { + event_type: "fetch"; + memory_collection: string; + memory_id: string; + found?: boolean; +} + +export type RetrievalEventInput = + | SearchRetrievalEventInput + | FetchRetrievalEventInput; + +interface RetrievalEventBase { + event_id: string; + tool_event_id: string; + client_id: string; + status: "success" | "error"; + timestamp: number; + expires_at: Date; + latency_ms?: number; + memory_collection: string; + error?: ToolCallEventError; +} + +export type RetrievalEvent = RetrievalEventBase & RetrievalEventInput; + export interface RequestEvent { event_id: string; event_type: "request"; @@ -55,6 +98,7 @@ export interface RecordToolCallEventInput { request: Record; response?: Record; error?: ToolCallEventError; + retrieval?: RetrievalEventInput; timestamp?: number; } @@ -98,7 +142,35 @@ export class FirestoreToolCallObserver implements ToolCallObserver { ...(input.error ? { error: input.error } : {}) }; - await this.persist("metaCortexMcp tool event", event); + const writes = [ + this.persist("metaCortexMcp tool event", event, this.collectionName) + ]; + + if (input.retrieval) { + const retrievalEvent: RetrievalEvent = { + event_id: randomUUID(), + tool_event_id: event.event_id, + client_id: input.client_id, + status: input.status, + timestamp, + expires_at: new Date(timestamp + MEMORY_EVENT_TTL_MS), + ...(typeof input.latency_ms === "number" + ? { latency_ms: input.latency_ms } + : {}), + ...input.retrieval, + ...(input.error ? { error: input.error } : {}) + }; + writes.push( + this.persist( + "metaCortexMcp retrieval event", + retrievalEvent, + RETRIEVAL_EVENT_COLLECTION, + false + ) + ); + } + + await Promise.all(writes); } async recordRequest(input: RecordRequestEventInput): Promise { @@ -119,17 +191,24 @@ export class FirestoreToolCallObserver implements ToolCallObserver { : {}) }; - await this.persist("metaCortexMcp request event", event); + await this.persist("metaCortexMcp request event", event, this.collectionName); } - private async persist(message: string, event: ObservabilityEvent): Promise { - const sanitizedEvent = stripUndefined(event) as ObservabilityEvent; + private async persist( + message: string, + event: ObservabilityEvent | RetrievalEvent, + collectionName: string, + logToConsole = true + ): Promise { + const sanitizedEvent = stripUndefined(event) as ObservabilityEvent | RetrievalEvent; - console.info(message, sanitizedEvent); + if (logToConsole) { + console.info(message, sanitizedEvent); + } try { await this.firestore - .collection(this.collectionName) + .collection(collectionName) .doc(sanitizedEvent.event_id) .set(sanitizedEvent); } catch (error) { diff --git a/functions/src/retrievalEvaluation.ts b/functions/src/retrievalEvaluation.ts new file mode 100644 index 0000000..18359be --- /dev/null +++ b/functions/src/retrievalEvaluation.ts @@ -0,0 +1,328 @@ +import type { Firestore } from "firebase-admin/firestore"; + +import type { RetrievalEvent } from "./observability.js"; +import type { BranchState } from "./types.js"; + +export const RETRIEVAL_EVAL_COLLECTION = "retrieval_eval_cases"; +const FIRESTORE_BATCH_LIMIT = 450; + +export type RetrievalEvalTargetMode = "isolated" | "production"; +export type RetrievalEvalSource = "synthetic_flow" | "observed_events"; + +export interface RetrievalEvalCase { + schema_version: 1; + case_id: string; + target_mode: RetrievalEvalTargetMode; + source: RetrievalEvalSource; + query: string; + filters: { + filter_topic?: string | null; + filter_state: BranchState; + }; + limit: number; + memory_collection: string; + positive_ids: string[]; + label_source: "implicit_fetch"; + created_at: number; + updated_at: number; +} + +export interface RetrievalEvalCaseStore { + deleteCases( + targetMode: RetrievalEvalTargetMode, + source: RetrievalEvalSource + ): Promise; + writeCases(cases: readonly RetrievalEvalCase[]): Promise; + listCases(targetMode: RetrievalEvalTargetMode): Promise; +} + +export class FirestoreRetrievalEvalCaseStore implements RetrievalEvalCaseStore { + constructor( + private readonly firestore: Firestore, + private readonly collectionName = RETRIEVAL_EVAL_COLLECTION + ) {} + + async deleteCases( + targetMode: RetrievalEvalTargetMode, + source: RetrievalEvalSource + ): Promise { + const snapshot = await this.firestore + .collection(this.collectionName) + .where("target_mode", "==", targetMode) + .where("source", "==", source) + .get(); + + for (const documents of chunk(snapshot.docs, FIRESTORE_BATCH_LIMIT)) { + const batch = this.firestore.batch(); + + for (const document of documents) { + batch.delete(document.ref); + } + + await batch.commit(); + } + } + + async writeCases(cases: readonly RetrievalEvalCase[]): Promise { + for (const caseChunk of chunk(cases, FIRESTORE_BATCH_LIMIT)) { + const batch = this.firestore.batch(); + + for (const evalCase of caseChunk) { + batch.set( + this.firestore.collection(this.collectionName).doc(evalCase.case_id), + evalCase + ); + } + + await batch.commit(); + } + } + + async listCases(targetMode: RetrievalEvalTargetMode): Promise { + const snapshot = await this.firestore + .collection(this.collectionName) + .where("target_mode", "==", targetMode) + .get(); + + return snapshot.docs + .map(document => document.data() as RetrievalEvalCase) + .filter(evalCase => evalCase.schema_version === 1) + .sort((left, right) => left.case_id.localeCompare(right.case_id)); + } +} + +export async function replaceRetrievalEvalCases( + store: RetrievalEvalCaseStore, + targetMode: RetrievalEvalTargetMode, + source: RetrievalEvalSource, + cases: readonly RetrievalEvalCase[] +): Promise { + await store.deleteCases(targetMode, source); + await store.writeCases(cases); +} + +export interface SyntheticEvalDefinition { + case_id: string; + query: string; + filter_topic?: string; + filter_state: BranchState; + limit: number; + positive_keys: string[]; +} + +export function buildSyntheticEvalCases(input: { + definitions: readonly SyntheticEvalDefinition[]; + memoryIdsByKey: ReadonlyMap; + memoryCollection: string; + timestamp: number; +}): RetrievalEvalCase[] { + return input.definitions.map(definition => ({ + schema_version: 1, + case_id: definition.case_id, + target_mode: "isolated", + source: "synthetic_flow", + query: definition.query, + filters: { + filter_topic: definition.filter_topic ?? null, + filter_state: definition.filter_state + }, + limit: definition.limit, + memory_collection: input.memoryCollection, + positive_ids: definition.positive_keys.map(key => { + const memoryId = input.memoryIdsByKey.get(key); + + if (!memoryId) { + throw new Error(`Synthetic eval definition references unknown memory key: ${key}`); + } + + return memoryId; + }), + label_source: "implicit_fetch", + created_at: input.timestamp, + updated_at: input.timestamp + })); +} + +export function correlateImplicitFetches( + events: readonly RetrievalEvent[], + options: { maxDelayMs?: number } = {} +): RetrievalEvalCase[] { + const maxDelayMs = options.maxDelayMs ?? 30 * 60 * 1000; + const searches = events + .filter( + (event): event is Extract => + event.event_type === "search" && event.status === "success" + ) + .sort((left, right) => left.timestamp - right.timestamp); + const fetches = events + .filter( + (event): event is Extract => + event.event_type === "fetch" && + event.status === "success" && + event.found === true + ) + .sort((left, right) => left.timestamp - right.timestamp); + const matches = new Map< + string, + { search: (typeof searches)[number]; positiveIds: Set; updatedAt: number } + >(); + + for (const fetch of fetches) { + const search = searches + .filter(candidate => + candidate.client_id === fetch.client_id && + candidate.memory_collection === fetch.memory_collection && + candidate.timestamp <= fetch.timestamp && + fetch.timestamp - candidate.timestamp <= maxDelayMs && + candidate.results?.some(result => result.id === fetch.memory_id) + ) + .at(-1); + + if (!search) { + continue; + } + + const existing = matches.get(search.event_id); + + if (existing) { + existing.positiveIds.add(fetch.memory_id); + existing.updatedAt = Math.max(existing.updatedAt, fetch.timestamp); + } else { + matches.set(search.event_id, { + search, + positiveIds: new Set([fetch.memory_id]), + updatedAt: fetch.timestamp + }); + } + } + + return [...matches.values()] + .map(({ search, positiveIds, updatedAt }) => ({ + schema_version: 1 as const, + case_id: `observed-${search.event_id.replaceAll("/", "-")}`, + target_mode: "production" as const, + source: "observed_events" as const, + query: search.query, + filters: { + filter_topic: search.filter_topic ?? null, + filter_state: search.filter_state as BranchState + }, + limit: search.limit, + memory_collection: search.memory_collection, + positive_ids: [...positiveIds], + label_source: "implicit_fetch" as const, + created_at: search.timestamp, + updated_at: updatedAt + })) + .sort((left, right) => left.case_id.localeCompare(right.case_id)); +} + +export interface RetrievalEvalObservation { + case_id: string; + positive_ids: string[]; + returned_ids: string[]; + latency_ms: number; +} + +export interface RetrievalEvalMetrics { + case_count: number; + hit_at_k: Record; + recall_at_k: Record; + mrr: number; + zero_result_count: number; + zero_result_rate: number; + latency_ms: { + p50: number; + p95: number; + }; + observations: RetrievalEvalObservation[]; +} + +export function computeRetrievalEvalMetrics( + observations: readonly RetrievalEvalObservation[], + cutoffs: readonly number[] = [1, 5] +): RetrievalEvalMetrics { + const normalizedCutoffs = [...new Set(cutoffs)] + .filter(cutoff => Number.isInteger(cutoff) && cutoff > 0) + .sort((left, right) => left - right); + const hitAtK: Record = {}; + const recallAtK: Record = {}; + + for (const cutoff of normalizedCutoffs) { + const hitTotal = observations.reduce((total, observation) => { + const positiveIds = new Set(observation.positive_ids); + const hit = observation.returned_ids + .slice(0, cutoff) + .some(id => positiveIds.has(id)); + return total + Number(hit); + }, 0); + const recallTotal = observations.reduce((total, observation) => { + const positiveIds = new Set(observation.positive_ids); + + if (positiveIds.size === 0) { + return total; + } + + const retrievedPositiveCount = new Set( + observation.returned_ids + .slice(0, cutoff) + .filter(id => positiveIds.has(id)) + ).size; + return total + retrievedPositiveCount / positiveIds.size; + }, 0); + + hitAtK[String(cutoff)] = safeAverage(hitTotal, observations.length); + recallAtK[String(cutoff)] = safeAverage(recallTotal, observations.length); + } + + const reciprocalRankTotal = observations.reduce((total, observation) => { + const positiveIds = new Set(observation.positive_ids); + const index = observation.returned_ids.findIndex(id => positiveIds.has(id)); + return total + (index === -1 ? 0 : 1 / (index + 1)); + }, 0); + const zeroResultCount = observations.filter( + observation => observation.returned_ids.length === 0 + ).length; + const latencies = observations.map(observation => observation.latency_ms); + + return { + case_count: observations.length, + hit_at_k: hitAtK, + recall_at_k: recallAtK, + mrr: safeAverage(reciprocalRankTotal, observations.length), + zero_result_count: zeroResultCount, + zero_result_rate: safeAverage(zeroResultCount, observations.length), + latency_ms: { + p50: percentile(latencies, 0.5), + p95: percentile(latencies, 0.95) + }, + observations: observations.map(observation => ({ + ...observation, + latency_ms: Number(observation.latency_ms.toFixed(3)) + })) + }; +} + +function safeAverage(total: number, count: number): number { + return count === 0 ? 0 : Number((total / count).toFixed(6)); +} + +function percentile(values: readonly number[], percentileValue: number): number { + if (values.length === 0) { + return 0; + } + + const sorted = [...values].sort((left, right) => left - right); + const index = Math.max(0, Math.ceil(percentileValue * sorted.length) - 1); + return Number(sorted[index]!.toFixed(3)); +} + +function chunk(items: readonly T[], size: number): T[][] { + const chunks: T[][] = []; + + for (let index = 0; index < items.length; index += size) { + chunks.push(items.slice(index, index + size)); + } + + return chunks; +} diff --git a/functions/test/config.test.ts b/functions/test/config.test.ts index fd0a22a..aeec890 100644 --- a/functions/test/config.test.ts +++ b/functions/test/config.test.ts @@ -23,6 +23,7 @@ describe("loadConfig", () => { expect(config.mergeModel).toBe("gemini-3.5-flash"); expect(config.generationVertexLocation).toBe("global"); expect(config.embeddingDimensions).toBe(768); + expect(config.retrievalEventLoggingEnabled).toBe(false); expect(config.defaultFilterState).toBe("active"); expect(config.topK).toBe(5); expect(config.defaultClientProfile.allowedTools).toContain("search_context"); @@ -45,6 +46,23 @@ describe("loadConfig", () => { ).toThrowError(MissingConfigurationError); }); + it("loads and validates retrieval event logging", () => { + const config = loadConfig({ + [geminiApiKeyEnv]: accessCredential("gemini"), + [adminTokenEnv]: accessCredential("admin"), + RETRIEVAL_EVENT_LOGGING_ENABLED: "true" + }); + + expect(config.retrievalEventLoggingEnabled).toBe(true); + expect(() => + loadConfig({ + [geminiApiKeyEnv]: accessCredential("gemini"), + [adminTokenEnv]: accessCredential("admin"), + RETRIEVAL_EVENT_LOGGING_ENABLED: "yes" + }) + ).toThrowError(MissingConfigurationError); + }); + it("loads scoped client profiles", () => { const config = loadConfig({ [geminiApiKeyEnv]: accessCredential("gemini"), diff --git a/functions/test/mcp.integration.test.ts b/functions/test/mcp.integration.test.ts index 46ddee4..c863678 100644 --- a/functions/test/mcp.integration.test.ts +++ b/functions/test/mcp.integration.test.ts @@ -322,6 +322,7 @@ describe("MCP integration", () => { status_code: 403 } }); + expect(runtime.observer.listRetrievalEvents()).toEqual([]); }); it("consolidates wip memories via consolidate_context tool", async () => { @@ -467,6 +468,7 @@ describe("MCP integration", () => { it("supports ChatGPT web remember, search, and fetch flows", async () => { const runtime = createTestRuntime({ + retrievalEventLoggingEnabled: true, clientProfiles: [ { id: "chatgpt-web", @@ -627,6 +629,34 @@ describe("MCP integration", () => { } } ]); + expect(runtime.observer.listRetrievalEvents()).toMatchObject([ + { + event_type: "search", + client_id: "chatgpt-web", + status: "success", + memory_collection: "memory_vectors", + query: "shared networking for android and ios", + filter_topic: "kmp-networking", + filter_state: "active", + limit: 5, + result_count: 1, + results: [ + { + id: "memory-1", + rank: 1, + score: expect.any(Number) + } + ] + }, + { + event_type: "fetch", + client_id: "chatgpt-web", + status: "success", + memory_collection: "memory_vectors", + memory_id: "memory-1", + found: true + } + ]); }); }); diff --git a/functions/test/retrievalEvaluation.test.ts b/functions/test/retrievalEvaluation.test.ts new file mode 100644 index 0000000..38507b9 --- /dev/null +++ b/functions/test/retrievalEvaluation.test.ts @@ -0,0 +1,196 @@ +import { describe, expect, it } from "vitest"; + +import type { RetrievalEvent } from "../src/observability.js"; +import { + buildSyntheticEvalCases, + computeRetrievalEvalMetrics, + correlateImplicitFetches, + replaceRetrievalEvalCases, + type RetrievalEvalCase, + type RetrievalEvalCaseStore, + type RetrievalEvalSource, + type RetrievalEvalTargetMode +} from "../src/retrievalEvaluation.js"; + +describe("retrieval evaluation", () => { + it("builds synthetic cases with resolved positive ids", () => { + const cases = buildSyntheticEvalCases({ + definitions: [ + { + case_id: "networking", + query: "shared iOS networking", + filter_topic: "networking", + filter_state: "active", + limit: 5, + positive_keys: ["ktor"] + } + ], + memoryIdsByKey: new Map([["ktor", "memory-42"]]), + memoryCollection: "memory_vectors_eval", + timestamp: 100 + }); + + expect(cases).toEqual([ + expect.objectContaining({ + case_id: "networking", + target_mode: "isolated", + source: "synthetic_flow", + positive_ids: ["memory-42"], + label_source: "implicit_fetch" + }) + ]); + }); + + it("correlates successful fetches to the most recent search containing the id", () => { + const events: RetrievalEvent[] = [ + searchEvent({ eventId: "search-1", timestamp: 100, resultIds: ["a", "b"] }), + searchEvent({ eventId: "search-2", timestamp: 200, resultIds: ["b"] }), + fetchEvent({ eventId: "fetch-1", timestamp: 250, memoryId: "b" }), + fetchEvent({ eventId: "fetch-2", timestamp: 260, memoryId: "missing" }) + ]; + + expect(correlateImplicitFetches(events)).toEqual([ + expect.objectContaining({ + case_id: "observed-search-2", + query: "query search-2", + positive_ids: ["b"], + target_mode: "production", + source: "observed_events" + }) + ]); + }); + + it("computes ranking, empty-result, and latency metrics", () => { + const metrics = computeRetrievalEvalMetrics([ + { + case_id: "one", + positive_ids: ["a"], + returned_ids: ["x", "a"], + latency_ms: 10 + }, + { + case_id: "two", + positive_ids: ["b", "c"], + returned_ids: ["c"], + latency_ms: 20 + }, + { + case_id: "three", + positive_ids: ["d"], + returned_ids: [], + latency_ms: 100 + } + ]); + + expect(metrics).toMatchObject({ + case_count: 3, + hit_at_k: { "1": 0.333333, "5": 0.666667 }, + recall_at_k: { "1": 0.166667, "5": 0.5 }, + mrr: 0.5, + zero_result_count: 1, + zero_result_rate: 0.333333, + latency_ms: { p50: 20, p95: 100 } + }); + }); + + it("replaces a source partition instead of retaining obsolete cases", async () => { + const obsolete = evalCase("obsolete"); + const retainedOtherSource = { + ...evalCase("observed"), + source: "observed_events" as const + }; + const store = new InMemoryEvalCaseStore([obsolete, retainedOtherSource]); + const replacement = evalCase("replacement"); + + await replaceRetrievalEvalCases( + store, + "isolated", + "synthetic_flow", + [replacement] + ); + + expect(await store.listCases("isolated")).toEqual([ + retainedOtherSource, + replacement + ]); + }); +}); + +class InMemoryEvalCaseStore implements RetrievalEvalCaseStore { + constructor(private cases: RetrievalEvalCase[]) {} + + async deleteCases( + targetMode: RetrievalEvalTargetMode, + source: RetrievalEvalSource + ): Promise { + this.cases = this.cases.filter( + evalCase => evalCase.target_mode !== targetMode || evalCase.source !== source + ); + } + + async writeCases(cases: readonly RetrievalEvalCase[]): Promise { + this.cases.push(...cases); + } + + async listCases(targetMode: RetrievalEvalTargetMode): Promise { + return this.cases.filter(evalCase => evalCase.target_mode === targetMode); + } +} + +function evalCase(caseId: string): RetrievalEvalCase { + return { + schema_version: 1, + case_id: caseId, + target_mode: "isolated", + source: "synthetic_flow", + query: caseId, + filters: { filter_topic: null, filter_state: "active" }, + limit: 5, + memory_collection: "memory_vectors_eval", + positive_ids: [caseId], + label_source: "implicit_fetch", + created_at: 100, + updated_at: 100 + }; +} + +function searchEvent(input: { + eventId: string; + timestamp: number; + resultIds: string[]; +}): RetrievalEvent { + return { + event_id: input.eventId, + tool_event_id: `tool-${input.eventId}`, + event_type: "search", + client_id: "client", + status: "success", + timestamp: input.timestamp, + expires_at: new Date(), + memory_collection: "memory_vectors", + query: `query ${input.eventId}`, + filter_state: "active", + limit: 5, + result_count: input.resultIds.length, + results: input.resultIds.map((id, index) => ({ id, rank: index + 1 })) + }; +} + +function fetchEvent(input: { + eventId: string; + timestamp: number; + memoryId: string; +}): RetrievalEvent { + return { + event_id: input.eventId, + tool_event_id: `tool-${input.eventId}`, + event_type: "fetch", + client_id: "client", + status: "success", + timestamp: input.timestamp, + expires_at: new Date(), + memory_collection: "memory_vectors", + memory_id: input.memoryId, + found: true + }; +} diff --git a/functions/test/support/fakes.ts b/functions/test/support/fakes.ts index c7f86da..98878c8 100644 --- a/functions/test/support/fakes.ts +++ b/functions/test/support/fakes.ts @@ -11,6 +11,7 @@ import type { ObservabilityEvent, RecordToolCallEventInput, RecordRequestEventInput, + RetrievalEvent, ToolCallObserver } from "../../src/observability.js"; import type { @@ -218,12 +219,15 @@ export class InMemoryMemoryRepository implements MemoryRepository { export class InMemoryToolCallObserver implements ToolCallObserver { private nextId = 1; private readonly events: ObservabilityEvent[] = []; + private readonly retrievalEvents: RetrievalEvent[] = []; async record(input: RecordToolCallEventInput): Promise { const timestamp = input.timestamp ?? Date.now(); + const eventId = `event-${this.nextId++}`; + this.events.push({ - event_id: `event-${this.nextId++}`, + event_id: eventId, event_type: "tool_call", timestamp, expires_at: new Date(timestamp + 90 * 24 * 60 * 60 * 1000), @@ -237,6 +241,22 @@ export class InMemoryToolCallObserver implements ToolCallObserver { ...(input.response ? { response: input.response } : {}), ...(input.error ? { error: input.error } : {}) }); + + if (input.retrieval) { + this.retrievalEvents.push({ + event_id: `retrieval-event-${this.nextId++}`, + tool_event_id: eventId, + timestamp, + expires_at: new Date(timestamp + 90 * 24 * 60 * 60 * 1000), + client_id: input.client_id, + status: input.status, + ...(typeof input.latency_ms === "number" + ? { latency_ms: input.latency_ms } + : {}), + ...input.retrieval, + ...(input.error ? { error: input.error } : {}) + }); + } } async recordRequest(input: RecordRequestEventInput): Promise { @@ -262,6 +282,10 @@ export class InMemoryToolCallObserver implements ToolCallObserver { listEvents(): ObservabilityEvent[] { return [...this.events]; } + + listRetrievalEvents(): RetrievalEvent[] { + return [...this.retrievalEvents]; + } } export function createTestConfig(overrides: Partial = {}): AppConfig { @@ -285,6 +309,7 @@ export function createTestConfig(overrides: Partial = {}): AppConfig generationVertexLocation: "global", embeddingDimensions: 768, memoryCollection: "memory_vectors", + retrievalEventLoggingEnabled: false, topK: 5, defaultFilterState: "active", defaultClientProfile: overrides.defaultClientProfile ?? defaultClientProfile, diff --git a/scripts/deploy-firestore-ttl.sh b/scripts/deploy-firestore-ttl.sh index 634eacf..08b1b65 100755 --- a/scripts/deploy-firestore-ttl.sh +++ b/scripts/deploy-firestore-ttl.sh @@ -30,6 +30,7 @@ done collection_groups=( "${MEMORY_COLLECTION}_write_fingerprints" "memory_events" + "retrieval_query_events" ) for collection_group in "${collection_groups[@]}"; do