From e57c64fbf676f2af6c20b823d893f3f22a8e2af5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:57:17 +0000 Subject: [PATCH 1/4] Initial plan From 2f08b4305e0781b5d3c4c11b9d957233c86c4d2a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Jun 2026 16:04:37 +0000 Subject: [PATCH 2/4] Add controller error-path tests and README docs for OpenAI resilience config --- README.md | 36 +++++++++++++++++++ packages/backend/src/ai/ai.controller.spec.ts | 28 +++++++++++++-- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8efe855b..5ec48523 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,16 @@ NODE_ENV=development # External APIs (optional, for AI features) OPENAI_API_KEY=sk-your-openai-key GOOGLE_TRANSLATE_API_KEY=your-google-translate-key + +# OpenAI error handling / resilience (all optional – defaults shown) +OPENAI_TIMEOUT_MS=10000 # Max ms to wait for a single OpenAI request before aborting +OPENAI_RETRIES=3 # Max retry attempts on transient errors (429, 5xx, timeouts) +OPENAI_BACKOFF_BASE_MS=500 # Base interval (ms) for exponential backoff with full jitter +CIRCUIT_BREAKER_FAILURES=5 # Consecutive failures before the circuit breaker opens +CIRCUIT_BREAKER_WINDOW_MS=60000 # Duration (ms) the circuit stays open before allowing a probe +CIRCUIT_BREAKER_PROBE_MS=30000 # Minimum interval (ms) between probe attempts while circuit is open +OPENAI_CONCURRENCY=10 # Maximum concurrent outbound OpenAI calls +FEATURE_FLAG_RESILIENT_OPENAI=true # Set to "false" to bypass all resilience logic (useful for debugging) ``` #### Frontend (`packages/frontend/.env`) @@ -258,6 +268,32 @@ docker logs | jq . - **Sentiment Analysis** - Analyzes message tone - **AI Summaries** - Generates summaries of message threads +#### OpenAI Error Handling + +All OpenAI API calls are wrapped in a `ResilientOpenAIClient` that prevents transient failures from crashing the application: + +| Behavior | Description | +| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Retry with backoff** | Transient errors (HTTP 429, 5xx, network timeouts) are retried with exponential backoff and full jitter. Non-retriable errors (4xx other than 429) are not retried. | +| **Circuit breaker** | After `CIRCUIT_BREAKER_FAILURES` consecutive failures the circuit opens, short-circuiting further calls until the window elapses and a probe succeeds. | +| **Timeout** | Each individual request is aborted after `OPENAI_TIMEOUT_MS` milliseconds. | +| **Concurrency limiter** | At most `OPENAI_CONCURRENCY` calls are in-flight at once; excess requests are rejected immediately. | +| **Prometheus metrics** | Counters and histograms for requests, retries, failures, circuit opens, and latency are exposed for observability. | +| **Slack alert on 429** | When the rate-limit error reaches the application layer, an alert is posted to `#muzzlefeedback`. | + +Configure the resilience settings with these environment variables (all optional): + +| Variable | Default | Description | +| ------------------------------- | ------- | -------------------------------------------------------------------- | +| `OPENAI_TIMEOUT_MS` | `10000` | Max ms to wait for a single OpenAI request | +| `OPENAI_RETRIES` | `3` | Max retry attempts on transient errors | +| `OPENAI_BACKOFF_BASE_MS` | `500` | Base interval (ms) for exponential backoff with full jitter | +| `CIRCUIT_BREAKER_FAILURES` | `5` | Consecutive failures before the circuit breaker opens | +| `CIRCUIT_BREAKER_WINDOW_MS` | `60000` | Duration (ms) the circuit stays open before allowing a probe | +| `CIRCUIT_BREAKER_PROBE_MS` | `30000` | Minimum interval (ms) between probes while the circuit is open | +| `OPENAI_CONCURRENCY` | `10` | Maximum concurrent outbound OpenAI calls | +| `FEATURE_FLAG_RESILIENT_OPENAI` | `true` | Set to `false` to bypass all resilience logic (useful for debugging) | + ### Scheduled Jobs Most scheduled jobs run inside the backend Node.js process using `node-cron`. They are started automatically when the server connects to the database. diff --git a/packages/backend/src/ai/ai.controller.spec.ts b/packages/backend/src/ai/ai.controller.spec.ts index 4a55f071..726477f1 100644 --- a/packages/backend/src/ai/ai.controller.spec.ts +++ b/packages/backend/src/ai/ai.controller.spec.ts @@ -75,7 +75,7 @@ describe('aiController', () => { expect(promptWithHistory).toHaveBeenCalledWith(body); }); - it('sends ephemeral on service errors', async () => { + it('sends ephemeral on /text service errors without crashing', async () => { generateText.mockRejectedValueOnce(new Error('boom')); await request(app) @@ -84,7 +84,31 @@ describe('aiController', () => { .expect(200); await Promise.resolve(); - expect(sendEphemeral).toHaveBeenCalled(); + expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('hello'), 'U1'); + }); + + it('sends ephemeral on /image service errors without crashing', async () => { + generateImage.mockRejectedValueOnce(new Error('image boom')); + + await request(app) + .post('/image') + .send({ user_id: 'U1', team_id: 'T1', channel_id: 'C1', text: 'draw a cat' }) + .expect(200); + + await Promise.resolve(); + expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('draw a cat'), 'U1'); + }); + + it('sends ephemeral on /prompt-with-history service errors without crashing', async () => { + promptWithHistory.mockRejectedValueOnce(new Error('history boom')); + + await request(app) + .post('/prompt-with-history') + .send({ user_id: 'U1', team_id: 'T1', channel_id: 'C1', text: 'summarize' }) + .expect(200); + + await Promise.resolve(); + expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('summarize'), 'U1'); }); describe('/set-prompt', () => { From d096c8511d081a13323e6f371333c3ae194b901e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Jun 2026 16:13:09 +0000 Subject: [PATCH 3/4] Report all AI errors to #muzzlefeedback channel in controller error handlers --- packages/backend/src/ai/ai.controller.spec.ts | 30 +++++++++++++------ packages/backend/src/ai/ai.controller.ts | 12 ++++++++ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/packages/backend/src/ai/ai.controller.spec.ts b/packages/backend/src/ai/ai.controller.spec.ts index 726477f1..23d553b0 100644 --- a/packages/backend/src/ai/ai.controller.spec.ts +++ b/packages/backend/src/ai/ai.controller.spec.ts @@ -2,15 +2,23 @@ import { vi } from 'vitest'; import express from 'express'; import request from 'supertest'; -const { generateText, generateImage, promptWithHistory, sendEphemeral, setCustomPrompt, clearCustomPrompt } = - vi.hoisted(() => ({ - generateText: vi.fn().mockResolvedValue(undefined), - generateImage: vi.fn().mockResolvedValue(undefined), - promptWithHistory: vi.fn().mockResolvedValue(undefined), - sendEphemeral: vi.fn().mockResolvedValue({ ok: true }), - setCustomPrompt: vi.fn().mockResolvedValue(true), - clearCustomPrompt: vi.fn().mockResolvedValue(true), - })); +const { + generateText, + generateImage, + promptWithHistory, + sendEphemeral, + sendMessage, + setCustomPrompt, + clearCustomPrompt, +} = vi.hoisted(() => ({ + generateText: vi.fn().mockResolvedValue(undefined), + generateImage: vi.fn().mockResolvedValue(undefined), + promptWithHistory: vi.fn().mockResolvedValue(undefined), + sendEphemeral: vi.fn().mockResolvedValue({ ok: true }), + sendMessage: vi.fn().mockResolvedValue({ ok: true }), + setCustomPrompt: vi.fn().mockResolvedValue(true), + clearCustomPrompt: vi.fn().mockResolvedValue(true), +})); vi.mock('./ai.service', async () => ({ AIService: classMock(() => ({ @@ -25,6 +33,7 @@ vi.mock('./ai.service', async () => ({ vi.mock('../shared/services/web/web.service', async () => ({ WebService: classMock(() => ({ sendEphemeral, + sendMessage, })), })); @@ -85,6 +94,7 @@ describe('aiController', () => { await Promise.resolve(); expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('hello'), 'U1'); + expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('boom')); }); it('sends ephemeral on /image service errors without crashing', async () => { @@ -97,6 +107,7 @@ describe('aiController', () => { await Promise.resolve(); expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('draw a cat'), 'U1'); + expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('image boom')); }); it('sends ephemeral on /prompt-with-history service errors without crashing', async () => { @@ -109,6 +120,7 @@ describe('aiController', () => { await Promise.resolve(); expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('summarize'), 'U1'); + expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('history boom')); }); describe('/set-prompt', () => { diff --git a/packages/backend/src/ai/ai.controller.ts b/packages/backend/src/ai/ai.controller.ts index f13469bf..3015b3b8 100644 --- a/packages/backend/src/ai/ai.controller.ts +++ b/packages/backend/src/ai/ai.controller.ts @@ -55,6 +55,10 @@ aiController.post('/text', (req, res) => { }); const errorMessage = `\`Sorry! Your request for ${text} failed. Please try again.\``; void webService.sendEphemeral(channel_id, errorMessage, user_id); + void webService.sendMessage( + '#muzzlefeedback', + `AI /text error for <@${user_id}>: ${e instanceof Error ? e.message : String(e)}`, + ); return undefined; }); }); @@ -71,6 +75,10 @@ aiController.post('/image', (req, res) => { }); const errorMessage = `\`Sorry! Your request for ${text} failed. Please try again.\``; void webService.sendEphemeral(channel_id, errorMessage, user_id); + void webService.sendMessage( + '#muzzlefeedback', + `AI /image error for <@${user_id}>: ${e instanceof Error ? e.message : String(e)}`, + ); return undefined; }); }); @@ -87,6 +95,10 @@ aiController.post('/prompt-with-history', (req, res) => { }); const errorMessage = `\`Sorry! Your request for ${request.text} failed. Please try again.\``; void webService.sendEphemeral(request.channel_id, errorMessage, request.user_id); + void webService.sendMessage( + '#muzzlefeedback', + `AI /prompt-with-history error for <@${request.user_id}>: ${e instanceof Error ? e.message : String(e)}`, + ); return undefined; }); }); From 419356718a154f26ca5cc2e28bd8f8d75350a868 Mon Sep 17 00:00:00 2001 From: sfreeman422 Date: Tue, 23 Jun 2026 12:15:27 -0400 Subject: [PATCH 4/4] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5ec48523..70c0dc34 100644 --- a/README.md +++ b/README.md @@ -278,7 +278,7 @@ All OpenAI API calls are wrapped in a `ResilientOpenAIClient` that prevents tran | **Circuit breaker** | After `CIRCUIT_BREAKER_FAILURES` consecutive failures the circuit opens, short-circuiting further calls until the window elapses and a probe succeeds. | | **Timeout** | Each individual request is aborted after `OPENAI_TIMEOUT_MS` milliseconds. | | **Concurrency limiter** | At most `OPENAI_CONCURRENCY` calls are in-flight at once; excess requests are rejected immediately. | -| **Prometheus metrics** | Counters and histograms for requests, retries, failures, circuit opens, and latency are exposed for observability. | +| **Prometheus metrics** | Counters and histograms for requests, retries, failures, circuit opens, and latency are registered via `prom-client` (expose the registry on a `/metrics` endpoint if desired). | | **Slack alert on 429** | When the rate-limit error reaches the application layer, an alert is posted to `#muzzlefeedback`. | Configure the resilience settings with these environment variables (all optional):