dev-chat · Copilot · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/README.md b/README.md
@@ -141,6 +141,16 @@ NODE_ENV=development
 # External APIs (optional, for AI features)
 OPENAI_API_KEY=sk-your-openai-key
 GOOGLE_TRANSLATE_API_KEY=your-google-translate-key
+
+# OpenAI error handling / resilience (all optional – defaults shown)
+OPENAI_TIMEOUT_MS=10000         # Max ms to wait for a single OpenAI request before aborting
+OPENAI_RETRIES=3                # Max retry attempts on transient errors (429, 5xx, timeouts)
+OPENAI_BACKOFF_BASE_MS=500      # Base interval (ms) for exponential backoff with full jitter
+CIRCUIT_BREAKER_FAILURES=5      # Consecutive failures before the circuit breaker opens
+CIRCUIT_BREAKER_WINDOW_MS=60000 # Duration (ms) the circuit stays open before allowing a probe
+CIRCUIT_BREAKER_PROBE_MS=30000  # Minimum interval (ms) between probe attempts while circuit is open
+OPENAI_CONCURRENCY=10           # Maximum concurrent outbound OpenAI calls
+FEATURE_FLAG_RESILIENT_OPENAI=true  # Set to "false" to bypass all resilience logic (useful for debugging)
 ```
 
 #### Frontend (`packages/frontend/.env`)
@@ -258,6 +268,32 @@ docker logs <container-id> | jq .
 - **Sentiment Analysis** - Analyzes message tone
 - **AI Summaries** - Generates summaries of message threads
 
+#### OpenAI Error Handling
+
+All OpenAI API calls are wrapped in a `ResilientOpenAIClient` that prevents transient failures from crashing the application:
+
+| Behavior                | Description                                                                                                                                                         |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Retry with backoff**  | Transient errors (HTTP 429, 5xx, network timeouts) are retried with exponential backoff and full jitter. Non-retriable errors (4xx other than 429) are not retried. |
+| **Circuit breaker**     | After `CIRCUIT_BREAKER_FAILURES` consecutive failures the circuit opens, short-circuiting further calls until the window elapses and a probe succeeds.              |
+| **Timeout**             | Each individual request is aborted after `OPENAI_TIMEOUT_MS` milliseconds.                                                                                          |
+| **Concurrency limiter** | At most `OPENAI_CONCURRENCY` calls are in-flight at once; excess requests are rejected immediately.                                                                 |
+| **Prometheus metrics**  | Counters and histograms for requests, retries, failures, circuit opens, and latency are registered via `prom-client` (expose the registry on a `/metrics` endpoint if desired). |
+| **Slack alert on 429**  | When the rate-limit error reaches the application layer, an alert is posted to `#muzzlefeedback`.                                                                   |
+
+Configure the resilience settings with these environment variables (all optional):
+
+| Variable                        | Default | Description                                                          |
+| ------------------------------- | ------- | -------------------------------------------------------------------- |
+| `OPENAI_TIMEOUT_MS`             | `10000` | Max ms to wait for a single OpenAI request                           |
+| `OPENAI_RETRIES`                | `3`     | Max retry attempts on transient errors                               |
+| `OPENAI_BACKOFF_BASE_MS`        | `500`   | Base interval (ms) for exponential backoff with full jitter          |
+| `CIRCUIT_BREAKER_FAILURES`      | `5`     | Consecutive failures before the circuit breaker opens                |
+| `CIRCUIT_BREAKER_WINDOW_MS`     | `60000` | Duration (ms) the circuit stays open before allowing a probe         |
+| `CIRCUIT_BREAKER_PROBE_MS`      | `30000` | Minimum interval (ms) between probes while the circuit is open       |
+| `OPENAI_CONCURRENCY`            | `10`    | Maximum concurrent outbound OpenAI calls                             |
+| `FEATURE_FLAG_RESILIENT_OPENAI` | `true`  | Set to `false` to bypass all resilience logic (useful for debugging) |
+
 ### Scheduled Jobs
 
 Most scheduled jobs run inside the backend Node.js process using `node-cron`. They are started automatically when the server connects to the database.

diff --git a/packages/backend/src/ai/ai.controller.spec.ts b/packages/backend/src/ai/ai.controller.spec.ts
@@ -2,15 +2,23 @@ import { vi } from 'vitest';
 import express from 'express';
 import request from 'supertest';
 
-const { generateText, generateImage, promptWithHistory, sendEphemeral, setCustomPrompt, clearCustomPrompt } =
-  vi.hoisted(() => ({
-    generateText: vi.fn().mockResolvedValue(undefined),
-    generateImage: vi.fn().mockResolvedValue(undefined),
-    promptWithHistory: vi.fn().mockResolvedValue(undefined),
-    sendEphemeral: vi.fn().mockResolvedValue({ ok: true }),
-    setCustomPrompt: vi.fn().mockResolvedValue(true),
-    clearCustomPrompt: vi.fn().mockResolvedValue(true),
-  }));
+const {
+  generateText,
+  generateImage,
+  promptWithHistory,
+  sendEphemeral,
+  sendMessage,
+  setCustomPrompt,
+  clearCustomPrompt,
+} = vi.hoisted(() => ({
+  generateText: vi.fn().mockResolvedValue(undefined),
+  generateImage: vi.fn().mockResolvedValue(undefined),
+  promptWithHistory: vi.fn().mockResolvedValue(undefined),
+  sendEphemeral: vi.fn().mockResolvedValue({ ok: true }),
+  sendMessage: vi.fn().mockResolvedValue({ ok: true }),
+  setCustomPrompt: vi.fn().mockResolvedValue(true),
+  clearCustomPrompt: vi.fn().mockResolvedValue(true),
+}));
 
 vi.mock('./ai.service', async () => ({
   AIService: classMock(() => ({
@@ -25,6 +33,7 @@ vi.mock('./ai.service', async () => ({
 vi.mock('../shared/services/web/web.service', async () => ({
   WebService: classMock(() => ({
     sendEphemeral,
+    sendMessage,
   })),
 }));
 
@@ -75,7 +84,7 @@ describe('aiController', () => {
     expect(promptWithHistory).toHaveBeenCalledWith(body);
   });
 
-  it('sends ephemeral on service errors', async () => {
+  it('sends ephemeral on /text service errors without crashing', async () => {
     generateText.mockRejectedValueOnce(new Error('boom'));
 
     await request(app)
@@ -84,7 +93,34 @@ describe('aiController', () => {
       .expect(200);
 
     await Promise.resolve();
-    expect(sendEphemeral).toHaveBeenCalled();
+    expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('hello'), 'U1');
+    expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('boom'));
+  });
+
+  it('sends ephemeral on /image service errors without crashing', async () => {
+    generateImage.mockRejectedValueOnce(new Error('image boom'));
+
+    await request(app)
+      .post('/image')
+      .send({ user_id: 'U1', team_id: 'T1', channel_id: 'C1', text: 'draw a cat' })
+      .expect(200);
+
+    await Promise.resolve();
+    expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('draw a cat'), 'U1');
+    expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('image boom'));
+  });
+
+  it('sends ephemeral on /prompt-with-history service errors without crashing', async () => {
+    promptWithHistory.mockRejectedValueOnce(new Error('history boom'));
+
+    await request(app)
+      .post('/prompt-with-history')
+      .send({ user_id: 'U1', team_id: 'T1', channel_id: 'C1', text: 'summarize' })
+      .expect(200);
+
+    await Promise.resolve();
+    expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('summarize'), 'U1');
+    expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('history boom'));
   });
 
   describe('/set-prompt', () => {

diff --git a/packages/backend/src/ai/ai.controller.ts b/packages/backend/src/ai/ai.controller.ts
@@ -55,6 +55,10 @@ aiController.post('/text', (req, res) => {
     });
     const errorMessage = `\`Sorry! Your request for ${text} failed. Please try again.\``;
     void webService.sendEphemeral(channel_id, errorMessage, user_id);
+    void webService.sendMessage(
+      '#muzzlefeedback',
+      `AI /text error for <@${user_id}>: ${e instanceof Error ? e.message : String(e)}`,
+    );
     return undefined;
   });
 });
@@ -71,6 +75,10 @@ aiController.post('/image', (req, res) => {
     });
     const errorMessage = `\`Sorry! Your request for ${text} failed. Please try again.\``;
     void webService.sendEphemeral(channel_id, errorMessage, user_id);
+    void webService.sendMessage(
+      '#muzzlefeedback',
+      `AI /image error for <@${user_id}>: ${e instanceof Error ? e.message : String(e)}`,
+    );
     return undefined;
   });
 });
@@ -87,6 +95,10 @@ aiController.post('/prompt-with-history', (req, res) => {
     });
     const errorMessage = `\`Sorry! Your request for ${request.text} failed. Please try again.\``;
     void webService.sendEphemeral(request.channel_id, errorMessage, request.user_id);
+    void webService.sendMessage(
+      '#muzzlefeedback',
+      `AI /prompt-with-history error for <@${request.user_id}>: ${e instanceof Error ? e.message : String(e)}`,
+    );
     return undefined;
   });
 });