Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,16 @@ NODE_ENV=development
# External APIs (optional, for AI features)
OPENAI_API_KEY=sk-your-openai-key
GOOGLE_TRANSLATE_API_KEY=your-google-translate-key

# OpenAI error handling / resilience (all optional – defaults shown)
OPENAI_TIMEOUT_MS=10000 # Max ms to wait for a single OpenAI request before aborting
OPENAI_RETRIES=3 # Max retry attempts on transient errors (429, 5xx, timeouts)
OPENAI_BACKOFF_BASE_MS=500 # Base interval (ms) for exponential backoff with full jitter
CIRCUIT_BREAKER_FAILURES=5 # Consecutive failures before the circuit breaker opens
CIRCUIT_BREAKER_WINDOW_MS=60000 # Duration (ms) the circuit stays open before allowing a probe
CIRCUIT_BREAKER_PROBE_MS=30000 # Minimum interval (ms) between probe attempts while circuit is open
OPENAI_CONCURRENCY=10 # Maximum concurrent outbound OpenAI calls
FEATURE_FLAG_RESILIENT_OPENAI=true # Set to "false" to bypass all resilience logic (useful for debugging)
```

#### Frontend (`packages/frontend/.env`)
Expand Down Expand Up @@ -258,6 +268,32 @@ docker logs <container-id> | jq .
- **Sentiment Analysis** - Analyzes message tone
- **AI Summaries** - Generates summaries of message threads

#### OpenAI Error Handling

All OpenAI API calls are wrapped in a `ResilientOpenAIClient` that prevents transient failures from crashing the application:

| Behavior | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Retry with backoff** | Transient errors (HTTP 429, 5xx, network timeouts) are retried with exponential backoff and full jitter. Non-retriable errors (4xx other than 429) are not retried. |
| **Circuit breaker** | After `CIRCUIT_BREAKER_FAILURES` consecutive failures the circuit opens, short-circuiting further calls until the window elapses and a probe succeeds. |
| **Timeout** | Each individual request is aborted after `OPENAI_TIMEOUT_MS` milliseconds. |
| **Concurrency limiter** | At most `OPENAI_CONCURRENCY` calls are in-flight at once; excess requests are rejected immediately. |
| **Prometheus metrics** | Counters and histograms for requests, retries, failures, circuit opens, and latency are registered via `prom-client` (expose the registry on a `/metrics` endpoint if desired). |
| **Slack alert on 429** | When the rate-limit error reaches the application layer, an alert is posted to `#muzzlefeedback`. |

Configure the resilience settings with these environment variables (all optional):

| Variable | Default | Description |
| ------------------------------- | ------- | -------------------------------------------------------------------- |
| `OPENAI_TIMEOUT_MS` | `10000` | Max ms to wait for a single OpenAI request |
| `OPENAI_RETRIES` | `3` | Max retry attempts on transient errors |
| `OPENAI_BACKOFF_BASE_MS` | `500` | Base interval (ms) for exponential backoff with full jitter |
| `CIRCUIT_BREAKER_FAILURES` | `5` | Consecutive failures before the circuit breaker opens |
| `CIRCUIT_BREAKER_WINDOW_MS` | `60000` | Duration (ms) the circuit stays open before allowing a probe |
| `CIRCUIT_BREAKER_PROBE_MS` | `30000` | Minimum interval (ms) between probes while the circuit is open |
| `OPENAI_CONCURRENCY` | `10` | Maximum concurrent outbound OpenAI calls |
| `FEATURE_FLAG_RESILIENT_OPENAI` | `true` | Set to `false` to bypass all resilience logic (useful for debugging) |

### Scheduled Jobs

Most scheduled jobs run inside the backend Node.js process using `node-cron`. They are started automatically when the server connects to the database.
Expand Down
58 changes: 47 additions & 11 deletions packages/backend/src/ai/ai.controller.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,23 @@ import { vi } from 'vitest';
import express from 'express';
import request from 'supertest';

const { generateText, generateImage, promptWithHistory, sendEphemeral, setCustomPrompt, clearCustomPrompt } =
vi.hoisted(() => ({
generateText: vi.fn().mockResolvedValue(undefined),
generateImage: vi.fn().mockResolvedValue(undefined),
promptWithHistory: vi.fn().mockResolvedValue(undefined),
sendEphemeral: vi.fn().mockResolvedValue({ ok: true }),
setCustomPrompt: vi.fn().mockResolvedValue(true),
clearCustomPrompt: vi.fn().mockResolvedValue(true),
}));
const {
generateText,
generateImage,
promptWithHistory,
sendEphemeral,
sendMessage,
setCustomPrompt,
clearCustomPrompt,
} = vi.hoisted(() => ({
generateText: vi.fn().mockResolvedValue(undefined),
generateImage: vi.fn().mockResolvedValue(undefined),
promptWithHistory: vi.fn().mockResolvedValue(undefined),
sendEphemeral: vi.fn().mockResolvedValue({ ok: true }),
sendMessage: vi.fn().mockResolvedValue({ ok: true }),
setCustomPrompt: vi.fn().mockResolvedValue(true),
clearCustomPrompt: vi.fn().mockResolvedValue(true),
}));

vi.mock('./ai.service', async () => ({
AIService: classMock(() => ({
Expand All @@ -25,6 +33,7 @@ vi.mock('./ai.service', async () => ({
vi.mock('../shared/services/web/web.service', async () => ({
WebService: classMock(() => ({
sendEphemeral,
sendMessage,
})),
}));

Expand Down Expand Up @@ -75,7 +84,7 @@ describe('aiController', () => {
expect(promptWithHistory).toHaveBeenCalledWith(body);
});

it('sends ephemeral on service errors', async () => {
it('sends ephemeral on /text service errors without crashing', async () => {
generateText.mockRejectedValueOnce(new Error('boom'));

await request(app)
Expand All @@ -84,7 +93,34 @@ describe('aiController', () => {
.expect(200);

await Promise.resolve();
expect(sendEphemeral).toHaveBeenCalled();
expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('hello'), 'U1');
expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('boom'));
});

it('sends ephemeral on /image service errors without crashing', async () => {
generateImage.mockRejectedValueOnce(new Error('image boom'));

await request(app)
.post('/image')
.send({ user_id: 'U1', team_id: 'T1', channel_id: 'C1', text: 'draw a cat' })
.expect(200);

await Promise.resolve();
expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('draw a cat'), 'U1');
expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('image boom'));
});

it('sends ephemeral on /prompt-with-history service errors without crashing', async () => {
promptWithHistory.mockRejectedValueOnce(new Error('history boom'));

await request(app)
.post('/prompt-with-history')
.send({ user_id: 'U1', team_id: 'T1', channel_id: 'C1', text: 'summarize' })
.expect(200);

await Promise.resolve();
expect(sendEphemeral).toHaveBeenCalledWith('C1', expect.stringContaining('summarize'), 'U1');
expect(sendMessage).toHaveBeenCalledWith('#muzzlefeedback', expect.stringContaining('history boom'));
});

describe('/set-prompt', () => {
Expand Down
12 changes: 12 additions & 0 deletions packages/backend/src/ai/ai.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ aiController.post('/text', (req, res) => {
});
const errorMessage = `\`Sorry! Your request for ${text} failed. Please try again.\``;
void webService.sendEphemeral(channel_id, errorMessage, user_id);
void webService.sendMessage(
'#muzzlefeedback',
`AI /text error for <@${user_id}>: ${e instanceof Error ? e.message : String(e)}`,
);
Comment on lines +58 to +61
return undefined;
});
});
Expand All @@ -71,6 +75,10 @@ aiController.post('/image', (req, res) => {
});
const errorMessage = `\`Sorry! Your request for ${text} failed. Please try again.\``;
void webService.sendEphemeral(channel_id, errorMessage, user_id);
void webService.sendMessage(
'#muzzlefeedback',
`AI /image error for <@${user_id}>: ${e instanceof Error ? e.message : String(e)}`,
);
Comment on lines +78 to +81
return undefined;
});
});
Expand All @@ -87,6 +95,10 @@ aiController.post('/prompt-with-history', (req, res) => {
});
const errorMessage = `\`Sorry! Your request for ${request.text} failed. Please try again.\``;
void webService.sendEphemeral(request.channel_id, errorMessage, request.user_id);
void webService.sendMessage(
'#muzzlefeedback',
`AI /prompt-with-history error for <@${request.user_id}>: ${e instanceof Error ? e.message : String(e)}`,
);
Comment on lines +98 to +101
return undefined;
});
});
Loading