From b9275496a8fc2fbc9cf1c6c88c6cfc6c4c31e572 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:06:47 +0000
Subject: [PATCH 1/7] Initial plan


From da1aed93a9f4346de8fcf15af90ae86ba076682e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:18:39 +0000
Subject: [PATCH 2/7] feat: add resilient OpenAI client with timeout, retries,
 circuit breaker, bulkhead, and metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- New src/config/openai.ts: reads all resilience settings from env vars
- New src/lib/resilientOpenAIClient.ts: per-request AbortController timeout,
  exponential backoff + jitter retries honoring Retry-After on 429, circuit
  breaker (closed→open→half-open), concurrency bulkhead, prom-client metrics,
  structured Winston logging, FEATURE_FLAG_RESILIENT_OPENAI bypass
- Update ai.service.ts: openAi property now uses ResilientOpenAIClient wrapper
- New src/lib/resilientOpenAIClient.spec.ts: 22 tests covering all features
- New docs/resilient-openai.md: env vars, behavior, rollout plan
- Add prom-client@15.1.3 dependency
---
 docs/resilient-openai.md                      | 212 +++++++
 package-lock.json                             |  38 ++
 packages/backend/package.json                 |   1 +
 packages/backend/src/ai/ai.service.ts         |  10 +-
 packages/backend/src/config/openai.ts         |  44 ++
 .../src/lib/resilientOpenAIClient.spec.ts     | 520 ++++++++++++++++++
 .../backend/src/lib/resilientOpenAIClient.ts  | 451 +++++++++++++++
 7 files changed, 1273 insertions(+), 3 deletions(-)
 create mode 100644 docs/resilient-openai.md
 create mode 100644 packages/backend/src/config/openai.ts
 create mode 100644 packages/backend/src/lib/resilientOpenAIClient.spec.ts
 create mode 100644 packages/backend/src/lib/resilientOpenAIClient.ts

diff --git a/docs/resilient-openai.md b/docs/resilient-openai.md
new file mode 100644
index 00000000..a041c507
--- /dev/null
+++ b/docs/resilient-openai.md
@@ -0,0 +1,212 @@
+# Resilient OpenAI Client
+
+This document describes the resilient OpenAI wrapper introduced in
+`packages/backend/src/lib/resilientOpenAIClient.ts`.  The wrapper prevents
+transient or sustained OpenAI failures from cascading and taking down the
+broader Moonbeam service.
+
+---
+
+## Overview
+
+`ResilientOpenAIClient` wraps the official `openai` SDK client and adds:
+
+| Feature | Default |
+|---|---|
+| Per-request timeout | 10 s |
+| Automatic retries with exponential backoff + full jitter | 3 retries |
+| `Retry-After` header honoured on 429 responses | yes |
+| Circuit breaker | opens after 5 consecutive failures |
+| Concurrency / bulkhead limiter | 10 concurrent calls |
+| Graceful degradation via `ResilientOpenAIError` | yes |
+| Structured logging (Winston) | yes |
+| Prometheus metrics (`prom-client`) | yes |
+
+The wrapper exposes the same narrow `responses.create` surface used by
+`AIService`, so the change is a drop-in replacement.
+
+---
+
+## Feature flag
+
+Set the environment variable `FEATURE_FLAG_RESILIENT_OPENAI` to `false` to
+bypass all resilience logic and delegate directly to the underlying OpenAI SDK
+client.  This is the rollback switch.
+
+```
+FEATURE_FLAG_RESILIENT_OPENAI=false   # bypass resilience (rollback)
+FEATURE_FLAG_RESILIENT_OPENAI=true    # enable resilience (default)
+```
+
+---
+
+## Environment variables
+
+All variables are optional.  Defaults are shown.
+
+| Variable | Default | Description |
+|---|---|---|
+| `FEATURE_FLAG_RESILIENT_OPENAI` | `true` | Set to `false` to bypass the wrapper entirely. |
+| `OPENAI_TIMEOUT_MS` | `10000` | Maximum ms to wait for a single request before aborting. |
+| `OPENAI_RETRIES` | `3` | Maximum retry attempts on transient errors. |
+| `OPENAI_BACKOFF_BASE_MS` | `500` | Base interval (ms) for exponential backoff with full jitter. |
+| `CIRCUIT_BREAKER_FAILURES` | `5` | Consecutive failures needed to open the circuit. |
+| `CIRCUIT_BREAKER_WINDOW_MS` | `60000` | Duration (ms) the circuit stays open before allowing a probe. |
+| `CIRCUIT_BREAKER_PROBE_MS` | `30000` | Minimum interval (ms) between probe attempts while open. |
+| `OPENAI_CONCURRENCY` | `10` | Maximum concurrent outbound OpenAI calls per instance. |
+
+Configuration is loaded by `packages/backend/src/config/openai.ts` which reads
+these variables at instantiation time with sensible defaults.
+
+---
+
+## Retry behaviour
+
+Requests are retried when the error is classified as _retriable_:
+
+- HTTP 429 (rate limit) — also extracts `Retry-After` header and waits
+  accordingly before retrying.
+- HTTP 5xx (server errors).
+- Network / connection errors (`ECONNRESET`, `ETIMEDOUT`, socket hang-up,
+  `fetch failed`, etc.).
+- `ResilientOpenAIError` with code `TIMEOUT`.
+
+Non-retriable errors (4xx other than 429, business-logic errors) are surfaced
+immediately.
+
+Backoff is computed as:
+
+```
+sleep = random(0, backoffBaseMs * 2^attempt)   # full jitter
+```
+
+If a `Retry-After` header is present on a 429 response, that duration (in
+seconds) overrides the computed backoff.
+
+---
+
+## Circuit-breaker states
+
+```
+CLOSED ──(N consecutive failures)──► OPEN
+  ▲                                    │
+  │   (probe succeeds)       (window + probe interval elapsed)
+  └──────── HALF-OPEN ◄────────────────┘
+                │
+          (probe fails)
+                │
+              OPEN
+```
+
+- **CLOSED**: normal operation.
+- **OPEN**: calls are short-circuited immediately with
+  `ResilientOpenAIError(code: 'CIRCUIT_OPEN')`. No requests reach OpenAI.
+- **HALF-OPEN**: one probe call is allowed through. If it succeeds the circuit
+  closes; if it fails the circuit re-opens.
+
+The current state is observable via `client.getCircuitState()`.
+
+---
+
+## Concurrency limiter
+
+At most `OPENAI_CONCURRENCY` requests may be in-flight simultaneously.
+Additional requests are rejected immediately with
+`ResilientOpenAIError(code: 'CONCURRENCY_REJECTED')`.
+
+`client.getActiveRequests()` returns the current in-flight count.
+
+---
+
+## Error types
+
+```typescript
+import { ResilientOpenAIError } from '../lib/resilientOpenAIClient';
+
+try {
+  await openAi.responses.create({ ... });
+} catch (err) {
+  if (err instanceof ResilientOpenAIError) {
+    // err.code is one of:
+    //   'CIRCUIT_OPEN'        – circuit is open, request was not sent
+    //   'TIMEOUT'             – request exceeded OPENAI_TIMEOUT_MS
+    //   'CONCURRENCY_REJECTED'– too many concurrent requests
+    //   'MAX_RETRIES_EXCEEDED'– reserved for future use
+    handleDegradedMode(err.code);
+  }
+}
+```
+
+Upstream code (e.g. `AIService`) can check `err instanceof ResilientOpenAIError`
+to distinguish transient/infrastructure failures from application errors and
+return appropriate degraded UX to users.
+
+---
+
+## Metrics
+
+The client registers the following Prometheus metrics via `prom-client`.  Each
+`ResilientOpenAIClient` instance uses its own `Registry` so multiple instances
+in the same process do not collide.  Expose the registry on a `/metrics`
+endpoint if Prometheus scraping is desired.
+
+| Metric | Type | Description |
+|---|---|---|
+| `openai_requests_total` | Counter | Total requests attempted, labelled `status` (`attempted`, `success`, `circuit_open`, `concurrency_rejected`). |
+| `openai_retries_total` | Counter | Total retry attempts. |
+| `openai_failures_total` | Counter | Total failed requests after all retries, labelled `reason` (`error`, `circuit_open`, `concurrency_rejected`). |
+| `openai_circuit_open_total` | Counter | Number of times the circuit transitioned to open. |
+| `openai_latency_seconds` | Histogram | End-to-end request latency in seconds, labelled `status` (`success`, `error`). Buckets: 0.1, 0.5, 1, 2, 5, 10, 30. |
+
+---
+
+## Logging
+
+The wrapper emits structured Winston logs (child logger `ResilientOpenAIClient`)
+for:
+
+- Per-retry failures (level `info`).
+- Non-retriable errors (level `warn`).
+- Timeout aborts (level `warn`).
+- Circuit-breaker state transitions (level `info` / `warn`).
+- Concurrency limit reached (level `warn`).
+
+---
+
+## Rollout / migration plan
+
+### Phase 1 — Behind feature flag (current state)
+
+The wrapper is **enabled by default** (`FEATURE_FLAG_RESILIENT_OPENAI=true`).
+
+1. **Deploy to staging** — verify smoke tests pass; inspect logs and metrics.
+2. **Enable in a canary production host** — monitor `openai_failures_total`,
+   `openai_circuit_open_total`, and p99 latency via `openai_latency_seconds`.
+3. **Gradually roll out** across all production instances while monitoring the
+   metrics above and Sentry/Datadog error rates.
+
+### Phase 2 — Stabilisation
+
+Once the canary shows stable behaviour for 24–48 h:
+
+- Enable for all production instances.
+- Set alerting on `openai_circuit_open_total > 0` and high
+  `openai_failures_total` rates.
+
+### Rollback
+
+If issues occur at any phase, flip the feature flag:
+
+```
+FEATURE_FLAG_RESILIENT_OPENAI=false
+```
+
+Restart the service.  The wrapper delegates directly to the underlying OpenAI
+SDK; no other code changes are required.
+
+### Phase 3 — Cleanup (future)
+
+After the feature has been stable in production for a sprint:
+
+- Remove the `featureFlagResilient` branch and env-var check.
+- Remove the `FEATURE_FLAG_RESILIENT_OPENAI` documentation references.
diff --git a/package-lock.json b/package-lock.json
index 0f61511a..ddc99a2d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1948,6 +1948,15 @@
         "node": ">= 8"
       }
     },
+    "node_modules/@opentelemetry/api": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.1.tgz",
+      "integrity": "sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
     "node_modules/@paralleldrive/cuid2": {
       "version": "2.3.1",
       "resolved": "https://registry.npmjs.org/@paralleldrive/cuid2/-/cuid2-2.3.1.tgz",
@@ -3956,6 +3965,12 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/bintrees": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz",
+      "integrity": "sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==",
+      "license": "MIT"
+    },
     "node_modules/body-parser": {
       "version": "1.20.4",
       "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.4.tgz",
@@ -8082,6 +8097,19 @@
       "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==",
       "license": "MIT"
     },
+    "node_modules/prom-client": {
+      "version": "15.1.3",
+      "resolved": "https://registry.npmjs.org/prom-client/-/prom-client-15.1.3.tgz",
+      "integrity": "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api": "^1.4.0",
+        "tdigest": "^0.1.1"
+      },
+      "engines": {
+        "node": "^16 || ^18 || >=20"
+      }
+    },
     "node_modules/protobufjs": {
       "version": "7.5.4",
       "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz",
@@ -9353,6 +9381,15 @@
       "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==",
       "license": "MIT"
     },
+    "node_modules/tdigest": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/tdigest/-/tdigest-0.1.2.tgz",
+      "integrity": "sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==",
+      "license": "MIT",
+      "dependencies": {
+        "bintrees": "1.0.2"
+      }
+    },
     "node_modules/text-hex": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
@@ -10734,6 +10771,7 @@
         "mysql": "^2.17.1",
         "node-cron": "^4.2.1",
         "openai": "^4.103.0",
+        "prom-client": "^15.1.3",
         "sentence-splitter": "^5.0.0",
         "sentiment": "^5.0.2",
         "sharp": "^0.34.5",
diff --git a/packages/backend/package.json b/packages/backend/package.json
index 0874a799..321c9bc3 100644
--- a/packages/backend/package.json
+++ b/packages/backend/package.json
@@ -41,6 +41,7 @@
     "mysql": "^2.17.1",
     "node-cron": "^4.2.1",
     "openai": "^4.103.0",
+    "prom-client": "^15.1.3",
     "sentence-splitter": "^5.0.0",
     "sentiment": "^5.0.2",
     "sharp": "^0.34.5",
diff --git a/packages/backend/src/ai/ai.service.ts b/packages/backend/src/ai/ai.service.ts
index 064f40af..7cfc07a7 100644
--- a/packages/backend/src/ai/ai.service.ts
+++ b/packages/backend/src/ai/ai.service.ts
@@ -32,6 +32,8 @@ import type {
   ResponseOutputText,
   ResponseOutputRefusal,
 } from 'openai/resources/responses/responses';
+import { ResilientOpenAIClient } from '../lib/resilientOpenAIClient';
+import type { OpenAIClientLike } from '../lib/resilientOpenAIClient';
 import type { Part } from '@google/genai';
 import { GoogleGenAI } from '@google/genai';
 import sharp from 'sharp';
@@ -130,9 +132,11 @@ const normalizeReleaseSha = (value?: string): string | null => {
 
 export class AIService {
   redis = new AIPersistenceService();
-  openAi = new OpenAI({
-    apiKey: process.env.OPENAI_API_KEY,
-  });
+  openAi: OpenAIClientLike = new ResilientOpenAIClient(
+    new OpenAI({
+      apiKey: process.env.OPENAI_API_KEY,
+    }),
+  );
   gemini = new GoogleGenAI({ apiKey: process.env.GOOGLE_GEMINI_API_KEY });
 
   muzzlePersistenceService = new MuzzlePersistenceService();
diff --git a/packages/backend/src/config/openai.ts b/packages/backend/src/config/openai.ts
new file mode 100644
index 00000000..8d8e8c78
--- /dev/null
+++ b/packages/backend/src/config/openai.ts
@@ -0,0 +1,44 @@
+/**
+ * Configuration for the resilient OpenAI client.
+ * All values are read from environment variables with sensible defaults.
+ */
+
+export interface OpenAIClientConfig {
+  /** Maximum ms to wait for a single OpenAI request before aborting. */
+  timeoutMs: number;
+  /** Maximum number of retry attempts on transient errors. */
+  retries: number;
+  /** Base backoff interval (ms) for exponential backoff with full jitter. */
+  backoffBaseMs: number;
+  /** Number of consecutive failures required to open the circuit breaker. */
+  circuitBreakerFailures: number;
+  /** Duration (ms) the circuit stays open before allowing a probe. */
+  circuitBreakerWindowMs: number;
+  /** Minimum interval (ms) between probe attempts while the circuit is open. */
+  circuitBreakerProbeMs: number;
+  /** Maximum number of concurrent outbound OpenAI calls. */
+  concurrency: number;
+  /**
+   * When true (default), requests are wrapped with timeouts, retries,
+   * the circuit breaker, and the concurrency limiter.
+   * Set FEATURE_FLAG_RESILIENT_OPENAI=false to bypass all resilience logic
+   * and delegate directly to the underlying OpenAI client.
+   */
+  featureFlagResilient: boolean;
+}
+
+const parseIntWithDefault = (value: string | undefined, defaultValue: number): number => {
+  const parsed = parseInt(value ?? '', 10);
+  return Number.isFinite(parsed) ? parsed : defaultValue;
+};
+
+export const getOpenAIClientConfig = (): OpenAIClientConfig => ({
+  timeoutMs: parseIntWithDefault(process.env.OPENAI_TIMEOUT_MS, 10_000),
+  retries: parseIntWithDefault(process.env.OPENAI_RETRIES, 3),
+  backoffBaseMs: parseIntWithDefault(process.env.OPENAI_BACKOFF_BASE_MS, 500),
+  circuitBreakerFailures: parseIntWithDefault(process.env.CIRCUIT_BREAKER_FAILURES, 5),
+  circuitBreakerWindowMs: parseIntWithDefault(process.env.CIRCUIT_BREAKER_WINDOW_MS, 60_000),
+  circuitBreakerProbeMs: parseIntWithDefault(process.env.CIRCUIT_BREAKER_PROBE_MS, 30_000),
+  concurrency: parseIntWithDefault(process.env.OPENAI_CONCURRENCY, 10),
+  featureFlagResilient: process.env.FEATURE_FLAG_RESILIENT_OPENAI !== 'false',
+});
diff --git a/packages/backend/src/lib/resilientOpenAIClient.spec.ts b/packages/backend/src/lib/resilientOpenAIClient.spec.ts
new file mode 100644
index 00000000..33e996e3
--- /dev/null
+++ b/packages/backend/src/lib/resilientOpenAIClient.spec.ts
@@ -0,0 +1,520 @@
+import { vi, it, describe, expect, beforeEach, afterEach } from 'vitest';
+import { ResilientOpenAIClient, ResilientOpenAIError, METRIC_NAMES } from './resilientOpenAIClient';
+import type { OpenAIClientLike } from './resilientOpenAIClient';
+import { Registry } from 'prom-client';
+import type { OpenAIClientConfig } from '../config/openai';
+
+// ---------------------------------------------------------------------------
+// Test helpers
+// ---------------------------------------------------------------------------
+
+const makeResponse = (text = 'hello') => ({
+  output: [
+    {
+      type: 'message' as const,
+      content: [{ type: 'output_text' as const, text }],
+    },
+  ],
+});
+
+const makeUnderlying = (createImpl: Mock): OpenAIClientLike => ({
+  responses: {
+    create: createImpl,
+  },
+});
+
+const noop = () => undefined;
+
+/** Build a minimal config with fast timings for tests. */
+const fastConfig = (overrides: Partial<OpenAIClientConfig> = {}): Partial<OpenAIClientConfig> => ({
+  timeoutMs: 500,
+  retries: 2,
+  backoffBaseMs: 0,
+  circuitBreakerFailures: 3,
+  circuitBreakerWindowMs: 100,
+  circuitBreakerProbeMs: 50,
+  concurrency: 10,
+  featureFlagResilient: true,
+  ...overrides,
+});
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe('ResilientOpenAIClient', () => {
+  let registry: Registry;
+
+  beforeEach(() => {
+    registry = new Registry();
+    vi.useFakeTimers({ shouldAdvanceTime: false });
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+  });
+
+  // -------------------------------------------------------------------------
+  // Feature-flag bypass
+  // -------------------------------------------------------------------------
+
+  describe('feature flag', () => {
+    it('delegates directly to the underlying client when featureFlagResilient is false', async () => {
+      const createMock = vi.fn().mockResolvedValue(makeResponse());
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ featureFlagResilient: false }),
+        registry,
+      );
+
+      const params = { model: 'gpt-4o', input: 'hello' };
+      await client.responses.create(params);
+
+      expect(createMock).toHaveBeenCalledOnce();
+      expect(createMock).toHaveBeenCalledWith(params, undefined);
+    });
+
+    it('applies resilience when featureFlagResilient is true (default)', async () => {
+      const createMock = vi.fn().mockResolvedValue(makeResponse());
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ featureFlagResilient: true }),
+        registry,
+      );
+
+      await client.responses.create({ model: 'gpt-4o', input: 'hello' });
+      expect(createMock).toHaveBeenCalledOnce();
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // Retry behaviour
+  // -------------------------------------------------------------------------
+
+  describe('retry behaviour', () => {
+    it('returns successfully on first attempt without retrying', async () => {
+      const createMock = vi.fn().mockResolvedValue(makeResponse('first try'));
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
+
+      const result = await client.responses.create({ model: 'gpt-4o', input: 'hi' });
+
+      expect(createMock).toHaveBeenCalledOnce();
+      expect(result).toEqual(makeResponse('first try'));
+    });
+
+    it('retries on transient 500 error and succeeds', async () => {
+      const transientError = Object.assign(new Error('Internal Server Error'), { status: 500 });
+      const createMock = vi
+        .fn()
+        .mockRejectedValueOnce(transientError)
+        .mockResolvedValue(makeResponse('retry success'));
+
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
+
+      const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      // Advance past the (near-zero) backoff
+      await vi.runAllTimersAsync();
+      const result = await resultPromise;
+
+      expect(createMock).toHaveBeenCalledTimes(2);
+      expect(result).toEqual(makeResponse('retry success'));
+    });
+
+    it('retries on 429 rate-limit error and succeeds', async () => {
+      const rateLimitError = Object.assign(new Error('Rate limit exceeded'), { status: 429 });
+      const createMock = vi
+        .fn()
+        .mockRejectedValueOnce(rateLimitError)
+        .mockResolvedValue(makeResponse('ok after 429'));
+
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
+
+      const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      await vi.runAllTimersAsync();
+      const result = await resultPromise;
+
+      expect(createMock).toHaveBeenCalledTimes(2);
+      expect(result).toEqual(makeResponse('ok after 429'));
+    });
+
+    it('honors Retry-After header on 429 response', async () => {
+      const retryAfterSeconds = 2;
+      const rateLimitError = Object.assign(new Error('Rate limited'), {
+        status: 429,
+        headers: { 'retry-after': String(retryAfterSeconds) },
+      });
+      const createMock = vi
+        .fn()
+        .mockRejectedValueOnce(rateLimitError)
+        .mockResolvedValue(makeResponse('after retry-after'));
+
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
+
+      const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      // Advance past the Retry-After delay (2000ms)
+      await vi.advanceTimersByTimeAsync(retryAfterSeconds * 1000 + 100);
+      const result = await resultPromise;
+
+      expect(result).toEqual(makeResponse('after retry-after'));
+      expect(createMock).toHaveBeenCalledTimes(2);
+    });
+
+    it('does not retry on 400 (non-retriable) error', async () => {
+      const badRequestError = Object.assign(new Error('Bad Request'), { status: 400 });
+      const createMock = vi.fn().mockRejectedValue(badRequestError);
+
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
+
+      await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(
+        'Bad Request',
+      );
+      // One attempt only (no retries for 4xx other than 429)
+      expect(createMock).toHaveBeenCalledOnce();
+    });
+
+    it('throws after exhausting all retries', async () => {
+      const transientError = Object.assign(new Error('Service Unavailable'), { status: 503 });
+      const createMock = vi.fn().mockRejectedValue(transientError);
+
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig({ retries: 2 }), registry);
+
+      const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      // Pre-attach a no-op handler so the promise is never "unhandled" while
+      // we wait for the fake timers to drain the retry backoff sleeps.
+      resultPromise.catch(() => undefined);
+      await vi.runAllTimersAsync();
+
+      await expect(resultPromise).rejects.toThrow('Service Unavailable');
+      // initial attempt + 2 retries = 3 calls total
+      expect(createMock).toHaveBeenCalledTimes(3);
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // Timeout behaviour
+  // -------------------------------------------------------------------------
+
+  describe('timeout behaviour', () => {
+    it('throws ResilientOpenAIError with TIMEOUT code when request exceeds timeoutMs', async () => {
+      // Simulate an underlying client that respects AbortSignal (as the real
+      // OpenAI client / fetch does): reject with an AbortError when the signal
+      // fires.
+      const createMock = vi.fn().mockImplementation(
+        (_params: unknown, options?: { signal?: AbortSignal }) =>
+          new Promise<never>((_, reject) => {
+            const signal = options?.signal;
+            if (!signal) return;
+            const onAbort = () => {
+              const err = new Error('The operation was aborted');
+              err.name = 'AbortError';
+              reject(err);
+            };
+            if (signal.aborted) {
+              onAbort();
+            } else {
+              signal.addEventListener('abort', onAbort, { once: true });
+            }
+          }),
+      );
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock as Mock),
+        fastConfig({ timeoutMs: 100, retries: 0 }),
+        registry,
+      );
+
+      const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      // Pre-attach a no-op handler so the rejected promise is never "unhandled"
+      // while we wait for the fake abort timer to fire.
+      resultPromise.catch(() => undefined);
+      // Advance past the 100 ms abort timer
+      await vi.advanceTimersByTimeAsync(200);
+
+      await expect(resultPromise).rejects.toMatchObject({
+        name: 'ResilientOpenAIError',
+        code: 'TIMEOUT',
+      });
+    });
+
+    it('does not throw when request completes within timeoutMs', async () => {
+      const createMock = vi.fn().mockImplementation(
+        () => new Promise((resolve) => setTimeout(() => resolve(makeResponse()), 50)),
+      );
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ timeoutMs: 200, retries: 0 }),
+        registry,
+      );
+
+      const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      await vi.advanceTimersByTimeAsync(100);
+
+      await expect(resultPromise).resolves.toEqual(makeResponse());
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // Circuit-breaker transitions
+  // -------------------------------------------------------------------------
+
+  describe('circuit breaker', () => {
+    it('stays closed on successes', async () => {
+      const createMock = vi.fn().mockResolvedValue(makeResponse());
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ circuitBreakerFailures: 3 }),
+        registry,
+      );
+
+      for (let i = 0; i < 5; i++) {
+        await client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      }
+
+      expect(client.getCircuitState()).toBe('closed');
+    });
+
+    it('opens after N consecutive failures', async () => {
+      const badRequestError = Object.assign(new Error('boom'), { status: 503 });
+      const createMock = vi.fn().mockRejectedValue(badRequestError);
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ circuitBreakerFailures: 3, retries: 0 }),
+        registry,
+      );
+
+      // 3 failures should open the circuit
+      for (let i = 0; i < 3; i++) {
+        await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow();
+      }
+
+      expect(client.getCircuitState()).toBe('open');
+    });
+
+    it('short-circuits calls when open and throws ResilientOpenAIError with CIRCUIT_OPEN code', async () => {
+      const badRequestError = Object.assign(new Error('boom'), { status: 503 });
+      const createMock = vi.fn().mockRejectedValue(badRequestError);
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ circuitBreakerFailures: 2, retries: 0 }),
+        registry,
+      );
+
+      // Open the circuit
+      for (let i = 0; i < 2; i++) {
+        await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow();
+      }
+      expect(client.getCircuitState()).toBe('open');
+
+      const callCountBeforeShortCircuit = createMock.mock.calls.length;
+
+      // Next call should be short-circuited (no underlying call)
+      await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toMatchObject({
+        name: 'ResilientOpenAIError',
+        code: 'CIRCUIT_OPEN',
+      });
+
+      expect(createMock).toHaveBeenCalledTimes(callCountBeforeShortCircuit);
+    });
+
+    it('transitions to half-open and closes on successful probe', async () => {
+      const badRequestError = Object.assign(new Error('boom'), { status: 503 });
+      const createMock = vi
+        .fn()
+        .mockRejectedValue(badRequestError);
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ circuitBreakerFailures: 2, retries: 0, circuitBreakerWindowMs: 100, circuitBreakerProbeMs: 50 }),
+        registry,
+      );
+
+      // Open the circuit
+      for (let i = 0; i < 2; i++) {
+        await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow();
+      }
+      expect(client.getCircuitState()).toBe('open');
+
+      // Advance past the window AND probe interval so a probe is allowed
+      await vi.advanceTimersByTimeAsync(150);
+
+      // Now configure the underlying to succeed on the probe
+      createMock.mockResolvedValue(makeResponse('probe success'));
+
+      const result = await client.responses.create({ model: 'gpt-4o', input: 'probe' });
+      expect(result).toEqual(makeResponse('probe success'));
+      expect(client.getCircuitState()).toBe('closed');
+    });
+
+    it('reopens the circuit when the probe fails', async () => {
+      const badRequestError = Object.assign(new Error('boom'), { status: 503 });
+      const createMock = vi.fn().mockRejectedValue(badRequestError);
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ circuitBreakerFailures: 2, retries: 0, circuitBreakerWindowMs: 100, circuitBreakerProbeMs: 50 }),
+        registry,
+      );
+
+      // Open the circuit
+      for (let i = 0; i < 2; i++) {
+        await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow();
+      }
+
+      // Advance past window + probe interval
+      await vi.advanceTimersByTimeAsync(150);
+
+      // Probe also fails → should re-open
+      await expect(client.responses.create({ model: 'gpt-4o', input: 'probe' })).rejects.toThrow();
+      expect(client.getCircuitState()).toBe('open');
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // Concurrency / bulkhead limiter
+  // -------------------------------------------------------------------------
+
+  describe('concurrency limiter', () => {
+    it('rejects requests that exceed the concurrency limit', async () => {
+      // Create a mock that never resolves so we can accumulate inflight requests
+      const neverResolves = () => new Promise<never>(noop);
+      const createMock = vi.fn().mockImplementation(neverResolves);
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ concurrency: 2, retries: 0 }),
+        registry,
+      );
+
+      // Kick off 2 requests that occupy all slots
+      void client.responses.create({ model: 'gpt-4o', input: '1' });
+      void client.responses.create({ model: 'gpt-4o', input: '2' });
+
+      expect(client.getActiveRequests()).toBe(2);
+
+      // Third request should be rejected
+      await expect(client.responses.create({ model: 'gpt-4o', input: '3' })).rejects.toMatchObject({
+        name: 'ResilientOpenAIError',
+        code: 'CONCURRENCY_REJECTED',
+      });
+    });
+
+    it('allows new requests after a slot is freed', async () => {
+      let resolveFirst!: (v: ReturnType<typeof makeResponse>) => void;
+      const firstRequest = new Promise<ReturnType<typeof makeResponse>>((resolve) => {
+        resolveFirst = resolve;
+      });
+
+      const createMock = vi
+        .fn()
+        .mockImplementationOnce(() => firstRequest)
+        .mockResolvedValue(makeResponse('second'));
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ concurrency: 1, retries: 0 }),
+        registry,
+      );
+
+      // Start first request (will hold the slot)
+      const firstPromise = client.responses.create({ model: 'gpt-4o', input: '1' });
+
+      // Second request should be rejected while slot is held
+      await expect(client.responses.create({ model: 'gpt-4o', input: '2' })).rejects.toMatchObject({
+        code: 'CONCURRENCY_REJECTED',
+      });
+
+      // Free the slot
+      resolveFirst(makeResponse('first'));
+      await firstPromise;
+
+      // Now a new request should succeed
+      const result = await client.responses.create({ model: 'gpt-4o', input: '3' });
+      expect(result).toEqual(makeResponse('second'));
+    });
+  });
+
+  // -------------------------------------------------------------------------
+  // Metrics
+  // -------------------------------------------------------------------------
+
+  describe('metrics', () => {
+    it('increments openai_requests_total on successful request', async () => {
+      const createMock = vi.fn().mockResolvedValue(makeResponse());
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
+
+      await client.responses.create({ model: 'gpt-4o', input: 'hi' });
+
+      const metric = await registry.getSingleMetricAsString(METRIC_NAMES.requestsTotal);
+      expect(metric).toContain('openai_requests_total');
+      expect(metric).toContain('status="success"');
+    });
+
+    it('increments openai_retries_total on retry', async () => {
+      const transientError = Object.assign(new Error('oops'), { status: 503 });
+      const createMock = vi
+        .fn()
+        .mockRejectedValueOnce(transientError)
+        .mockResolvedValue(makeResponse());
+
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
+
+      const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      await vi.runAllTimersAsync();
+      await resultPromise;
+
+      const metric = await registry.getSingleMetricAsString(METRIC_NAMES.retriesTotal);
+      expect(metric).toContain('openai_retries_total 1');
+    });
+
+    it('increments openai_circuit_open_total when circuit opens', async () => {
+      const badError = Object.assign(new Error('boom'), { status: 503 });
+      const createMock = vi.fn().mockRejectedValue(badError);
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ circuitBreakerFailures: 2, retries: 0 }),
+        registry,
+      );
+
+      for (let i = 0; i < 2; i++) {
+        await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow();
+      }
+
+      const metric = await registry.getSingleMetricAsString(METRIC_NAMES.circuitOpenTotal);
+      expect(metric).toContain('openai_circuit_open_total 1');
+    });
+
+    it('records openai_latency_seconds histogram', async () => {
+      const createMock = vi.fn().mockResolvedValue(makeResponse());
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
+
+      await client.responses.create({ model: 'gpt-4o', input: 'hi' });
+
+      const metric = await registry.getSingleMetricAsString(METRIC_NAMES.latencySeconds);
+      expect(metric).toContain('openai_latency_seconds_bucket');
+    });
+
+    it('does not throw or produce unhandled rejections on failure', async () => {
+      const badError = Object.assign(new Error('fail'), { status: 503 });
+      const createMock = vi.fn().mockRejectedValue(badError);
+
+      const client = new ResilientOpenAIClient(
+        makeUnderlying(createMock),
+        fastConfig({ retries: 0 }),
+        registry,
+      );
+
+      // Should reject but not crash the process
+      await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(
+        'fail',
+      );
+
+      // Metrics should still be recorded
+      const metric = await registry.getSingleMetricAsString(METRIC_NAMES.failuresTotal);
+      expect(metric).toContain('openai_failures_total');
+    });
+  });
+});
diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts
new file mode 100644
index 00000000..83f82fe7
--- /dev/null
+++ b/packages/backend/src/lib/resilientOpenAIClient.ts
@@ -0,0 +1,451 @@
+import type OpenAI from 'openai';
+import type {
+  ResponseCreateParamsNonStreaming,
+  Response as OpenAIResponse,
+} from 'openai/resources/responses/responses';
+import { Counter, Histogram, Registry } from 'prom-client';
+import type { OpenAIClientConfig } from '../config/openai';
+import { getOpenAIClientConfig } from '../config/openai';
+import { logger } from '../shared/logger/logger';
+import { logError } from '../shared/logger/error-logging';
+
+// ---------------------------------------------------------------------------
+// Public error type
+// ---------------------------------------------------------------------------
+
+export class ResilientOpenAIError extends Error {
+  constructor(
+    message: string,
+    public readonly code: 'CIRCUIT_OPEN' | 'TIMEOUT' | 'CONCURRENCY_REJECTED' | 'MAX_RETRIES_EXCEEDED',
+    public readonly cause?: unknown,
+  ) {
+    super(message);
+    this.name = 'ResilientOpenAIError';
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Metric names (exported so callers can reference them)
+// ---------------------------------------------------------------------------
+
+export const METRIC_NAMES = {
+  requestsTotal: 'openai_requests_total',
+  retriesTotal: 'openai_retries_total',
+  failuresTotal: 'openai_failures_total',
+  circuitOpenTotal: 'openai_circuit_open_total',
+  latencySeconds: 'openai_latency_seconds',
+} as const;
+
+// ---------------------------------------------------------------------------
+// Circuit-breaker states
+// ---------------------------------------------------------------------------
+
+type CircuitState = 'closed' | 'open' | 'half-open';
+
+// ---------------------------------------------------------------------------
+// OpenAI-like interface (narrow surface used by AIService)
+// ---------------------------------------------------------------------------
+
+export interface OpenAIResponsesAPI {
+  create(
+    params: ResponseCreateParamsNonStreaming,
+    options?: OpenAI.RequestOptions,
+  ): Promise<OpenAIResponse>;
+}
+
+export interface OpenAIClientLike {
+  responses: OpenAIResponsesAPI;
+}
+
+// ---------------------------------------------------------------------------
+// Resilient client
+// ---------------------------------------------------------------------------
+
+export class ResilientOpenAIClient implements OpenAIClientLike {
+  public readonly responses: OpenAIResponsesAPI;
+
+  private readonly config: OpenAIClientConfig;
+  private readonly clientLogger = logger.child({ module: 'ResilientOpenAIClient' });
+
+  // Circuit-breaker state
+  private circuitState: CircuitState = 'closed';
+  private consecutiveFailures = 0;
+  private lastOpenedAt: number | null = null;
+  private lastProbeAt: number | null = null;
+
+  // Bulkhead (concurrency limiter)
+  private activeRequests = 0;
+
+  // Prometheus metrics (lazily initialized against a shared or injected registry)
+  private readonly registry: Registry;
+  private readonly metricRequestsTotal: Counter;
+  private readonly metricRetriesTotal: Counter;
+  private readonly metricFailuresTotal: Counter;
+  private readonly metricCircuitOpenTotal: Counter;
+  private readonly metricLatency: Histogram;
+
+  constructor(
+    private readonly underlying: OpenAIClientLike,
+    config?: Partial<OpenAIClientConfig>,
+    registry?: Registry,
+  ) {
+    this.config = { ...getOpenAIClientConfig(), ...config };
+    this.registry = registry ?? new Registry();
+
+    this.metricRequestsTotal = new Counter({
+      name: METRIC_NAMES.requestsTotal,
+      help: 'Total number of outbound OpenAI requests attempted',
+      labelNames: ['status'],
+      registers: [this.registry],
+    });
+
+    this.metricRetriesTotal = new Counter({
+      name: METRIC_NAMES.retriesTotal,
+      help: 'Total number of OpenAI request retry attempts',
+      registers: [this.registry],
+    });
+
+    this.metricFailuresTotal = new Counter({
+      name: METRIC_NAMES.failuresTotal,
+      help: 'Total number of failed OpenAI requests (after all retries)',
+      labelNames: ['reason'],
+      registers: [this.registry],
+    });
+
+    this.metricCircuitOpenTotal = new Counter({
+      name: METRIC_NAMES.circuitOpenTotal,
+      help: 'Total number of times the circuit breaker transitioned to open state',
+      registers: [this.registry],
+    });
+
+    this.metricLatency = new Histogram({
+      name: METRIC_NAMES.latencySeconds,
+      help: 'OpenAI request latency in seconds',
+      labelNames: ['status'],
+      buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
+      registers: [this.registry],
+    });
+
+    this.responses = {
+      create: (params, options) => this.resilientCreate(params, options),
+    };
+  }
+
+  // ---------------------------------------------------------------------------
+  // Public helpers
+  // ---------------------------------------------------------------------------
+
+  /** Returns the current circuit-breaker state for observability / testing. */
+  public getCircuitState(): CircuitState {
+    return this.circuitState;
+  }
+
+  /** Returns the number of currently active (in-flight) requests. */
+  public getActiveRequests(): number {
+    return this.activeRequests;
+  }
+
+  /** Returns the Prometheus registry so callers can expose /metrics if desired. */
+  public getRegistry(): Registry {
+    return this.registry;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Core resilient dispatch
+  // ---------------------------------------------------------------------------
+
+  private async resilientCreate(
+    params: ResponseCreateParamsNonStreaming,
+    options?: OpenAI.RequestOptions,
+  ): Promise<OpenAIResponse> {
+    // Feature-flag bypass: delegate directly to underlying client
+    if (!this.config.featureFlagResilient) {
+      return this.underlying.responses.create(params, options);
+    }
+
+    // --- Circuit-breaker check ---
+    if (!this.isCallAllowed()) {
+      this.metricFailuresTotal.inc({ reason: 'circuit_open' });
+      this.metricRequestsTotal.inc({ status: 'circuit_open' });
+      const msg = 'OpenAI circuit breaker is open; request short-circuited';
+      this.clientLogger.warn(msg, { circuitState: this.circuitState });
+      throw new ResilientOpenAIError(msg, 'CIRCUIT_OPEN');
+    }
+
+    // --- Bulkhead / concurrency limiter ---
+    if (this.activeRequests >= this.config.concurrency) {
+      this.metricFailuresTotal.inc({ reason: 'concurrency_rejected' });
+      this.metricRequestsTotal.inc({ status: 'concurrency_rejected' });
+      const msg = `OpenAI concurrency limit (${this.config.concurrency}) reached; request rejected`;
+      this.clientLogger.warn(msg, { activeRequests: this.activeRequests });
+      throw new ResilientOpenAIError(msg, 'CONCURRENCY_REJECTED');
+    }
+
+    this.activeRequests++;
+    const endTimer = this.metricLatency.startTimer();
+    this.metricRequestsTotal.inc({ status: 'attempted' });
+
+    try {
+      const result = await this.attemptWithRetry(params, options);
+      this.onSuccess();
+      this.metricRequestsTotal.inc({ status: 'success' });
+      endTimer({ status: 'success' });
+      return result;
+    } catch (error) {
+      this.onFailure(error);
+      this.metricFailuresTotal.inc({ reason: 'error' });
+      endTimer({ status: 'error' });
+      throw error;
+    } finally {
+      this.activeRequests--;
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Retry loop with exponential backoff + jitter
+  // ---------------------------------------------------------------------------
+
+  private async attemptWithRetry(
+    params: ResponseCreateParamsNonStreaming,
+    options?: OpenAI.RequestOptions,
+  ): Promise<OpenAIResponse> {
+    let lastError: unknown;
+
+    for (let attempt = 0; attempt <= this.config.retries; attempt++) {
+      if (attempt > 0) {
+        this.metricRetriesTotal.inc();
+        const backoffMs = this.computeBackoffMs(attempt, lastError);
+        this.clientLogger.info('Retrying OpenAI request', { attempt, backoffMs });
+        await this.sleep(backoffMs);
+      }
+
+      try {
+        return await this.attemptWithTimeout(params, options);
+      } catch (error) {
+        lastError = error;
+
+        // Don't retry circuit-open or concurrency errors (they won't be
+        // thrown from here, but guard defensively)
+        if (error instanceof ResilientOpenAIError) {
+          throw error;
+        }
+
+        const isLast = attempt === this.config.retries;
+        if (isLast) {
+          break;
+        }
+
+        if (!this.isRetriable(error)) {
+          this.clientLogger.warn('Non-retriable OpenAI error; giving up', {
+            attempt,
+            error: error instanceof Error ? error.message : String(error),
+          });
+          break;
+        }
+
+        logError(this.clientLogger, 'OpenAI request failed; will retry', error, { attempt });
+      }
+    }
+
+    throw lastError;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Single attempt with AbortController timeout
+  // ---------------------------------------------------------------------------
+
+  private async attemptWithTimeout(
+    params: ResponseCreateParamsNonStreaming,
+    options?: OpenAI.RequestOptions,
+  ): Promise<OpenAIResponse> {
+    const controller = new AbortController();
+    const timer = setTimeout(() => {
+      controller.abort();
+    }, this.config.timeoutMs);
+
+    try {
+      return await this.underlying.responses.create(params, {
+        ...options,
+        signal: controller.signal,
+      });
+    } catch (error) {
+      if (controller.signal.aborted) {
+        const msg = `OpenAI request timed out after ${this.config.timeoutMs}ms`;
+        this.clientLogger.warn(msg, { timeoutMs: this.config.timeoutMs });
+        throw new ResilientOpenAIError(msg, 'TIMEOUT', error);
+      }
+      throw error;
+    } finally {
+      clearTimeout(timer);
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Circuit-breaker helpers
+  // ---------------------------------------------------------------------------
+
+  private isCallAllowed(): boolean {
+    if (this.circuitState === 'closed') {
+      return true;
+    }
+
+    const now = Date.now();
+
+    if (this.circuitState === 'open') {
+      const openedAt = this.lastOpenedAt ?? 0;
+      const windowElapsed = now - openedAt >= this.config.circuitBreakerWindowMs;
+      const probeIntervalElapsed = now - (this.lastProbeAt ?? 0) >= this.config.circuitBreakerProbeMs;
+
+      if (windowElapsed && probeIntervalElapsed) {
+        this.circuitState = 'half-open';
+        this.lastProbeAt = now;
+        this.clientLogger.info('Circuit breaker transitioning to half-open (probing)', {
+          openedAt,
+          circuitWindowMs: this.config.circuitBreakerWindowMs,
+        });
+        return true;
+      }
+      return false;
+    }
+
+    // half-open: allow exactly one probe at a time
+    return true;
+  }
+
+  private onSuccess(): void {
+    if (this.circuitState !== 'closed') {
+      this.clientLogger.info('Circuit breaker closing after successful probe', {
+        previousState: this.circuitState,
+      });
+      this.circuitState = 'closed';
+    }
+    this.consecutiveFailures = 0;
+  }
+
+  private onFailure(error: unknown): void {
+    this.consecutiveFailures++;
+
+    if (this.circuitState === 'half-open') {
+      this.clientLogger.warn('Circuit breaker probe failed; re-opening circuit', {
+        consecutiveFailures: this.consecutiveFailures,
+      });
+      this.openCircuit();
+      return;
+    }
+
+    if (
+      this.circuitState === 'closed' &&
+      this.consecutiveFailures >= this.config.circuitBreakerFailures
+    ) {
+      logError(this.clientLogger, 'Circuit breaker opening after consecutive failures', error, {
+        consecutiveFailures: this.consecutiveFailures,
+        threshold: this.config.circuitBreakerFailures,
+      });
+      this.openCircuit();
+    }
+  }
+
+  private openCircuit(): void {
+    this.circuitState = 'open';
+    this.lastOpenedAt = Date.now();
+    this.metricCircuitOpenTotal.inc();
+    this.clientLogger.warn('Circuit breaker is now open', {
+      consecutiveFailures: this.consecutiveFailures,
+      windowMs: this.config.circuitBreakerWindowMs,
+    });
+  }
+
+  // ---------------------------------------------------------------------------
+  // Retry classification
+  // ---------------------------------------------------------------------------
+
+  private isRetriable(error: unknown): boolean {
+    if (!(error instanceof Error)) {
+      return false;
+    }
+
+    // Timeout errors are retriable
+    if (error instanceof ResilientOpenAIError && error.code === 'TIMEOUT') {
+      return true;
+    }
+
+    // Network / connection errors
+    if (
+      error.message.includes('ECONNRESET') ||
+      error.message.includes('ECONNREFUSED') ||
+      error.message.includes('ETIMEDOUT') ||
+      error.message.includes('socket hang up') ||
+      error.message.includes('fetch failed') ||
+      error.message.includes('network')
+    ) {
+      return true;
+    }
+
+    // OpenAI HTTP status codes: 429 (rate-limit) and 5xx are retriable
+    const statusCode = this.getStatusCode(error);
+    if (statusCode === 429 || (statusCode !== undefined && statusCode >= 500)) {
+      return true;
+    }
+
+    return false;
+  }
+
+  private getStatusCode(error: unknown): number | undefined {
+    if (typeof error === 'object' && error !== null) {
+      const status = Reflect.get(error as object, 'status');
+      return typeof status === 'number' ? status : undefined;
+    }
+    return undefined;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Backoff with full jitter; respects Retry-After header on 429
+  // ---------------------------------------------------------------------------
+
+  private computeBackoffMs(attempt: number, error: unknown): number {
+    // Honour Retry-After header on 429 responses
+    const retryAfterMs = this.extractRetryAfterMs(error);
+    if (retryAfterMs !== null) {
+      return retryAfterMs;
+    }
+
+    // Exponential backoff with full jitter: random(0, base * 2^attempt)
+    const cap = this.config.backoffBaseMs * Math.pow(2, attempt);
+    return Math.random() * cap;
+  }
+
+  private extractRetryAfterMs(error: unknown): number | null {
+    if (typeof error !== 'object' || error === null) {
+      return null;
+    }
+
+    const headers = Reflect.get(error as object, 'headers');
+    if (typeof headers !== 'object' || headers === null) {
+      return null;
+    }
+
+    const retryAfterHeader =
+      (Reflect.get(headers as object, 'retry-after') as string | undefined) ??
+      (Reflect.get(headers as object, 'Retry-After') as string | undefined);
+
+    if (!retryAfterHeader) {
+      return null;
+    }
+
+    const seconds = parseFloat(retryAfterHeader);
+    if (Number.isFinite(seconds)) {
+      return seconds * 1000;
+    }
+
+    // Could be an HTTP-date; fall back to default backoff
+    return null;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Utilities
+  // ---------------------------------------------------------------------------
+
+  private sleep(ms: number): Promise<void> {
+    return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms)));
+  }
+}

From 894b1c4fb37f00bd9a35f3acfe2edc0462921981 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:21:06 +0000
Subject: [PATCH 3/7] refactor: address code review feedback on resilient
 OpenAI client

- Clarify ResilientOpenAIError re-throw comment in retry loop
- Fix spelling: Honour -> Honor
- Add comment explaining Math.max(0, ms) defensive guard in sleep()
- Remove unused noop helper; inline it at the call site
- Fix doc table spelling: honoured -> honored
---
 docs/resilient-openai.md                               | 2 +-
 packages/backend/src/lib/resilientOpenAIClient.spec.ts | 4 +---
 packages/backend/src/lib/resilientOpenAIClient.ts      | 8 +++++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/resilient-openai.md b/docs/resilient-openai.md
index a041c507..6568f8cc 100644
--- a/docs/resilient-openai.md
+++ b/docs/resilient-openai.md
@@ -15,7 +15,7 @@ broader Moonbeam service.
 |---|---|
 | Per-request timeout | 10 s |
 | Automatic retries with exponential backoff + full jitter | 3 retries |
-| `Retry-After` header honoured on 429 responses | yes |
+| `Retry-After` header honored on 429 responses | yes |
 | Circuit breaker | opens after 5 consecutive failures |
 | Concurrency / bulkhead limiter | 10 concurrent calls |
 | Graceful degradation via `ResilientOpenAIError` | yes |
diff --git a/packages/backend/src/lib/resilientOpenAIClient.spec.ts b/packages/backend/src/lib/resilientOpenAIClient.spec.ts
index 33e996e3..ea7bc39b 100644
--- a/packages/backend/src/lib/resilientOpenAIClient.spec.ts
+++ b/packages/backend/src/lib/resilientOpenAIClient.spec.ts
@@ -23,8 +23,6 @@ const makeUnderlying = (createImpl: Mock): OpenAIClientLike => ({
   },
 });
 
-const noop = () => undefined;
-
 /** Build a minimal config with fast timings for tests. */
 const fastConfig = (overrides: Partial<OpenAIClientConfig> = {}): Partial<OpenAIClientConfig> => ({
   timeoutMs: 500,
@@ -379,7 +377,7 @@ describe('ResilientOpenAIClient', () => {
   describe('concurrency limiter', () => {
     it('rejects requests that exceed the concurrency limit', async () => {
       // Create a mock that never resolves so we can accumulate inflight requests
-      const neverResolves = () => new Promise<never>(noop);
+      const neverResolves = () => new Promise<never>(() => undefined);
       const createMock = vi.fn().mockImplementation(neverResolves);
 
       const client = new ResilientOpenAIClient(
diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts
index 83f82fe7..d7715ed8 100644
--- a/packages/backend/src/lib/resilientOpenAIClient.ts
+++ b/packages/backend/src/lib/resilientOpenAIClient.ts
@@ -224,8 +224,8 @@ export class ResilientOpenAIClient implements OpenAIClientLike {
       } catch (error) {
         lastError = error;
 
-        // Don't retry circuit-open or concurrency errors (they won't be
-        // thrown from here, but guard defensively)
+        // TIMEOUT is wrapped as a ResilientOpenAIError by attemptWithTimeout;
+        // re-throw it (and any other ResilientOpenAIError) without retrying.
         if (error instanceof ResilientOpenAIError) {
           throw error;
         }
@@ -403,7 +403,7 @@ export class ResilientOpenAIClient implements OpenAIClientLike {
   // ---------------------------------------------------------------------------
 
   private computeBackoffMs(attempt: number, error: unknown): number {
-    // Honour Retry-After header on 429 responses
+    // Honor Retry-After header on 429 responses
     const retryAfterMs = this.extractRetryAfterMs(error);
     if (retryAfterMs !== null) {
       return retryAfterMs;
@@ -446,6 +446,8 @@ export class ResilientOpenAIClient implements OpenAIClientLike {
   // ---------------------------------------------------------------------------
 
   private sleep(ms: number): Promise<void> {
+    // Math.max guards against negative values that could arise from floating-
+    // point jitter computation; setTimeout(fn, 0) is the desired minimum.
     return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms)));
   }
 }

From 8e43bdb78d4c9c2248ea6e692b5c6ccdee44dc65 Mon Sep 17 00:00:00 2001
From: sfreeman422 <sfreeman422@protonmail.com>
Date: Tue, 23 Jun 2026 11:29:49 -0400
Subject: [PATCH 4/7] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 packages/backend/src/lib/resilientOpenAIClient.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts
index d7715ed8..2bca98e9 100644
--- a/packages/backend/src/lib/resilientOpenAIClient.ts
+++ b/packages/backend/src/lib/resilientOpenAIClient.ts
@@ -224,9 +224,9 @@ export class ResilientOpenAIClient implements OpenAIClientLike {
       } catch (error) {
         lastError = error;
 
-        // TIMEOUT is wrapped as a ResilientOpenAIError by attemptWithTimeout;
-        // re-throw it (and any other ResilientOpenAIError) without retrying.
-        if (error instanceof ResilientOpenAIError) {
+        // Only short-circuit retries for non-retriable wrapper errors.
+        // TIMEOUT is classified as retriable by isRetriable().
+        if (error instanceof ResilientOpenAIError && error.code !== 'TIMEOUT') {
           throw error;
         }
 

From 750753e0dafbbc7063326af90f1a2b5694664fa7 Mon Sep 17 00:00:00 2001
From: sfreeman422 <sfreeman422@protonmail.com>
Date: Tue, 23 Jun 2026 11:30:12 -0400
Subject: [PATCH 5/7] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 packages/backend/src/lib/resilientOpenAIClient.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts
index 2bca98e9..42399ab3 100644
--- a/packages/backend/src/lib/resilientOpenAIClient.ts
+++ b/packages/backend/src/lib/resilientOpenAIClient.ts
@@ -309,7 +309,7 @@ export class ResilientOpenAIClient implements OpenAIClientLike {
     }
 
     // half-open: allow exactly one probe at a time
-    return true;
+    return this.activeRequests === 0;
   }
 
   private onSuccess(): void {

From ebbdb86665f027db05745ef1f42b3e93f57217a0 Mon Sep 17 00:00:00 2001
From: sfreeman422 <sfreeman422@protonmail.com>
Date: Tue, 23 Jun 2026 11:30:36 -0400
Subject: [PATCH 6/7] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 packages/backend/src/config/openai.ts | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/packages/backend/src/config/openai.ts b/packages/backend/src/config/openai.ts
index 8d8e8c78..6616e09b 100644
--- a/packages/backend/src/config/openai.ts
+++ b/packages/backend/src/config/openai.ts
@@ -33,12 +33,12 @@ const parseIntWithDefault = (value: string | undefined, defaultValue: number): n
 };
 
 export const getOpenAIClientConfig = (): OpenAIClientConfig => ({
-  timeoutMs: parseIntWithDefault(process.env.OPENAI_TIMEOUT_MS, 10_000),
-  retries: parseIntWithDefault(process.env.OPENAI_RETRIES, 3),
-  backoffBaseMs: parseIntWithDefault(process.env.OPENAI_BACKOFF_BASE_MS, 500),
-  circuitBreakerFailures: parseIntWithDefault(process.env.CIRCUIT_BREAKER_FAILURES, 5),
-  circuitBreakerWindowMs: parseIntWithDefault(process.env.CIRCUIT_BREAKER_WINDOW_MS, 60_000),
-  circuitBreakerProbeMs: parseIntWithDefault(process.env.CIRCUIT_BREAKER_PROBE_MS, 30_000),
-  concurrency: parseIntWithDefault(process.env.OPENAI_CONCURRENCY, 10),
+  timeoutMs: Math.max(1, parseIntWithDefault(process.env.OPENAI_TIMEOUT_MS, 10_000)),
+  retries: Math.max(0, parseIntWithDefault(process.env.OPENAI_RETRIES, 3)),
+  backoffBaseMs: Math.max(0, parseIntWithDefault(process.env.OPENAI_BACKOFF_BASE_MS, 500)),
+  circuitBreakerFailures: Math.max(1, parseIntWithDefault(process.env.CIRCUIT_BREAKER_FAILURES, 5)),
+  circuitBreakerWindowMs: Math.max(1, parseIntWithDefault(process.env.CIRCUIT_BREAKER_WINDOW_MS, 60_000)),
+  circuitBreakerProbeMs: Math.max(1, parseIntWithDefault(process.env.CIRCUIT_BREAKER_PROBE_MS, 30_000)),
+  concurrency: Math.max(1, parseIntWithDefault(process.env.OPENAI_CONCURRENCY, 10)),
   featureFlagResilient: process.env.FEATURE_FLAG_RESILIENT_OPENAI !== 'false',
 });

From 6f0c6ea94681721704d9ed9f63e9ad5e12366f0f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:38:01 +0000
Subject: [PATCH 7/7] fix: resolve CI lint and format failures

---
 docs/resilient-openai.md                      | 66 +++++++++----------
 .../src/lib/resilientOpenAIClient.spec.ts     | 51 +++++---------
 .../backend/src/lib/resilientOpenAIClient.ts  | 29 ++++----
 3 files changed, 64 insertions(+), 82 deletions(-)

diff --git a/docs/resilient-openai.md b/docs/resilient-openai.md
index 6568f8cc..c0dc5b86 100644
--- a/docs/resilient-openai.md
+++ b/docs/resilient-openai.md
@@ -1,7 +1,7 @@
 # Resilient OpenAI Client
 
 This document describes the resilient OpenAI wrapper introduced in
-`packages/backend/src/lib/resilientOpenAIClient.ts`.  The wrapper prevents
+`packages/backend/src/lib/resilientOpenAIClient.ts`. The wrapper prevents
 transient or sustained OpenAI failures from cascading and taking down the
 broader Moonbeam service.
 
@@ -11,16 +11,16 @@ broader Moonbeam service.
 
 `ResilientOpenAIClient` wraps the official `openai` SDK client and adds:
 
-| Feature | Default |
-|---|---|
-| Per-request timeout | 10 s |
-| Automatic retries with exponential backoff + full jitter | 3 retries |
-| `Retry-After` header honored on 429 responses | yes |
-| Circuit breaker | opens after 5 consecutive failures |
-| Concurrency / bulkhead limiter | 10 concurrent calls |
-| Graceful degradation via `ResilientOpenAIError` | yes |
-| Structured logging (Winston) | yes |
-| Prometheus metrics (`prom-client`) | yes |
+| Feature                                                  | Default                            |
+| -------------------------------------------------------- | ---------------------------------- |
+| Per-request timeout                                      | 10 s                               |
+| Automatic retries with exponential backoff + full jitter | 3 retries                          |
+| `Retry-After` header honored on 429 responses            | yes                                |
+| Circuit breaker                                          | opens after 5 consecutive failures |
+| Concurrency / bulkhead limiter                           | 10 concurrent calls                |
+| Graceful degradation via `ResilientOpenAIError`          | yes                                |
+| Structured logging (Winston)                             | yes                                |
+| Prometheus metrics (`prom-client`)                       | yes                                |
 
 The wrapper exposes the same narrow `responses.create` surface used by
 `AIService`, so the change is a drop-in replacement.
@@ -31,7 +31,7 @@ The wrapper exposes the same narrow `responses.create` surface used by
 
 Set the environment variable `FEATURE_FLAG_RESILIENT_OPENAI` to `false` to
 bypass all resilience logic and delegate directly to the underlying OpenAI SDK
-client.  This is the rollback switch.
+client. This is the rollback switch.
 
 ```
 FEATURE_FLAG_RESILIENT_OPENAI=false   # bypass resilience (rollback)
@@ -42,18 +42,18 @@ FEATURE_FLAG_RESILIENT_OPENAI=true    # enable resilience (default)
 
 ## Environment variables
 
-All variables are optional.  Defaults are shown.
+All variables are optional. Defaults are shown.
 
-| Variable | Default | Description |
-|---|---|---|
-| `FEATURE_FLAG_RESILIENT_OPENAI` | `true` | Set to `false` to bypass the wrapper entirely. |
-| `OPENAI_TIMEOUT_MS` | `10000` | Maximum ms to wait for a single request before aborting. |
-| `OPENAI_RETRIES` | `3` | Maximum retry attempts on transient errors. |
-| `OPENAI_BACKOFF_BASE_MS` | `500` | Base interval (ms) for exponential backoff with full jitter. |
-| `CIRCUIT_BREAKER_FAILURES` | `5` | Consecutive failures needed to open the circuit. |
-| `CIRCUIT_BREAKER_WINDOW_MS` | `60000` | Duration (ms) the circuit stays open before allowing a probe. |
-| `CIRCUIT_BREAKER_PROBE_MS` | `30000` | Minimum interval (ms) between probe attempts while open. |
-| `OPENAI_CONCURRENCY` | `10` | Maximum concurrent outbound OpenAI calls per instance. |
+| Variable                        | Default | Description                                                   |
+| ------------------------------- | ------- | ------------------------------------------------------------- |
+| `FEATURE_FLAG_RESILIENT_OPENAI` | `true`  | Set to `false` to bypass the wrapper entirely.                |
+| `OPENAI_TIMEOUT_MS`             | `10000` | Maximum ms to wait for a single request before aborting.      |
+| `OPENAI_RETRIES`                | `3`     | Maximum retry attempts on transient errors.                   |
+| `OPENAI_BACKOFF_BASE_MS`        | `500`   | Base interval (ms) for exponential backoff with full jitter.  |
+| `CIRCUIT_BREAKER_FAILURES`      | `5`     | Consecutive failures needed to open the circuit.              |
+| `CIRCUIT_BREAKER_WINDOW_MS`     | `60000` | Duration (ms) the circuit stays open before allowing a probe. |
+| `CIRCUIT_BREAKER_PROBE_MS`      | `30000` | Minimum interval (ms) between probe attempts while open.      |
+| `OPENAI_CONCURRENCY`            | `10`    | Maximum concurrent outbound OpenAI calls per instance.        |
 
 Configuration is loaded by `packages/backend/src/config/openai.ts` which reads
 these variables at instantiation time with sensible defaults.
@@ -145,18 +145,18 @@ return appropriate degraded UX to users.
 
 ## Metrics
 
-The client registers the following Prometheus metrics via `prom-client`.  Each
+The client registers the following Prometheus metrics via `prom-client`. Each
 `ResilientOpenAIClient` instance uses its own `Registry` so multiple instances
-in the same process do not collide.  Expose the registry on a `/metrics`
+in the same process do not collide. Expose the registry on a `/metrics`
 endpoint if Prometheus scraping is desired.
 
-| Metric | Type | Description |
-|---|---|---|
-| `openai_requests_total` | Counter | Total requests attempted, labelled `status` (`attempted`, `success`, `circuit_open`, `concurrency_rejected`). |
-| `openai_retries_total` | Counter | Total retry attempts. |
-| `openai_failures_total` | Counter | Total failed requests after all retries, labelled `reason` (`error`, `circuit_open`, `concurrency_rejected`). |
-| `openai_circuit_open_total` | Counter | Number of times the circuit transitioned to open. |
-| `openai_latency_seconds` | Histogram | End-to-end request latency in seconds, labelled `status` (`success`, `error`). Buckets: 0.1, 0.5, 1, 2, 5, 10, 30. |
+| Metric                      | Type      | Description                                                                                                        |
+| --------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------ |
+| `openai_requests_total`     | Counter   | Total requests attempted, labelled `status` (`attempted`, `success`, `circuit_open`, `concurrency_rejected`).      |
+| `openai_retries_total`      | Counter   | Total retry attempts.                                                                                              |
+| `openai_failures_total`     | Counter   | Total failed requests after all retries, labelled `reason` (`error`, `circuit_open`, `concurrency_rejected`).      |
+| `openai_circuit_open_total` | Counter   | Number of times the circuit transitioned to open.                                                                  |
+| `openai_latency_seconds`    | Histogram | End-to-end request latency in seconds, labelled `status` (`success`, `error`). Buckets: 0.1, 0.5, 1, 2, 5, 10, 30. |
 
 ---
 
@@ -201,7 +201,7 @@ If issues occur at any phase, flip the feature flag:
 FEATURE_FLAG_RESILIENT_OPENAI=false
 ```
 
-Restart the service.  The wrapper delegates directly to the underlying OpenAI
+Restart the service. The wrapper delegates directly to the underlying OpenAI
 SDK; no other code changes are required.
 
 ### Phase 3 — Cleanup (future)
diff --git a/packages/backend/src/lib/resilientOpenAIClient.spec.ts b/packages/backend/src/lib/resilientOpenAIClient.spec.ts
index ea7bc39b..de1acf2a 100644
--- a/packages/backend/src/lib/resilientOpenAIClient.spec.ts
+++ b/packages/backend/src/lib/resilientOpenAIClient.spec.ts
@@ -102,10 +102,7 @@ describe('ResilientOpenAIClient', () => {
 
     it('retries on transient 500 error and succeeds', async () => {
       const transientError = Object.assign(new Error('Internal Server Error'), { status: 500 });
-      const createMock = vi
-        .fn()
-        .mockRejectedValueOnce(transientError)
-        .mockResolvedValue(makeResponse('retry success'));
+      const createMock = vi.fn().mockRejectedValueOnce(transientError).mockResolvedValue(makeResponse('retry success'));
 
       const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
 
@@ -120,10 +117,7 @@ describe('ResilientOpenAIClient', () => {
 
     it('retries on 429 rate-limit error and succeeds', async () => {
       const rateLimitError = Object.assign(new Error('Rate limit exceeded'), { status: 429 });
-      const createMock = vi
-        .fn()
-        .mockRejectedValueOnce(rateLimitError)
-        .mockResolvedValue(makeResponse('ok after 429'));
+      const createMock = vi.fn().mockRejectedValueOnce(rateLimitError).mockResolvedValue(makeResponse('ok after 429'));
 
       const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
 
@@ -163,9 +157,7 @@ describe('ResilientOpenAIClient', () => {
 
       const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
 
-      await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(
-        'Bad Request',
-      );
+      await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow('Bad Request');
       // One attempt only (no retries for 4xx other than 429)
       expect(createMock).toHaveBeenCalledOnce();
     });
@@ -228,16 +220,16 @@ describe('ResilientOpenAIClient', () => {
       // Advance past the 100 ms abort timer
       await vi.advanceTimersByTimeAsync(200);
 
+      await expect(resultPromise).rejects.toBeInstanceOf(ResilientOpenAIError);
       await expect(resultPromise).rejects.toMatchObject({
-        name: 'ResilientOpenAIError',
         code: 'TIMEOUT',
       });
     });
 
     it('does not throw when request completes within timeoutMs', async () => {
-      const createMock = vi.fn().mockImplementation(
-        () => new Promise((resolve) => setTimeout(() => resolve(makeResponse()), 50)),
-      );
+      const createMock = vi
+        .fn()
+        .mockImplementation(() => new Promise((resolve) => setTimeout(() => resolve(makeResponse()), 50)));
 
       const client = new ResilientOpenAIClient(
         makeUnderlying(createMock),
@@ -309,8 +301,9 @@ describe('ResilientOpenAIClient', () => {
       const callCountBeforeShortCircuit = createMock.mock.calls.length;
 
       // Next call should be short-circuited (no underlying call)
-      await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toMatchObject({
-        name: 'ResilientOpenAIError',
+      const shortCircuited = client.responses.create({ model: 'gpt-4o', input: 'hi' });
+      await expect(shortCircuited).rejects.toBeInstanceOf(ResilientOpenAIError);
+      await expect(shortCircuited).rejects.toMatchObject({
         code: 'CIRCUIT_OPEN',
       });
 
@@ -319,9 +312,7 @@ describe('ResilientOpenAIClient', () => {
 
     it('transitions to half-open and closes on successful probe', async () => {
       const badRequestError = Object.assign(new Error('boom'), { status: 503 });
-      const createMock = vi
-        .fn()
-        .mockRejectedValue(badRequestError);
+      const createMock = vi.fn().mockRejectedValue(badRequestError);
 
       const client = new ResilientOpenAIClient(
         makeUnderlying(createMock),
@@ -393,8 +384,9 @@ describe('ResilientOpenAIClient', () => {
       expect(client.getActiveRequests()).toBe(2);
 
       // Third request should be rejected
-      await expect(client.responses.create({ model: 'gpt-4o', input: '3' })).rejects.toMatchObject({
-        name: 'ResilientOpenAIError',
+      const rejected = client.responses.create({ model: 'gpt-4o', input: '3' });
+      await expect(rejected).rejects.toBeInstanceOf(ResilientOpenAIError);
+      await expect(rejected).rejects.toMatchObject({
         code: 'CONCURRENCY_REJECTED',
       });
     });
@@ -452,10 +444,7 @@ describe('ResilientOpenAIClient', () => {
 
     it('increments openai_retries_total on retry', async () => {
       const transientError = Object.assign(new Error('oops'), { status: 503 });
-      const createMock = vi
-        .fn()
-        .mockRejectedValueOnce(transientError)
-        .mockResolvedValue(makeResponse());
+      const createMock = vi.fn().mockRejectedValueOnce(transientError).mockResolvedValue(makeResponse());
 
       const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry);
 
@@ -499,16 +488,10 @@ describe('ResilientOpenAIClient', () => {
       const badError = Object.assign(new Error('fail'), { status: 503 });
       const createMock = vi.fn().mockRejectedValue(badError);
 
-      const client = new ResilientOpenAIClient(
-        makeUnderlying(createMock),
-        fastConfig({ retries: 0 }),
-        registry,
-      );
+      const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig({ retries: 0 }), registry);
 
       // Should reject but not crash the process
-      await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(
-        'fail',
-      );
+      await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow('fail');
 
       // Metrics should still be recorded
       const metric = await registry.getSingleMetricAsString(METRIC_NAMES.failuresTotal);
diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts
index 42399ab3..60422262 100644
--- a/packages/backend/src/lib/resilientOpenAIClient.ts
+++ b/packages/backend/src/lib/resilientOpenAIClient.ts
@@ -28,13 +28,19 @@ export class ResilientOpenAIError extends Error {
 // Metric names (exported so callers can reference them)
 // ---------------------------------------------------------------------------
 
-export const METRIC_NAMES = {
+export const METRIC_NAMES: {
+  readonly requestsTotal: 'openai_requests_total';
+  readonly retriesTotal: 'openai_retries_total';
+  readonly failuresTotal: 'openai_failures_total';
+  readonly circuitOpenTotal: 'openai_circuit_open_total';
+  readonly latencySeconds: 'openai_latency_seconds';
+} = {
   requestsTotal: 'openai_requests_total',
   retriesTotal: 'openai_retries_total',
   failuresTotal: 'openai_failures_total',
   circuitOpenTotal: 'openai_circuit_open_total',
   latencySeconds: 'openai_latency_seconds',
-} as const;
+};
 
 // ---------------------------------------------------------------------------
 // Circuit-breaker states
@@ -47,10 +53,7 @@ type CircuitState = 'closed' | 'open' | 'half-open';
 // ---------------------------------------------------------------------------
 
 export interface OpenAIResponsesAPI {
-  create(
-    params: ResponseCreateParamsNonStreaming,
-    options?: OpenAI.RequestOptions,
-  ): Promise<OpenAIResponse>;
+  create(params: ResponseCreateParamsNonStreaming, options?: OpenAI.RequestOptions): Promise<OpenAIResponse>;
 }
 
 export interface OpenAIClientLike {
@@ -333,10 +336,7 @@ export class ResilientOpenAIClient implements OpenAIClientLike {
       return;
     }
 
-    if (
-      this.circuitState === 'closed' &&
-      this.consecutiveFailures >= this.config.circuitBreakerFailures
-    ) {
+    if (this.circuitState === 'closed' && this.consecutiveFailures >= this.config.circuitBreakerFailures) {
       logError(this.clientLogger, 'Circuit breaker opening after consecutive failures', error, {
         consecutiveFailures: this.consecutiveFailures,
         threshold: this.config.circuitBreakerFailures,
@@ -392,7 +392,7 @@ export class ResilientOpenAIClient implements OpenAIClientLike {
 
   private getStatusCode(error: unknown): number | undefined {
     if (typeof error === 'object' && error !== null) {
-      const status = Reflect.get(error as object, 'status');
+      const status = Reflect.get(error, 'status');
       return typeof status === 'number' ? status : undefined;
     }
     return undefined;
@@ -419,14 +419,13 @@ export class ResilientOpenAIClient implements OpenAIClientLike {
       return null;
     }
 
-    const headers = Reflect.get(error as object, 'headers');
+    const headers = Reflect.get(error, 'headers');
     if (typeof headers !== 'object' || headers === null) {
       return null;
     }
 
-    const retryAfterHeader =
-      (Reflect.get(headers as object, 'retry-after') as string | undefined) ??
-      (Reflect.get(headers as object, 'Retry-After') as string | undefined);
+    const retryAfterRaw = Reflect.get(headers, 'retry-after') ?? Reflect.get(headers, 'Retry-After');
+    const retryAfterHeader = typeof retryAfterRaw === 'string' ? retryAfterRaw : undefined;
 
     if (!retryAfterHeader) {
       return null;