From b9275496a8fc2fbc9cf1c6c88c6cfc6c4c31e572 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:06:47 +0000 Subject: [PATCH 1/7] Initial plan From da1aed93a9f4346de8fcf15af90ae86ba076682e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:18:39 +0000 Subject: [PATCH 2/7] feat: add resilient OpenAI client with timeout, retries, circuit breaker, bulkhead, and metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New src/config/openai.ts: reads all resilience settings from env vars - New src/lib/resilientOpenAIClient.ts: per-request AbortController timeout, exponential backoff + jitter retries honoring Retry-After on 429, circuit breaker (closed→open→half-open), concurrency bulkhead, prom-client metrics, structured Winston logging, FEATURE_FLAG_RESILIENT_OPENAI bypass - Update ai.service.ts: openAi property now uses ResilientOpenAIClient wrapper - New src/lib/resilientOpenAIClient.spec.ts: 22 tests covering all features - New docs/resilient-openai.md: env vars, behavior, rollout plan - Add prom-client@15.1.3 dependency --- docs/resilient-openai.md | 212 +++++++ package-lock.json | 38 ++ packages/backend/package.json | 1 + packages/backend/src/ai/ai.service.ts | 10 +- packages/backend/src/config/openai.ts | 44 ++ .../src/lib/resilientOpenAIClient.spec.ts | 520 ++++++++++++++++++ .../backend/src/lib/resilientOpenAIClient.ts | 451 +++++++++++++++ 7 files changed, 1273 insertions(+), 3 deletions(-) create mode 100644 docs/resilient-openai.md create mode 100644 packages/backend/src/config/openai.ts create mode 100644 packages/backend/src/lib/resilientOpenAIClient.spec.ts create mode 100644 packages/backend/src/lib/resilientOpenAIClient.ts diff --git a/docs/resilient-openai.md b/docs/resilient-openai.md new file mode 100644 index 00000000..a041c507 --- /dev/null +++ b/docs/resilient-openai.md @@ -0,0 +1,212 @@ +# Resilient OpenAI Client + +This document describes the resilient OpenAI wrapper introduced in +`packages/backend/src/lib/resilientOpenAIClient.ts`. The wrapper prevents +transient or sustained OpenAI failures from cascading and taking down the +broader Moonbeam service. + +--- + +## Overview + +`ResilientOpenAIClient` wraps the official `openai` SDK client and adds: + +| Feature | Default | +|---|---| +| Per-request timeout | 10 s | +| Automatic retries with exponential backoff + full jitter | 3 retries | +| `Retry-After` header honoured on 429 responses | yes | +| Circuit breaker | opens after 5 consecutive failures | +| Concurrency / bulkhead limiter | 10 concurrent calls | +| Graceful degradation via `ResilientOpenAIError` | yes | +| Structured logging (Winston) | yes | +| Prometheus metrics (`prom-client`) | yes | + +The wrapper exposes the same narrow `responses.create` surface used by +`AIService`, so the change is a drop-in replacement. + +--- + +## Feature flag + +Set the environment variable `FEATURE_FLAG_RESILIENT_OPENAI` to `false` to +bypass all resilience logic and delegate directly to the underlying OpenAI SDK +client. This is the rollback switch. + +``` +FEATURE_FLAG_RESILIENT_OPENAI=false # bypass resilience (rollback) +FEATURE_FLAG_RESILIENT_OPENAI=true # enable resilience (default) +``` + +--- + +## Environment variables + +All variables are optional. Defaults are shown. + +| Variable | Default | Description | +|---|---|---| +| `FEATURE_FLAG_RESILIENT_OPENAI` | `true` | Set to `false` to bypass the wrapper entirely. | +| `OPENAI_TIMEOUT_MS` | `10000` | Maximum ms to wait for a single request before aborting. | +| `OPENAI_RETRIES` | `3` | Maximum retry attempts on transient errors. | +| `OPENAI_BACKOFF_BASE_MS` | `500` | Base interval (ms) for exponential backoff with full jitter. | +| `CIRCUIT_BREAKER_FAILURES` | `5` | Consecutive failures needed to open the circuit. | +| `CIRCUIT_BREAKER_WINDOW_MS` | `60000` | Duration (ms) the circuit stays open before allowing a probe. | +| `CIRCUIT_BREAKER_PROBE_MS` | `30000` | Minimum interval (ms) between probe attempts while open. | +| `OPENAI_CONCURRENCY` | `10` | Maximum concurrent outbound OpenAI calls per instance. | + +Configuration is loaded by `packages/backend/src/config/openai.ts` which reads +these variables at instantiation time with sensible defaults. + +--- + +## Retry behaviour + +Requests are retried when the error is classified as _retriable_: + +- HTTP 429 (rate limit) — also extracts `Retry-After` header and waits + accordingly before retrying. +- HTTP 5xx (server errors). +- Network / connection errors (`ECONNRESET`, `ETIMEDOUT`, socket hang-up, + `fetch failed`, etc.). +- `ResilientOpenAIError` with code `TIMEOUT`. + +Non-retriable errors (4xx other than 429, business-logic errors) are surfaced +immediately. + +Backoff is computed as: + +``` +sleep = random(0, backoffBaseMs * 2^attempt) # full jitter +``` + +If a `Retry-After` header is present on a 429 response, that duration (in +seconds) overrides the computed backoff. + +--- + +## Circuit-breaker states + +``` +CLOSED ──(N consecutive failures)──► OPEN + ▲ │ + │ (probe succeeds) (window + probe interval elapsed) + └──────── HALF-OPEN ◄────────────────┘ + │ + (probe fails) + │ + OPEN +``` + +- **CLOSED**: normal operation. +- **OPEN**: calls are short-circuited immediately with + `ResilientOpenAIError(code: 'CIRCUIT_OPEN')`. No requests reach OpenAI. +- **HALF-OPEN**: one probe call is allowed through. If it succeeds the circuit + closes; if it fails the circuit re-opens. + +The current state is observable via `client.getCircuitState()`. + +--- + +## Concurrency limiter + +At most `OPENAI_CONCURRENCY` requests may be in-flight simultaneously. +Additional requests are rejected immediately with +`ResilientOpenAIError(code: 'CONCURRENCY_REJECTED')`. + +`client.getActiveRequests()` returns the current in-flight count. + +--- + +## Error types + +```typescript +import { ResilientOpenAIError } from '../lib/resilientOpenAIClient'; + +try { + await openAi.responses.create({ ... }); +} catch (err) { + if (err instanceof ResilientOpenAIError) { + // err.code is one of: + // 'CIRCUIT_OPEN' – circuit is open, request was not sent + // 'TIMEOUT' – request exceeded OPENAI_TIMEOUT_MS + // 'CONCURRENCY_REJECTED'– too many concurrent requests + // 'MAX_RETRIES_EXCEEDED'– reserved for future use + handleDegradedMode(err.code); + } +} +``` + +Upstream code (e.g. `AIService`) can check `err instanceof ResilientOpenAIError` +to distinguish transient/infrastructure failures from application errors and +return appropriate degraded UX to users. + +--- + +## Metrics + +The client registers the following Prometheus metrics via `prom-client`. Each +`ResilientOpenAIClient` instance uses its own `Registry` so multiple instances +in the same process do not collide. Expose the registry on a `/metrics` +endpoint if Prometheus scraping is desired. + +| Metric | Type | Description | +|---|---|---| +| `openai_requests_total` | Counter | Total requests attempted, labelled `status` (`attempted`, `success`, `circuit_open`, `concurrency_rejected`). | +| `openai_retries_total` | Counter | Total retry attempts. | +| `openai_failures_total` | Counter | Total failed requests after all retries, labelled `reason` (`error`, `circuit_open`, `concurrency_rejected`). | +| `openai_circuit_open_total` | Counter | Number of times the circuit transitioned to open. | +| `openai_latency_seconds` | Histogram | End-to-end request latency in seconds, labelled `status` (`success`, `error`). Buckets: 0.1, 0.5, 1, 2, 5, 10, 30. | + +--- + +## Logging + +The wrapper emits structured Winston logs (child logger `ResilientOpenAIClient`) +for: + +- Per-retry failures (level `info`). +- Non-retriable errors (level `warn`). +- Timeout aborts (level `warn`). +- Circuit-breaker state transitions (level `info` / `warn`). +- Concurrency limit reached (level `warn`). + +--- + +## Rollout / migration plan + +### Phase 1 — Behind feature flag (current state) + +The wrapper is **enabled by default** (`FEATURE_FLAG_RESILIENT_OPENAI=true`). + +1. **Deploy to staging** — verify smoke tests pass; inspect logs and metrics. +2. **Enable in a canary production host** — monitor `openai_failures_total`, + `openai_circuit_open_total`, and p99 latency via `openai_latency_seconds`. +3. **Gradually roll out** across all production instances while monitoring the + metrics above and Sentry/Datadog error rates. + +### Phase 2 — Stabilisation + +Once the canary shows stable behaviour for 24–48 h: + +- Enable for all production instances. +- Set alerting on `openai_circuit_open_total > 0` and high + `openai_failures_total` rates. + +### Rollback + +If issues occur at any phase, flip the feature flag: + +``` +FEATURE_FLAG_RESILIENT_OPENAI=false +``` + +Restart the service. The wrapper delegates directly to the underlying OpenAI +SDK; no other code changes are required. + +### Phase 3 — Cleanup (future) + +After the feature has been stable in production for a sprint: + +- Remove the `featureFlagResilient` branch and env-var check. +- Remove the `FEATURE_FLAG_RESILIENT_OPENAI` documentation references. diff --git a/package-lock.json b/package-lock.json index 0f61511a..ddc99a2d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1948,6 +1948,15 @@ "node": ">= 8" } }, + "node_modules/@opentelemetry/api": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.1.tgz", + "integrity": "sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==", + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" + } + }, "node_modules/@paralleldrive/cuid2": { "version": "2.3.1", "resolved": "https://registry.npmjs.org/@paralleldrive/cuid2/-/cuid2-2.3.1.tgz", @@ -3956,6 +3965,12 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/bintrees": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz", + "integrity": "sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==", + "license": "MIT" + }, "node_modules/body-parser": { "version": "1.20.4", "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.4.tgz", @@ -8082,6 +8097,19 @@ "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", "license": "MIT" }, + "node_modules/prom-client": { + "version": "15.1.3", + "resolved": "https://registry.npmjs.org/prom-client/-/prom-client-15.1.3.tgz", + "integrity": "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.4.0", + "tdigest": "^0.1.1" + }, + "engines": { + "node": "^16 || ^18 || >=20" + } + }, "node_modules/protobufjs": { "version": "7.5.4", "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz", @@ -9353,6 +9381,15 @@ "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==", "license": "MIT" }, + "node_modules/tdigest": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/tdigest/-/tdigest-0.1.2.tgz", + "integrity": "sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==", + "license": "MIT", + "dependencies": { + "bintrees": "1.0.2" + } + }, "node_modules/text-hex": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz", @@ -10734,6 +10771,7 @@ "mysql": "^2.17.1", "node-cron": "^4.2.1", "openai": "^4.103.0", + "prom-client": "^15.1.3", "sentence-splitter": "^5.0.0", "sentiment": "^5.0.2", "sharp": "^0.34.5", diff --git a/packages/backend/package.json b/packages/backend/package.json index 0874a799..321c9bc3 100644 --- a/packages/backend/package.json +++ b/packages/backend/package.json @@ -41,6 +41,7 @@ "mysql": "^2.17.1", "node-cron": "^4.2.1", "openai": "^4.103.0", + "prom-client": "^15.1.3", "sentence-splitter": "^5.0.0", "sentiment": "^5.0.2", "sharp": "^0.34.5", diff --git a/packages/backend/src/ai/ai.service.ts b/packages/backend/src/ai/ai.service.ts index 064f40af..7cfc07a7 100644 --- a/packages/backend/src/ai/ai.service.ts +++ b/packages/backend/src/ai/ai.service.ts @@ -32,6 +32,8 @@ import type { ResponseOutputText, ResponseOutputRefusal, } from 'openai/resources/responses/responses'; +import { ResilientOpenAIClient } from '../lib/resilientOpenAIClient'; +import type { OpenAIClientLike } from '../lib/resilientOpenAIClient'; import type { Part } from '@google/genai'; import { GoogleGenAI } from '@google/genai'; import sharp from 'sharp'; @@ -130,9 +132,11 @@ const normalizeReleaseSha = (value?: string): string | null => { export class AIService { redis = new AIPersistenceService(); - openAi = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY, - }); + openAi: OpenAIClientLike = new ResilientOpenAIClient( + new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, + }), + ); gemini = new GoogleGenAI({ apiKey: process.env.GOOGLE_GEMINI_API_KEY }); muzzlePersistenceService = new MuzzlePersistenceService(); diff --git a/packages/backend/src/config/openai.ts b/packages/backend/src/config/openai.ts new file mode 100644 index 00000000..8d8e8c78 --- /dev/null +++ b/packages/backend/src/config/openai.ts @@ -0,0 +1,44 @@ +/** + * Configuration for the resilient OpenAI client. + * All values are read from environment variables with sensible defaults. + */ + +export interface OpenAIClientConfig { + /** Maximum ms to wait for a single OpenAI request before aborting. */ + timeoutMs: number; + /** Maximum number of retry attempts on transient errors. */ + retries: number; + /** Base backoff interval (ms) for exponential backoff with full jitter. */ + backoffBaseMs: number; + /** Number of consecutive failures required to open the circuit breaker. */ + circuitBreakerFailures: number; + /** Duration (ms) the circuit stays open before allowing a probe. */ + circuitBreakerWindowMs: number; + /** Minimum interval (ms) between probe attempts while the circuit is open. */ + circuitBreakerProbeMs: number; + /** Maximum number of concurrent outbound OpenAI calls. */ + concurrency: number; + /** + * When true (default), requests are wrapped with timeouts, retries, + * the circuit breaker, and the concurrency limiter. + * Set FEATURE_FLAG_RESILIENT_OPENAI=false to bypass all resilience logic + * and delegate directly to the underlying OpenAI client. + */ + featureFlagResilient: boolean; +} + +const parseIntWithDefault = (value: string | undefined, defaultValue: number): number => { + const parsed = parseInt(value ?? '', 10); + return Number.isFinite(parsed) ? parsed : defaultValue; +}; + +export const getOpenAIClientConfig = (): OpenAIClientConfig => ({ + timeoutMs: parseIntWithDefault(process.env.OPENAI_TIMEOUT_MS, 10_000), + retries: parseIntWithDefault(process.env.OPENAI_RETRIES, 3), + backoffBaseMs: parseIntWithDefault(process.env.OPENAI_BACKOFF_BASE_MS, 500), + circuitBreakerFailures: parseIntWithDefault(process.env.CIRCUIT_BREAKER_FAILURES, 5), + circuitBreakerWindowMs: parseIntWithDefault(process.env.CIRCUIT_BREAKER_WINDOW_MS, 60_000), + circuitBreakerProbeMs: parseIntWithDefault(process.env.CIRCUIT_BREAKER_PROBE_MS, 30_000), + concurrency: parseIntWithDefault(process.env.OPENAI_CONCURRENCY, 10), + featureFlagResilient: process.env.FEATURE_FLAG_RESILIENT_OPENAI !== 'false', +}); diff --git a/packages/backend/src/lib/resilientOpenAIClient.spec.ts b/packages/backend/src/lib/resilientOpenAIClient.spec.ts new file mode 100644 index 00000000..33e996e3 --- /dev/null +++ b/packages/backend/src/lib/resilientOpenAIClient.spec.ts @@ -0,0 +1,520 @@ +import { vi, it, describe, expect, beforeEach, afterEach } from 'vitest'; +import { ResilientOpenAIClient, ResilientOpenAIError, METRIC_NAMES } from './resilientOpenAIClient'; +import type { OpenAIClientLike } from './resilientOpenAIClient'; +import { Registry } from 'prom-client'; +import type { OpenAIClientConfig } from '../config/openai'; + +// --------------------------------------------------------------------------- +// Test helpers +// --------------------------------------------------------------------------- + +const makeResponse = (text = 'hello') => ({ + output: [ + { + type: 'message' as const, + content: [{ type: 'output_text' as const, text }], + }, + ], +}); + +const makeUnderlying = (createImpl: Mock): OpenAIClientLike => ({ + responses: { + create: createImpl, + }, +}); + +const noop = () => undefined; + +/** Build a minimal config with fast timings for tests. */ +const fastConfig = (overrides: Partial = {}): Partial => ({ + timeoutMs: 500, + retries: 2, + backoffBaseMs: 0, + circuitBreakerFailures: 3, + circuitBreakerWindowMs: 100, + circuitBreakerProbeMs: 50, + concurrency: 10, + featureFlagResilient: true, + ...overrides, +}); + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('ResilientOpenAIClient', () => { + let registry: Registry; + + beforeEach(() => { + registry = new Registry(); + vi.useFakeTimers({ shouldAdvanceTime: false }); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + // ------------------------------------------------------------------------- + // Feature-flag bypass + // ------------------------------------------------------------------------- + + describe('feature flag', () => { + it('delegates directly to the underlying client when featureFlagResilient is false', async () => { + const createMock = vi.fn().mockResolvedValue(makeResponse()); + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ featureFlagResilient: false }), + registry, + ); + + const params = { model: 'gpt-4o', input: 'hello' }; + await client.responses.create(params); + + expect(createMock).toHaveBeenCalledOnce(); + expect(createMock).toHaveBeenCalledWith(params, undefined); + }); + + it('applies resilience when featureFlagResilient is true (default)', async () => { + const createMock = vi.fn().mockResolvedValue(makeResponse()); + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ featureFlagResilient: true }), + registry, + ); + + await client.responses.create({ model: 'gpt-4o', input: 'hello' }); + expect(createMock).toHaveBeenCalledOnce(); + }); + }); + + // ------------------------------------------------------------------------- + // Retry behaviour + // ------------------------------------------------------------------------- + + describe('retry behaviour', () => { + it('returns successfully on first attempt without retrying', async () => { + const createMock = vi.fn().mockResolvedValue(makeResponse('first try')); + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); + + const result = await client.responses.create({ model: 'gpt-4o', input: 'hi' }); + + expect(createMock).toHaveBeenCalledOnce(); + expect(result).toEqual(makeResponse('first try')); + }); + + it('retries on transient 500 error and succeeds', async () => { + const transientError = Object.assign(new Error('Internal Server Error'), { status: 500 }); + const createMock = vi + .fn() + .mockRejectedValueOnce(transientError) + .mockResolvedValue(makeResponse('retry success')); + + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); + + const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' }); + // Advance past the (near-zero) backoff + await vi.runAllTimersAsync(); + const result = await resultPromise; + + expect(createMock).toHaveBeenCalledTimes(2); + expect(result).toEqual(makeResponse('retry success')); + }); + + it('retries on 429 rate-limit error and succeeds', async () => { + const rateLimitError = Object.assign(new Error('Rate limit exceeded'), { status: 429 }); + const createMock = vi + .fn() + .mockRejectedValueOnce(rateLimitError) + .mockResolvedValue(makeResponse('ok after 429')); + + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); + + const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' }); + await vi.runAllTimersAsync(); + const result = await resultPromise; + + expect(createMock).toHaveBeenCalledTimes(2); + expect(result).toEqual(makeResponse('ok after 429')); + }); + + it('honors Retry-After header on 429 response', async () => { + const retryAfterSeconds = 2; + const rateLimitError = Object.assign(new Error('Rate limited'), { + status: 429, + headers: { 'retry-after': String(retryAfterSeconds) }, + }); + const createMock = vi + .fn() + .mockRejectedValueOnce(rateLimitError) + .mockResolvedValue(makeResponse('after retry-after')); + + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); + + const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' }); + // Advance past the Retry-After delay (2000ms) + await vi.advanceTimersByTimeAsync(retryAfterSeconds * 1000 + 100); + const result = await resultPromise; + + expect(result).toEqual(makeResponse('after retry-after')); + expect(createMock).toHaveBeenCalledTimes(2); + }); + + it('does not retry on 400 (non-retriable) error', async () => { + const badRequestError = Object.assign(new Error('Bad Request'), { status: 400 }); + const createMock = vi.fn().mockRejectedValue(badRequestError); + + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); + + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow( + 'Bad Request', + ); + // One attempt only (no retries for 4xx other than 429) + expect(createMock).toHaveBeenCalledOnce(); + }); + + it('throws after exhausting all retries', async () => { + const transientError = Object.assign(new Error('Service Unavailable'), { status: 503 }); + const createMock = vi.fn().mockRejectedValue(transientError); + + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig({ retries: 2 }), registry); + + const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' }); + // Pre-attach a no-op handler so the promise is never "unhandled" while + // we wait for the fake timers to drain the retry backoff sleeps. + resultPromise.catch(() => undefined); + await vi.runAllTimersAsync(); + + await expect(resultPromise).rejects.toThrow('Service Unavailable'); + // initial attempt + 2 retries = 3 calls total + expect(createMock).toHaveBeenCalledTimes(3); + }); + }); + + // ------------------------------------------------------------------------- + // Timeout behaviour + // ------------------------------------------------------------------------- + + describe('timeout behaviour', () => { + it('throws ResilientOpenAIError with TIMEOUT code when request exceeds timeoutMs', async () => { + // Simulate an underlying client that respects AbortSignal (as the real + // OpenAI client / fetch does): reject with an AbortError when the signal + // fires. + const createMock = vi.fn().mockImplementation( + (_params: unknown, options?: { signal?: AbortSignal }) => + new Promise((_, reject) => { + const signal = options?.signal; + if (!signal) return; + const onAbort = () => { + const err = new Error('The operation was aborted'); + err.name = 'AbortError'; + reject(err); + }; + if (signal.aborted) { + onAbort(); + } else { + signal.addEventListener('abort', onAbort, { once: true }); + } + }), + ); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock as Mock), + fastConfig({ timeoutMs: 100, retries: 0 }), + registry, + ); + + const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' }); + // Pre-attach a no-op handler so the rejected promise is never "unhandled" + // while we wait for the fake abort timer to fire. + resultPromise.catch(() => undefined); + // Advance past the 100 ms abort timer + await vi.advanceTimersByTimeAsync(200); + + await expect(resultPromise).rejects.toMatchObject({ + name: 'ResilientOpenAIError', + code: 'TIMEOUT', + }); + }); + + it('does not throw when request completes within timeoutMs', async () => { + const createMock = vi.fn().mockImplementation( + () => new Promise((resolve) => setTimeout(() => resolve(makeResponse()), 50)), + ); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ timeoutMs: 200, retries: 0 }), + registry, + ); + + const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' }); + await vi.advanceTimersByTimeAsync(100); + + await expect(resultPromise).resolves.toEqual(makeResponse()); + }); + }); + + // ------------------------------------------------------------------------- + // Circuit-breaker transitions + // ------------------------------------------------------------------------- + + describe('circuit breaker', () => { + it('stays closed on successes', async () => { + const createMock = vi.fn().mockResolvedValue(makeResponse()); + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ circuitBreakerFailures: 3 }), + registry, + ); + + for (let i = 0; i < 5; i++) { + await client.responses.create({ model: 'gpt-4o', input: 'hi' }); + } + + expect(client.getCircuitState()).toBe('closed'); + }); + + it('opens after N consecutive failures', async () => { + const badRequestError = Object.assign(new Error('boom'), { status: 503 }); + const createMock = vi.fn().mockRejectedValue(badRequestError); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ circuitBreakerFailures: 3, retries: 0 }), + registry, + ); + + // 3 failures should open the circuit + for (let i = 0; i < 3; i++) { + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(); + } + + expect(client.getCircuitState()).toBe('open'); + }); + + it('short-circuits calls when open and throws ResilientOpenAIError with CIRCUIT_OPEN code', async () => { + const badRequestError = Object.assign(new Error('boom'), { status: 503 }); + const createMock = vi.fn().mockRejectedValue(badRequestError); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ circuitBreakerFailures: 2, retries: 0 }), + registry, + ); + + // Open the circuit + for (let i = 0; i < 2; i++) { + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(); + } + expect(client.getCircuitState()).toBe('open'); + + const callCountBeforeShortCircuit = createMock.mock.calls.length; + + // Next call should be short-circuited (no underlying call) + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toMatchObject({ + name: 'ResilientOpenAIError', + code: 'CIRCUIT_OPEN', + }); + + expect(createMock).toHaveBeenCalledTimes(callCountBeforeShortCircuit); + }); + + it('transitions to half-open and closes on successful probe', async () => { + const badRequestError = Object.assign(new Error('boom'), { status: 503 }); + const createMock = vi + .fn() + .mockRejectedValue(badRequestError); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ circuitBreakerFailures: 2, retries: 0, circuitBreakerWindowMs: 100, circuitBreakerProbeMs: 50 }), + registry, + ); + + // Open the circuit + for (let i = 0; i < 2; i++) { + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(); + } + expect(client.getCircuitState()).toBe('open'); + + // Advance past the window AND probe interval so a probe is allowed + await vi.advanceTimersByTimeAsync(150); + + // Now configure the underlying to succeed on the probe + createMock.mockResolvedValue(makeResponse('probe success')); + + const result = await client.responses.create({ model: 'gpt-4o', input: 'probe' }); + expect(result).toEqual(makeResponse('probe success')); + expect(client.getCircuitState()).toBe('closed'); + }); + + it('reopens the circuit when the probe fails', async () => { + const badRequestError = Object.assign(new Error('boom'), { status: 503 }); + const createMock = vi.fn().mockRejectedValue(badRequestError); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ circuitBreakerFailures: 2, retries: 0, circuitBreakerWindowMs: 100, circuitBreakerProbeMs: 50 }), + registry, + ); + + // Open the circuit + for (let i = 0; i < 2; i++) { + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(); + } + + // Advance past window + probe interval + await vi.advanceTimersByTimeAsync(150); + + // Probe also fails → should re-open + await expect(client.responses.create({ model: 'gpt-4o', input: 'probe' })).rejects.toThrow(); + expect(client.getCircuitState()).toBe('open'); + }); + }); + + // ------------------------------------------------------------------------- + // Concurrency / bulkhead limiter + // ------------------------------------------------------------------------- + + describe('concurrency limiter', () => { + it('rejects requests that exceed the concurrency limit', async () => { + // Create a mock that never resolves so we can accumulate inflight requests + const neverResolves = () => new Promise(noop); + const createMock = vi.fn().mockImplementation(neverResolves); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ concurrency: 2, retries: 0 }), + registry, + ); + + // Kick off 2 requests that occupy all slots + void client.responses.create({ model: 'gpt-4o', input: '1' }); + void client.responses.create({ model: 'gpt-4o', input: '2' }); + + expect(client.getActiveRequests()).toBe(2); + + // Third request should be rejected + await expect(client.responses.create({ model: 'gpt-4o', input: '3' })).rejects.toMatchObject({ + name: 'ResilientOpenAIError', + code: 'CONCURRENCY_REJECTED', + }); + }); + + it('allows new requests after a slot is freed', async () => { + let resolveFirst!: (v: ReturnType) => void; + const firstRequest = new Promise>((resolve) => { + resolveFirst = resolve; + }); + + const createMock = vi + .fn() + .mockImplementationOnce(() => firstRequest) + .mockResolvedValue(makeResponse('second')); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ concurrency: 1, retries: 0 }), + registry, + ); + + // Start first request (will hold the slot) + const firstPromise = client.responses.create({ model: 'gpt-4o', input: '1' }); + + // Second request should be rejected while slot is held + await expect(client.responses.create({ model: 'gpt-4o', input: '2' })).rejects.toMatchObject({ + code: 'CONCURRENCY_REJECTED', + }); + + // Free the slot + resolveFirst(makeResponse('first')); + await firstPromise; + + // Now a new request should succeed + const result = await client.responses.create({ model: 'gpt-4o', input: '3' }); + expect(result).toEqual(makeResponse('second')); + }); + }); + + // ------------------------------------------------------------------------- + // Metrics + // ------------------------------------------------------------------------- + + describe('metrics', () => { + it('increments openai_requests_total on successful request', async () => { + const createMock = vi.fn().mockResolvedValue(makeResponse()); + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); + + await client.responses.create({ model: 'gpt-4o', input: 'hi' }); + + const metric = await registry.getSingleMetricAsString(METRIC_NAMES.requestsTotal); + expect(metric).toContain('openai_requests_total'); + expect(metric).toContain('status="success"'); + }); + + it('increments openai_retries_total on retry', async () => { + const transientError = Object.assign(new Error('oops'), { status: 503 }); + const createMock = vi + .fn() + .mockRejectedValueOnce(transientError) + .mockResolvedValue(makeResponse()); + + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); + + const resultPromise = client.responses.create({ model: 'gpt-4o', input: 'hi' }); + await vi.runAllTimersAsync(); + await resultPromise; + + const metric = await registry.getSingleMetricAsString(METRIC_NAMES.retriesTotal); + expect(metric).toContain('openai_retries_total 1'); + }); + + it('increments openai_circuit_open_total when circuit opens', async () => { + const badError = Object.assign(new Error('boom'), { status: 503 }); + const createMock = vi.fn().mockRejectedValue(badError); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ circuitBreakerFailures: 2, retries: 0 }), + registry, + ); + + for (let i = 0; i < 2; i++) { + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow(); + } + + const metric = await registry.getSingleMetricAsString(METRIC_NAMES.circuitOpenTotal); + expect(metric).toContain('openai_circuit_open_total 1'); + }); + + it('records openai_latency_seconds histogram', async () => { + const createMock = vi.fn().mockResolvedValue(makeResponse()); + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); + + await client.responses.create({ model: 'gpt-4o', input: 'hi' }); + + const metric = await registry.getSingleMetricAsString(METRIC_NAMES.latencySeconds); + expect(metric).toContain('openai_latency_seconds_bucket'); + }); + + it('does not throw or produce unhandled rejections on failure', async () => { + const badError = Object.assign(new Error('fail'), { status: 503 }); + const createMock = vi.fn().mockRejectedValue(badError); + + const client = new ResilientOpenAIClient( + makeUnderlying(createMock), + fastConfig({ retries: 0 }), + registry, + ); + + // Should reject but not crash the process + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow( + 'fail', + ); + + // Metrics should still be recorded + const metric = await registry.getSingleMetricAsString(METRIC_NAMES.failuresTotal); + expect(metric).toContain('openai_failures_total'); + }); + }); +}); diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts new file mode 100644 index 00000000..83f82fe7 --- /dev/null +++ b/packages/backend/src/lib/resilientOpenAIClient.ts @@ -0,0 +1,451 @@ +import type OpenAI from 'openai'; +import type { + ResponseCreateParamsNonStreaming, + Response as OpenAIResponse, +} from 'openai/resources/responses/responses'; +import { Counter, Histogram, Registry } from 'prom-client'; +import type { OpenAIClientConfig } from '../config/openai'; +import { getOpenAIClientConfig } from '../config/openai'; +import { logger } from '../shared/logger/logger'; +import { logError } from '../shared/logger/error-logging'; + +// --------------------------------------------------------------------------- +// Public error type +// --------------------------------------------------------------------------- + +export class ResilientOpenAIError extends Error { + constructor( + message: string, + public readonly code: 'CIRCUIT_OPEN' | 'TIMEOUT' | 'CONCURRENCY_REJECTED' | 'MAX_RETRIES_EXCEEDED', + public readonly cause?: unknown, + ) { + super(message); + this.name = 'ResilientOpenAIError'; + } +} + +// --------------------------------------------------------------------------- +// Metric names (exported so callers can reference them) +// --------------------------------------------------------------------------- + +export const METRIC_NAMES = { + requestsTotal: 'openai_requests_total', + retriesTotal: 'openai_retries_total', + failuresTotal: 'openai_failures_total', + circuitOpenTotal: 'openai_circuit_open_total', + latencySeconds: 'openai_latency_seconds', +} as const; + +// --------------------------------------------------------------------------- +// Circuit-breaker states +// --------------------------------------------------------------------------- + +type CircuitState = 'closed' | 'open' | 'half-open'; + +// --------------------------------------------------------------------------- +// OpenAI-like interface (narrow surface used by AIService) +// --------------------------------------------------------------------------- + +export interface OpenAIResponsesAPI { + create( + params: ResponseCreateParamsNonStreaming, + options?: OpenAI.RequestOptions, + ): Promise; +} + +export interface OpenAIClientLike { + responses: OpenAIResponsesAPI; +} + +// --------------------------------------------------------------------------- +// Resilient client +// --------------------------------------------------------------------------- + +export class ResilientOpenAIClient implements OpenAIClientLike { + public readonly responses: OpenAIResponsesAPI; + + private readonly config: OpenAIClientConfig; + private readonly clientLogger = logger.child({ module: 'ResilientOpenAIClient' }); + + // Circuit-breaker state + private circuitState: CircuitState = 'closed'; + private consecutiveFailures = 0; + private lastOpenedAt: number | null = null; + private lastProbeAt: number | null = null; + + // Bulkhead (concurrency limiter) + private activeRequests = 0; + + // Prometheus metrics (lazily initialized against a shared or injected registry) + private readonly registry: Registry; + private readonly metricRequestsTotal: Counter; + private readonly metricRetriesTotal: Counter; + private readonly metricFailuresTotal: Counter; + private readonly metricCircuitOpenTotal: Counter; + private readonly metricLatency: Histogram; + + constructor( + private readonly underlying: OpenAIClientLike, + config?: Partial, + registry?: Registry, + ) { + this.config = { ...getOpenAIClientConfig(), ...config }; + this.registry = registry ?? new Registry(); + + this.metricRequestsTotal = new Counter({ + name: METRIC_NAMES.requestsTotal, + help: 'Total number of outbound OpenAI requests attempted', + labelNames: ['status'], + registers: [this.registry], + }); + + this.metricRetriesTotal = new Counter({ + name: METRIC_NAMES.retriesTotal, + help: 'Total number of OpenAI request retry attempts', + registers: [this.registry], + }); + + this.metricFailuresTotal = new Counter({ + name: METRIC_NAMES.failuresTotal, + help: 'Total number of failed OpenAI requests (after all retries)', + labelNames: ['reason'], + registers: [this.registry], + }); + + this.metricCircuitOpenTotal = new Counter({ + name: METRIC_NAMES.circuitOpenTotal, + help: 'Total number of times the circuit breaker transitioned to open state', + registers: [this.registry], + }); + + this.metricLatency = new Histogram({ + name: METRIC_NAMES.latencySeconds, + help: 'OpenAI request latency in seconds', + labelNames: ['status'], + buckets: [0.1, 0.5, 1, 2, 5, 10, 30], + registers: [this.registry], + }); + + this.responses = { + create: (params, options) => this.resilientCreate(params, options), + }; + } + + // --------------------------------------------------------------------------- + // Public helpers + // --------------------------------------------------------------------------- + + /** Returns the current circuit-breaker state for observability / testing. */ + public getCircuitState(): CircuitState { + return this.circuitState; + } + + /** Returns the number of currently active (in-flight) requests. */ + public getActiveRequests(): number { + return this.activeRequests; + } + + /** Returns the Prometheus registry so callers can expose /metrics if desired. */ + public getRegistry(): Registry { + return this.registry; + } + + // --------------------------------------------------------------------------- + // Core resilient dispatch + // --------------------------------------------------------------------------- + + private async resilientCreate( + params: ResponseCreateParamsNonStreaming, + options?: OpenAI.RequestOptions, + ): Promise { + // Feature-flag bypass: delegate directly to underlying client + if (!this.config.featureFlagResilient) { + return this.underlying.responses.create(params, options); + } + + // --- Circuit-breaker check --- + if (!this.isCallAllowed()) { + this.metricFailuresTotal.inc({ reason: 'circuit_open' }); + this.metricRequestsTotal.inc({ status: 'circuit_open' }); + const msg = 'OpenAI circuit breaker is open; request short-circuited'; + this.clientLogger.warn(msg, { circuitState: this.circuitState }); + throw new ResilientOpenAIError(msg, 'CIRCUIT_OPEN'); + } + + // --- Bulkhead / concurrency limiter --- + if (this.activeRequests >= this.config.concurrency) { + this.metricFailuresTotal.inc({ reason: 'concurrency_rejected' }); + this.metricRequestsTotal.inc({ status: 'concurrency_rejected' }); + const msg = `OpenAI concurrency limit (${this.config.concurrency}) reached; request rejected`; + this.clientLogger.warn(msg, { activeRequests: this.activeRequests }); + throw new ResilientOpenAIError(msg, 'CONCURRENCY_REJECTED'); + } + + this.activeRequests++; + const endTimer = this.metricLatency.startTimer(); + this.metricRequestsTotal.inc({ status: 'attempted' }); + + try { + const result = await this.attemptWithRetry(params, options); + this.onSuccess(); + this.metricRequestsTotal.inc({ status: 'success' }); + endTimer({ status: 'success' }); + return result; + } catch (error) { + this.onFailure(error); + this.metricFailuresTotal.inc({ reason: 'error' }); + endTimer({ status: 'error' }); + throw error; + } finally { + this.activeRequests--; + } + } + + // --------------------------------------------------------------------------- + // Retry loop with exponential backoff + jitter + // --------------------------------------------------------------------------- + + private async attemptWithRetry( + params: ResponseCreateParamsNonStreaming, + options?: OpenAI.RequestOptions, + ): Promise { + let lastError: unknown; + + for (let attempt = 0; attempt <= this.config.retries; attempt++) { + if (attempt > 0) { + this.metricRetriesTotal.inc(); + const backoffMs = this.computeBackoffMs(attempt, lastError); + this.clientLogger.info('Retrying OpenAI request', { attempt, backoffMs }); + await this.sleep(backoffMs); + } + + try { + return await this.attemptWithTimeout(params, options); + } catch (error) { + lastError = error; + + // Don't retry circuit-open or concurrency errors (they won't be + // thrown from here, but guard defensively) + if (error instanceof ResilientOpenAIError) { + throw error; + } + + const isLast = attempt === this.config.retries; + if (isLast) { + break; + } + + if (!this.isRetriable(error)) { + this.clientLogger.warn('Non-retriable OpenAI error; giving up', { + attempt, + error: error instanceof Error ? error.message : String(error), + }); + break; + } + + logError(this.clientLogger, 'OpenAI request failed; will retry', error, { attempt }); + } + } + + throw lastError; + } + + // --------------------------------------------------------------------------- + // Single attempt with AbortController timeout + // --------------------------------------------------------------------------- + + private async attemptWithTimeout( + params: ResponseCreateParamsNonStreaming, + options?: OpenAI.RequestOptions, + ): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => { + controller.abort(); + }, this.config.timeoutMs); + + try { + return await this.underlying.responses.create(params, { + ...options, + signal: controller.signal, + }); + } catch (error) { + if (controller.signal.aborted) { + const msg = `OpenAI request timed out after ${this.config.timeoutMs}ms`; + this.clientLogger.warn(msg, { timeoutMs: this.config.timeoutMs }); + throw new ResilientOpenAIError(msg, 'TIMEOUT', error); + } + throw error; + } finally { + clearTimeout(timer); + } + } + + // --------------------------------------------------------------------------- + // Circuit-breaker helpers + // --------------------------------------------------------------------------- + + private isCallAllowed(): boolean { + if (this.circuitState === 'closed') { + return true; + } + + const now = Date.now(); + + if (this.circuitState === 'open') { + const openedAt = this.lastOpenedAt ?? 0; + const windowElapsed = now - openedAt >= this.config.circuitBreakerWindowMs; + const probeIntervalElapsed = now - (this.lastProbeAt ?? 0) >= this.config.circuitBreakerProbeMs; + + if (windowElapsed && probeIntervalElapsed) { + this.circuitState = 'half-open'; + this.lastProbeAt = now; + this.clientLogger.info('Circuit breaker transitioning to half-open (probing)', { + openedAt, + circuitWindowMs: this.config.circuitBreakerWindowMs, + }); + return true; + } + return false; + } + + // half-open: allow exactly one probe at a time + return true; + } + + private onSuccess(): void { + if (this.circuitState !== 'closed') { + this.clientLogger.info('Circuit breaker closing after successful probe', { + previousState: this.circuitState, + }); + this.circuitState = 'closed'; + } + this.consecutiveFailures = 0; + } + + private onFailure(error: unknown): void { + this.consecutiveFailures++; + + if (this.circuitState === 'half-open') { + this.clientLogger.warn('Circuit breaker probe failed; re-opening circuit', { + consecutiveFailures: this.consecutiveFailures, + }); + this.openCircuit(); + return; + } + + if ( + this.circuitState === 'closed' && + this.consecutiveFailures >= this.config.circuitBreakerFailures + ) { + logError(this.clientLogger, 'Circuit breaker opening after consecutive failures', error, { + consecutiveFailures: this.consecutiveFailures, + threshold: this.config.circuitBreakerFailures, + }); + this.openCircuit(); + } + } + + private openCircuit(): void { + this.circuitState = 'open'; + this.lastOpenedAt = Date.now(); + this.metricCircuitOpenTotal.inc(); + this.clientLogger.warn('Circuit breaker is now open', { + consecutiveFailures: this.consecutiveFailures, + windowMs: this.config.circuitBreakerWindowMs, + }); + } + + // --------------------------------------------------------------------------- + // Retry classification + // --------------------------------------------------------------------------- + + private isRetriable(error: unknown): boolean { + if (!(error instanceof Error)) { + return false; + } + + // Timeout errors are retriable + if (error instanceof ResilientOpenAIError && error.code === 'TIMEOUT') { + return true; + } + + // Network / connection errors + if ( + error.message.includes('ECONNRESET') || + error.message.includes('ECONNREFUSED') || + error.message.includes('ETIMEDOUT') || + error.message.includes('socket hang up') || + error.message.includes('fetch failed') || + error.message.includes('network') + ) { + return true; + } + + // OpenAI HTTP status codes: 429 (rate-limit) and 5xx are retriable + const statusCode = this.getStatusCode(error); + if (statusCode === 429 || (statusCode !== undefined && statusCode >= 500)) { + return true; + } + + return false; + } + + private getStatusCode(error: unknown): number | undefined { + if (typeof error === 'object' && error !== null) { + const status = Reflect.get(error as object, 'status'); + return typeof status === 'number' ? status : undefined; + } + return undefined; + } + + // --------------------------------------------------------------------------- + // Backoff with full jitter; respects Retry-After header on 429 + // --------------------------------------------------------------------------- + + private computeBackoffMs(attempt: number, error: unknown): number { + // Honour Retry-After header on 429 responses + const retryAfterMs = this.extractRetryAfterMs(error); + if (retryAfterMs !== null) { + return retryAfterMs; + } + + // Exponential backoff with full jitter: random(0, base * 2^attempt) + const cap = this.config.backoffBaseMs * Math.pow(2, attempt); + return Math.random() * cap; + } + + private extractRetryAfterMs(error: unknown): number | null { + if (typeof error !== 'object' || error === null) { + return null; + } + + const headers = Reflect.get(error as object, 'headers'); + if (typeof headers !== 'object' || headers === null) { + return null; + } + + const retryAfterHeader = + (Reflect.get(headers as object, 'retry-after') as string | undefined) ?? + (Reflect.get(headers as object, 'Retry-After') as string | undefined); + + if (!retryAfterHeader) { + return null; + } + + const seconds = parseFloat(retryAfterHeader); + if (Number.isFinite(seconds)) { + return seconds * 1000; + } + + // Could be an HTTP-date; fall back to default backoff + return null; + } + + // --------------------------------------------------------------------------- + // Utilities + // --------------------------------------------------------------------------- + + private sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms))); + } +} From 894b1c4fb37f00bd9a35f3acfe2edc0462921981 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:21:06 +0000 Subject: [PATCH 3/7] refactor: address code review feedback on resilient OpenAI client - Clarify ResilientOpenAIError re-throw comment in retry loop - Fix spelling: Honour -> Honor - Add comment explaining Math.max(0, ms) defensive guard in sleep() - Remove unused noop helper; inline it at the call site - Fix doc table spelling: honoured -> honored --- docs/resilient-openai.md | 2 +- packages/backend/src/lib/resilientOpenAIClient.spec.ts | 4 +--- packages/backend/src/lib/resilientOpenAIClient.ts | 8 +++++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/resilient-openai.md b/docs/resilient-openai.md index a041c507..6568f8cc 100644 --- a/docs/resilient-openai.md +++ b/docs/resilient-openai.md @@ -15,7 +15,7 @@ broader Moonbeam service. |---|---| | Per-request timeout | 10 s | | Automatic retries with exponential backoff + full jitter | 3 retries | -| `Retry-After` header honoured on 429 responses | yes | +| `Retry-After` header honored on 429 responses | yes | | Circuit breaker | opens after 5 consecutive failures | | Concurrency / bulkhead limiter | 10 concurrent calls | | Graceful degradation via `ResilientOpenAIError` | yes | diff --git a/packages/backend/src/lib/resilientOpenAIClient.spec.ts b/packages/backend/src/lib/resilientOpenAIClient.spec.ts index 33e996e3..ea7bc39b 100644 --- a/packages/backend/src/lib/resilientOpenAIClient.spec.ts +++ b/packages/backend/src/lib/resilientOpenAIClient.spec.ts @@ -23,8 +23,6 @@ const makeUnderlying = (createImpl: Mock): OpenAIClientLike => ({ }, }); -const noop = () => undefined; - /** Build a minimal config with fast timings for tests. */ const fastConfig = (overrides: Partial = {}): Partial => ({ timeoutMs: 500, @@ -379,7 +377,7 @@ describe('ResilientOpenAIClient', () => { describe('concurrency limiter', () => { it('rejects requests that exceed the concurrency limit', async () => { // Create a mock that never resolves so we can accumulate inflight requests - const neverResolves = () => new Promise(noop); + const neverResolves = () => new Promise(() => undefined); const createMock = vi.fn().mockImplementation(neverResolves); const client = new ResilientOpenAIClient( diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts index 83f82fe7..d7715ed8 100644 --- a/packages/backend/src/lib/resilientOpenAIClient.ts +++ b/packages/backend/src/lib/resilientOpenAIClient.ts @@ -224,8 +224,8 @@ export class ResilientOpenAIClient implements OpenAIClientLike { } catch (error) { lastError = error; - // Don't retry circuit-open or concurrency errors (they won't be - // thrown from here, but guard defensively) + // TIMEOUT is wrapped as a ResilientOpenAIError by attemptWithTimeout; + // re-throw it (and any other ResilientOpenAIError) without retrying. if (error instanceof ResilientOpenAIError) { throw error; } @@ -403,7 +403,7 @@ export class ResilientOpenAIClient implements OpenAIClientLike { // --------------------------------------------------------------------------- private computeBackoffMs(attempt: number, error: unknown): number { - // Honour Retry-After header on 429 responses + // Honor Retry-After header on 429 responses const retryAfterMs = this.extractRetryAfterMs(error); if (retryAfterMs !== null) { return retryAfterMs; @@ -446,6 +446,8 @@ export class ResilientOpenAIClient implements OpenAIClientLike { // --------------------------------------------------------------------------- private sleep(ms: number): Promise { + // Math.max guards against negative values that could arise from floating- + // point jitter computation; setTimeout(fn, 0) is the desired minimum. return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms))); } } From 8e43bdb78d4c9c2248ea6e692b5c6ccdee44dc65 Mon Sep 17 00:00:00 2001 From: sfreeman422 Date: Tue, 23 Jun 2026 11:29:49 -0400 Subject: [PATCH 4/7] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- packages/backend/src/lib/resilientOpenAIClient.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts index d7715ed8..2bca98e9 100644 --- a/packages/backend/src/lib/resilientOpenAIClient.ts +++ b/packages/backend/src/lib/resilientOpenAIClient.ts @@ -224,9 +224,9 @@ export class ResilientOpenAIClient implements OpenAIClientLike { } catch (error) { lastError = error; - // TIMEOUT is wrapped as a ResilientOpenAIError by attemptWithTimeout; - // re-throw it (and any other ResilientOpenAIError) without retrying. - if (error instanceof ResilientOpenAIError) { + // Only short-circuit retries for non-retriable wrapper errors. + // TIMEOUT is classified as retriable by isRetriable(). + if (error instanceof ResilientOpenAIError && error.code !== 'TIMEOUT') { throw error; } From 750753e0dafbbc7063326af90f1a2b5694664fa7 Mon Sep 17 00:00:00 2001 From: sfreeman422 Date: Tue, 23 Jun 2026 11:30:12 -0400 Subject: [PATCH 5/7] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- packages/backend/src/lib/resilientOpenAIClient.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts index 2bca98e9..42399ab3 100644 --- a/packages/backend/src/lib/resilientOpenAIClient.ts +++ b/packages/backend/src/lib/resilientOpenAIClient.ts @@ -309,7 +309,7 @@ export class ResilientOpenAIClient implements OpenAIClientLike { } // half-open: allow exactly one probe at a time - return true; + return this.activeRequests === 0; } private onSuccess(): void { From ebbdb86665f027db05745ef1f42b3e93f57217a0 Mon Sep 17 00:00:00 2001 From: sfreeman422 Date: Tue, 23 Jun 2026 11:30:36 -0400 Subject: [PATCH 6/7] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- packages/backend/src/config/openai.ts | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/packages/backend/src/config/openai.ts b/packages/backend/src/config/openai.ts index 8d8e8c78..6616e09b 100644 --- a/packages/backend/src/config/openai.ts +++ b/packages/backend/src/config/openai.ts @@ -33,12 +33,12 @@ const parseIntWithDefault = (value: string | undefined, defaultValue: number): n }; export const getOpenAIClientConfig = (): OpenAIClientConfig => ({ - timeoutMs: parseIntWithDefault(process.env.OPENAI_TIMEOUT_MS, 10_000), - retries: parseIntWithDefault(process.env.OPENAI_RETRIES, 3), - backoffBaseMs: parseIntWithDefault(process.env.OPENAI_BACKOFF_BASE_MS, 500), - circuitBreakerFailures: parseIntWithDefault(process.env.CIRCUIT_BREAKER_FAILURES, 5), - circuitBreakerWindowMs: parseIntWithDefault(process.env.CIRCUIT_BREAKER_WINDOW_MS, 60_000), - circuitBreakerProbeMs: parseIntWithDefault(process.env.CIRCUIT_BREAKER_PROBE_MS, 30_000), - concurrency: parseIntWithDefault(process.env.OPENAI_CONCURRENCY, 10), + timeoutMs: Math.max(1, parseIntWithDefault(process.env.OPENAI_TIMEOUT_MS, 10_000)), + retries: Math.max(0, parseIntWithDefault(process.env.OPENAI_RETRIES, 3)), + backoffBaseMs: Math.max(0, parseIntWithDefault(process.env.OPENAI_BACKOFF_BASE_MS, 500)), + circuitBreakerFailures: Math.max(1, parseIntWithDefault(process.env.CIRCUIT_BREAKER_FAILURES, 5)), + circuitBreakerWindowMs: Math.max(1, parseIntWithDefault(process.env.CIRCUIT_BREAKER_WINDOW_MS, 60_000)), + circuitBreakerProbeMs: Math.max(1, parseIntWithDefault(process.env.CIRCUIT_BREAKER_PROBE_MS, 30_000)), + concurrency: Math.max(1, parseIntWithDefault(process.env.OPENAI_CONCURRENCY, 10)), featureFlagResilient: process.env.FEATURE_FLAG_RESILIENT_OPENAI !== 'false', }); From 6f0c6ea94681721704d9ed9f63e9ad5e12366f0f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:38:01 +0000 Subject: [PATCH 7/7] fix: resolve CI lint and format failures --- docs/resilient-openai.md | 66 +++++++++---------- .../src/lib/resilientOpenAIClient.spec.ts | 51 +++++--------- .../backend/src/lib/resilientOpenAIClient.ts | 29 ++++---- 3 files changed, 64 insertions(+), 82 deletions(-) diff --git a/docs/resilient-openai.md b/docs/resilient-openai.md index 6568f8cc..c0dc5b86 100644 --- a/docs/resilient-openai.md +++ b/docs/resilient-openai.md @@ -1,7 +1,7 @@ # Resilient OpenAI Client This document describes the resilient OpenAI wrapper introduced in -`packages/backend/src/lib/resilientOpenAIClient.ts`. The wrapper prevents +`packages/backend/src/lib/resilientOpenAIClient.ts`. The wrapper prevents transient or sustained OpenAI failures from cascading and taking down the broader Moonbeam service. @@ -11,16 +11,16 @@ broader Moonbeam service. `ResilientOpenAIClient` wraps the official `openai` SDK client and adds: -| Feature | Default | -|---|---| -| Per-request timeout | 10 s | -| Automatic retries with exponential backoff + full jitter | 3 retries | -| `Retry-After` header honored on 429 responses | yes | -| Circuit breaker | opens after 5 consecutive failures | -| Concurrency / bulkhead limiter | 10 concurrent calls | -| Graceful degradation via `ResilientOpenAIError` | yes | -| Structured logging (Winston) | yes | -| Prometheus metrics (`prom-client`) | yes | +| Feature | Default | +| -------------------------------------------------------- | ---------------------------------- | +| Per-request timeout | 10 s | +| Automatic retries with exponential backoff + full jitter | 3 retries | +| `Retry-After` header honored on 429 responses | yes | +| Circuit breaker | opens after 5 consecutive failures | +| Concurrency / bulkhead limiter | 10 concurrent calls | +| Graceful degradation via `ResilientOpenAIError` | yes | +| Structured logging (Winston) | yes | +| Prometheus metrics (`prom-client`) | yes | The wrapper exposes the same narrow `responses.create` surface used by `AIService`, so the change is a drop-in replacement. @@ -31,7 +31,7 @@ The wrapper exposes the same narrow `responses.create` surface used by Set the environment variable `FEATURE_FLAG_RESILIENT_OPENAI` to `false` to bypass all resilience logic and delegate directly to the underlying OpenAI SDK -client. This is the rollback switch. +client. This is the rollback switch. ``` FEATURE_FLAG_RESILIENT_OPENAI=false # bypass resilience (rollback) @@ -42,18 +42,18 @@ FEATURE_FLAG_RESILIENT_OPENAI=true # enable resilience (default) ## Environment variables -All variables are optional. Defaults are shown. +All variables are optional. Defaults are shown. -| Variable | Default | Description | -|---|---|---| -| `FEATURE_FLAG_RESILIENT_OPENAI` | `true` | Set to `false` to bypass the wrapper entirely. | -| `OPENAI_TIMEOUT_MS` | `10000` | Maximum ms to wait for a single request before aborting. | -| `OPENAI_RETRIES` | `3` | Maximum retry attempts on transient errors. | -| `OPENAI_BACKOFF_BASE_MS` | `500` | Base interval (ms) for exponential backoff with full jitter. | -| `CIRCUIT_BREAKER_FAILURES` | `5` | Consecutive failures needed to open the circuit. | -| `CIRCUIT_BREAKER_WINDOW_MS` | `60000` | Duration (ms) the circuit stays open before allowing a probe. | -| `CIRCUIT_BREAKER_PROBE_MS` | `30000` | Minimum interval (ms) between probe attempts while open. | -| `OPENAI_CONCURRENCY` | `10` | Maximum concurrent outbound OpenAI calls per instance. | +| Variable | Default | Description | +| ------------------------------- | ------- | ------------------------------------------------------------- | +| `FEATURE_FLAG_RESILIENT_OPENAI` | `true` | Set to `false` to bypass the wrapper entirely. | +| `OPENAI_TIMEOUT_MS` | `10000` | Maximum ms to wait for a single request before aborting. | +| `OPENAI_RETRIES` | `3` | Maximum retry attempts on transient errors. | +| `OPENAI_BACKOFF_BASE_MS` | `500` | Base interval (ms) for exponential backoff with full jitter. | +| `CIRCUIT_BREAKER_FAILURES` | `5` | Consecutive failures needed to open the circuit. | +| `CIRCUIT_BREAKER_WINDOW_MS` | `60000` | Duration (ms) the circuit stays open before allowing a probe. | +| `CIRCUIT_BREAKER_PROBE_MS` | `30000` | Minimum interval (ms) between probe attempts while open. | +| `OPENAI_CONCURRENCY` | `10` | Maximum concurrent outbound OpenAI calls per instance. | Configuration is loaded by `packages/backend/src/config/openai.ts` which reads these variables at instantiation time with sensible defaults. @@ -145,18 +145,18 @@ return appropriate degraded UX to users. ## Metrics -The client registers the following Prometheus metrics via `prom-client`. Each +The client registers the following Prometheus metrics via `prom-client`. Each `ResilientOpenAIClient` instance uses its own `Registry` so multiple instances -in the same process do not collide. Expose the registry on a `/metrics` +in the same process do not collide. Expose the registry on a `/metrics` endpoint if Prometheus scraping is desired. -| Metric | Type | Description | -|---|---|---| -| `openai_requests_total` | Counter | Total requests attempted, labelled `status` (`attempted`, `success`, `circuit_open`, `concurrency_rejected`). | -| `openai_retries_total` | Counter | Total retry attempts. | -| `openai_failures_total` | Counter | Total failed requests after all retries, labelled `reason` (`error`, `circuit_open`, `concurrency_rejected`). | -| `openai_circuit_open_total` | Counter | Number of times the circuit transitioned to open. | -| `openai_latency_seconds` | Histogram | End-to-end request latency in seconds, labelled `status` (`success`, `error`). Buckets: 0.1, 0.5, 1, 2, 5, 10, 30. | +| Metric | Type | Description | +| --------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------ | +| `openai_requests_total` | Counter | Total requests attempted, labelled `status` (`attempted`, `success`, `circuit_open`, `concurrency_rejected`). | +| `openai_retries_total` | Counter | Total retry attempts. | +| `openai_failures_total` | Counter | Total failed requests after all retries, labelled `reason` (`error`, `circuit_open`, `concurrency_rejected`). | +| `openai_circuit_open_total` | Counter | Number of times the circuit transitioned to open. | +| `openai_latency_seconds` | Histogram | End-to-end request latency in seconds, labelled `status` (`success`, `error`). Buckets: 0.1, 0.5, 1, 2, 5, 10, 30. | --- @@ -201,7 +201,7 @@ If issues occur at any phase, flip the feature flag: FEATURE_FLAG_RESILIENT_OPENAI=false ``` -Restart the service. The wrapper delegates directly to the underlying OpenAI +Restart the service. The wrapper delegates directly to the underlying OpenAI SDK; no other code changes are required. ### Phase 3 — Cleanup (future) diff --git a/packages/backend/src/lib/resilientOpenAIClient.spec.ts b/packages/backend/src/lib/resilientOpenAIClient.spec.ts index ea7bc39b..de1acf2a 100644 --- a/packages/backend/src/lib/resilientOpenAIClient.spec.ts +++ b/packages/backend/src/lib/resilientOpenAIClient.spec.ts @@ -102,10 +102,7 @@ describe('ResilientOpenAIClient', () => { it('retries on transient 500 error and succeeds', async () => { const transientError = Object.assign(new Error('Internal Server Error'), { status: 500 }); - const createMock = vi - .fn() - .mockRejectedValueOnce(transientError) - .mockResolvedValue(makeResponse('retry success')); + const createMock = vi.fn().mockRejectedValueOnce(transientError).mockResolvedValue(makeResponse('retry success')); const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); @@ -120,10 +117,7 @@ describe('ResilientOpenAIClient', () => { it('retries on 429 rate-limit error and succeeds', async () => { const rateLimitError = Object.assign(new Error('Rate limit exceeded'), { status: 429 }); - const createMock = vi - .fn() - .mockRejectedValueOnce(rateLimitError) - .mockResolvedValue(makeResponse('ok after 429')); + const createMock = vi.fn().mockRejectedValueOnce(rateLimitError).mockResolvedValue(makeResponse('ok after 429')); const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); @@ -163,9 +157,7 @@ describe('ResilientOpenAIClient', () => { const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); - await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow( - 'Bad Request', - ); + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow('Bad Request'); // One attempt only (no retries for 4xx other than 429) expect(createMock).toHaveBeenCalledOnce(); }); @@ -228,16 +220,16 @@ describe('ResilientOpenAIClient', () => { // Advance past the 100 ms abort timer await vi.advanceTimersByTimeAsync(200); + await expect(resultPromise).rejects.toBeInstanceOf(ResilientOpenAIError); await expect(resultPromise).rejects.toMatchObject({ - name: 'ResilientOpenAIError', code: 'TIMEOUT', }); }); it('does not throw when request completes within timeoutMs', async () => { - const createMock = vi.fn().mockImplementation( - () => new Promise((resolve) => setTimeout(() => resolve(makeResponse()), 50)), - ); + const createMock = vi + .fn() + .mockImplementation(() => new Promise((resolve) => setTimeout(() => resolve(makeResponse()), 50))); const client = new ResilientOpenAIClient( makeUnderlying(createMock), @@ -309,8 +301,9 @@ describe('ResilientOpenAIClient', () => { const callCountBeforeShortCircuit = createMock.mock.calls.length; // Next call should be short-circuited (no underlying call) - await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toMatchObject({ - name: 'ResilientOpenAIError', + const shortCircuited = client.responses.create({ model: 'gpt-4o', input: 'hi' }); + await expect(shortCircuited).rejects.toBeInstanceOf(ResilientOpenAIError); + await expect(shortCircuited).rejects.toMatchObject({ code: 'CIRCUIT_OPEN', }); @@ -319,9 +312,7 @@ describe('ResilientOpenAIClient', () => { it('transitions to half-open and closes on successful probe', async () => { const badRequestError = Object.assign(new Error('boom'), { status: 503 }); - const createMock = vi - .fn() - .mockRejectedValue(badRequestError); + const createMock = vi.fn().mockRejectedValue(badRequestError); const client = new ResilientOpenAIClient( makeUnderlying(createMock), @@ -393,8 +384,9 @@ describe('ResilientOpenAIClient', () => { expect(client.getActiveRequests()).toBe(2); // Third request should be rejected - await expect(client.responses.create({ model: 'gpt-4o', input: '3' })).rejects.toMatchObject({ - name: 'ResilientOpenAIError', + const rejected = client.responses.create({ model: 'gpt-4o', input: '3' }); + await expect(rejected).rejects.toBeInstanceOf(ResilientOpenAIError); + await expect(rejected).rejects.toMatchObject({ code: 'CONCURRENCY_REJECTED', }); }); @@ -452,10 +444,7 @@ describe('ResilientOpenAIClient', () => { it('increments openai_retries_total on retry', async () => { const transientError = Object.assign(new Error('oops'), { status: 503 }); - const createMock = vi - .fn() - .mockRejectedValueOnce(transientError) - .mockResolvedValue(makeResponse()); + const createMock = vi.fn().mockRejectedValueOnce(transientError).mockResolvedValue(makeResponse()); const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig(), registry); @@ -499,16 +488,10 @@ describe('ResilientOpenAIClient', () => { const badError = Object.assign(new Error('fail'), { status: 503 }); const createMock = vi.fn().mockRejectedValue(badError); - const client = new ResilientOpenAIClient( - makeUnderlying(createMock), - fastConfig({ retries: 0 }), - registry, - ); + const client = new ResilientOpenAIClient(makeUnderlying(createMock), fastConfig({ retries: 0 }), registry); // Should reject but not crash the process - await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow( - 'fail', - ); + await expect(client.responses.create({ model: 'gpt-4o', input: 'hi' })).rejects.toThrow('fail'); // Metrics should still be recorded const metric = await registry.getSingleMetricAsString(METRIC_NAMES.failuresTotal); diff --git a/packages/backend/src/lib/resilientOpenAIClient.ts b/packages/backend/src/lib/resilientOpenAIClient.ts index 42399ab3..60422262 100644 --- a/packages/backend/src/lib/resilientOpenAIClient.ts +++ b/packages/backend/src/lib/resilientOpenAIClient.ts @@ -28,13 +28,19 @@ export class ResilientOpenAIError extends Error { // Metric names (exported so callers can reference them) // --------------------------------------------------------------------------- -export const METRIC_NAMES = { +export const METRIC_NAMES: { + readonly requestsTotal: 'openai_requests_total'; + readonly retriesTotal: 'openai_retries_total'; + readonly failuresTotal: 'openai_failures_total'; + readonly circuitOpenTotal: 'openai_circuit_open_total'; + readonly latencySeconds: 'openai_latency_seconds'; +} = { requestsTotal: 'openai_requests_total', retriesTotal: 'openai_retries_total', failuresTotal: 'openai_failures_total', circuitOpenTotal: 'openai_circuit_open_total', latencySeconds: 'openai_latency_seconds', -} as const; +}; // --------------------------------------------------------------------------- // Circuit-breaker states @@ -47,10 +53,7 @@ type CircuitState = 'closed' | 'open' | 'half-open'; // --------------------------------------------------------------------------- export interface OpenAIResponsesAPI { - create( - params: ResponseCreateParamsNonStreaming, - options?: OpenAI.RequestOptions, - ): Promise; + create(params: ResponseCreateParamsNonStreaming, options?: OpenAI.RequestOptions): Promise; } export interface OpenAIClientLike { @@ -333,10 +336,7 @@ export class ResilientOpenAIClient implements OpenAIClientLike { return; } - if ( - this.circuitState === 'closed' && - this.consecutiveFailures >= this.config.circuitBreakerFailures - ) { + if (this.circuitState === 'closed' && this.consecutiveFailures >= this.config.circuitBreakerFailures) { logError(this.clientLogger, 'Circuit breaker opening after consecutive failures', error, { consecutiveFailures: this.consecutiveFailures, threshold: this.config.circuitBreakerFailures, @@ -392,7 +392,7 @@ export class ResilientOpenAIClient implements OpenAIClientLike { private getStatusCode(error: unknown): number | undefined { if (typeof error === 'object' && error !== null) { - const status = Reflect.get(error as object, 'status'); + const status = Reflect.get(error, 'status'); return typeof status === 'number' ? status : undefined; } return undefined; @@ -419,14 +419,13 @@ export class ResilientOpenAIClient implements OpenAIClientLike { return null; } - const headers = Reflect.get(error as object, 'headers'); + const headers = Reflect.get(error, 'headers'); if (typeof headers !== 'object' || headers === null) { return null; } - const retryAfterHeader = - (Reflect.get(headers as object, 'retry-after') as string | undefined) ?? - (Reflect.get(headers as object, 'Retry-After') as string | undefined); + const retryAfterRaw = Reflect.get(headers, 'retry-after') ?? Reflect.get(headers, 'Retry-After'); + const retryAfterHeader = typeof retryAfterRaw === 'string' ? retryAfterRaw : undefined; if (!retryAfterHeader) { return null;