From 2e94f05a67b70c3f8563b5d652003fb6c3bc5507 Mon Sep 17 00:00:00 2001
From: Clifford Tawiah <ctawiah@launchdarkly.com>
Date: Wed, 10 Jun 2026 22:41:50 -0400
Subject: [PATCH 1/2] feat: add manual judge evaluation (Judge, Evaluator,
 createJudge) (AIC-2665)

Implements the AIEVALS manual-only evaluation path:

- Runner SPI and RunnerResult for caller-supplied model invocation
- Judge: sampling decided before invocation, well-known input format,
  score/reasoning parsing with range validation, invocation tracked via
  trackMetricsOf (does not emit trackJudgeResult; caller's responsibility)
- Evaluator: per-judge fault isolation and per-judge timeout, order-preserving
  results, noop() returns an empty list; sampling-rate normalization on Judge
- LDAIClient.createJudge: fires only $ld:ai:usage:create-judge, resolves the
  judge config via the internal evaluate path, returns null when disabled or
  when no runner is supplied

Automatic judge auto-attachment and provider runners are deferred past v1.0.
README documents the manual-only flow and the auto-attach descope.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 lib/sdk/server-ai/README.md                   |  40 ++-
 .../launchdarkly/sdk/server/ai/Evaluator.java | 125 +++++++++
 .../com/launchdarkly/sdk/server/ai/Judge.java | 243 ++++++++++++++++++
 .../sdk/server/ai/LDAIClient.java             |  25 ++
 .../sdk/server/ai/LDAIClientImpl.java         |  32 +++
 .../launchdarkly/sdk/server/ai/Runner.java    |  26 ++
 .../sdk/server/ai/RunnerResult.java           | 105 ++++++++
 .../sdk/server/ai/EvaluatorTest.java          | 121 +++++++++
 .../launchdarkly/sdk/server/ai/JudgeTest.java | 170 ++++++++++++
 .../sdk/server/ai/LDAIClientImplTest.java     |  56 ++++
 10 files changed, 941 insertions(+), 2 deletions(-)
 create mode 100644 lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java
 create mode 100644 lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java
 create mode 100644 lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java
 create mode 100644 lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java
 create mode 100644 lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java
 create mode 100644 lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java

diff --git a/lib/sdk/server-ai/README.md b/lib/sdk/server-ai/README.md
index b2d8d4f2..971bb43c 100644
--- a/lib/sdk/server-ai/README.md
+++ b/lib/sdk/server-ai/README.md
@@ -43,8 +43,44 @@ The companion `agentConfig`/`agentConfigs` and `judgeConfig` methods retrieve ag
 configs respectively. Within a prompt message or agent instruction, the evaluation context is
 available as `{{ldctx}}` (for example `{{ldctx.key}}`).
 
-Metric tracking and manual judge evaluation will be added as the SDK is built out (see epic
-AIC-2629).
+## Tracking AI runs
+
+Every retrieved config exposes a tracker via `config.createTracker()`. Use it to record duration,
+time-to-first-token, success/error, token usage, tool calls, and feedback for an AI run. Trackers
+are thread-safe, and at-most-once metrics (duration, time-to-first-token, outcome, feedback, tokens)
+emit a single event even under concurrent calls. A run can be correlated across processes with
+`tracker.getResumptionToken()` and rebuilt later via `aiClient.createTracker(token, context)`.
+
+## Evaluating responses with judges (manual)
+
+A judge is an AI Config with `mode: judge` that scores another config's output against an evaluation
+metric.
+
+In `v1.0`, evaluation is **manual only**. The SDK parses `judgeConfiguration` and exposes it on
+configs, but it does **not** automatically invoke judges on completion or agent calls. Sample-rate
+driven auto-attachment is deferred past `v1.0`. Because no provider-specific runners ship yet, you
+supply your own `Runner` that calls your model and returns structured `{score, reasoning}` output.
+
+```java
+Runner runner = input -> {
+    // Call your model with `input`, then return its score/reasoning as structured output.
+    // metrics carries success/tokens/duration for the invocation.
+    return RunnerResult.builder(Metrics.builder(true).build())
+        .parsed(LDValue.buildObject().put("score", 0.9).put("reasoning", "grounded").build())
+        .build();
+};
+
+Judge judge = aiClient.createJudge("my-judge-key", context, null, variables, runner, 1.0);
+if (judge != null) {
+    JudgeResult result = judge.evaluate(originalInput, modelOutput);
+    // Recording the result is the caller's responsibility:
+    completionTracker.trackJudgeResult(result);
+}
+```
+
+`Evaluator` runs several judges over the same input/output with per-judge fault isolation and a
+per-judge timeout, returning one `JudgeResult` per judge in order. `Evaluator.noop()` returns an
+empty result list.
 
 ## Internal API convention
 
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java
new file mode 100644
index 00000000..4df1d6e1
--- /dev/null
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java
@@ -0,0 +1,125 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.logging.Logs;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+/**
+ * Runs a fixed set of {@link Judge}s against one input/output pair and collects their results.
+ * <p>
+ * Each judge runs with <strong>fault isolation</strong>: a judge that throws or times out yields a
+ * failed {@link JudgeResult} for that judge while every other judge's result is preserved, in the
+ * original order. Judges run concurrently and each is bounded by a per-judge timeout so a single
+ * hung judge cannot stall the whole evaluation.
+ * <p>
+ * The evaluator does not record results; recording the returned {@link JudgeResult}s (for example
+ * via a tracker) is the caller's responsibility. Instances are immutable and thread-safe.
+ */
+public final class Evaluator {
+  /**
+   * Default per-judge timeout used when one is not supplied.
+   */
+  public static final Duration DEFAULT_PER_JUDGE_TIMEOUT = Duration.ofSeconds(30);
+
+  private final List<Judge> judges;
+  private final Duration perJudgeTimeout;
+  private final LDLogger logger;
+
+  /**
+   * Creates an evaluator using the {@link #DEFAULT_PER_JUDGE_TIMEOUT default per-judge timeout}.
+   *
+   * @param judges the judges to run; must not be {@code null}
+   * @param logger the logger; must not be {@code null}
+   */
+  public Evaluator(List<Judge> judges, LDLogger logger) {
+    this(judges, DEFAULT_PER_JUDGE_TIMEOUT, Objects.requireNonNull(logger, "logger"));
+  }
+
+  /**
+   * Creates an evaluator with an explicit per-judge timeout.
+   *
+   * @param judges the judges to run; must not be {@code null}
+   * @param perJudgeTimeout the maximum time to wait for each judge; must not be {@code null}
+   * @param logger the logger; must not be {@code null}
+   */
+  public Evaluator(List<Judge> judges, Duration perJudgeTimeout, LDLogger logger) {
+    this.judges = Collections.unmodifiableList(new ArrayList<>(Objects.requireNonNull(judges, "judges")));
+    this.perJudgeTimeout = Objects.requireNonNull(perJudgeTimeout, "perJudgeTimeout");
+    this.logger = Objects.requireNonNull(logger, "logger");
+  }
+
+  /**
+   * Returns an evaluator with no judges. Its {@link #evaluate} returns an empty list and logs
+   * nothing.
+   *
+   * @return a no-op evaluator
+   */
+  public static Evaluator noop() {
+    return new Evaluator(
+        Collections.emptyList(), DEFAULT_PER_JUDGE_TIMEOUT, LDLogger.withAdapter(Logs.none(), ""));
+  }
+
+  /**
+   * Runs every judge against the given input and output.
+   *
+   * @param input the input that was provided to the AI being evaluated
+   * @param output the AI-generated response to score
+   * @return one {@link JudgeResult} per judge, in the judges' order; empty when there are no judges
+   */
+  public List<JudgeResult> evaluate(String input, String output) {
+    if (judges.isEmpty()) {
+      return Collections.emptyList();
+    }
+
+    ExecutorService pool = Executors.newFixedThreadPool(judges.size());
+    try {
+      List<Future<JudgeResult>> futures = new ArrayList<>(judges.size());
+      for (Judge judge : judges) {
+        futures.add(pool.submit(() -> judge.evaluate(input, output)));
+      }
+
+      List<JudgeResult> results = new ArrayList<>(judges.size());
+      for (int i = 0; i < judges.size(); i++) {
+        results.add(awaitResult(judges.get(i), futures.get(i)));
+      }
+      return results;
+    } finally {
+      pool.shutdownNow();
+    }
+  }
+
+  private JudgeResult awaitResult(Judge judge, Future<JudgeResult> future) {
+    String key = judge.getAIConfig().getKey();
+    try {
+      return future.get(perJudgeTimeout.toMillis(), TimeUnit.MILLISECONDS);
+    } catch (TimeoutException e) {
+      future.cancel(true);
+      logger.warn("Judge {} timed out after {} ms", key, perJudgeTimeout.toMillis());
+      return failed(key, "Judge evaluation timed out");
+    } catch (ExecutionException e) {
+      Throwable cause = e.getCause() != null ? e.getCause() : e;
+      logger.error("Judge {} failed: {}", key, cause.toString());
+      return failed(key, cause.getMessage() != null ? cause.getMessage() : "Unknown error");
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+      future.cancel(true);
+      return failed(key, "Judge evaluation interrupted");
+    }
+  }
+
+  private static JudgeResult failed(String key, String message) {
+    return JudgeResult.builder(true, false).judgeConfigKey(key).errorMessage(message).build();
+  }
+}
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java
new file mode 100644
index 00000000..33ba7c4d
--- /dev/null
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java
@@ -0,0 +1,243 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.sdk.LDValue;
+import com.launchdarkly.sdk.LDValueType;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
+
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * Evaluates the output of another AI Config using a judge AI Config and a caller-supplied
+ * {@link Runner}.
+ * <p>
+ * A judge is an AI Config with {@code mode: judge} that scores a model response. Obtain one from
+ * {@link LDAIClient#createJudge}, then call {@link #evaluate(String, String)} with the original
+ * input and the response to score. Evaluation is synchronous.
+ * <p>
+ * The judge records invocation metrics (duration, success, tokens) on its own tracker but does
+ * <strong>not</strong> emit the score via {@code trackJudgeResult}; recording the returned
+ * {@link JudgeResult} is the caller's responsibility.
+ * <p>
+ * Instances are immutable and safe to share across threads as long as the supplied {@link Runner}
+ * is too.
+ */
+public final class Judge {
+  private final AIJudgeConfig config;
+  private final Runner runner;
+  private final double sampleRate;
+  private final LDLogger logger;
+
+  /**
+   * Creates a judge.
+   *
+   * @param config the judge AI Config; must not be {@code null}
+   * @param runner the runner used to invoke the judge model; must not be {@code null}
+   * @param sampleRate the default sampling rate in {@code [0.0, 1.0]}; non-finite, negative, or
+   *     greater-than-one values are normalized
+   * @param logger the logger; must not be {@code null}
+   */
+  public Judge(AIJudgeConfig config, Runner runner, double sampleRate, LDLogger logger) {
+    this.config = Objects.requireNonNull(config, "config");
+    this.runner = Objects.requireNonNull(runner, "runner");
+    this.sampleRate = normalizeSampleRate(sampleRate);
+    this.logger = Objects.requireNonNull(logger, "logger");
+  }
+
+  /**
+   * Normalizes a sampling rate into {@code [0.0, 1.0]}. Non-finite rates fall back to {@code 1.0}
+   * (the default "always sample"); negative rates clamp to {@code 0.0}; rates above one clamp to
+   * {@code 1.0}.
+   *
+   * @param rate the requested rate
+   * @return the normalized rate
+   */
+  public static double normalizeSampleRate(double rate) {
+    if (Double.isNaN(rate) || Double.isInfinite(rate)) {
+      return 1.0;
+    }
+    if (rate < 0.0) {
+      return 0.0;
+    }
+    if (rate > 1.0) {
+      return 1.0;
+    }
+    return rate;
+  }
+
+  /**
+   * Returns the default sampling rate baked in at construction.
+   *
+   * @return the sampling rate
+   */
+  public double getSampleRate() {
+    return sampleRate;
+  }
+
+  /**
+   * Returns the judge AI Config.
+   *
+   * @return the config
+   */
+  public AIJudgeConfig getAIConfig() {
+    return config;
+  }
+
+  /**
+   * Returns the runner this judge invokes.
+   *
+   * @return the runner
+   */
+  public Runner getRunner() {
+    return runner;
+  }
+
+  /**
+   * Evaluates a response using the judge's default sampling rate.
+   *
+   * @param input the input that was provided to the AI being evaluated
+   * @param output the AI-generated response to score
+   * @return the evaluation result; never {@code null}
+   */
+  public JudgeResult evaluate(String input, String output) {
+    return evaluate(input, output, sampleRate);
+  }
+
+  /**
+   * Evaluates a response, deciding sampling before invoking the model.
+   *
+   * @param input the input that was provided to the AI being evaluated
+   * @param output the AI-generated response to score
+   * @param samplingRate the sampling rate to use for this call; an explicit {@code 0} suppresses the
+   *     evaluation
+   * @return the evaluation result; never {@code null}. The result is failed (and never sampled) when
+   *     the judge config has no evaluation metric key, when sampling skips it, when the response
+   *     cannot be parsed, or when the runner throws.
+   */
+  public JudgeResult evaluate(String input, String output, double samplingRate) {
+    double effectiveRate = normalizeSampleRate(samplingRate);
+    String key = config.getKey();
+    LDAIConfigTracker tracker = config.createTracker();
+    try {
+      String metricKey = evaluationMetricKey();
+      if (metricKey == null) {
+        logger.warn("Judge configuration is missing required evaluation metric key: {}", key);
+        return JudgeResult.builder(true, false)
+            .judgeConfigKey(key)
+            .errorMessage("Judge configuration is missing required evaluation metric key")
+            .build();
+      }
+
+      if (Math.random() > effectiveRate) {
+        logger.debug("Judge evaluation skipped due to sampling rate: {}", effectiveRate);
+        return JudgeResult.builder(false, false).judgeConfigKey(key).build();
+      }
+
+      String evaluationInput = buildEvaluationInput(input, output);
+      RunnerResult response = tracker.trackMetricsOf(RunnerResult::getMetrics,
+          () -> runner.run(evaluationInput));
+
+      ParsedEvaluation parsed = parseEvaluationResponse(response.getParsed());
+      if (parsed == null) {
+        logger.warn("Could not parse judge evaluation response for: {}", key);
+        return JudgeResult.builder(true, false).judgeConfigKey(key).build();
+      }
+
+      Metrics metrics = response.getMetrics();
+      boolean success = metrics != null && metrics.isSuccess();
+      return JudgeResult.builder(true, success)
+          .judgeConfigKey(key)
+          .metricKey(metricKey)
+          .score(parsed.score)
+          .reasoning(parsed.reasoning)
+          .build();
+    } catch (Exception e) {
+      logger.error("Judge evaluation failed for {}: {}", key, e.toString());
+      String message = e.getMessage() != null ? e.getMessage() : "Unknown error";
+      return JudgeResult.builder(true, false).judgeConfigKey(key).errorMessage(message).build();
+    }
+  }
+
+  /**
+   * Evaluates a response from a conversation history and a runner result, using the judge's default
+   * sampling rate.
+   *
+   * @param messages the conversation history; may be empty or {@code null}
+   * @param response the runner result whose content is scored
+   * @return the evaluation result; never {@code null}
+   */
+  public JudgeResult evaluateMessages(List<Message> messages, RunnerResult response) {
+    return evaluateMessages(messages, response, sampleRate);
+  }
+
+  /**
+   * Evaluates a response from a conversation history and a runner result.
+   * <p>
+   * Each message is rendered as {@code <role>: <content>} and the messages are joined with newlines
+   * to form the input; the response's content is the output.
+   *
+   * @param messages the conversation history; may be empty or {@code null}
+   * @param response the runner result whose content is scored
+   * @param samplingRate the sampling rate to use for this call
+   * @return the evaluation result; never {@code null}
+   */
+  public JudgeResult evaluateMessages(List<Message> messages, RunnerResult response, double samplingRate) {
+    StringBuilder input = new StringBuilder();
+    if (messages != null) {
+      boolean first = true;
+      for (Message message : messages) {
+        if (!first) {
+          input.append('\n');
+        }
+        input.append(message.getRole().getWireValue()).append(": ").append(message.getContent());
+        first = false;
+      }
+    }
+    String output = response == null ? null : response.getContent();
+    return evaluate(input.toString(), output, samplingRate);
+  }
+
+  private String evaluationMetricKey() {
+    String key = config.getEvaluationMetricKey();
+    if (key != null && !key.trim().isEmpty()) {
+      return key.trim();
+    }
+    return null;
+  }
+
+  private static String buildEvaluationInput(String input, String output) {
+    return "MESSAGE HISTORY:\n" + input + "\n\nRESPONSE TO EVALUATE:\n" + output;
+  }
+
+  private static ParsedEvaluation parseEvaluationResponse(LDValue parsed) {
+    if (parsed == null || parsed.getType() != LDValueType.OBJECT) {
+      return null;
+    }
+    LDValue score = parsed.get("score");
+    if (!score.isNumber()) {
+      return null;
+    }
+    double value = score.doubleValue();
+    if (value < 0.0 || value > 1.0) {
+      return null;
+    }
+    LDValue reasoning = parsed.get("reasoning");
+    if (!reasoning.isString()) {
+      return null;
+    }
+    return new ParsedEvaluation(value, reasoning.stringValue());
+  }
+
+  private static final class ParsedEvaluation {
+    private final double score;
+    private final String reasoning;
+
+    ParsedEvaluation(double score, String reasoning) {
+      this.score = score;
+      this.reasoning = reasoning;
+    }
+  }
+}
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java
index 523e1d68..ad11114d 100644
--- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java
@@ -96,4 +96,29 @@ AIJudgeConfig judgeConfig(
    * @throws IllegalArgumentException if the token is malformed
    */
   LDAIConfigTracker createTracker(String resumptionToken, LDContext context);
+
+  /**
+   * Retrieves a judge AI Config and builds a {@link Judge} for manual evaluation.
+   * <p>
+   * This fires only the {@code $ld:ai:usage:create-judge} usage event. In v1.0 the SDK does not
+   * auto-attach judges to completion or agent calls; evaluation is manual, driven by the returned
+   * judge. Because the SDK ships no provider runners yet, the caller supplies the {@link Runner}.
+   *
+   * @param key the judge AI Config key
+   * @param context the context to evaluate the configuration in
+   * @param defaultValue the default used when the flag is absent or cannot be evaluated; when
+   *     {@code null}, a disabled default is used
+   * @param variables variables interpolated into the judge prompt; may be {@code null}
+   * @param runner the runner the judge invokes; when {@code null}, no judge is created
+   * @param sampleRate the default sampling rate for the judge in {@code [0.0, 1.0]}
+   * @return a {@link Judge}, or {@code null} if the configuration is disabled or no runner was
+   *     supplied
+   */
+  Judge createJudge(
+      String key,
+      LDContext context,
+      AIJudgeConfigDefault defaultValue,
+      Map<String, Object> variables,
+      Runner runner,
+      double sampleRate);
 }
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java
index 80ef29b9..4b097115 100644
--- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java
@@ -47,6 +47,7 @@ public final class LDAIClientImpl implements LDAIClient {
   private static final String TRACK_USAGE_AGENT_CONFIG = "$ld:ai:usage:agent-config";
   private static final String TRACK_USAGE_AGENT_CONFIGS = "$ld:ai:usage:agent-configs";
   private static final String TRACK_USAGE_JUDGE_CONFIG = "$ld:ai:usage:judge-config";
+  private static final String TRACK_USAGE_CREATE_JUDGE = "$ld:ai:usage:create-judge";
 
   private static final LDContext INIT_TRACK_CONTEXT = LDContext
       .builder("ld-internal-tracking")
@@ -145,6 +146,37 @@ public LDAIConfigTracker createTracker(String resumptionToken, LDContext context
     return LDAIConfigTrackerImpl.fromResumptionToken(resumptionToken, client, context, logger);
   }
 
+  @Override
+  public Judge createJudge(
+      String key,
+      LDContext context,
+      AIJudgeConfigDefault defaultValue,
+      Map<String, Object> variables,
+      Runner runner,
+      double sampleRate) {
+    // Manual-only path: fire the create-judge usage event, then resolve the config through the
+    // internal evaluate (which does not fire $ld:ai:usage:judge-config).
+    client.trackMetric(TRACK_USAGE_CREATE_JUDGE, context, LDValue.of(key), 1);
+    try {
+      AIJudgeConfigDefault effectiveDefault =
+          defaultValue != null ? defaultValue : AIJudgeConfigDefault.disabled();
+      AIJudgeConfig judgeConfig =
+          (AIJudgeConfig) evaluate(key, context, effectiveDefault, Mode.JUDGE, variables);
+      if (!judgeConfig.isEnabled()) {
+        logger.info("Judge configuration is disabled: {}", key);
+        return null;
+      }
+      if (runner == null) {
+        logger.warn("No runner supplied for judge: {}", key);
+        return null;
+      }
+      return new Judge(judgeConfig, runner, sampleRate, logger);
+    } catch (RuntimeException e) {
+      logger.error("Failed to initialize judge {}: {}", key, e.toString());
+      return null;
+    }
+  }
+
   private AIAgentConfig evaluateAgent(
       String key, LDContext context, AIAgentConfigDefault defaultValue, Map<String, Object> variables) {
     AIAgentConfigDefault effectiveDefault =
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java
new file mode 100644
index 00000000..6542c33f
--- /dev/null
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java
@@ -0,0 +1,26 @@
+package com.launchdarkly.sdk.server.ai;
+
+/**
+ * Invokes an AI model with a string input and returns its result.
+ * <p>
+ * In v1.0 the AI SDK does not ship provider-specific runners; an application supplies its own
+ * {@code Runner} (for example wrapping an OpenAI or Bedrock call) when creating a {@link Judge} via
+ * {@link LDAIClient#createJudge}. Built-in provider runners are planned for a later release.
+ * <p>
+ * For structured-output use cases such as judge evaluation, the runner is expected to make the
+ * model's parsed JSON available via {@link RunnerResult#getParsed()}.
+ * <p>
+ * Implementations should be safe to invoke from multiple threads if the same runner is shared across
+ * concurrently-evaluating judges.
+ */
+public interface Runner {
+  /**
+   * Invokes the model with the given input.
+   *
+   * @param input the input string to send to the model
+   * @return the model result; must not be {@code null}
+   * @throws Exception if the invocation fails; the caller (a {@link Judge}) records the failure and
+   *     surfaces it as a failed evaluation rather than propagating it
+   */
+  RunnerResult run(String input) throws Exception;
+}
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java
new file mode 100644
index 00000000..9994da11
--- /dev/null
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java
@@ -0,0 +1,105 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.sdk.LDValue;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
+
+/**
+ * The result of a {@link Runner} invocation.
+ * <p>
+ * Carries the model's text {@link #getContent() content}, the {@link #getMetrics() metrics} the SDK
+ * uses to track the run, and any {@link #getParsed() parsed} structured output. For judge
+ * evaluation the parsed value is expected to be a JSON object with {@code score} (a number in
+ * {@code [0.0, 1.0]}) and {@code reasoning} (a string). Instances are immutable.
+ */
+public final class RunnerResult {
+  private final String content;
+  private final Metrics metrics;
+  private final LDValue parsed;
+
+  private RunnerResult(Builder b) {
+    this.content = b.content;
+    this.metrics = b.metrics;
+    this.parsed = b.parsed == null ? LDValue.ofNull() : b.parsed;
+  }
+
+  /**
+   * Returns the model's text response.
+   *
+   * @return the content, or {@code null} if none was produced
+   */
+  public String getContent() {
+    return content;
+  }
+
+  /**
+   * Returns the metrics for this invocation.
+   *
+   * @return the metrics, or {@code null} if none were provided
+   */
+  public Metrics getMetrics() {
+    return metrics;
+  }
+
+  /**
+   * Returns the parsed structured output.
+   *
+   * @return the parsed value; never {@code null}, but {@link LDValue#ofNull()} when there was none
+   */
+  public LDValue getParsed() {
+    return parsed;
+  }
+
+  /**
+   * Creates a builder.
+   *
+   * @param metrics the metrics for the invocation
+   * @return a new {@link Builder}
+   */
+  public static Builder builder(Metrics metrics) {
+    return new Builder(metrics);
+  }
+
+  /**
+   * Builder for {@link RunnerResult}.
+   */
+  public static final class Builder {
+    private final Metrics metrics;
+    private String content;
+    private LDValue parsed;
+
+    private Builder(Metrics metrics) {
+      this.metrics = metrics;
+    }
+
+    /**
+     * Sets the model's text response.
+     *
+     * @param v the content
+     * @return this builder
+     */
+    public Builder content(String v) {
+      this.content = v;
+      return this;
+    }
+
+    /**
+     * Sets the parsed structured output.
+     *
+     * @param v the parsed value
+     * @return this builder
+     */
+    public Builder parsed(LDValue v) {
+      this.parsed = v;
+      return this;
+    }
+
+    /**
+     * Builds the immutable {@link RunnerResult}.
+     *
+     * @return a new {@link RunnerResult}
+     */
+    public RunnerResult build() {
+      return new RunnerResult(this);
+    }
+  }
+}
diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java
new file mode 100644
index 00000000..2a4c8997
--- /dev/null
+++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java
@@ -0,0 +1,121 @@
+package com.launchdarkly.sdk.server.ai;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.contains;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.empty;
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
+import static org.mockito.Mockito.mock;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.logging.Logs;
+import com.launchdarkly.sdk.LDContext;
+import com.launchdarkly.sdk.LDValue;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
+import com.launchdarkly.sdk.server.ai.internal.LDAIConfigTrackerImpl;
+import com.launchdarkly.sdk.server.interfaces.LDClientInterface;
+
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.Supplier;
+
+import org.junit.Before;
+import org.junit.Test;
+
+@SuppressWarnings("javadoc")
+public class EvaluatorTest {
+  private LDClientInterface client;
+  private LDLogger logger;
+  private final LDContext context = LDContext.create("user-key");
+
+  @Before
+  public void setUp() {
+    client = mock(LDClientInterface.class);
+    logger = LDLogger.withAdapter(Logs.capture(), "test");
+  }
+
+  private Judge judge(String key, Runner runner) {
+    Supplier<LDAIConfigTracker> trackerFactory = () -> new LDAIConfigTrackerImpl(
+        client, "run-" + key, key, "v1", 1, "gpt-4", "openai", context, null, logger);
+    AIJudgeConfig config = new AIJudgeConfig(key, true, null, null, null, "relevance", trackerFactory);
+    return new Judge(config, runner, 1.0, logger);
+  }
+
+  private static Runner scoring(double score) {
+    return input -> RunnerResult.builder(Metrics.builder(true).build())
+        .parsed(LDValue.buildObject().put("score", score).put("reasoning", "r").build())
+        .build();
+  }
+
+  @Test
+  public void noopReturnsEmptyListAndLogsNothing() {
+    List<JudgeResult> results = Evaluator.noop().evaluate("q", "a");
+    assertThat(results, is(empty()));
+  }
+
+  @Test
+  public void runsEveryJudgePreservingOrder() {
+    Evaluator evaluator = new Evaluator(
+        Arrays.asList(judge("first", scoring(0.1)), judge("second", scoring(0.2))), logger);
+    List<JudgeResult> results = evaluator.evaluate("q", "a");
+    assertThat(results, hasSize(2));
+    assertThat(results.get(0).getJudgeConfigKey(), is("first"));
+    assertThat(results.get(0).getScore(), is(0.1));
+    assertThat(results.get(1).getJudgeConfigKey(), is("second"));
+    assertThat(results.get(1).getScore(), is(0.2));
+  }
+
+  @Test
+  public void faultyJudgeIsolatedAndOthersPreserved() {
+    Runner failing = input -> {
+      throw new RuntimeException("boom");
+    };
+    Evaluator evaluator = new Evaluator(
+        Arrays.asList(judge("ok", scoring(0.9)), judge("bad", failing)), logger);
+    List<JudgeResult> results = evaluator.evaluate("q", "a");
+    assertThat(results, hasSize(2));
+    assertThat(results.get(0).isSuccess(), is(true));
+    assertThat(results.get(0).getScore(), is(0.9));
+    assertThat(results.get(1).isSuccess(), is(false));
+    assertThat(results.get(1).getErrorMessage(), is("boom"));
+  }
+
+  @Test
+  public void hungJudgeTimesOutWithoutStallingChain() {
+    Runner slow = input -> {
+      Thread.sleep(5000);
+      return RunnerResult.builder(Metrics.builder(true).build()).build();
+    };
+    Evaluator evaluator = new Evaluator(
+        Arrays.asList(judge("fast", scoring(0.7)), judge("slow", slow)),
+        Duration.ofMillis(150),
+        logger);
+    List<JudgeResult> results = evaluator.evaluate("q", "a");
+    assertThat(results, hasSize(2));
+    assertThat(results.get(0).isSuccess(), is(true));
+    assertThat(results.get(1).isSuccess(), is(false));
+    assertThat(results.get(1).getErrorMessage(), containsString("timed out"));
+    assertThat(results.get(1).getJudgeConfigKey(), is("slow"));
+  }
+
+  @Test
+  public void resultsAreInJudgeOrderEvenWhenCompletionOrderDiffers() {
+    Runner slowOk = input -> {
+      Thread.sleep(300);
+      return RunnerResult.builder(Metrics.builder(true).build())
+          .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "r").build())
+          .build();
+    };
+    Evaluator evaluator = new Evaluator(
+        Arrays.asList(judge("slow", slowOk), judge("fast", scoring(0.6))),
+        Duration.ofSeconds(5),
+        logger);
+    List<JudgeResult> results = evaluator.evaluate("q", "a");
+    assertThat(
+        Arrays.asList(results.get(0).getJudgeConfigKey(), results.get(1).getJudgeConfigKey()),
+        contains("slow", "fast"));
+  }
+}
diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java
new file mode 100644
index 00000000..7fae78b3
--- /dev/null
+++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java
@@ -0,0 +1,170 @@
+package com.launchdarkly.sdk.server.ai;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.nullValue;
+import static org.mockito.Mockito.mock;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.logging.Logs;
+import com.launchdarkly.sdk.LDContext;
+import com.launchdarkly.sdk.LDValue;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
+import com.launchdarkly.sdk.server.ai.internal.LDAIConfigTrackerImpl;
+import com.launchdarkly.sdk.server.interfaces.LDClientInterface;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Supplier;
+
+import org.junit.Before;
+import org.junit.Test;
+
+@SuppressWarnings("javadoc")
+public class JudgeTest {
+  private LDClientInterface client;
+  private LDLogger logger;
+  private final LDContext context = LDContext.create("user-key");
+
+  @Before
+  public void setUp() {
+    client = mock(LDClientInterface.class);
+    logger = LDLogger.withAdapter(Logs.capture(), "test");
+  }
+
+  private AIJudgeConfig judgeConfig(String metricKey, boolean enabled) {
+    Supplier<LDAIConfigTracker> trackerFactory = () -> new LDAIConfigTrackerImpl(
+        client, "run-1", "judge-key", "v1", 1, "gpt-4", "openai", context, null, logger);
+    return new AIJudgeConfig("judge-key", enabled, null, null, null, metricKey, trackerFactory);
+  }
+
+  private static Runner runnerReturning(double score, String reasoning) {
+    return input -> RunnerResult.builder(Metrics.builder(true).build())
+        .content("evaluated")
+        .parsed(LDValue.buildObject().put("score", score).put("reasoning", reasoning).build())
+        .build();
+  }
+
+  @Test
+  public void evaluateScoresResponseAndReportsMetricKey() {
+    Judge judge = new Judge(judgeConfig("relevance", true), runnerReturning(0.8, "well grounded"), 1.0, logger);
+    JudgeResult result = judge.evaluate("the question", "the answer");
+    assertThat(result.isSampled(), is(true));
+    assertThat(result.isSuccess(), is(true));
+    assertThat(result.getScore(), is(0.8));
+    assertThat(result.getReasoning(), is("well grounded"));
+    assertThat(result.getMetricKey(), is("relevance"));
+    assertThat(result.getJudgeConfigKey(), is("judge-key"));
+  }
+
+  @Test
+  public void evaluateBuildsWellKnownInputFormat() {
+    AtomicReference<String> captured = new AtomicReference<>();
+    Runner capturing = input -> {
+      captured.set(input);
+      return RunnerResult.builder(Metrics.builder(true).build())
+          .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "ok").build())
+          .build();
+    };
+    Judge judge = new Judge(judgeConfig("relevance", true), capturing, 1.0, logger);
+    judge.evaluate("what is 2+2?", "4");
+    assertThat(captured.get(), is("MESSAGE HISTORY:\nwhat is 2+2?\n\nRESPONSE TO EVALUATE:\n4"));
+  }
+
+  @Test
+  public void zeroSamplingRateSkipsInvocation() {
+    AtomicReference<Boolean> invoked = new AtomicReference<>(false);
+    Runner runner = input -> {
+      invoked.set(true);
+      return RunnerResult.builder(Metrics.builder(true).build()).build();
+    };
+    Judge judge = new Judge(judgeConfig("relevance", true), runner, 0.0, logger);
+    JudgeResult result = judge.evaluate("q", "a");
+    assertThat(result.isSampled(), is(false));
+    assertThat(result.isSuccess(), is(false));
+    assertThat(invoked.get(), is(false));
+  }
+
+  @Test
+  public void missingEvaluationMetricKeyYieldsFailure() {
+    Judge judge = new Judge(judgeConfig("   ", true), runnerReturning(0.8, "x"), 1.0, logger);
+    JudgeResult result = judge.evaluate("q", "a");
+    assertThat(result.isSampled(), is(true));
+    assertThat(result.isSuccess(), is(false));
+    assertThat(result.getErrorMessage(), containsString("evaluation metric key"));
+  }
+
+  @Test
+  public void outOfRangeScoreFailsToParse() {
+    Judge judge = new Judge(judgeConfig("relevance", true), runnerReturning(1.5, "too high"), 1.0, logger);
+    JudgeResult result = judge.evaluate("q", "a");
+    assertThat(result.isSampled(), is(true));
+    assertThat(result.isSuccess(), is(false));
+    assertThat(result.getScore(), is(nullValue()));
+  }
+
+  @Test
+  public void missingReasoningFailsToParse() {
+    Runner runner = input -> RunnerResult.builder(Metrics.builder(true).build())
+        .parsed(LDValue.buildObject().put("score", 0.5).build())
+        .build();
+    Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger);
+    JudgeResult result = judge.evaluate("q", "a");
+    assertThat(result.isSuccess(), is(false));
+    assertThat(result.getScore(), is(nullValue()));
+  }
+
+  @Test
+  public void runnerFailureYieldsFailedResult() {
+    Runner runner = input -> {
+      throw new RuntimeException("model exploded");
+    };
+    Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger);
+    JudgeResult result = judge.evaluate("q", "a");
+    assertThat(result.isSampled(), is(true));
+    assertThat(result.isSuccess(), is(false));
+    assertThat(result.getErrorMessage(), is("model exploded"));
+  }
+
+  @Test
+  public void runnerReportingFailureMetricsYieldsUnsuccessfulResult() {
+    Runner runner = input -> RunnerResult.builder(Metrics.builder(false).build())
+        .parsed(LDValue.buildObject().put("score", 0.3).put("reasoning", "weak").build())
+        .build();
+    Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger);
+    JudgeResult result = judge.evaluate("q", "a");
+    // Parsed successfully, but the runner's own metrics say the call did not succeed.
+    assertThat(result.isSuccess(), is(false));
+    assertThat(result.getScore(), is(0.3));
+  }
+
+  @Test
+  public void evaluateMessagesRendersRolePrefixedHistory() {
+    AtomicReference<String> captured = new AtomicReference<>();
+    Runner capturing = input -> {
+      captured.set(input);
+      return RunnerResult.builder(Metrics.builder(true).build())
+          .parsed(LDValue.buildObject().put("score", 0.9).put("reasoning", "great").build())
+          .build();
+    };
+    Judge judge = new Judge(judgeConfig("relevance", true), capturing, 1.0, logger);
+    RunnerResult response = RunnerResult.builder(Metrics.builder(true).build()).content("the answer").build();
+    judge.evaluateMessages(
+        Arrays.asList(new Message(Message.Role.SYSTEM, "be helpful"), new Message(Message.Role.USER, "hi")),
+        response);
+    assertThat(captured.get(),
+        is("MESSAGE HISTORY:\nsystem: be helpful\nuser: hi\n\nRESPONSE TO EVALUATE:\nthe answer"));
+  }
+
+  @Test
+  public void normalizeSampleRateClampsAndDefaults() {
+    assertThat(Judge.normalizeSampleRate(-0.5), is(0.0));
+    assertThat(Judge.normalizeSampleRate(2.0), is(1.0));
+    assertThat(Judge.normalizeSampleRate(Double.NaN), is(1.0));
+    assertThat(Judge.normalizeSampleRate(Double.POSITIVE_INFINITY), is(1.0));
+    assertThat(Judge.normalizeSampleRate(0.42), is(0.42));
+  }
+}
diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java
index 7ba4798e..0ab4f63d 100644
--- a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java
+++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java
@@ -9,9 +9,11 @@
 import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.nullValue;
 import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyDouble;
 import static org.mockito.ArgumentMatchers.anyString;
 import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
@@ -24,6 +26,7 @@
 import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Mode;
 import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message;
 import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Model;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics;
 import com.launchdarkly.sdk.server.interfaces.LDClientInterface;
 
 import java.util.ArrayList;
@@ -320,4 +323,57 @@ public void eachCreateTrackerCallStartsANewRun() {
     assertThat(runA, is(notNullValue()));
     assertThat(runA.equals(runB), is(false));
   }
+
+  // ---- createJudge ----------------------------------------------------------
+
+  private static final String JUDGE_JSON =
+      "{\"_ldMeta\":{\"enabled\":true,\"mode\":\"judge\"},\"evaluationMetricKeys\":[\"relevance\"]}";
+
+  private static Runner stubRunner() {
+    return input -> RunnerResult.builder(Metrics.builder(true).build())
+        .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "r").build())
+        .build();
+  }
+
+  @Test
+  public void createJudgeFiresOnlyCreateJudgeUsageEvent() {
+    when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON));
+
+    ai.createJudge("judge-key", context, null, null, stubRunner(), 1.0);
+
+    verify(client).trackMetric(eq("$ld:ai:usage:create-judge"), eq(context), eq(LDValue.of("judge-key")), eq(1.0));
+    verify(client, never()).trackMetric(eq("$ld:ai:usage:judge-config"), any(), any(), anyDouble());
+  }
+
+  @Test
+  public void createJudgeReturnsJudgeForEnabledConfig() {
+    when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON));
+
+    Runner runner = stubRunner();
+    Judge judge = ai.createJudge("judge-key", context, null, null, runner, 1.0);
+
+    assertThat(judge, is(notNullValue()));
+    assertThat(judge.getAIConfig().getKey(), is("judge-key"));
+    assertThat(judge.getAIConfig().getEvaluationMetricKey(), is("relevance"));
+    assertThat(judge.getRunner(), is(runner));
+  }
+
+  @Test
+  public void createJudgeReturnsNullWhenDisabled() {
+    String disabled = "{\"_ldMeta\":{\"enabled\":false,\"mode\":\"judge\"}}";
+    when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(disabled));
+
+    Judge judge = ai.createJudge("judge-key", context, null, null, stubRunner(), 1.0);
+    assertThat(judge, is(nullValue()));
+  }
+
+  @Test
+  public void createJudgeReturnsNullWhenNoRunner() {
+    when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON));
+
+    Judge judge = ai.createJudge("judge-key", context, null, null, null, 1.0);
+    assertThat(judge, is(nullValue()));
+    // The usage event still fires before the runner check.
+    verify(client).trackMetric(eq("$ld:ai:usage:create-judge"), eq(context), eq(LDValue.of("judge-key")), eq(1.0));
+  }
 }

From f6d4a4ca09f578ba78852233399d6d494c81e6f3 Mon Sep 17 00:00:00 2001
From: Clifford Tawiah <ctawiah@launchdarkly.com>
Date: Wed, 10 Jun 2026 22:44:23 -0400
Subject: [PATCH 2/2] refactor: make Evaluator package-private (not public API
 in v1.0)

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 lib/sdk/server-ai/README.md                                   | 4 ----
 .../main/java/com/launchdarkly/sdk/server/ai/Evaluator.java   | 4 +++-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/lib/sdk/server-ai/README.md b/lib/sdk/server-ai/README.md
index 971bb43c..d6afbb44 100644
--- a/lib/sdk/server-ai/README.md
+++ b/lib/sdk/server-ai/README.md
@@ -78,10 +78,6 @@ if (judge != null) {
 }
 ```
 
-`Evaluator` runs several judges over the same input/output with per-judge fault isolation and a
-per-judge timeout, returning one `JudgeResult` per judge in order. `Evaluator.noop()` returns an
-empty result list.
-
 ## Internal API convention
 
 Public, supported types live directly under `com.launchdarkly.sdk.server.ai` (and its documented
diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java
index 4df1d6e1..8e53f024 100644
--- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java
+++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java
@@ -26,8 +26,10 @@
  * <p>
  * The evaluator does not record results; recording the returned {@link JudgeResult}s (for example
  * via a tracker) is the caller's responsibility. Instances are immutable and thread-safe.
+ * <p>
+ * This type is not part of the public API in v1.0 and may change without notice.
  */
-public final class Evaluator {
+final class Evaluator {
   /**
    * Default per-judge timeout used when one is not supplied.
    */