From 2e94f05a67b70c3f8563b5d652003fb6c3bc5507 Mon Sep 17 00:00:00 2001 From: Clifford Tawiah Date: Wed, 10 Jun 2026 22:41:50 -0400 Subject: [PATCH 1/2] feat: add manual judge evaluation (Judge, Evaluator, createJudge) (AIC-2665) Implements the AIEVALS manual-only evaluation path: - Runner SPI and RunnerResult for caller-supplied model invocation - Judge: sampling decided before invocation, well-known input format, score/reasoning parsing with range validation, invocation tracked via trackMetricsOf (does not emit trackJudgeResult; caller's responsibility) - Evaluator: per-judge fault isolation and per-judge timeout, order-preserving results, noop() returns an empty list; sampling-rate normalization on Judge - LDAIClient.createJudge: fires only $ld:ai:usage:create-judge, resolves the judge config via the internal evaluate path, returns null when disabled or when no runner is supplied Automatic judge auto-attachment and provider runners are deferred past v1.0. README documents the manual-only flow and the auto-attach descope. Co-authored-by: Cursor --- lib/sdk/server-ai/README.md | 40 ++- .../launchdarkly/sdk/server/ai/Evaluator.java | 125 +++++++++ .../com/launchdarkly/sdk/server/ai/Judge.java | 243 ++++++++++++++++++ .../sdk/server/ai/LDAIClient.java | 25 ++ .../sdk/server/ai/LDAIClientImpl.java | 32 +++ .../launchdarkly/sdk/server/ai/Runner.java | 26 ++ .../sdk/server/ai/RunnerResult.java | 105 ++++++++ .../sdk/server/ai/EvaluatorTest.java | 121 +++++++++ .../launchdarkly/sdk/server/ai/JudgeTest.java | 170 ++++++++++++ .../sdk/server/ai/LDAIClientImplTest.java | 56 ++++ 10 files changed, 941 insertions(+), 2 deletions(-) create mode 100644 lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java create mode 100644 lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java create mode 100644 lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java create mode 100644 lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java create mode 100644 lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java create mode 100644 lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java diff --git a/lib/sdk/server-ai/README.md b/lib/sdk/server-ai/README.md index b2d8d4f2..971bb43c 100644 --- a/lib/sdk/server-ai/README.md +++ b/lib/sdk/server-ai/README.md @@ -43,8 +43,44 @@ The companion `agentConfig`/`agentConfigs` and `judgeConfig` methods retrieve ag configs respectively. Within a prompt message or agent instruction, the evaluation context is available as `{{ldctx}}` (for example `{{ldctx.key}}`). -Metric tracking and manual judge evaluation will be added as the SDK is built out (see epic -AIC-2629). +## Tracking AI runs + +Every retrieved config exposes a tracker via `config.createTracker()`. Use it to record duration, +time-to-first-token, success/error, token usage, tool calls, and feedback for an AI run. Trackers +are thread-safe, and at-most-once metrics (duration, time-to-first-token, outcome, feedback, tokens) +emit a single event even under concurrent calls. A run can be correlated across processes with +`tracker.getResumptionToken()` and rebuilt later via `aiClient.createTracker(token, context)`. + +## Evaluating responses with judges (manual) + +A judge is an AI Config with `mode: judge` that scores another config's output against an evaluation +metric. + +In `v1.0`, evaluation is **manual only**. The SDK parses `judgeConfiguration` and exposes it on +configs, but it does **not** automatically invoke judges on completion or agent calls. Sample-rate +driven auto-attachment is deferred past `v1.0`. Because no provider-specific runners ship yet, you +supply your own `Runner` that calls your model and returns structured `{score, reasoning}` output. + +```java +Runner runner = input -> { + // Call your model with `input`, then return its score/reasoning as structured output. + // metrics carries success/tokens/duration for the invocation. + return RunnerResult.builder(Metrics.builder(true).build()) + .parsed(LDValue.buildObject().put("score", 0.9).put("reasoning", "grounded").build()) + .build(); +}; + +Judge judge = aiClient.createJudge("my-judge-key", context, null, variables, runner, 1.0); +if (judge != null) { + JudgeResult result = judge.evaluate(originalInput, modelOutput); + // Recording the result is the caller's responsibility: + completionTracker.trackJudgeResult(result); +} +``` + +`Evaluator` runs several judges over the same input/output with per-judge fault isolation and a +per-judge timeout, returning one `JudgeResult` per judge in order. `Evaluator.noop()` returns an +empty result list. ## Internal API convention diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java new file mode 100644 index 00000000..4df1d6e1 --- /dev/null +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java @@ -0,0 +1,125 @@ +package com.launchdarkly.sdk.server.ai; + +import com.launchdarkly.logging.LDLogger; +import com.launchdarkly.logging.Logs; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +/** + * Runs a fixed set of {@link Judge}s against one input/output pair and collects their results. + *

+ * Each judge runs with fault isolation: a judge that throws or times out yields a + * failed {@link JudgeResult} for that judge while every other judge's result is preserved, in the + * original order. Judges run concurrently and each is bounded by a per-judge timeout so a single + * hung judge cannot stall the whole evaluation. + *

+ * The evaluator does not record results; recording the returned {@link JudgeResult}s (for example + * via a tracker) is the caller's responsibility. Instances are immutable and thread-safe. + */ +public final class Evaluator { + /** + * Default per-judge timeout used when one is not supplied. + */ + public static final Duration DEFAULT_PER_JUDGE_TIMEOUT = Duration.ofSeconds(30); + + private final List judges; + private final Duration perJudgeTimeout; + private final LDLogger logger; + + /** + * Creates an evaluator using the {@link #DEFAULT_PER_JUDGE_TIMEOUT default per-judge timeout}. + * + * @param judges the judges to run; must not be {@code null} + * @param logger the logger; must not be {@code null} + */ + public Evaluator(List judges, LDLogger logger) { + this(judges, DEFAULT_PER_JUDGE_TIMEOUT, Objects.requireNonNull(logger, "logger")); + } + + /** + * Creates an evaluator with an explicit per-judge timeout. + * + * @param judges the judges to run; must not be {@code null} + * @param perJudgeTimeout the maximum time to wait for each judge; must not be {@code null} + * @param logger the logger; must not be {@code null} + */ + public Evaluator(List judges, Duration perJudgeTimeout, LDLogger logger) { + this.judges = Collections.unmodifiableList(new ArrayList<>(Objects.requireNonNull(judges, "judges"))); + this.perJudgeTimeout = Objects.requireNonNull(perJudgeTimeout, "perJudgeTimeout"); + this.logger = Objects.requireNonNull(logger, "logger"); + } + + /** + * Returns an evaluator with no judges. Its {@link #evaluate} returns an empty list and logs + * nothing. + * + * @return a no-op evaluator + */ + public static Evaluator noop() { + return new Evaluator( + Collections.emptyList(), DEFAULT_PER_JUDGE_TIMEOUT, LDLogger.withAdapter(Logs.none(), "")); + } + + /** + * Runs every judge against the given input and output. + * + * @param input the input that was provided to the AI being evaluated + * @param output the AI-generated response to score + * @return one {@link JudgeResult} per judge, in the judges' order; empty when there are no judges + */ + public List evaluate(String input, String output) { + if (judges.isEmpty()) { + return Collections.emptyList(); + } + + ExecutorService pool = Executors.newFixedThreadPool(judges.size()); + try { + List> futures = new ArrayList<>(judges.size()); + for (Judge judge : judges) { + futures.add(pool.submit(() -> judge.evaluate(input, output))); + } + + List results = new ArrayList<>(judges.size()); + for (int i = 0; i < judges.size(); i++) { + results.add(awaitResult(judges.get(i), futures.get(i))); + } + return results; + } finally { + pool.shutdownNow(); + } + } + + private JudgeResult awaitResult(Judge judge, Future future) { + String key = judge.getAIConfig().getKey(); + try { + return future.get(perJudgeTimeout.toMillis(), TimeUnit.MILLISECONDS); + } catch (TimeoutException e) { + future.cancel(true); + logger.warn("Judge {} timed out after {} ms", key, perJudgeTimeout.toMillis()); + return failed(key, "Judge evaluation timed out"); + } catch (ExecutionException e) { + Throwable cause = e.getCause() != null ? e.getCause() : e; + logger.error("Judge {} failed: {}", key, cause.toString()); + return failed(key, cause.getMessage() != null ? cause.getMessage() : "Unknown error"); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + future.cancel(true); + return failed(key, "Judge evaluation interrupted"); + } + } + + private static JudgeResult failed(String key, String message) { + return JudgeResult.builder(true, false).judgeConfigKey(key).errorMessage(message).build(); + } +} diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java new file mode 100644 index 00000000..33ba7c4d --- /dev/null +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java @@ -0,0 +1,243 @@ +package com.launchdarkly.sdk.server.ai; + +import com.launchdarkly.logging.LDLogger; +import com.launchdarkly.sdk.LDValue; +import com.launchdarkly.sdk.LDValueType; +import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics; + +import java.util.List; +import java.util.Objects; + +/** + * Evaluates the output of another AI Config using a judge AI Config and a caller-supplied + * {@link Runner}. + *

+ * A judge is an AI Config with {@code mode: judge} that scores a model response. Obtain one from + * {@link LDAIClient#createJudge}, then call {@link #evaluate(String, String)} with the original + * input and the response to score. Evaluation is synchronous. + *

+ * The judge records invocation metrics (duration, success, tokens) on its own tracker but does + * not emit the score via {@code trackJudgeResult}; recording the returned + * {@link JudgeResult} is the caller's responsibility. + *

+ * Instances are immutable and safe to share across threads as long as the supplied {@link Runner} + * is too. + */ +public final class Judge { + private final AIJudgeConfig config; + private final Runner runner; + private final double sampleRate; + private final LDLogger logger; + + /** + * Creates a judge. + * + * @param config the judge AI Config; must not be {@code null} + * @param runner the runner used to invoke the judge model; must not be {@code null} + * @param sampleRate the default sampling rate in {@code [0.0, 1.0]}; non-finite, negative, or + * greater-than-one values are normalized + * @param logger the logger; must not be {@code null} + */ + public Judge(AIJudgeConfig config, Runner runner, double sampleRate, LDLogger logger) { + this.config = Objects.requireNonNull(config, "config"); + this.runner = Objects.requireNonNull(runner, "runner"); + this.sampleRate = normalizeSampleRate(sampleRate); + this.logger = Objects.requireNonNull(logger, "logger"); + } + + /** + * Normalizes a sampling rate into {@code [0.0, 1.0]}. Non-finite rates fall back to {@code 1.0} + * (the default "always sample"); negative rates clamp to {@code 0.0}; rates above one clamp to + * {@code 1.0}. + * + * @param rate the requested rate + * @return the normalized rate + */ + public static double normalizeSampleRate(double rate) { + if (Double.isNaN(rate) || Double.isInfinite(rate)) { + return 1.0; + } + if (rate < 0.0) { + return 0.0; + } + if (rate > 1.0) { + return 1.0; + } + return rate; + } + + /** + * Returns the default sampling rate baked in at construction. + * + * @return the sampling rate + */ + public double getSampleRate() { + return sampleRate; + } + + /** + * Returns the judge AI Config. + * + * @return the config + */ + public AIJudgeConfig getAIConfig() { + return config; + } + + /** + * Returns the runner this judge invokes. + * + * @return the runner + */ + public Runner getRunner() { + return runner; + } + + /** + * Evaluates a response using the judge's default sampling rate. + * + * @param input the input that was provided to the AI being evaluated + * @param output the AI-generated response to score + * @return the evaluation result; never {@code null} + */ + public JudgeResult evaluate(String input, String output) { + return evaluate(input, output, sampleRate); + } + + /** + * Evaluates a response, deciding sampling before invoking the model. + * + * @param input the input that was provided to the AI being evaluated + * @param output the AI-generated response to score + * @param samplingRate the sampling rate to use for this call; an explicit {@code 0} suppresses the + * evaluation + * @return the evaluation result; never {@code null}. The result is failed (and never sampled) when + * the judge config has no evaluation metric key, when sampling skips it, when the response + * cannot be parsed, or when the runner throws. + */ + public JudgeResult evaluate(String input, String output, double samplingRate) { + double effectiveRate = normalizeSampleRate(samplingRate); + String key = config.getKey(); + LDAIConfigTracker tracker = config.createTracker(); + try { + String metricKey = evaluationMetricKey(); + if (metricKey == null) { + logger.warn("Judge configuration is missing required evaluation metric key: {}", key); + return JudgeResult.builder(true, false) + .judgeConfigKey(key) + .errorMessage("Judge configuration is missing required evaluation metric key") + .build(); + } + + if (Math.random() > effectiveRate) { + logger.debug("Judge evaluation skipped due to sampling rate: {}", effectiveRate); + return JudgeResult.builder(false, false).judgeConfigKey(key).build(); + } + + String evaluationInput = buildEvaluationInput(input, output); + RunnerResult response = tracker.trackMetricsOf(RunnerResult::getMetrics, + () -> runner.run(evaluationInput)); + + ParsedEvaluation parsed = parseEvaluationResponse(response.getParsed()); + if (parsed == null) { + logger.warn("Could not parse judge evaluation response for: {}", key); + return JudgeResult.builder(true, false).judgeConfigKey(key).build(); + } + + Metrics metrics = response.getMetrics(); + boolean success = metrics != null && metrics.isSuccess(); + return JudgeResult.builder(true, success) + .judgeConfigKey(key) + .metricKey(metricKey) + .score(parsed.score) + .reasoning(parsed.reasoning) + .build(); + } catch (Exception e) { + logger.error("Judge evaluation failed for {}: {}", key, e.toString()); + String message = e.getMessage() != null ? e.getMessage() : "Unknown error"; + return JudgeResult.builder(true, false).judgeConfigKey(key).errorMessage(message).build(); + } + } + + /** + * Evaluates a response from a conversation history and a runner result, using the judge's default + * sampling rate. + * + * @param messages the conversation history; may be empty or {@code null} + * @param response the runner result whose content is scored + * @return the evaluation result; never {@code null} + */ + public JudgeResult evaluateMessages(List messages, RunnerResult response) { + return evaluateMessages(messages, response, sampleRate); + } + + /** + * Evaluates a response from a conversation history and a runner result. + *

+ * Each message is rendered as {@code : } and the messages are joined with newlines + * to form the input; the response's content is the output. + * + * @param messages the conversation history; may be empty or {@code null} + * @param response the runner result whose content is scored + * @param samplingRate the sampling rate to use for this call + * @return the evaluation result; never {@code null} + */ + public JudgeResult evaluateMessages(List messages, RunnerResult response, double samplingRate) { + StringBuilder input = new StringBuilder(); + if (messages != null) { + boolean first = true; + for (Message message : messages) { + if (!first) { + input.append('\n'); + } + input.append(message.getRole().getWireValue()).append(": ").append(message.getContent()); + first = false; + } + } + String output = response == null ? null : response.getContent(); + return evaluate(input.toString(), output, samplingRate); + } + + private String evaluationMetricKey() { + String key = config.getEvaluationMetricKey(); + if (key != null && !key.trim().isEmpty()) { + return key.trim(); + } + return null; + } + + private static String buildEvaluationInput(String input, String output) { + return "MESSAGE HISTORY:\n" + input + "\n\nRESPONSE TO EVALUATE:\n" + output; + } + + private static ParsedEvaluation parseEvaluationResponse(LDValue parsed) { + if (parsed == null || parsed.getType() != LDValueType.OBJECT) { + return null; + } + LDValue score = parsed.get("score"); + if (!score.isNumber()) { + return null; + } + double value = score.doubleValue(); + if (value < 0.0 || value > 1.0) { + return null; + } + LDValue reasoning = parsed.get("reasoning"); + if (!reasoning.isString()) { + return null; + } + return new ParsedEvaluation(value, reasoning.stringValue()); + } + + private static final class ParsedEvaluation { + private final double score; + private final String reasoning; + + ParsedEvaluation(double score, String reasoning) { + this.score = score; + this.reasoning = reasoning; + } + } +} diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java index 523e1d68..ad11114d 100644 --- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClient.java @@ -96,4 +96,29 @@ AIJudgeConfig judgeConfig( * @throws IllegalArgumentException if the token is malformed */ LDAIConfigTracker createTracker(String resumptionToken, LDContext context); + + /** + * Retrieves a judge AI Config and builds a {@link Judge} for manual evaluation. + *

+ * This fires only the {@code $ld:ai:usage:create-judge} usage event. In v1.0 the SDK does not + * auto-attach judges to completion or agent calls; evaluation is manual, driven by the returned + * judge. Because the SDK ships no provider runners yet, the caller supplies the {@link Runner}. + * + * @param key the judge AI Config key + * @param context the context to evaluate the configuration in + * @param defaultValue the default used when the flag is absent or cannot be evaluated; when + * {@code null}, a disabled default is used + * @param variables variables interpolated into the judge prompt; may be {@code null} + * @param runner the runner the judge invokes; when {@code null}, no judge is created + * @param sampleRate the default sampling rate for the judge in {@code [0.0, 1.0]} + * @return a {@link Judge}, or {@code null} if the configuration is disabled or no runner was + * supplied + */ + Judge createJudge( + String key, + LDContext context, + AIJudgeConfigDefault defaultValue, + Map variables, + Runner runner, + double sampleRate); } diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java index 80ef29b9..4b097115 100644 --- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java @@ -47,6 +47,7 @@ public final class LDAIClientImpl implements LDAIClient { private static final String TRACK_USAGE_AGENT_CONFIG = "$ld:ai:usage:agent-config"; private static final String TRACK_USAGE_AGENT_CONFIGS = "$ld:ai:usage:agent-configs"; private static final String TRACK_USAGE_JUDGE_CONFIG = "$ld:ai:usage:judge-config"; + private static final String TRACK_USAGE_CREATE_JUDGE = "$ld:ai:usage:create-judge"; private static final LDContext INIT_TRACK_CONTEXT = LDContext .builder("ld-internal-tracking") @@ -145,6 +146,37 @@ public LDAIConfigTracker createTracker(String resumptionToken, LDContext context return LDAIConfigTrackerImpl.fromResumptionToken(resumptionToken, client, context, logger); } + @Override + public Judge createJudge( + String key, + LDContext context, + AIJudgeConfigDefault defaultValue, + Map variables, + Runner runner, + double sampleRate) { + // Manual-only path: fire the create-judge usage event, then resolve the config through the + // internal evaluate (which does not fire $ld:ai:usage:judge-config). + client.trackMetric(TRACK_USAGE_CREATE_JUDGE, context, LDValue.of(key), 1); + try { + AIJudgeConfigDefault effectiveDefault = + defaultValue != null ? defaultValue : AIJudgeConfigDefault.disabled(); + AIJudgeConfig judgeConfig = + (AIJudgeConfig) evaluate(key, context, effectiveDefault, Mode.JUDGE, variables); + if (!judgeConfig.isEnabled()) { + logger.info("Judge configuration is disabled: {}", key); + return null; + } + if (runner == null) { + logger.warn("No runner supplied for judge: {}", key); + return null; + } + return new Judge(judgeConfig, runner, sampleRate, logger); + } catch (RuntimeException e) { + logger.error("Failed to initialize judge {}: {}", key, e.toString()); + return null; + } + } + private AIAgentConfig evaluateAgent( String key, LDContext context, AIAgentConfigDefault defaultValue, Map variables) { AIAgentConfigDefault effectiveDefault = diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java new file mode 100644 index 00000000..6542c33f --- /dev/null +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java @@ -0,0 +1,26 @@ +package com.launchdarkly.sdk.server.ai; + +/** + * Invokes an AI model with a string input and returns its result. + *

+ * In v1.0 the AI SDK does not ship provider-specific runners; an application supplies its own + * {@code Runner} (for example wrapping an OpenAI or Bedrock call) when creating a {@link Judge} via + * {@link LDAIClient#createJudge}. Built-in provider runners are planned for a later release. + *

+ * For structured-output use cases such as judge evaluation, the runner is expected to make the + * model's parsed JSON available via {@link RunnerResult#getParsed()}. + *

+ * Implementations should be safe to invoke from multiple threads if the same runner is shared across + * concurrently-evaluating judges. + */ +public interface Runner { + /** + * Invokes the model with the given input. + * + * @param input the input string to send to the model + * @return the model result; must not be {@code null} + * @throws Exception if the invocation fails; the caller (a {@link Judge}) records the failure and + * surfaces it as a failed evaluation rather than propagating it + */ + RunnerResult run(String input) throws Exception; +} diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java new file mode 100644 index 00000000..9994da11 --- /dev/null +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java @@ -0,0 +1,105 @@ +package com.launchdarkly.sdk.server.ai; + +import com.launchdarkly.sdk.LDValue; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics; + +/** + * The result of a {@link Runner} invocation. + *

+ * Carries the model's text {@link #getContent() content}, the {@link #getMetrics() metrics} the SDK + * uses to track the run, and any {@link #getParsed() parsed} structured output. For judge + * evaluation the parsed value is expected to be a JSON object with {@code score} (a number in + * {@code [0.0, 1.0]}) and {@code reasoning} (a string). Instances are immutable. + */ +public final class RunnerResult { + private final String content; + private final Metrics metrics; + private final LDValue parsed; + + private RunnerResult(Builder b) { + this.content = b.content; + this.metrics = b.metrics; + this.parsed = b.parsed == null ? LDValue.ofNull() : b.parsed; + } + + /** + * Returns the model's text response. + * + * @return the content, or {@code null} if none was produced + */ + public String getContent() { + return content; + } + + /** + * Returns the metrics for this invocation. + * + * @return the metrics, or {@code null} if none were provided + */ + public Metrics getMetrics() { + return metrics; + } + + /** + * Returns the parsed structured output. + * + * @return the parsed value; never {@code null}, but {@link LDValue#ofNull()} when there was none + */ + public LDValue getParsed() { + return parsed; + } + + /** + * Creates a builder. + * + * @param metrics the metrics for the invocation + * @return a new {@link Builder} + */ + public static Builder builder(Metrics metrics) { + return new Builder(metrics); + } + + /** + * Builder for {@link RunnerResult}. + */ + public static final class Builder { + private final Metrics metrics; + private String content; + private LDValue parsed; + + private Builder(Metrics metrics) { + this.metrics = metrics; + } + + /** + * Sets the model's text response. + * + * @param v the content + * @return this builder + */ + public Builder content(String v) { + this.content = v; + return this; + } + + /** + * Sets the parsed structured output. + * + * @param v the parsed value + * @return this builder + */ + public Builder parsed(LDValue v) { + this.parsed = v; + return this; + } + + /** + * Builds the immutable {@link RunnerResult}. + * + * @return a new {@link RunnerResult} + */ + public RunnerResult build() { + return new RunnerResult(this); + } + } +} diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java new file mode 100644 index 00000000..2a4c8997 --- /dev/null +++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java @@ -0,0 +1,121 @@ +package com.launchdarkly.sdk.server.ai; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.empty; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.is; +import static org.mockito.Mockito.mock; + +import com.launchdarkly.logging.LDLogger; +import com.launchdarkly.logging.Logs; +import com.launchdarkly.sdk.LDContext; +import com.launchdarkly.sdk.LDValue; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics; +import com.launchdarkly.sdk.server.ai.internal.LDAIConfigTrackerImpl; +import com.launchdarkly.sdk.server.interfaces.LDClientInterface; + +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.function.Supplier; + +import org.junit.Before; +import org.junit.Test; + +@SuppressWarnings("javadoc") +public class EvaluatorTest { + private LDClientInterface client; + private LDLogger logger; + private final LDContext context = LDContext.create("user-key"); + + @Before + public void setUp() { + client = mock(LDClientInterface.class); + logger = LDLogger.withAdapter(Logs.capture(), "test"); + } + + private Judge judge(String key, Runner runner) { + Supplier trackerFactory = () -> new LDAIConfigTrackerImpl( + client, "run-" + key, key, "v1", 1, "gpt-4", "openai", context, null, logger); + AIJudgeConfig config = new AIJudgeConfig(key, true, null, null, null, "relevance", trackerFactory); + return new Judge(config, runner, 1.0, logger); + } + + private static Runner scoring(double score) { + return input -> RunnerResult.builder(Metrics.builder(true).build()) + .parsed(LDValue.buildObject().put("score", score).put("reasoning", "r").build()) + .build(); + } + + @Test + public void noopReturnsEmptyListAndLogsNothing() { + List results = Evaluator.noop().evaluate("q", "a"); + assertThat(results, is(empty())); + } + + @Test + public void runsEveryJudgePreservingOrder() { + Evaluator evaluator = new Evaluator( + Arrays.asList(judge("first", scoring(0.1)), judge("second", scoring(0.2))), logger); + List results = evaluator.evaluate("q", "a"); + assertThat(results, hasSize(2)); + assertThat(results.get(0).getJudgeConfigKey(), is("first")); + assertThat(results.get(0).getScore(), is(0.1)); + assertThat(results.get(1).getJudgeConfigKey(), is("second")); + assertThat(results.get(1).getScore(), is(0.2)); + } + + @Test + public void faultyJudgeIsolatedAndOthersPreserved() { + Runner failing = input -> { + throw new RuntimeException("boom"); + }; + Evaluator evaluator = new Evaluator( + Arrays.asList(judge("ok", scoring(0.9)), judge("bad", failing)), logger); + List results = evaluator.evaluate("q", "a"); + assertThat(results, hasSize(2)); + assertThat(results.get(0).isSuccess(), is(true)); + assertThat(results.get(0).getScore(), is(0.9)); + assertThat(results.get(1).isSuccess(), is(false)); + assertThat(results.get(1).getErrorMessage(), is("boom")); + } + + @Test + public void hungJudgeTimesOutWithoutStallingChain() { + Runner slow = input -> { + Thread.sleep(5000); + return RunnerResult.builder(Metrics.builder(true).build()).build(); + }; + Evaluator evaluator = new Evaluator( + Arrays.asList(judge("fast", scoring(0.7)), judge("slow", slow)), + Duration.ofMillis(150), + logger); + List results = evaluator.evaluate("q", "a"); + assertThat(results, hasSize(2)); + assertThat(results.get(0).isSuccess(), is(true)); + assertThat(results.get(1).isSuccess(), is(false)); + assertThat(results.get(1).getErrorMessage(), containsString("timed out")); + assertThat(results.get(1).getJudgeConfigKey(), is("slow")); + } + + @Test + public void resultsAreInJudgeOrderEvenWhenCompletionOrderDiffers() { + Runner slowOk = input -> { + Thread.sleep(300); + return RunnerResult.builder(Metrics.builder(true).build()) + .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "r").build()) + .build(); + }; + Evaluator evaluator = new Evaluator( + Arrays.asList(judge("slow", slowOk), judge("fast", scoring(0.6))), + Duration.ofSeconds(5), + logger); + List results = evaluator.evaluate("q", "a"); + assertThat( + Arrays.asList(results.get(0).getJudgeConfigKey(), results.get(1).getJudgeConfigKey()), + contains("slow", "fast")); + } +} diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java new file mode 100644 index 00000000..7fae78b3 --- /dev/null +++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java @@ -0,0 +1,170 @@ +package com.launchdarkly.sdk.server.ai; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.nullValue; +import static org.mockito.Mockito.mock; + +import com.launchdarkly.logging.LDLogger; +import com.launchdarkly.logging.Logs; +import com.launchdarkly.sdk.LDContext; +import com.launchdarkly.sdk.LDValue; +import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics; +import com.launchdarkly.sdk.server.ai.internal.LDAIConfigTrackerImpl; +import com.launchdarkly.sdk.server.interfaces.LDClientInterface; + +import java.util.Arrays; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Supplier; + +import org.junit.Before; +import org.junit.Test; + +@SuppressWarnings("javadoc") +public class JudgeTest { + private LDClientInterface client; + private LDLogger logger; + private final LDContext context = LDContext.create("user-key"); + + @Before + public void setUp() { + client = mock(LDClientInterface.class); + logger = LDLogger.withAdapter(Logs.capture(), "test"); + } + + private AIJudgeConfig judgeConfig(String metricKey, boolean enabled) { + Supplier trackerFactory = () -> new LDAIConfigTrackerImpl( + client, "run-1", "judge-key", "v1", 1, "gpt-4", "openai", context, null, logger); + return new AIJudgeConfig("judge-key", enabled, null, null, null, metricKey, trackerFactory); + } + + private static Runner runnerReturning(double score, String reasoning) { + return input -> RunnerResult.builder(Metrics.builder(true).build()) + .content("evaluated") + .parsed(LDValue.buildObject().put("score", score).put("reasoning", reasoning).build()) + .build(); + } + + @Test + public void evaluateScoresResponseAndReportsMetricKey() { + Judge judge = new Judge(judgeConfig("relevance", true), runnerReturning(0.8, "well grounded"), 1.0, logger); + JudgeResult result = judge.evaluate("the question", "the answer"); + assertThat(result.isSampled(), is(true)); + assertThat(result.isSuccess(), is(true)); + assertThat(result.getScore(), is(0.8)); + assertThat(result.getReasoning(), is("well grounded")); + assertThat(result.getMetricKey(), is("relevance")); + assertThat(result.getJudgeConfigKey(), is("judge-key")); + } + + @Test + public void evaluateBuildsWellKnownInputFormat() { + AtomicReference captured = new AtomicReference<>(); + Runner capturing = input -> { + captured.set(input); + return RunnerResult.builder(Metrics.builder(true).build()) + .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "ok").build()) + .build(); + }; + Judge judge = new Judge(judgeConfig("relevance", true), capturing, 1.0, logger); + judge.evaluate("what is 2+2?", "4"); + assertThat(captured.get(), is("MESSAGE HISTORY:\nwhat is 2+2?\n\nRESPONSE TO EVALUATE:\n4")); + } + + @Test + public void zeroSamplingRateSkipsInvocation() { + AtomicReference invoked = new AtomicReference<>(false); + Runner runner = input -> { + invoked.set(true); + return RunnerResult.builder(Metrics.builder(true).build()).build(); + }; + Judge judge = new Judge(judgeConfig("relevance", true), runner, 0.0, logger); + JudgeResult result = judge.evaluate("q", "a"); + assertThat(result.isSampled(), is(false)); + assertThat(result.isSuccess(), is(false)); + assertThat(invoked.get(), is(false)); + } + + @Test + public void missingEvaluationMetricKeyYieldsFailure() { + Judge judge = new Judge(judgeConfig(" ", true), runnerReturning(0.8, "x"), 1.0, logger); + JudgeResult result = judge.evaluate("q", "a"); + assertThat(result.isSampled(), is(true)); + assertThat(result.isSuccess(), is(false)); + assertThat(result.getErrorMessage(), containsString("evaluation metric key")); + } + + @Test + public void outOfRangeScoreFailsToParse() { + Judge judge = new Judge(judgeConfig("relevance", true), runnerReturning(1.5, "too high"), 1.0, logger); + JudgeResult result = judge.evaluate("q", "a"); + assertThat(result.isSampled(), is(true)); + assertThat(result.isSuccess(), is(false)); + assertThat(result.getScore(), is(nullValue())); + } + + @Test + public void missingReasoningFailsToParse() { + Runner runner = input -> RunnerResult.builder(Metrics.builder(true).build()) + .parsed(LDValue.buildObject().put("score", 0.5).build()) + .build(); + Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger); + JudgeResult result = judge.evaluate("q", "a"); + assertThat(result.isSuccess(), is(false)); + assertThat(result.getScore(), is(nullValue())); + } + + @Test + public void runnerFailureYieldsFailedResult() { + Runner runner = input -> { + throw new RuntimeException("model exploded"); + }; + Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger); + JudgeResult result = judge.evaluate("q", "a"); + assertThat(result.isSampled(), is(true)); + assertThat(result.isSuccess(), is(false)); + assertThat(result.getErrorMessage(), is("model exploded")); + } + + @Test + public void runnerReportingFailureMetricsYieldsUnsuccessfulResult() { + Runner runner = input -> RunnerResult.builder(Metrics.builder(false).build()) + .parsed(LDValue.buildObject().put("score", 0.3).put("reasoning", "weak").build()) + .build(); + Judge judge = new Judge(judgeConfig("relevance", true), runner, 1.0, logger); + JudgeResult result = judge.evaluate("q", "a"); + // Parsed successfully, but the runner's own metrics say the call did not succeed. + assertThat(result.isSuccess(), is(false)); + assertThat(result.getScore(), is(0.3)); + } + + @Test + public void evaluateMessagesRendersRolePrefixedHistory() { + AtomicReference captured = new AtomicReference<>(); + Runner capturing = input -> { + captured.set(input); + return RunnerResult.builder(Metrics.builder(true).build()) + .parsed(LDValue.buildObject().put("score", 0.9).put("reasoning", "great").build()) + .build(); + }; + Judge judge = new Judge(judgeConfig("relevance", true), capturing, 1.0, logger); + RunnerResult response = RunnerResult.builder(Metrics.builder(true).build()).content("the answer").build(); + judge.evaluateMessages( + Arrays.asList(new Message(Message.Role.SYSTEM, "be helpful"), new Message(Message.Role.USER, "hi")), + response); + assertThat(captured.get(), + is("MESSAGE HISTORY:\nsystem: be helpful\nuser: hi\n\nRESPONSE TO EVALUATE:\nthe answer")); + } + + @Test + public void normalizeSampleRateClampsAndDefaults() { + assertThat(Judge.normalizeSampleRate(-0.5), is(0.0)); + assertThat(Judge.normalizeSampleRate(2.0), is(1.0)); + assertThat(Judge.normalizeSampleRate(Double.NaN), is(1.0)); + assertThat(Judge.normalizeSampleRate(Double.POSITIVE_INFINITY), is(1.0)); + assertThat(Judge.normalizeSampleRate(0.42), is(0.42)); + } +} diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java index 7ba4798e..0ab4f63d 100644 --- a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java +++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/LDAIClientImplTest.java @@ -9,9 +9,11 @@ import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.nullValue; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyDouble; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -24,6 +26,7 @@ import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Mode; import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message; import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Model; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.Metrics; import com.launchdarkly.sdk.server.interfaces.LDClientInterface; import java.util.ArrayList; @@ -320,4 +323,57 @@ public void eachCreateTrackerCallStartsANewRun() { assertThat(runA, is(notNullValue())); assertThat(runA.equals(runB), is(false)); } + + // ---- createJudge ---------------------------------------------------------- + + private static final String JUDGE_JSON = + "{\"_ldMeta\":{\"enabled\":true,\"mode\":\"judge\"},\"evaluationMetricKeys\":[\"relevance\"]}"; + + private static Runner stubRunner() { + return input -> RunnerResult.builder(Metrics.builder(true).build()) + .parsed(LDValue.buildObject().put("score", 0.5).put("reasoning", "r").build()) + .build(); + } + + @Test + public void createJudgeFiresOnlyCreateJudgeUsageEvent() { + when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON)); + + ai.createJudge("judge-key", context, null, null, stubRunner(), 1.0); + + verify(client).trackMetric(eq("$ld:ai:usage:create-judge"), eq(context), eq(LDValue.of("judge-key")), eq(1.0)); + verify(client, never()).trackMetric(eq("$ld:ai:usage:judge-config"), any(), any(), anyDouble()); + } + + @Test + public void createJudgeReturnsJudgeForEnabledConfig() { + when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON)); + + Runner runner = stubRunner(); + Judge judge = ai.createJudge("judge-key", context, null, null, runner, 1.0); + + assertThat(judge, is(notNullValue())); + assertThat(judge.getAIConfig().getKey(), is("judge-key")); + assertThat(judge.getAIConfig().getEvaluationMetricKey(), is("relevance")); + assertThat(judge.getRunner(), is(runner)); + } + + @Test + public void createJudgeReturnsNullWhenDisabled() { + String disabled = "{\"_ldMeta\":{\"enabled\":false,\"mode\":\"judge\"}}"; + when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(disabled)); + + Judge judge = ai.createJudge("judge-key", context, null, null, stubRunner(), 1.0); + assertThat(judge, is(nullValue())); + } + + @Test + public void createJudgeReturnsNullWhenNoRunner() { + when(client.jsonValueVariation(anyString(), any(), any())).thenReturn(LDValue.parse(JUDGE_JSON)); + + Judge judge = ai.createJudge("judge-key", context, null, null, null, 1.0); + assertThat(judge, is(nullValue())); + // The usage event still fires before the runner check. + verify(client).trackMetric(eq("$ld:ai:usage:create-judge"), eq(context), eq(LDValue.of("judge-key")), eq(1.0)); + } } From f6d4a4ca09f578ba78852233399d6d494c81e6f3 Mon Sep 17 00:00:00 2001 From: Clifford Tawiah Date: Wed, 10 Jun 2026 22:44:23 -0400 Subject: [PATCH 2/2] refactor: make Evaluator package-private (not public API in v1.0) Co-authored-by: Cursor --- lib/sdk/server-ai/README.md | 4 ---- .../main/java/com/launchdarkly/sdk/server/ai/Evaluator.java | 4 +++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/sdk/server-ai/README.md b/lib/sdk/server-ai/README.md index 971bb43c..d6afbb44 100644 --- a/lib/sdk/server-ai/README.md +++ b/lib/sdk/server-ai/README.md @@ -78,10 +78,6 @@ if (judge != null) { } ``` -`Evaluator` runs several judges over the same input/output with per-judge fault isolation and a -per-judge timeout, returning one `JudgeResult` per judge in order. `Evaluator.noop()` returns an -empty result list. - ## Internal API convention Public, supported types live directly under `com.launchdarkly.sdk.server.ai` (and its documented diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java index 4df1d6e1..8e53f024 100644 --- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java @@ -26,8 +26,10 @@ *

* The evaluator does not record results; recording the returned {@link JudgeResult}s (for example * via a tracker) is the caller's responsibility. Instances are immutable and thread-safe. + *

+ * This type is not part of the public API in v1.0 and may change without notice. */ -public final class Evaluator { +final class Evaluator { /** * Default per-judge timeout used when one is not supplied. */