From 841f094dd17be2c36a44b09877c203a23224b65d Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Fri, 26 Jun 2026 13:17:24 -0600 Subject: [PATCH 1/3] eval listener + otel eval listener --- .../main/java/dev/braintrust/eval/Eval.java | 377 +++++++---------- .../dev/braintrust/eval/EvalListener.java | 43 ++ .../dev/braintrust/eval/OtelEvalListener.java | 381 ++++++++++++++++++ .../eval/DatasetBrainstoreImplTest.java | 41 ++ 4 files changed, 619 insertions(+), 223 deletions(-) create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java index a9c3d06b..4483f062 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java @@ -1,21 +1,16 @@ package dev.braintrust.eval; -import static dev.braintrust.json.BraintrustJsonMapper.toJson; - import dev.braintrust.BraintrustUtils; import dev.braintrust.api.BraintrustApiClient; import dev.braintrust.api.BraintrustOpenApiClient; import dev.braintrust.config.BraintrustConfig; +import dev.braintrust.eval.EvalListener.CaseListener; +import dev.braintrust.eval.EvalListener.RunListener; import dev.braintrust.openapi.api.ExperimentsApi; import dev.braintrust.openapi.model.CreateExperiment; import dev.braintrust.openapi.model.Project; import dev.braintrust.trace.BrainstoreTrace; -import dev.braintrust.trace.BraintrustContext; import dev.braintrust.trace.BraintrustTracing; -import io.opentelemetry.api.common.AttributeKey; -import io.opentelemetry.api.trace.Span; -import io.opentelemetry.api.trace.SpanKind; -import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.api.trace.Tracer; import java.util.*; import java.util.function.Function; @@ -31,14 +26,11 @@ */ @Slf4j public final class Eval { - private static final AttributeKey PARENT = - AttributeKey.stringKey(BraintrustTracing.PARENT_KEY); private final @Nonnull String experimentName; private final @Nonnull BraintrustConfig config; private final @Nonnull BraintrustOpenApiClient client; private final @Nonnull Project project; private final @Nonnull BraintrustOpenApiClient.OrgInfo orgInfo; - private final @Nonnull Tracer tracer; private final @Nonnull Dataset dataset; private final @Nonnull Task task; private final @Nonnull List> scorers; @@ -47,6 +39,18 @@ public final class Eval { private final @Nonnull Map metadata; private final @Nonnull Parameters parameters; + /** + * All listeners attached to this eval, including the built-in {@link OtelEvalListener} (always + * first) which manages the OTel spans. + */ + private final @Nonnull List listeners; + + /** + * Typed reference to the built-in OTel listener (also present in {@link #listeners}). Kept so + * we can pull span-derived info — e.g. the per-case {@link BrainstoreTrace} — back out of it. + */ + private final @Nonnull OtelEvalListener otelListener; + private Eval(Builder builder) { this.experimentName = builder.experimentName; this.config = Objects.requireNonNull(builder.config); @@ -55,7 +59,6 @@ private Eval(Builder builder) { client.fetchOrCreateProject( builder.projectId, config.defaultProjectName().orElse(null)); this.orgInfo = client.fetchOrgInfo(project.getOrgId().toString()); - this.tracer = Objects.requireNonNull(builder.tracer); this.dataset = builder.dataset; this.task = Objects.requireNonNull(builder.task); this.scorers = List.copyOf(builder.scorers); @@ -63,6 +66,12 @@ private Eval(Builder builder) { this.tags = List.copyOf(builder.tags); this.metadata = Map.copyOf(builder.metadata); this.parameters = builder.buildParameters(); + this.otelListener = new OtelEvalListener(Objects.requireNonNull(builder.tracer), client); + // built-in OTel listener runs first, then any user-supplied listeners + var allListeners = new ArrayList(); + allListeners.add(otelListener); + allListeners.addAll(builder.listeners); + this.listeners = List.copyOf(allListeners); } /** Runs the evaluation and returns results. */ @@ -88,8 +97,25 @@ public EvalResult run() { datasetVersion.ifPresent(createExperiment::datasetVersion); var experiment = new ExperimentsApi(client).postExperiment(createExperiment); + var experimentId = experiment.getId().toString(); + + // Create one RunListener per attached listener, tracking the built-in OTel run + // listener by identity so we can later pull the per-case BrainstoreTrace from it. + var runListeners = new ArrayList(listeners.size()); + OtelEvalListener.OtelRunListener otelRunListener = null; + for (var listener : listeners) { + var runListener = listener.createRunListener(experimentId); + if (listener == otelListener) { + otelRunListener = (OtelEvalListener.OtelRunListener) runListener; + } + runListeners.add(runListener); + } + final var otelRun = otelRunListener; - cursor.forEach(datasetCase -> evalOne(experiment.getId().toString(), datasetCase)); + runListeners.forEach(runListener -> runListener.onStart(experimentId)); + cursor.forEach( + datasetCase -> evalOne(experimentId, datasetCase, runListeners, otelRun)); + runListeners.forEach(RunListener::onEnd); } var experimentUrl = @@ -102,136 +128,79 @@ public EvalResult run() { return new EvalResult(experimentUrl); } - private void evalOne(String experimentId, DatasetCase datasetCase) { - var rootSpan = - tracer.spanBuilder("eval") // TODO: allow names for eval cases - .setNoParent() // each eval case is its own trace - .setSpanKind(SpanKind.CLIENT) - .setAttribute(PARENT, "experiment_id:" + experimentId) - .setAttribute("braintrust.span_attributes", toJson(Map.of("type", "eval"))) - .setAttribute( - "braintrust.input_json", - toJson(Map.of("input", datasetCase.input()))) - .setAttribute("braintrust.expected", toJson(datasetCase.expected())) - .startSpan(); - if (datasetCase.origin().isPresent()) { - rootSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get())); - } - if (!datasetCase.tags().isEmpty()) { - rootSpan.setAttribute( - AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); - } - if (!datasetCase.metadata().isEmpty()) { - rootSpan.setAttribute( - AttributeKey.stringKey("braintrust.metadata"), toJson(datasetCase.metadata())); - } - try (var rootScope = BraintrustContext.ofExperiment(experimentId, rootSpan).makeCurrent()) { + private void evalOne( + String experimentId, + DatasetCase datasetCase, + List runListeners, + @Nullable OtelEvalListener.OtelRunListener otelRunListener) { + // Create one CaseListener per RunListener, tracking the OTel one by identity so we can + // pull the BrainstoreTrace from it later. + var caseListeners = new ArrayList(runListeners.size()); + OtelEvalListener.OtelCaseListener otelCase = null; + for (var runListener : runListeners) { + var caseListener = runListener.createCaseListener(datasetCase); + if (runListener == otelRunListener) { + otelCase = (OtelEvalListener.OtelCaseListener) caseListener; + } + caseListeners.add(caseListener); + } + + caseListeners.forEach(CaseListener::onStart); + try { + // run task + caseListeners.forEach(cl -> cl.onTaskStart(experimentId, datasetCase)); final TaskResult taskResult; - final String taskSpanId; - { // run task - var taskSpan = - tracer.spanBuilder("task") - .setAttribute(PARENT, "experiment_id:" + experimentId) - .setAttribute( - "braintrust.span_attributes", - toJson(Map.of("type", "task"))) - .startSpan(); - taskSpanId = taskSpan.getSpanContext().getSpanId(); - try (var unused = - BraintrustContext.ofExperiment(experimentId, taskSpan).makeCurrent()) { - taskResult = task.apply(datasetCase, parameters); - rootSpan.setAttribute( - "braintrust.output_json", - toJson(Map.of("output", taskResult.result()))); - } catch (Exception e) { - taskSpan.setStatus(StatusCode.ERROR, e.getMessage()); - taskSpan.recordException(e); - taskSpan.end(); - rootSpan.setStatus(StatusCode.ERROR, e.getMessage()); - rootSpan.setAttribute( - "braintrust.output_json", - toJson(Collections.singletonMap("output", null))); - log.debug("Task threw exception for input: " + datasetCase.input(), e); - // run scoreForTaskException on each scorer - for (var scorer : scorers) { - runScoreForTaskException(experimentId, rootSpan, scorer, e, datasetCase); - } - return; + try { + taskResult = task.apply(datasetCase, parameters); + } catch (Exception e) { + caseListeners.forEach(cl -> cl.onTaskError(experimentId, datasetCase, e)); + log.debug("Task threw exception for input: " + datasetCase.input(), e); + // run scoreForTaskException on each scorer; classifiers are skipped + for (var scorer : scorers) { + runScoreForTaskException(caseListeners, scorer, e, datasetCase); } - taskSpan.end(); + return; } + caseListeners.forEach(cl -> cl.onTaskEnd(experimentId, taskResult)); - // Create a single BrainstoreTrace for this eval case, shared across all scorers. - // It fetches spans lazily on first access (only if a TracedScorer actually calls it). - // We wait specifically for the task span to appear, which guarantees its children - // (LLM spans, tool spans) have also been indexed — since children end before parents. - var rootTraceId = rootSpan.getSpanContext().getTraceId(); - var trace = - BrainstoreTrace.forExperiment( - client, experimentId, rootTraceId, List.of(taskSpanId)); + // A single BrainstoreTrace for this eval case, shared across all scorers/classifiers. + // It fetches spans lazily on first access (only if a traced scorer/classifier calls + // it). Owned by the OTel listener since it is derived from span ids. + BrainstoreTrace trace = otelCase != null ? otelCase.brainstoreTrace() : null; - // run scorers - one span per scorer + // run scorers for (var scorer : scorers) { - runScorer(experimentId, rootSpan, scorer, taskResult, trace); + runScorer(caseListeners, scorer, taskResult, trace); } - // run classifiers - one span per classifier. Classifier exceptions are non-fatal: - // they are recorded on the classifier span and surfaced in the root span's metadata - // under `classifier_errors`, but do not abort the eval or affect other classifiers/ - // scorers. Classifiers only run when the task succeeded (no scoreForTaskException - // analogue). - if (!classifiers.isEmpty()) { - Map>> caseClassifications = new LinkedHashMap<>(); - Map classifierErrors = new LinkedHashMap<>(); - for (int i = 0; i < classifiers.size(); i++) { - var classifier = classifiers.get(i); - var classifierName = classifier.getName(); - if (classifierName == null || classifierName.isBlank()) { - classifierName = "classifier_" + i; - } - runClassifier( - experimentId, - classifier, - classifierName, - taskResult, - trace, - caseClassifications, - classifierErrors); - } - if (!caseClassifications.isEmpty()) { - rootSpan.setAttribute( - "braintrust.classifications", toJson(caseClassifications)); - } - if (!classifierErrors.isEmpty()) { - Map mergedMetadata = - new LinkedHashMap<>(datasetCase.metadata()); - mergedMetadata.put("classifier_errors", classifierErrors); - rootSpan.setAttribute( - AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata)); - } + // run classifiers. Classifier exceptions are non-fatal: they are recorded on the + // classifier span and surfaced in the root span's metadata under `classifier_errors`, + // but do not abort the eval or affect other classifiers/scorers. Classifiers only run + // when the task succeeded (no scoreForTaskException analogue). + for (var classifier : classifiers) { + runClassifier(caseListeners, classifier, taskResult, trace); } } finally { - rootSpan.end(); + caseListeners.forEach(CaseListener::onEnd); } } /** * Runs a scorer against a successful task result. If the scorer is a {@link TracedScorer}, it * receives the {@link BrainstoreTrace} for the eval case. If the scorer throws, falls back to - * {@link Scorer#scoreForScorerException}. + * {@link Scorer#scoreForScorerException}. The {@code onScoreEnd} event is always dispatched (so + * the OTel listener can end its span) even when score validation aborts the eval. */ private void runScorer( - String experimentId, - Span rootSpan, + List caseListeners, Scorer scorer, TaskResult taskResult, - BrainstoreTrace trace) { - var scoreSpan = - tracer.spanBuilder("score") - .setAttribute(PARENT, "experiment_id:" + experimentId) - .startSpan(); - try (var unused = BraintrustContext.ofExperiment(experimentId, scoreSpan).makeCurrent()) { - List scores; + @Nullable BrainstoreTrace trace) { + caseListeners.forEach(cl -> cl.onScoreStart(scorer)); + List scores = List.of(); + Exception scoreException = null; + RuntimeException pending = null; + try { try { if (scorer instanceof TracedScorer tracedScorer) { scores = tracedScorer.score(taskResult, trace); @@ -239,142 +208,97 @@ private void runScorer( scores = scorer.score(taskResult); } } catch (Exception e) { - scoreSpan.setStatus(StatusCode.ERROR, e.getMessage()); - scoreSpan.recordException(e); + scoreException = e; log.debug("Scorer '{}' threw exception", scorer.getName(), e); // fall back to scoreForScorerException — if this throws, eval aborts scores = scorer.scoreForScorerException(e, taskResult); } - recordScores(scoreSpan, rootSpan, scorer, scores); + validateScores(scorer, scores); + } catch (RuntimeException re) { + // validation (or a throwing fallback) aborts the eval; record nothing for this score + pending = re; + scores = List.of(); } finally { - scoreSpan.end(); + final var finalScores = scores; + final var finalException = scoreException; + caseListeners.forEach(cl -> cl.onScoreEnd(scorer, finalScores, finalException)); + } + if (pending != null) { + throw pending; } } /** - * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback throws, the - * eval aborts. + * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback (or score + * validation) throws, the eval aborts — but the {@code onScoreEnd} event is still dispatched. */ private void runScoreForTaskException( - String experimentId, - Span rootSpan, + List caseListeners, Scorer scorer, Exception taskException, DatasetCase datasetCase) { - var scoreSpan = - tracer.spanBuilder("score") - .setAttribute(PARENT, "experiment_id:" + experimentId) - .startSpan(); - try (var unused = BraintrustContext.ofExperiment(experimentId, scoreSpan).makeCurrent()) { - // if this throws, it propagates and the eval aborts - var scores = scorer.scoreForTaskException(taskException, datasetCase); - recordScores(scoreSpan, rootSpan, scorer, scores); + caseListeners.forEach(cl -> cl.onScoreStart(scorer)); + List scores = List.of(); + RuntimeException pending = null; + try { + scores = scorer.scoreForTaskException(taskException, datasetCase); + validateScores(scorer, scores); + } catch (RuntimeException re) { + pending = re; + scores = List.of(); } finally { - scoreSpan.end(); + final var finalScores = scores; + caseListeners.forEach(cl -> cl.onScoreEnd(scorer, finalScores, null)); + } + if (pending != null) { + throw pending; } } /** - * Runs a classifier inside its own span. Exceptions are recorded on the classifier span and - * surfaced via {@code classifierErrors}; they do not propagate. + * Runs a classifier. Exceptions are non-fatal: they are surfaced to listeners via the {@code + * classifierException} argument of {@code onClassifierEnd} and do not propagate. */ private void runClassifier( - String experimentId, + List caseListeners, Classifier classifier, - String resolvedName, TaskResult taskResult, - BrainstoreTrace trace, - Map>> caseClassifications, - Map classifierErrors) { - var classifierSpan = - tracer.spanBuilder(resolvedName) - .setAttribute(PARENT, "experiment_id:" + experimentId) - .startSpan(); - try (var unused = - BraintrustContext.ofExperiment(experimentId, classifierSpan).makeCurrent()) { - Map spanAttrs = new LinkedHashMap<>(); - spanAttrs.put("type", "classifier"); - spanAttrs.put("name", resolvedName); - spanAttrs.put("purpose", "scorer"); - classifierSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs)); - - List classifications; - try { - if (classifier instanceof TracedClassifier tracedClassifier) { - classifications = tracedClassifier.classify(taskResult, trace); - } else { - classifications = classifier.classify(taskResult); - } - if (classifications == null) { - classifications = List.of(); - } - } catch (Exception e) { - classifierSpan.setStatus(StatusCode.ERROR, e.getMessage()); - classifierSpan.recordException(e); - log.debug("Classifier '{}' threw exception", resolvedName, e); - classifierErrors.put( - resolvedName, e.getMessage() == null ? e.toString() : e.getMessage()); - return; + @Nullable BrainstoreTrace trace) { + caseListeners.forEach(cl -> cl.onClassifierStart(classifier)); + List classifications = List.of(); + Exception classifierException = null; + try { + if (classifier instanceof TracedClassifier tracedClassifier) { + classifications = tracedClassifier.classify(taskResult, trace); + } else { + classifications = classifier.classify(taskResult); } - - // Group results by resolved item name (item.name, falling back to the classifier - // name when blank). Same map is logged to the classifier span and merged into the - // per-case aggregate logged on the root span. - Map>> outputByName = new LinkedHashMap<>(); - for (var item : classifications) { - var itemName = item.name(); - if (itemName == null || itemName.isBlank()) { - itemName = resolvedName; - } - var itemMap = toClassificationItem(item); - outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap); - caseClassifications.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap); + if (classifications == null) { + classifications = List.of(); } - classifierSpan.setAttribute("braintrust.output_json", toJson(outputByName)); - } finally { - classifierSpan.end(); - } + } catch (Exception e) { + classifierException = e; + classifications = List.of(); + log.debug("Classifier '{}' threw exception", classifier.getName(), e); + } + final var finalClassifications = classifications; + final var finalException = classifierException; + caseListeners.forEach( + cl -> cl.onClassifierEnd(classifier, finalClassifications, finalException)); } - /** - * Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code - * name}, includes {@code label} and {@code metadata} only when present. - */ - private static Map toClassificationItem(Classification c) { - Map m = new LinkedHashMap<>(); - m.put("id", c.id()); - if (c.label() != null) { - m.put("label", c.label()); - } - if (c.metadata() != null) { - m.put("metadata", c.metadata()); - } - return m; - } - - /** Validates and records scores on the score span and root span. */ - private void recordScores( - Span scoreSpan, Span rootSpan, Scorer scorer, List scores) { - if (scores == null || scores.isEmpty()) { + /** Validates that every score value is between 0 and 1 inclusive. Throws (aborting) if not. */ + private void validateScores(Scorer scorer, @Nullable List scores) { + if (scores == null) { return; } - final Map scorerScores = new LinkedHashMap<>(); for (var score : scores) { if (score.value() < 0.0 || score.value() > 1.0) { throw new RuntimeException( "score must be between 0 and 1: %s : %s" .formatted(scorer.getName(), score)); } - scorerScores.put(score.name(), score.value()); - } - Map spanAttrs = new LinkedHashMap<>(); - spanAttrs.put("type", "score"); - spanAttrs.put("name", scorer.getName()); - spanAttrs.put("purpose", "scorer"); - scoreSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs)); - var scoresJson = toJson(scorerScores); - scoreSpan.setAttribute("braintrust.output_json", scoresJson); - scoreSpan.setAttribute("braintrust.scores", scoresJson); + } } /** Creates a new eval builder. */ @@ -397,6 +321,7 @@ public static final class Builder { private @Nonnull Map parameterValues = Map.of(); private @Nonnull List tags = List.of(); private @Nonnull Map metadata = Map.of(); + private @Nonnull List listeners = new ArrayList<>(); public Eval build() { if (config == null) { @@ -515,6 +440,12 @@ public Builder tags(String... tags) { return this; } + /** Adds a listener which will be notified of eval lifecycle events. */ + public Builder addListener(@Nonnull EvalListener listener) { + this.listeners.add(Objects.requireNonNull(listener)); + return this; + } + /** Sets metadata for the experiment. */ public Builder metadata(Map metadata) { this.metadata = Map.copyOf(metadata); diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java new file mode 100644 index 00000000..401f3167 --- /dev/null +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java @@ -0,0 +1,43 @@ +package dev.braintrust.eval; + +import java.util.List; +import javax.annotation.Nullable; + +/** a listener which can be attached to an eval and hook specific events */ +public interface EvalListener { + RunListener createRunListener(String experimentId); + + /** a listener which receives events over the lifecycle of a single eval run */ + public interface RunListener { + void onStart(String experimentId); + + CaseListener createCaseListener(DatasetCase datasetCase); + + void onEnd(); + } + + /** a listener which receives events over the lifecycle of a single case of an eval run */ + public interface CaseListener { + void onStart(); + + void onTaskStart(String experimentId, DatasetCase datasetCase); + + void onTaskEnd(String experimentId, TaskResult taskResult); + + void onTaskError(String experimentId, DatasetCase datasetCase, Exception error); + + void onScoreStart(Scorer scorer); + + void onScoreEnd( + Scorer scorer, List scores, @Nullable Exception scoreException); + + void onClassifierStart(Classifier classifier); + + void onClassifierEnd( + Classifier classifier, + List classifications, + @Nullable Exception classifierException); + + void onEnd(); + } +} diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java new file mode 100644 index 00000000..e1e9083e --- /dev/null +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java @@ -0,0 +1,381 @@ +package dev.braintrust.eval; + +import static dev.braintrust.json.BraintrustJsonMapper.toJson; + +import dev.braintrust.api.BraintrustOpenApiClient; +import dev.braintrust.trace.BrainstoreTrace; +import dev.braintrust.trace.BraintrustContext; +import dev.braintrust.trace.BraintrustTracing; +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanKind; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.context.Scope; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.extern.slf4j.Slf4j; + +/** + * Built-in {@link EvalListener} that manages all OpenTelemetry spans for an eval (root {@code eval} + * span, {@code task} span, {@code score} spans, and {@code classifier} spans), pushing/popping the + * braintrust OTel context across the start/end events so user code nests correctly. + * + *

This listener pushes spans onto the current OTel context on start events and pops them on the + * matching end events. That makes it inherently thread-affine and dependent on strictly nested + * (LIFO) start/end ordering — which holds because evals run sequentially per case on one thread. + */ +@Slf4j +final class OtelEvalListener implements EvalListener { + private static final AttributeKey PARENT = + AttributeKey.stringKey(BraintrustTracing.PARENT_KEY); + + private final @Nonnull Tracer tracer; + private final @Nonnull BraintrustOpenApiClient client; + + OtelEvalListener(@Nonnull Tracer tracer, @Nonnull BraintrustOpenApiClient client) { + this.tracer = tracer; + this.client = client; + } + + @Override + public OtelRunListener createRunListener(String experimentId) { + return new OtelRunListener(experimentId); + } + + /** Run-scoped listener. There is currently no run-level span; it only spawns case listeners. */ + final class OtelRunListener implements RunListener { + private final @Nonnull String experimentId; + + private OtelRunListener(@Nonnull String experimentId) { + this.experimentId = experimentId; + } + + @Override + public void onStart(String experimentId) {} + + @Override + public OtelCaseListener createCaseListener(DatasetCase datasetCase) { + return new OtelCaseListener(experimentId, datasetCase); + } + + @Override + public void onEnd() {} + } + + /** Case-scoped listener owning the root/task/score/classifier spans and their scopes. */ + final class OtelCaseListener implements CaseListener { + private final @Nonnull String experimentId; + private final @Nonnull DatasetCase datasetCase; + + private @Nullable Span rootSpan; + private @Nullable Scope rootScope; + private @Nullable String rootTraceId; + private @Nullable String taskSpanId; + + private @Nullable Span taskSpan; + private @Nullable Scope taskScope; + + private @Nullable Span scoreSpan; + private @Nullable Scope scoreScope; + + private int classifierIndex = 0; + private @Nullable Span classifierSpan; + private @Nullable Scope classifierScope; + private @Nullable String classifierName; + + // Accumulated classifier results, written onto the root span at case end. + private final Map>> caseClassifications = + new LinkedHashMap<>(); + private final Map classifierErrors = new LinkedHashMap<>(); + + private OtelCaseListener( + @Nonnull String experimentId, @Nonnull DatasetCase datasetCase) { + this.experimentId = experimentId; + this.datasetCase = datasetCase; + } + + @Override + public void onStart() { + var span = + tracer.spanBuilder("eval") // TODO: allow names for eval cases + .setNoParent() // each eval case is its own trace + .setSpanKind(SpanKind.CLIENT) + .setAttribute(PARENT, "experiment_id:" + experimentId) + .setAttribute( + "braintrust.span_attributes", toJson(Map.of("type", "eval"))) + .setAttribute( + "braintrust.input_json", + toJson(Map.of("input", datasetCase.input()))) + .setAttribute("braintrust.expected", toJson(datasetCase.expected())) + .startSpan(); + if (datasetCase.origin().isPresent()) { + span.setAttribute("braintrust.origin", toJson(datasetCase.origin().get())); + } + if (!datasetCase.tags().isEmpty()) { + span.setAttribute( + AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); + } + if (!datasetCase.metadata().isEmpty()) { + span.setAttribute( + AttributeKey.stringKey("braintrust.metadata"), + toJson(datasetCase.metadata())); + } + this.rootSpan = span; + this.rootTraceId = span.getSpanContext().getTraceId(); + this.rootScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent(); + } + + @Override + public void onTaskStart(String experimentId, DatasetCase datasetCase) { + var span = + tracer.spanBuilder("task") + .setAttribute(PARENT, "experiment_id:" + this.experimentId) + .setAttribute( + "braintrust.span_attributes", toJson(Map.of("type", "task"))) + .startSpan(); + this.taskSpan = span; + this.taskSpanId = span.getSpanContext().getSpanId(); + this.taskScope = BraintrustContext.ofExperiment(this.experimentId, span).makeCurrent(); + } + + @Override + public void onTaskEnd(String experimentId, TaskResult taskResult) { + requireRoot() + .setAttribute( + "braintrust.output_json", + toJson(Map.of("output", taskResult.result()))); + closeTaskScope(); + requireTask().end(); + } + + @Override + public void onTaskError( + String experimentId, DatasetCase datasetCase, Exception error) { + var task = requireTask(); + task.setStatus(StatusCode.ERROR, error.getMessage()); + task.recordException(error); + closeTaskScope(); + task.end(); + + var root = requireRoot(); + root.setStatus(StatusCode.ERROR, error.getMessage()); + var nullOutput = new LinkedHashMap(); + nullOutput.put("output", null); + root.setAttribute("braintrust.output_json", toJson(nullOutput)); + } + + @Override + public void onScoreStart(Scorer scorer) { + var span = + tracer.spanBuilder("score") + .setAttribute(PARENT, "experiment_id:" + experimentId) + .startSpan(); + this.scoreSpan = span; + this.scoreScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent(); + } + + @Override + public void onScoreEnd( + Scorer scorer, List scores, @Nullable Exception scoreException) { + var span = requireScore(); + try { + if (scoreException != null) { + span.setStatus(StatusCode.ERROR, scoreException.getMessage()); + span.recordException(scoreException); + } + recordScores(span, requireRoot(), scorer, scores); + } finally { + closeScoreScope(); + span.end(); + } + } + + @Override + public void onClassifierStart(Classifier classifier) { + var resolvedName = classifier.getName(); + if (resolvedName == null || resolvedName.isBlank()) { + resolvedName = "classifier_" + classifierIndex; + } + classifierIndex++; + this.classifierName = resolvedName; + + var span = + tracer.spanBuilder(resolvedName) + .setAttribute(PARENT, "experiment_id:" + experimentId) + .startSpan(); + Map spanAttrs = new LinkedHashMap<>(); + spanAttrs.put("type", "classifier"); + spanAttrs.put("name", resolvedName); + spanAttrs.put("purpose", "scorer"); + span.setAttribute("braintrust.span_attributes", toJson(spanAttrs)); + + this.classifierSpan = span; + this.classifierScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent(); + } + + @Override + public void onClassifierEnd( + Classifier classifier, + List classifications, + @Nullable Exception classifierException) { + var span = requireClassifier(); + var resolvedName = classifierName; + try { + if (classifierException != null) { + span.setStatus(StatusCode.ERROR, classifierException.getMessage()); + span.recordException(classifierException); + classifierErrors.put( + resolvedName, + classifierException.getMessage() == null + ? classifierException.toString() + : classifierException.getMessage()); + return; + } + + // Group results by resolved item name (item.name, falling back to the classifier + // name when blank). Same map is logged to the classifier span and merged into the + // per-case aggregate logged on the root span. + Map>> outputByName = new LinkedHashMap<>(); + for (var item : classifications) { + var itemName = item.name(); + if (itemName == null || itemName.isBlank()) { + itemName = resolvedName; + } + var itemMap = toClassificationItem(item); + outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap); + caseClassifications + .computeIfAbsent(itemName, k -> new ArrayList<>()) + .add(itemMap); + } + span.setAttribute("braintrust.output_json", toJson(outputByName)); + } finally { + closeClassifierScope(); + span.end(); + } + } + + @Override + public void onEnd() { + var root = requireRoot(); + try { + if (!caseClassifications.isEmpty()) { + root.setAttribute("braintrust.classifications", toJson(caseClassifications)); + } + if (!classifierErrors.isEmpty()) { + Map mergedMetadata = + new LinkedHashMap<>(datasetCase.metadata()); + mergedMetadata.put("classifier_errors", classifierErrors); + root.setAttribute( + AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata)); + } + } finally { + closeRootScope(); + root.end(); + } + } + + /** + * Builds the {@link BrainstoreTrace} for this case from the root trace id and the task span + * id. Must be called after {@link #onTaskEnd}. + */ + BrainstoreTrace brainstoreTrace() { + return BrainstoreTrace.forExperiment( + client, + experimentId, + requireNonNullState(rootTraceId, "rootTraceId"), + List.of(requireNonNullState(taskSpanId, "taskSpanId"))); + } + + private Span requireRoot() { + return requireNonNullState(rootSpan, "rootSpan"); + } + + private Span requireTask() { + return requireNonNullState(taskSpan, "taskSpan"); + } + + private Span requireScore() { + return requireNonNullState(scoreSpan, "scoreSpan"); + } + + private Span requireClassifier() { + return requireNonNullState(classifierSpan, "classifierSpan"); + } + + private void closeRootScope() { + if (rootScope != null) { + rootScope.close(); + rootScope = null; + } + } + + private void closeTaskScope() { + if (taskScope != null) { + taskScope.close(); + taskScope = null; + } + } + + private void closeScoreScope() { + if (scoreScope != null) { + scoreScope.close(); + scoreScope = null; + } + } + + private void closeClassifierScope() { + if (classifierScope != null) { + classifierScope.close(); + classifierScope = null; + } + } + } + + /** Records scores onto the score span and root span. Validation is the caller's job. */ + private static void recordScores( + Span scoreSpan, Span rootSpan, Scorer scorer, List scores) { + if (scores == null || scores.isEmpty()) { + return; + } + final Map scorerScores = new LinkedHashMap<>(); + for (var score : scores) { + scorerScores.put(score.name(), score.value()); + } + Map spanAttrs = new LinkedHashMap<>(); + spanAttrs.put("type", "score"); + spanAttrs.put("name", scorer.getName()); + spanAttrs.put("purpose", "scorer"); + scoreSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs)); + var scoresJson = toJson(scorerScores); + scoreSpan.setAttribute("braintrust.output_json", scoresJson); + scoreSpan.setAttribute("braintrust.scores", scoresJson); + } + + /** + * Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code + * name}, includes {@code label} and {@code metadata} only when present. + */ + private static Map toClassificationItem(Classification c) { + Map m = new LinkedHashMap<>(); + m.put("id", c.id()); + if (c.label() != null) { + m.put("label", c.label()); + } + if (c.metadata() != null) { + m.put("metadata", c.metadata()); + } + return m; + } + + private static T requireNonNullState(@Nullable T value, String name) { + if (value == null) { + throw new IllegalStateException("OtelEvalListener: " + name + " accessed out of order"); + } + return value; + } +} diff --git a/braintrust-sdk/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java b/braintrust-sdk/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java index 40227f08..17d712a1 100644 --- a/braintrust-sdk/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java +++ b/braintrust-sdk/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java @@ -265,6 +265,47 @@ void testMetadataPopulatedFromDatasetRow() { assertEquals("user123", metadata.get("userId")); } + @Test + void testTagsPopulatedFromDatasetRow() { + wireMock.stubFor( + post(urlEqualTo("/v1/dataset/" + datasetId + "/fetch")) + .willReturn( + aResponse() + .withStatus(200) + .withHeader("Content-Type", "application/json") + .withBody( + """ + { + "events": [ + { + "object_type": "dataset", + "dataset_id": "%s", + "id": "meta-row-1", + "_xact_id": "1", + "created": "2024-01-01T00:00:00Z", + "input": "test input", + "expected": "test output", + "tags": ["unit-test"] + } + ], + "cursor": null + } + """ + .formatted(datasetId)))); + + DatasetBrainstoreImpl dataset = + new DatasetBrainstoreImpl<>(apiClient, datasetId, "test-version"); + + List> cases = new ArrayList<>(); + dataset.forEach(cases::add); + + assertEquals(1, cases.size()); + List tags = cases.get(0).tags(); + assertFalse(tags.isEmpty(), "tags should not be empty"); + assertEquals(1, tags.size()); + assertEquals("unit-test", tags.get(0)); + } + @Test void testFetchFromBraintrustNotFound() { String projectName = "test-project"; From e3730195d6eb55486643da9d34a58dd194535ffb Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Fri, 26 Jun 2026 16:30:56 -0600 Subject: [PATCH 2/3] wip --- braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java index 4483f062..a20f7861 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java @@ -308,7 +308,7 @@ public static Builder builder() { /** Builder for creating evaluations with fluent API. */ public static final class Builder { - public @Nonnull Dataset dataset; + private @Nonnull Dataset dataset; private @Nonnull String experimentName = "unnamed-java-eval"; private @Nullable BraintrustConfig config; private @Nullable BraintrustOpenApiClient apiClient; From 1ba64efe554ba4f44b38d9d1ac191f0254493fa0 Mon Sep 17 00:00:00 2001 From: Andrew Kent Date: Fri, 26 Jun 2026 17:06:41 -0600 Subject: [PATCH 3/3] wip 50/50 otel --- .../dev/braintrust/devserver/Devserver.java | 461 ++++-------------- .../devserver/PlaygroundSpanDecorator.java | 151 ++++++ .../main/java/dev/braintrust/eval/Eval.java | 290 ++++++----- .../dev/braintrust/eval/EvalListener.java | 76 ++- .../java/dev/braintrust/eval/EvalRunInfo.java | 24 + .../braintrust/eval/EvalSpanDecorator.java | 211 ++++++++ .../braintrust/eval/EvalTargetProvider.java | 33 ++ .../eval/ExperimentTargetProvider.java | 46 ++ .../dev/braintrust/eval/OtelEvalListener.java | 381 --------------- .../braintrust/devserver/DevserverTest.java | 8 +- 10 files changed, 812 insertions(+), 869 deletions(-) create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/devserver/PlaygroundSpanDecorator.java create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/EvalRunInfo.java create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/EvalSpanDecorator.java create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/EvalTargetProvider.java create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/ExperimentTargetProvider.java delete mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java diff --git a/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java b/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java index f83024af..d7f2906a 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java @@ -13,14 +13,7 @@ import dev.braintrust.api.BraintrustOpenApiClient; import dev.braintrust.config.BraintrustConfig; import dev.braintrust.eval.*; -import dev.braintrust.trace.BraintrustContext; -import dev.braintrust.trace.BraintrustTracing; -import io.opentelemetry.api.common.AttributeKey; import io.opentelemetry.api.trace.Span; -import io.opentelemetry.api.trace.SpanKind; -import io.opentelemetry.api.trace.StatusCode; -import io.opentelemetry.api.trace.Tracer; -import io.opentelemetry.context.Context; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -69,9 +62,6 @@ public class Devserver { private static final String EXPOSED_HEADERS = "x-bt-cursor, x-bt-found-existing-experiment, x-bt-span-id, x-bt-span-export"; - private static final AttributeKey PARENT = - AttributeKey.stringKey(BraintrustTracing.PARENT_KEY); - private final List corsOriginWhitelist; private final BraintrustConfig config; @@ -345,12 +335,12 @@ private void handleEval(HttpExchange exchange) throws IOException { } @SuppressWarnings({"unchecked", "rawtypes"}) - private void handleStreamingEval( + private void handleStreamingEval( HttpExchange exchange, RemoteEval eval, EvalRequest request, RequestContext context, - List> remoteScorers) + List> remoteScorers) throws Exception { // Set SSE headers exchange.getResponseHeaders().set("Content-Type", "text/event-stream"); @@ -377,174 +367,44 @@ private void handleStreamingEval( BraintrustUtils.createProjectURI( braintrust.config().appUrl(), orgName, projectName) .toASCIIString(); - final var experimentUrl = projectUrl + "/experiments/" + experimentName; - - var tracer = BraintrustTracing.getTracer(); - // Merge parameters: evaluator defaults + request overrides - final Parameters mergedParameters = - new Parameters( - eval.getParameters(), - null == request.getParameters() - ? Map.of() - : request.getParameters()); + // Combine local scorers from RemoteEval with remote scorers from the request + List> allScorers = new ArrayList<>(eval.getScorers()); + allScorers.addAll(remoteScorers); - // Execute task and scorers for each case - final Map> scoresByName = new ConcurrentHashMap<>(); final var parentInfo = extractParentInfo(request); - final var braintrustParent = parentInfo.braintrustParent(); - final var braintrustGeneration = parentInfo.generation(); - - // NOTE: this code is serial but written in a thread-safe manner to support - // concurrent dataset fetching and eval execution - extractDataset(request, apiClient) - .forEach( - rawDataset -> { - final DatasetCase datasetCase = - (DatasetCase) rawDataset; - var evalSpan = - tracer.spanBuilder("eval") - .setNoParent() - .setSpanKind(SpanKind.CLIENT) - .setAttribute( - PARENT, - braintrustParent.toParentValue()) - .startSpan(); - Context evalContext = Context.current().with(evalSpan); - evalContext = - BraintrustContext.setParentInBaggage( - evalContext, - braintrustParent.type(), - braintrustParent.id()); - // Make the eval context (with span and baggage) current - try (var rootScope = evalContext.makeCurrent()) { - final TaskResult taskResult; - { // run task - var taskSpan = tracer.spanBuilder("task").startSpan(); - try (var unused = - Context.current() - .with(taskSpan) - .makeCurrent()) { - var task = eval.getTask(); - try { - taskResult = - task.apply( - datasetCase, mergedParameters); - } catch (Exception e) { - taskSpan.setStatus( - StatusCode.ERROR, e.getMessage()); - taskSpan.recordException(e); - taskSpan.end(); - evalSpan.setStatus( - StatusCode.ERROR, e.getMessage()); - evalSpan.setAttribute( - "braintrust.output_json", - toJson( - Collections.singletonMap( - "output", null))); - log.debug( - "Task threw exception for input: " - + datasetCase.input(), - e); - // Set eval span attributes so Braintrust can - // resolve the trace - setEvalSpanAttributesForError( - evalSpan, - braintrustParent, - braintrustGeneration, - datasetCase); - // Send progress event even on error so the - // Playground can link to the trace - sendProgressEvent( - os, - evalSpan.getSpanContext().getSpanId(), - datasetCase.origin(), - eval.getName(), - null); - // run scoreForTaskException on each scorer - List> allScorersForError = - new ArrayList<>(eval.getScorers()); - allScorersForError.addAll(remoteScorers); - for (var scorer : allScorersForError) { - runScoreForTaskException( - tracer, - evalSpan, - braintrustParent, - braintrustGeneration, - scorer, - e, - datasetCase, - scoresByName); - } - return; - } - // Send progress event for task completion - sendProgressEvent( - os, - evalSpan.getSpanContext().getSpanId(), - datasetCase.origin(), - eval.getName(), - taskResult.result()); - setTaskSpanAttributes( - taskSpan, - braintrustParent, - braintrustGeneration, - datasetCase, - taskResult); - } finally { - taskSpan.end(); - } - // setting eval span attributes here because we need the - // task output - setEvalSpanAttributes( - evalSpan, - braintrustParent, - braintrustGeneration, - datasetCase, - taskResult); - } - // run scorers - one score span per scorer - // Combine local scorers from RemoteEval with remote scorers - // from request - List> allScorers = - new ArrayList<>(eval.getScorers()); - allScorers.addAll(remoteScorers); - for (var scorer : allScorers) { - runScorer( - tracer, - evalSpan, - braintrustParent, - braintrustGeneration, - scorer, - taskResult, - scoresByName); - } - } catch (IOException e) { - throw new RuntimeException( - "Failed to send progress event", e); - } finally { - evalSpan.end(); - } - }); - - // Aggregate scores - Map scoreSummaries = new LinkedHashMap<>(); - for (Map.Entry> entry : scoresByName.entrySet()) { - String scoreName = entry.getKey(); - List values = entry.getValue(); - - double avgScore = - values.stream().mapToDouble(Double::doubleValue).average().orElse(0.0); - - scoreSummaries.put( - scoreName, - EvalResponse.ScoreSummary.builder() - .name(scoreName) - .score(avgScore) - .improvements(0) - .regressions(0) - .build()); - } + // The playground targets a playground_id parent (no experiment is created) and + // weaves the request's generation into span attributes. + EvalTargetProvider playgroundTarget = + ctx -> + new EvalRunInfo( + parentInfo.braintrustParent(), + parentInfo.generation(), + null, + null, + false); + + var sseListener = new SseEvalListener(os, eval.getName()); + + Eval.builder() + .name(experimentName) + .config(braintrust.config()) + .apiClient(apiClient) + .projectId(projectId) + .dataset((Dataset) extractDataset(request, apiClient)) + .task((Task) eval.getTask()) + .scorers(allScorers.toArray(new Scorer[0])) + .parameters(eval.getParameters()) + .parameterValues( + request.getParameters() == null + ? Map.of() + : request.getParameters()) + .evalTargetProvider(playgroundTarget) + .clearListeners() + .addListener(new PlaygroundSpanDecorator()) + .addListener(sseListener) + .build() + .run(); sendSummaryEvent( os, @@ -552,8 +412,8 @@ private void handleStreamingEval( projectId, experimentName, projectUrl, - experimentUrl, - scoreSummaries); + null, + sseListener.scoreSummaries()); sendDoneEvent(os); } catch (Exception e) { // Send error event via SSE @@ -577,194 +437,87 @@ private void handleStreamingEval( } } - private void setEvalSpanAttributes( - Span evalSpan, - BraintrustUtils.Parent braintrustParent, - String braintrustGeneration, - DatasetCase datasetCase, - TaskResult taskResult) { - var spanAttrs = new LinkedHashMap<>(); - spanAttrs.put("type", "eval"); - spanAttrs.put("name", "eval"); - if (braintrustGeneration != null) { - spanAttrs.put("generation", braintrustGeneration); - } - evalSpan.setAttribute(PARENT, braintrustParent.toParentValue()) - .setAttribute("braintrust.span_attributes", toJson(spanAttrs)) - .setAttribute("braintrust.input_json", toJson(Map.of("input", datasetCase.input()))) - .setAttribute("braintrust.expected_json", toJson(datasetCase.expected())); - - if (datasetCase.origin().isPresent()) { - evalSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get())); - } - if (!datasetCase.tags().isEmpty()) { - evalSpan.setAttribute( - AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); - } - if (!datasetCase.metadata().isEmpty()) { - evalSpan.setAttribute("braintrust.metadata", toJson(datasetCase.metadata())); - } - evalSpan.setAttribute( - "braintrust.output_json", toJson(Map.of("output", taskResult.result()))); - } - /** - * Sets eval span attributes when the task threw an exception. Similar to {@link - * #setEvalSpanAttributes} but does not require a TaskResult. + * An {@link EvalListener} that streams playground SSE {@code progress} events (one per case, + * including on task error) and accumulates per-scorer averages for the {@code summary} event. + * Span decoration is handled separately by {@link PlaygroundSpanDecorator}. */ - private void setEvalSpanAttributesForError( - Span evalSpan, - BraintrustUtils.Parent braintrustParent, - String braintrustGeneration, - DatasetCase datasetCase) { - var spanAttrs = new LinkedHashMap<>(); - spanAttrs.put("type", "eval"); - spanAttrs.put("name", "eval"); - if (braintrustGeneration != null) { - spanAttrs.put("generation", braintrustGeneration); - } - evalSpan.setAttribute(PARENT, braintrustParent.toParentValue()) - .setAttribute("braintrust.span_attributes", toJson(spanAttrs)) - .setAttribute("braintrust.input_json", toJson(Map.of("input", datasetCase.input()))) - .setAttribute("braintrust.expected_json", toJson(datasetCase.expected())); - - if (datasetCase.origin().isPresent()) { - evalSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get())); - } - if (!datasetCase.tags().isEmpty()) { - evalSpan.setAttribute( - AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); + private final class SseEvalListener implements EvalListener { + private final OutputStream os; + private final String evalName; + private final Map> scoresByName = new ConcurrentHashMap<>(); + + SseEvalListener(OutputStream os, String evalName) { + this.os = os; + this.evalName = evalName; } - if (!datasetCase.metadata().isEmpty()) { - evalSpan.setAttribute("braintrust.metadata", toJson(datasetCase.metadata())); - } - } - private void setTaskSpanAttributes( - Span taskSpan, - BraintrustUtils.Parent braintrustParent, - String braintrustGeneration, - DatasetCase datasetCase, - TaskResult taskResult) { - Map taskSpanAttrs = new LinkedHashMap<>(); - taskSpanAttrs.put("type", "task"); - taskSpanAttrs.put("name", "task"); - if (braintrustGeneration != null) { - taskSpanAttrs.put("generation", braintrustGeneration); + Map scoreSummaries() { + Map scoreSummaries = new LinkedHashMap<>(); + for (var entry : scoresByName.entrySet()) { + double avgScore = + entry.getValue().stream() + .mapToDouble(Double::doubleValue) + .average() + .orElse(0.0); + scoreSummaries.put( + entry.getKey(), + EvalResponse.ScoreSummary.builder() + .name(entry.getKey()) + .score(avgScore) + .improvements(0) + .regressions(0) + .build()); + } + return scoreSummaries; } - taskSpan.setAttribute(PARENT, braintrustParent.toParentValue()) - .setAttribute("braintrust.span_attributes", toJson(taskSpanAttrs)) - .setAttribute("braintrust.input_json", toJson(Map.of("input", datasetCase.input()))) - .setAttribute( - "braintrust.output_json", toJson(Map.of("output", taskResult.result()))); - } - - private void setScoreSpanAttributes( - Span scoreSpan, - BraintrustUtils.Parent braintrustParent, - String braintrustGeneration, - String scorerName, - Map scorerScores) { - Map scoreSpanAttrs = new LinkedHashMap<>(); - scoreSpanAttrs.put("type", "score"); - scoreSpanAttrs.put("name", scorerName); - scoreSpanAttrs.put("purpose", "scorer"); - if (braintrustGeneration != null) { - scoreSpanAttrs.put("generation", braintrustGeneration); + @Override + public RunListener createRunListener(EvalRunInfo info) { + return datasetCase -> new SseCaseListener(); } - var scoresJson = toJson(scorerScores); - scoreSpan - .setAttribute(PARENT, braintrustParent.toParentValue()) - .setAttribute("braintrust.span_attributes", toJson(scoreSpanAttrs)) - .setAttribute("braintrust.output_json", scoresJson) - .setAttribute("braintrust.scores", scoresJson); - } + private final class SseCaseListener implements CaseListener { + @Override + public void onTaskSuccess(Span rootSpan, Span taskSpan, TaskResult taskResult) { + sendProgress(rootSpan, taskResult.datasetCase(), taskResult.result()); + } - /** - * Runs a scorer against a successful task result. If the scorer throws, falls back to {@link - * Scorer#scoreForScorerException}. - */ - private void runScorer( - Tracer tracer, - Span evalSpan, - BraintrustUtils.Parent braintrustParent, - String braintrustGeneration, - Scorer scorer, - TaskResult taskResult, - Map> scoresByName) { - var scoreSpan = tracer.spanBuilder("score").startSpan(); - try (var unused = Context.current().with(scoreSpan).makeCurrent()) { - List scores; - try { - scores = scorer.score(taskResult); - } catch (Exception e) { - scoreSpan.setStatus(StatusCode.ERROR, e.getMessage()); - scoreSpan.recordException(e); - log.debug("Scorer '{}' threw exception", scorer.getName(), e); - // fall back to scoreForScorerException — if this throws, eval aborts - scores = scorer.scoreForScorerException(e, taskResult); + @Override + public void onTaskError( + Span rootSpan, Span taskSpan, DatasetCase datasetCase, Exception error) { + // Send progress even on error so the Playground can link to the trace. + sendProgress(rootSpan, datasetCase, null); } - recordScores( - scoreSpan, - braintrustParent, - braintrustGeneration, - scorer, - scores, - scoresByName); - } finally { - scoreSpan.end(); - } - } - /** - * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback throws, the - * eval aborts. - */ - private void runScoreForTaskException( - Tracer tracer, - Span evalSpan, - BraintrustUtils.Parent braintrustParent, - String braintrustGeneration, - Scorer scorer, - Exception taskException, - DatasetCase datasetCase, - Map> scoresByName) { - var scoreSpan = tracer.spanBuilder("score").startSpan(); - try (var unused = Context.current().with(scoreSpan).makeCurrent()) { - // if this throws, it propagates and the eval aborts - var scores = scorer.scoreForTaskException(taskException, datasetCase); - recordScores( - scoreSpan, - braintrustParent, - braintrustGeneration, - scorer, - scores, - scoresByName); - } finally { - scoreSpan.end(); - } - } + @Override + public void onScoreResult( + Span scoreSpan, + Span rootSpan, + Scorer scorer, + List scores, + @Nullable Exception scoreException) { + for (var score : scores) { + scoresByName + .computeIfAbsent(score.name(), k -> new ArrayList<>()) + .add(score.value()); + } + } - /** Records scores on the score span and accumulates them into scoresByName. */ - private void recordScores( - Span scoreSpan, - BraintrustUtils.Parent braintrustParent, - String braintrustGeneration, - Scorer scorer, - List scores, - Map> scoresByName) { - if (scores == null || scores.isEmpty()) { - return; - } - Map scorerScores = new LinkedHashMap<>(); - for (Score score : scores) { - scoresByName.computeIfAbsent(score.name(), k -> new ArrayList<>()).add(score.value()); - scorerScores.put(score.name(), score.value()); + private void sendProgress( + Span rootSpan, DatasetCase datasetCase, @Nullable Object output) { + try { + sendProgressEvent( + os, + rootSpan.getSpanContext().getSpanId(), + datasetCase.origin(), + evalName, + output); + } catch (IOException e) { + throw new RuntimeException("Failed to send progress event", e); + } + } } - setScoreSpanAttributes( - scoreSpan, braintrustParent, braintrustGeneration, scorer.getName(), scorerScores); } private void sendSSEEvent(OutputStream os, String eventType, String data) throws IOException { diff --git a/braintrust-sdk/src/main/java/dev/braintrust/devserver/PlaygroundSpanDecorator.java b/braintrust-sdk/src/main/java/dev/braintrust/devserver/PlaygroundSpanDecorator.java new file mode 100644 index 00000000..5b30dd46 --- /dev/null +++ b/braintrust-sdk/src/main/java/dev/braintrust/devserver/PlaygroundSpanDecorator.java @@ -0,0 +1,151 @@ +package dev.braintrust.devserver; + +import static dev.braintrust.json.BraintrustJsonMapper.toJson; + +import dev.braintrust.eval.Classifier; +import dev.braintrust.eval.DatasetCase; +import dev.braintrust.eval.EvalListener; +import dev.braintrust.eval.EvalRunInfo; +import dev.braintrust.eval.Score; +import dev.braintrust.eval.Scorer; +import dev.braintrust.eval.TaskResult; +import dev.braintrust.trace.BraintrustTracing; +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import javax.annotation.Nullable; + +/** + * Playground variant of the span decorator. Mirrors {@link dev.braintrust.eval.EvalSpanDecorator} + * but emits the playground attribute shape: a {@code playground_id} parent, a {@code generation} + * woven into each {@code span_attributes}, a {@code name} on the eval/task span attributes, {@code + * braintrust.expected_json} (rather than {@code braintrust.expected}), and input/output on the task + * span. + * + *

Standalone (does not extend {@code EvalSpanDecorator}) so the two attribute shapes can evolve + * independently. + */ +final class PlaygroundSpanDecorator implements EvalListener { + private static final AttributeKey PARENT = + AttributeKey.stringKey(BraintrustTracing.PARENT_KEY); + + @Override + public RunListener createRunListener(EvalRunInfo info) { + return new RunListener() { + @Override + public CaseListener createCaseListener(DatasetCase datasetCase) { + return new Decorator(info); + } + }; + } + + private static final class Decorator implements CaseListener { + private final EvalRunInfo info; + + Decorator(EvalRunInfo info) { + this.info = info; + } + + private String parentValue() { + return info.parent().toParentValue(); + } + + private Map spanAttrs(String type, String name) { + var m = new LinkedHashMap(); + m.put("type", type); + m.put("name", name); + if (info.generation() != null) { + m.put("generation", info.generation()); + } + return m; + } + + @Override + public void onRootSpan(Span rootSpan, DatasetCase datasetCase) { + rootSpan.setAttribute(PARENT, parentValue()); + rootSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs("eval", "eval"))); + rootSpan.setAttribute( + "braintrust.input_json", toJson(Map.of("input", datasetCase.input()))); + rootSpan.setAttribute("braintrust.expected_json", toJson(datasetCase.expected())); + if (datasetCase.origin().isPresent()) { + rootSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get())); + } + if (!datasetCase.tags().isEmpty()) { + rootSpan.setAttribute( + AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); + } + if (!datasetCase.metadata().isEmpty()) { + rootSpan.setAttribute("braintrust.metadata", toJson(datasetCase.metadata())); + } + } + + @Override + public void onTaskSpan(Span taskSpan, DatasetCase datasetCase) { + taskSpan.setAttribute(PARENT, parentValue()); + taskSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs("task", "task"))); + taskSpan.setAttribute( + "braintrust.input_json", toJson(Map.of("input", datasetCase.input()))); + } + + @Override + public void onTaskSuccess(Span rootSpan, Span taskSpan, TaskResult taskResult) { + var output = toJson(Map.of("output", taskResult.result())); + taskSpan.setAttribute("braintrust.output_json", output); + rootSpan.setAttribute("braintrust.output_json", output); + } + + @Override + public void onTaskError( + Span rootSpan, Span taskSpan, DatasetCase datasetCase, Exception error) { + taskSpan.setStatus(StatusCode.ERROR, error.getMessage()); + taskSpan.recordException(error); + rootSpan.setStatus(StatusCode.ERROR, error.getMessage()); + rootSpan.setAttribute( + "braintrust.output_json", toJson(Collections.singletonMap("output", null))); + } + + @Override + public void onScoreSpan(Span scoreSpan, Scorer scorer) { + scoreSpan.setAttribute(PARENT, parentValue()); + } + + @Override + public void onScoreResult( + Span scoreSpan, + Span rootSpan, + Scorer scorer, + List scores, + @Nullable Exception scoreException) { + if (scoreException != null) { + scoreSpan.setStatus(StatusCode.ERROR, scoreException.getMessage()); + scoreSpan.recordException(scoreException); + } + if (scores == null || scores.isEmpty()) { + return; + } + var scorerScores = new LinkedHashMap(); + for (var score : scores) { + scorerScores.put(score.name(), score.value()); + } + var attrs = spanAttrs("score", scorer.getName()); + attrs.put("purpose", "scorer"); + scoreSpan.setAttribute("braintrust.span_attributes", toJson(attrs)); + var scoresJson = toJson(scorerScores); + scoreSpan.setAttribute("braintrust.output_json", scoresJson); + scoreSpan.setAttribute("braintrust.scores", scoresJson); + } + + @Override + public void onClassifierSpan( + Span classifierSpan, Classifier classifier, String resolvedName) { + classifierSpan.setAttribute(PARENT, parentValue()); + var attrs = spanAttrs("classifier", resolvedName); + attrs.put("purpose", "scorer"); + classifierSpan.setAttribute("braintrust.span_attributes", toJson(attrs)); + } + } +} diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java index a20f7861..5d1f6e97 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java @@ -6,12 +6,15 @@ import dev.braintrust.config.BraintrustConfig; import dev.braintrust.eval.EvalListener.CaseListener; import dev.braintrust.eval.EvalListener.RunListener; -import dev.braintrust.openapi.api.ExperimentsApi; -import dev.braintrust.openapi.model.CreateExperiment; import dev.braintrust.openapi.model.Project; import dev.braintrust.trace.BrainstoreTrace; +import dev.braintrust.trace.BraintrustContext; import dev.braintrust.trace.BraintrustTracing; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanKind; import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.context.Context; +import io.opentelemetry.context.Scope; import java.util.*; import java.util.function.Function; import javax.annotation.Nonnull; @@ -31,6 +34,7 @@ public final class Eval { private final @Nonnull BraintrustOpenApiClient client; private final @Nonnull Project project; private final @Nonnull BraintrustOpenApiClient.OrgInfo orgInfo; + private final @Nonnull Tracer tracer; private final @Nonnull Dataset dataset; private final @Nonnull Task task; private final @Nonnull List> scorers; @@ -38,19 +42,14 @@ public final class Eval { private final @Nonnull List tags; private final @Nonnull Map metadata; private final @Nonnull Parameters parameters; + private final @Nonnull EvalTargetProvider targetProvider; /** - * All listeners attached to this eval, including the built-in {@link OtelEvalListener} (always - * first) which manages the OTel spans. + * All listeners attached to this eval. {@link Eval} owns the spans; listeners decorate/observe + * them. By default this includes the built-in {@link EvalSpanDecorator}. */ private final @Nonnull List listeners; - /** - * Typed reference to the built-in OTel listener (also present in {@link #listeners}). Kept so - * we can pull span-derived info — e.g. the per-case {@link BrainstoreTrace} — back out of it. - */ - private final @Nonnull OtelEvalListener otelListener; - private Eval(Builder builder) { this.experimentName = builder.experimentName; this.config = Objects.requireNonNull(builder.config); @@ -59,6 +58,7 @@ private Eval(Builder builder) { client.fetchOrCreateProject( builder.projectId, config.defaultProjectName().orElse(null)); this.orgInfo = client.fetchOrgInfo(project.getOrgId().toString()); + this.tracer = Objects.requireNonNull(builder.tracer); this.dataset = builder.dataset; this.task = Objects.requireNonNull(builder.task); this.scorers = List.copyOf(builder.scorers); @@ -66,16 +66,13 @@ private Eval(Builder builder) { this.tags = List.copyOf(builder.tags); this.metadata = Map.copyOf(builder.metadata); this.parameters = builder.buildParameters(); - this.otelListener = new OtelEvalListener(Objects.requireNonNull(builder.tracer), client); - // built-in OTel listener runs first, then any user-supplied listeners - var allListeners = new ArrayList(); - allListeners.add(otelListener); - allListeners.addAll(builder.listeners); - this.listeners = List.copyOf(allListeners); + this.targetProvider = Objects.requireNonNull(builder.targetProvider); + this.listeners = List.copyOf(builder.listeners); } /** Runs the evaluation and returns results. */ public EvalResult run() { + final EvalRunInfo runInfo; try (var cursor = dataset.openCursor()) { Optional datasetVersion = Optional.empty(); Optional datasetId = Optional.empty(); @@ -84,123 +81,145 @@ public EvalResult run() { datasetId = Optional.of(dataset.id()); } - var createExperiment = - new CreateExperiment().projectId(project.getId()).name(experimentName); - - if (!tags.isEmpty()) { - createExperiment.tags(tags); - } - if (!metadata.isEmpty()) { - createExperiment.metadata(metadata); - } - datasetId.ifPresent(id -> createExperiment.datasetId(UUID.fromString(id))); - datasetVersion.ifPresent(createExperiment::datasetVersion); + runInfo = + targetProvider.create( + new EvalTargetProvider.Context( + config, + client, + project, + orgInfo, + experimentName, + tags, + metadata, + datasetId, + datasetVersion)); - var experiment = new ExperimentsApi(client).postExperiment(createExperiment); - var experimentId = experiment.getId().toString(); - - // Create one RunListener per attached listener, tracking the built-in OTel run - // listener by identity so we can later pull the per-case BrainstoreTrace from it. var runListeners = new ArrayList(listeners.size()); - OtelEvalListener.OtelRunListener otelRunListener = null; for (var listener : listeners) { - var runListener = listener.createRunListener(experimentId); - if (listener == otelListener) { - otelRunListener = (OtelEvalListener.OtelRunListener) runListener; - } - runListeners.add(runListener); + runListeners.add(listener.createRunListener(runInfo)); } - final var otelRun = otelRunListener; - - runListeners.forEach(runListener -> runListener.onStart(experimentId)); - cursor.forEach( - datasetCase -> evalOne(experimentId, datasetCase, runListeners, otelRun)); - runListeners.forEach(RunListener::onEnd); - } - - var experimentUrl = - "%s/experiments/%s" - .formatted( - BraintrustUtils.createProjectURI( - config.appUrl(), orgInfo.name(), project.getName()) - .toASCIIString(), - experimentName); - return new EvalResult(experimentUrl); + + runListeners.forEach(RunListener::onRunStart); + cursor.forEach(datasetCase -> evalOne(runInfo, datasetCase, runListeners)); + runListeners.forEach(RunListener::onRunEnd); + } + + return new EvalResult(runInfo.experimentUrl()); + } + + /** Makes {@code span} current with the braintrust parent set in baggage for child spans. */ + private Scope makeCurrent(Span span, BraintrustUtils.Parent parent) { + var ctx = Context.current().with(span); + ctx = BraintrustContext.setParentInBaggage(ctx, parent.type(), parent.id()); + return ctx.makeCurrent(); } private void evalOne( - String experimentId, + EvalRunInfo runInfo, DatasetCase datasetCase, - List runListeners, - @Nullable OtelEvalListener.OtelRunListener otelRunListener) { - // Create one CaseListener per RunListener, tracking the OTel one by identity so we can - // pull the BrainstoreTrace from it later. + List runListeners) { var caseListeners = new ArrayList(runListeners.size()); - OtelEvalListener.OtelCaseListener otelCase = null; for (var runListener : runListeners) { - var caseListener = runListener.createCaseListener(datasetCase); - if (runListener == otelRunListener) { - otelCase = (OtelEvalListener.OtelCaseListener) caseListener; + caseListeners.add(runListener.createCaseListener(datasetCase)); + } + var parent = runInfo.parent(); + + // Eval owns the span structure: create the root span (name only), then let listeners + // decorate it. + var rootSpan = + tracer.spanBuilder("eval") // TODO: allow names for eval cases + .setNoParent() // each eval case is its own trace + .setSpanKind(SpanKind.CLIENT) + .startSpan(); + for (var cl : caseListeners) { + cl.onRootSpan(rootSpan, datasetCase); + } + try (var rootScope = makeCurrent(rootSpan, parent)) { + TaskResult taskResult = null; + Exception taskError = null; + var taskSpan = tracer.spanBuilder("task").startSpan(); + final String taskSpanId = taskSpan.getSpanContext().getSpanId(); + for (var cl : caseListeners) { + cl.onTaskSpan(taskSpan, datasetCase); } - caseListeners.add(caseListener); - } - - caseListeners.forEach(CaseListener::onStart); - try { - // run task - caseListeners.forEach(cl -> cl.onTaskStart(experimentId, datasetCase)); - final TaskResult taskResult; - try { + try (var taskScope = makeCurrent(taskSpan, parent)) { taskResult = task.apply(datasetCase, parameters); + for (var cl : caseListeners) { + cl.onTaskSuccess(rootSpan, taskSpan, taskResult); + } } catch (Exception e) { - caseListeners.forEach(cl -> cl.onTaskError(experimentId, datasetCase, e)); - log.debug("Task threw exception for input: " + datasetCase.input(), e); - // run scoreForTaskException on each scorer; classifiers are skipped + taskError = e; + for (var cl : caseListeners) { + cl.onTaskError(rootSpan, taskSpan, datasetCase, e); + } + } + taskSpan.end(); + + if (taskError != null) { + log.debug("Task threw exception for input: " + datasetCase.input(), taskError); + // run scoreForTaskException on each scorer (score spans nest under the root span, + // since the task scope is now closed); classifiers are skipped for (var scorer : scorers) { - runScoreForTaskException(caseListeners, scorer, e, datasetCase); + runScoreForTaskException( + caseListeners, rootSpan, parent, scorer, taskError, datasetCase); } return; } - caseListeners.forEach(cl -> cl.onTaskEnd(experimentId, taskResult)); // A single BrainstoreTrace for this eval case, shared across all scorers/classifiers. // It fetches spans lazily on first access (only if a traced scorer/classifier calls - // it). Owned by the OTel listener since it is derived from span ids. - BrainstoreTrace trace = otelCase != null ? otelCase.brainstoreTrace() : null; + // it). Only available when targeting an experiment. + BrainstoreTrace trace = + runInfo.tracingSupported() + ? BrainstoreTrace.forExperiment( + client, + Objects.requireNonNull(runInfo.experimentId()), + rootSpan.getSpanContext().getTraceId(), + List.of(taskSpanId)) + : null; // run scorers for (var scorer : scorers) { - runScorer(caseListeners, scorer, taskResult, trace); + runScorer(caseListeners, rootSpan, parent, scorer, taskResult, trace); } // run classifiers. Classifier exceptions are non-fatal: they are recorded on the // classifier span and surfaced in the root span's metadata under `classifier_errors`, // but do not abort the eval or affect other classifiers/scorers. Classifiers only run // when the task succeeded (no scoreForTaskException analogue). - for (var classifier : classifiers) { - runClassifier(caseListeners, classifier, taskResult, trace); + for (int i = 0; i < classifiers.size(); i++) { + runClassifier( + caseListeners, rootSpan, parent, classifiers.get(i), i, taskResult, trace); } } finally { - caseListeners.forEach(CaseListener::onEnd); + for (var cl : caseListeners) { + cl.onCaseEnd(rootSpan); + } + rootSpan.end(); } } /** * Runs a scorer against a successful task result. If the scorer is a {@link TracedScorer}, it * receives the {@link BrainstoreTrace} for the eval case. If the scorer throws, falls back to - * {@link Scorer#scoreForScorerException}. The {@code onScoreEnd} event is always dispatched (so - * the OTel listener can end its span) even when score validation aborts the eval. + * {@link Scorer#scoreForScorerException}. {@code onScoreResult} is dispatched only when scores + * are valid; on validation/fallback failure the span is still ended and the eval aborts. */ private void runScorer( List caseListeners, + Span rootSpan, + BraintrustUtils.Parent parent, Scorer scorer, TaskResult taskResult, @Nullable BrainstoreTrace trace) { - caseListeners.forEach(cl -> cl.onScoreStart(scorer)); - List scores = List.of(); - Exception scoreException = null; + var scoreSpan = tracer.spanBuilder("score").startSpan(); + for (var cl : caseListeners) { + cl.onScoreSpan(scoreSpan, scorer); + } RuntimeException pending = null; - try { + try (var unused = makeCurrent(scoreSpan, parent)) { + List scores; + Exception scoreException = null; try { if (scorer instanceof TracedScorer tracedScorer) { scores = tracedScorer.score(taskResult, trace); @@ -214,14 +233,16 @@ private void runScorer( scores = scorer.scoreForScorerException(e, taskResult); } validateScores(scorer, scores); + final var finalScores = scores; + final var finalException = scoreException; + for (var cl : caseListeners) { + cl.onScoreResult(scoreSpan, rootSpan, scorer, finalScores, finalException); + } } catch (RuntimeException re) { // validation (or a throwing fallback) aborts the eval; record nothing for this score pending = re; - scores = List.of(); } finally { - final var finalScores = scores; - final var finalException = scoreException; - caseListeners.forEach(cl -> cl.onScoreEnd(scorer, finalScores, finalException)); + scoreSpan.end(); } if (pending != null) { throw pending; @@ -230,25 +251,30 @@ private void runScorer( /** * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback (or score - * validation) throws, the eval aborts — but the {@code onScoreEnd} event is still dispatched. + * validation) throws, the eval aborts — but the score span is still ended. */ private void runScoreForTaskException( List caseListeners, + Span rootSpan, + BraintrustUtils.Parent parent, Scorer scorer, Exception taskException, DatasetCase datasetCase) { - caseListeners.forEach(cl -> cl.onScoreStart(scorer)); - List scores = List.of(); + var scoreSpan = tracer.spanBuilder("score").startSpan(); + for (var cl : caseListeners) { + cl.onScoreSpan(scoreSpan, scorer); + } RuntimeException pending = null; - try { - scores = scorer.scoreForTaskException(taskException, datasetCase); + try (var unused = makeCurrent(scoreSpan, parent)) { + var scores = scorer.scoreForTaskException(taskException, datasetCase); validateScores(scorer, scores); + for (var cl : caseListeners) { + cl.onScoreResult(scoreSpan, rootSpan, scorer, scores, null); + } } catch (RuntimeException re) { pending = re; - scores = List.of(); } finally { - final var finalScores = scores; - caseListeners.forEach(cl -> cl.onScoreEnd(scorer, finalScores, null)); + scoreSpan.end(); } if (pending != null) { throw pending; @@ -256,18 +282,29 @@ private void runScoreForTaskException( } /** - * Runs a classifier. Exceptions are non-fatal: they are surfaced to listeners via the {@code - * classifierException} argument of {@code onClassifierEnd} and do not propagate. + * Runs a classifier inside its own span. Exceptions are non-fatal: they are surfaced to + * listeners via the {@code classifierException} argument of {@code onClassifierResult} and do + * not propagate. */ private void runClassifier( List caseListeners, + Span rootSpan, + BraintrustUtils.Parent parent, Classifier classifier, + int index, TaskResult taskResult, @Nullable BrainstoreTrace trace) { - caseListeners.forEach(cl -> cl.onClassifierStart(classifier)); + var resolvedName = classifier.getName(); + if (resolvedName == null || resolvedName.isBlank()) { + resolvedName = "classifier_" + index; + } + var classifierSpan = tracer.spanBuilder(resolvedName).startSpan(); + for (var cl : caseListeners) { + cl.onClassifierSpan(classifierSpan, classifier, resolvedName); + } List classifications = List.of(); Exception classifierException = null; - try { + try (var unused = makeCurrent(classifierSpan, parent)) { if (classifier instanceof TracedClassifier tracedClassifier) { classifications = tracedClassifier.classify(taskResult, trace); } else { @@ -279,12 +316,22 @@ private void runClassifier( } catch (Exception e) { classifierException = e; classifications = List.of(); - log.debug("Classifier '{}' threw exception", classifier.getName(), e); + log.debug("Classifier '{}' threw exception", resolvedName, e); + } finally { + final var finalClassifications = classifications; + final var finalException = classifierException; + final var finalResolvedName = resolvedName; + for (var cl : caseListeners) { + cl.onClassifierResult( + classifierSpan, + rootSpan, + classifier, + finalResolvedName, + finalClassifications, + finalException); + } + classifierSpan.end(); } - final var finalClassifications = classifications; - final var finalException = classifierException; - caseListeners.forEach( - cl -> cl.onClassifierEnd(classifier, finalClassifications, finalException)); } /** Validates that every score value is between 0 and 1 inclusive. Throws (aborting) if not. */ @@ -321,7 +368,10 @@ public static final class Builder { private @Nonnull Map parameterValues = Map.of(); private @Nonnull List tags = List.of(); private @Nonnull Map metadata = Map.of(); - private @Nonnull List listeners = new ArrayList<>(); + private @Nonnull EvalTargetProvider targetProvider = new ExperimentTargetProvider(); + // Seeded with the standard span decorator; removable via clearListeners(). + private @Nonnull List listeners = + new ArrayList<>(List.of(new EvalSpanDecorator())); public Eval build() { if (config == null) { @@ -446,6 +496,24 @@ public Builder addListener(@Nonnull EvalListener listener) { return this; } + /** + * Removes all attached listeners, including the built-in {@link EvalSpanDecorator}. Use + * this to fully control span decoration (e.g. the playground attaches its own decorator). + */ + public Builder clearListeners() { + this.listeners.clear(); + return this; + } + + /** + * Overrides how the eval target (parent / experiment) is resolved. Defaults to creating a + * Braintrust experiment ({@link ExperimentTargetProvider}). + */ + public Builder evalTargetProvider(@Nonnull EvalTargetProvider provider) { + this.targetProvider = Objects.requireNonNull(provider); + return this; + } + /** Sets metadata for the experiment. */ public Builder metadata(Map metadata) { this.metadata = Map.copyOf(metadata); diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java index 401f3167..24625db6 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java @@ -1,43 +1,81 @@ package dev.braintrust.eval; +import io.opentelemetry.api.trace.Span; import java.util.List; import javax.annotation.Nullable; -/** a listener which can be attached to an eval and hook specific events */ +/** + * A listener which can be attached to an eval to observe and/or decorate its lifecycle. + * + *

{@link Eval} owns the OpenTelemetry span structure — it creates the root ({@code + * eval}), {@code task}, {@code score}, and classifier spans (names only), manages the current + * context (so user/LLM child spans nest correctly), and ends the spans. Listeners receive the live + * {@link Span}s at each lifecycle point and may decorate them with attributes (e.g. the built-in + * {@link EvalSpanDecorator}) or simply observe them (e.g. read span ids for streaming progress). + * + *

All callbacks are no-ops by default so implementations only override what they need. + */ public interface EvalListener { - RunListener createRunListener(String experimentId); + /** Creates a run-scoped listener. Called once per {@link Eval#run()}. */ + RunListener createRunListener(EvalRunInfo info); - /** a listener which receives events over the lifecycle of a single eval run */ - public interface RunListener { - void onStart(String experimentId); + /** Run-scoped listener; spawns a {@link CaseListener} per eval case. */ + interface RunListener { + default void onRunStart() {} CaseListener createCaseListener(DatasetCase datasetCase); - void onEnd(); + default void onRunEnd() {} } - /** a listener which receives events over the lifecycle of a single case of an eval run */ - public interface CaseListener { - void onStart(); + /** Case-scoped listener receiving the live spans for a single eval case. */ + interface CaseListener { + /** The root {@code eval} span has been created (no attributes yet). */ + default void onRootSpan(Span rootSpan, DatasetCase datasetCase) {} - void onTaskStart(String experimentId, DatasetCase datasetCase); + /** The {@code task} span has been created (no attributes yet). */ + default void onTaskSpan(Span taskSpan, DatasetCase datasetCase) {} - void onTaskEnd(String experimentId, TaskResult taskResult); + /** The task completed successfully. */ + default void onTaskSuccess(Span rootSpan, Span taskSpan, TaskResult taskResult) {} - void onTaskError(String experimentId, DatasetCase datasetCase, Exception error); + /** + * The task threw. Scorers still run via {@code scoreForTaskException}; classifiers do not. + */ + default void onTaskError( + Span rootSpan, Span taskSpan, DatasetCase datasetCase, Exception error) {} - void onScoreStart(Scorer scorer); + /** A {@code score} span has been created (no attributes yet). */ + default void onScoreSpan(Span scoreSpan, Scorer scorer) {} - void onScoreEnd( - Scorer scorer, List scores, @Nullable Exception scoreException); + /** + * A scorer produced scores. Not called when score validation aborts the eval. {@code + * scoreException} is non-null when the scorer threw and the fallback was used. + */ + default void onScoreResult( + Span scoreSpan, + Span rootSpan, + Scorer scorer, + List scores, + @Nullable Exception scoreException) {} - void onClassifierStart(Classifier classifier); + /** A classifier span has been created (no attributes yet). */ + default void onClassifierSpan( + Span classifierSpan, Classifier classifier, String resolvedName) {} - void onClassifierEnd( + /** + * A classifier finished. {@code classifierException} is non-null when the classifier threw + * (non-fatal). + */ + default void onClassifierResult( + Span classifierSpan, + Span rootSpan, Classifier classifier, + String resolvedName, List classifications, - @Nullable Exception classifierException); + @Nullable Exception classifierException) {} - void onEnd(); + /** The case is finishing; the root span is about to be ended. */ + default void onCaseEnd(Span rootSpan) {} } } diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalRunInfo.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalRunInfo.java new file mode 100644 index 00000000..907f9ec8 --- /dev/null +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalRunInfo.java @@ -0,0 +1,24 @@ +package dev.braintrust.eval; + +import dev.braintrust.BraintrustUtils; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +/** + * Resolved target for an eval run, produced by an {@link EvalTargetProvider} and handed to every + * {@link EvalListener} via {@link EvalListener#createRunListener(EvalRunInfo)}. + * + * @param parent the braintrust parent for all spans (e.g. {@code experiment_id:…} or {@code + * playground_id:…}) + * @param generation optional generation identifier woven into span attributes (playground) + * @param experimentId the experiment id, when running against an experiment; otherwise null + * @param experimentUrl the experiment URL, when applicable; otherwise null + * @param tracingSupported whether a {@link dev.braintrust.trace.BrainstoreTrace} can be built for + * traced scorers/classifiers (true only in experiment mode) + */ +public record EvalRunInfo( + @Nonnull BraintrustUtils.Parent parent, + @Nullable String generation, + @Nullable String experimentId, + @Nullable String experimentUrl, + boolean tracingSupported) {} diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalSpanDecorator.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalSpanDecorator.java new file mode 100644 index 00000000..e7f1f0a7 --- /dev/null +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalSpanDecorator.java @@ -0,0 +1,211 @@ +package dev.braintrust.eval; + +import static dev.braintrust.json.BraintrustJsonMapper.toJson; + +import dev.braintrust.trace.BraintrustTracing; +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import javax.annotation.Nullable; + +/** + * The standard {@link EvalListener} that decorates the spans created by {@link Eval} with the + * canonical Braintrust attributes (root {@code eval}, {@code task}, {@code score}, and classifier + * spans). Attached automatically by {@link Eval.Builder}; can be removed via {@link + * Eval.Builder#clearListeners()}. + */ +public final class EvalSpanDecorator implements EvalListener { + private static final AttributeKey PARENT = + AttributeKey.stringKey(BraintrustTracing.PARENT_KEY); + + @Override + public RunListener createRunListener(EvalRunInfo info) { + return new RunListener() { + @Override + public CaseListener createCaseListener(DatasetCase datasetCase) { + return new Decorator(info); + } + }; + } + + private static final class Decorator implements CaseListener { + private final EvalRunInfo info; + private final Map>> caseClassifications = + new LinkedHashMap<>(); + private final Map classifierErrors = new LinkedHashMap<>(); + private @Nullable DatasetCase datasetCase; + + Decorator(EvalRunInfo info) { + this.info = info; + } + + private String parentValue() { + return info.parent().toParentValue(); + } + + private Map spanAttrs(String type) { + var m = new LinkedHashMap(); + m.put("type", type); + if (info.generation() != null) { + m.put("generation", info.generation()); + } + return m; + } + + @Override + public void onRootSpan(Span rootSpan, DatasetCase datasetCase) { + this.datasetCase = datasetCase; + rootSpan.setAttribute(PARENT, parentValue()); + rootSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs("eval"))); + rootSpan.setAttribute( + "braintrust.input_json", toJson(Map.of("input", datasetCase.input()))); + rootSpan.setAttribute("braintrust.expected", toJson(datasetCase.expected())); + if (datasetCase.origin().isPresent()) { + rootSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get())); + } + if (!datasetCase.tags().isEmpty()) { + rootSpan.setAttribute( + AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); + } + if (!datasetCase.metadata().isEmpty()) { + rootSpan.setAttribute( + AttributeKey.stringKey("braintrust.metadata"), + toJson(datasetCase.metadata())); + } + } + + @Override + public void onTaskSpan(Span taskSpan, DatasetCase datasetCase) { + taskSpan.setAttribute(PARENT, parentValue()); + taskSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs("task"))); + } + + @Override + public void onTaskSuccess(Span rootSpan, Span taskSpan, TaskResult taskResult) { + rootSpan.setAttribute( + "braintrust.output_json", toJson(Map.of("output", taskResult.result()))); + } + + @Override + public void onTaskError( + Span rootSpan, Span taskSpan, DatasetCase datasetCase, Exception error) { + taskSpan.setStatus(StatusCode.ERROR, error.getMessage()); + taskSpan.recordException(error); + rootSpan.setStatus(StatusCode.ERROR, error.getMessage()); + rootSpan.setAttribute( + "braintrust.output_json", toJson(Collections.singletonMap("output", null))); + } + + @Override + public void onScoreSpan(Span scoreSpan, Scorer scorer) { + scoreSpan.setAttribute(PARENT, parentValue()); + } + + @Override + public void onScoreResult( + Span scoreSpan, + Span rootSpan, + Scorer scorer, + List scores, + @Nullable Exception scoreException) { + if (scoreException != null) { + scoreSpan.setStatus(StatusCode.ERROR, scoreException.getMessage()); + scoreSpan.recordException(scoreException); + } + if (scores == null || scores.isEmpty()) { + return; + } + var scorerScores = new LinkedHashMap(); + for (var score : scores) { + scorerScores.put(score.name(), score.value()); + } + var attrs = spanAttrs("score"); + attrs.put("name", scorer.getName()); + attrs.put("purpose", "scorer"); + scoreSpan.setAttribute("braintrust.span_attributes", toJson(attrs)); + var scoresJson = toJson(scorerScores); + scoreSpan.setAttribute("braintrust.output_json", scoresJson); + scoreSpan.setAttribute("braintrust.scores", scoresJson); + } + + @Override + public void onClassifierSpan( + Span classifierSpan, Classifier classifier, String resolvedName) { + classifierSpan.setAttribute(PARENT, parentValue()); + var attrs = spanAttrs("classifier"); + attrs.put("name", resolvedName); + attrs.put("purpose", "scorer"); + classifierSpan.setAttribute("braintrust.span_attributes", toJson(attrs)); + } + + @Override + public void onClassifierResult( + Span classifierSpan, + Span rootSpan, + Classifier classifier, + String resolvedName, + List classifications, + @Nullable Exception classifierException) { + if (classifierException != null) { + classifierSpan.setStatus(StatusCode.ERROR, classifierException.getMessage()); + classifierSpan.recordException(classifierException); + classifierErrors.put( + resolvedName, + classifierException.getMessage() == null + ? classifierException.toString() + : classifierException.getMessage()); + return; + } + // Group results by resolved item name (item.name, falling back to the classifier name + // when blank). Same map is logged to the classifier span and merged into the per-case + // aggregate logged on the root span. + Map>> outputByName = new LinkedHashMap<>(); + for (var item : classifications) { + var itemName = item.name(); + if (itemName == null || itemName.isBlank()) { + itemName = resolvedName; + } + var itemMap = toClassificationItem(item); + outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap); + caseClassifications.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap); + } + classifierSpan.setAttribute("braintrust.output_json", toJson(outputByName)); + } + + @Override + public void onCaseEnd(Span rootSpan) { + if (!caseClassifications.isEmpty()) { + rootSpan.setAttribute("braintrust.classifications", toJson(caseClassifications)); + } + if (!classifierErrors.isEmpty()) { + Map mergedMetadata = + new LinkedHashMap<>( + datasetCase == null ? Map.of() : datasetCase.metadata()); + mergedMetadata.put("classifier_errors", classifierErrors); + rootSpan.setAttribute( + AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata)); + } + } + } + + /** + * Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code + * name}, includes {@code label} and {@code metadata} only when present. + */ + private static Map toClassificationItem(Classification c) { + Map m = new LinkedHashMap<>(); + m.put("id", c.id()); + if (c.label() != null) { + m.put("label", c.label()); + } + if (c.metadata() != null) { + m.put("metadata", c.metadata()); + } + return m; + } +} diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalTargetProvider.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalTargetProvider.java new file mode 100644 index 00000000..2aa0a640 --- /dev/null +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalTargetProvider.java @@ -0,0 +1,33 @@ +package dev.braintrust.eval; + +import dev.braintrust.api.BraintrustOpenApiClient; +import dev.braintrust.config.BraintrustConfig; +import dev.braintrust.openapi.model.Project; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import javax.annotation.Nonnull; + +/** + * Resolves the {@link EvalRunInfo target} for an eval run. The default implementation creates a + * Braintrust experiment (see {@link ExperimentTargetProvider}); alternative implementations (e.g. + * the devserver/playground) can supply a different parent and skip experiment creation. + */ +public interface EvalTargetProvider { + @Nonnull + EvalRunInfo create(@Nonnull Context ctx); + + /** + * Inputs available when resolving the eval target, gathered at the start of {@link Eval#run()}. + */ + record Context( + @Nonnull BraintrustConfig config, + @Nonnull BraintrustOpenApiClient client, + @Nonnull Project project, + @Nonnull BraintrustOpenApiClient.OrgInfo orgInfo, + @Nonnull String experimentName, + @Nonnull List tags, + @Nonnull Map metadata, + @Nonnull Optional datasetId, + @Nonnull Optional datasetVersion) {} +} diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/ExperimentTargetProvider.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/ExperimentTargetProvider.java new file mode 100644 index 00000000..825f0ea3 --- /dev/null +++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/ExperimentTargetProvider.java @@ -0,0 +1,46 @@ +package dev.braintrust.eval; + +import dev.braintrust.BraintrustUtils; +import dev.braintrust.openapi.api.ExperimentsApi; +import dev.braintrust.openapi.model.CreateExperiment; +import java.util.UUID; +import javax.annotation.Nonnull; + +/** + * Default {@link EvalTargetProvider}: creates a Braintrust experiment and targets spans at it via + * an {@code experiment_id:} parent. + */ +final class ExperimentTargetProvider implements EvalTargetProvider { + @Override + @Nonnull + public EvalRunInfo create(@Nonnull Context ctx) { + var createExperiment = + new CreateExperiment().projectId(ctx.project().getId()).name(ctx.experimentName()); + if (!ctx.tags().isEmpty()) { + createExperiment.tags(ctx.tags()); + } + if (!ctx.metadata().isEmpty()) { + createExperiment.metadata(ctx.metadata()); + } + ctx.datasetId().ifPresent(id -> createExperiment.datasetId(UUID.fromString(id))); + ctx.datasetVersion().ifPresent(createExperiment::datasetVersion); + + var experiment = new ExperimentsApi(ctx.client()).postExperiment(createExperiment); + var experimentId = experiment.getId().toString(); + var experimentUrl = + "%s/experiments/%s" + .formatted( + BraintrustUtils.createProjectURI( + ctx.config().appUrl(), + ctx.orgInfo().name(), + ctx.project().getName()) + .toASCIIString(), + ctx.experimentName()); + return new EvalRunInfo( + new BraintrustUtils.Parent("experiment_id", experimentId), + null, + experimentId, + experimentUrl, + true); + } +} diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java deleted file mode 100644 index e1e9083e..00000000 --- a/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java +++ /dev/null @@ -1,381 +0,0 @@ -package dev.braintrust.eval; - -import static dev.braintrust.json.BraintrustJsonMapper.toJson; - -import dev.braintrust.api.BraintrustOpenApiClient; -import dev.braintrust.trace.BrainstoreTrace; -import dev.braintrust.trace.BraintrustContext; -import dev.braintrust.trace.BraintrustTracing; -import io.opentelemetry.api.common.AttributeKey; -import io.opentelemetry.api.trace.Span; -import io.opentelemetry.api.trace.SpanKind; -import io.opentelemetry.api.trace.StatusCode; -import io.opentelemetry.api.trace.Tracer; -import io.opentelemetry.context.Scope; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import lombok.extern.slf4j.Slf4j; - -/** - * Built-in {@link EvalListener} that manages all OpenTelemetry spans for an eval (root {@code eval} - * span, {@code task} span, {@code score} spans, and {@code classifier} spans), pushing/popping the - * braintrust OTel context across the start/end events so user code nests correctly. - * - *

This listener pushes spans onto the current OTel context on start events and pops them on the - * matching end events. That makes it inherently thread-affine and dependent on strictly nested - * (LIFO) start/end ordering — which holds because evals run sequentially per case on one thread. - */ -@Slf4j -final class OtelEvalListener implements EvalListener { - private static final AttributeKey PARENT = - AttributeKey.stringKey(BraintrustTracing.PARENT_KEY); - - private final @Nonnull Tracer tracer; - private final @Nonnull BraintrustOpenApiClient client; - - OtelEvalListener(@Nonnull Tracer tracer, @Nonnull BraintrustOpenApiClient client) { - this.tracer = tracer; - this.client = client; - } - - @Override - public OtelRunListener createRunListener(String experimentId) { - return new OtelRunListener(experimentId); - } - - /** Run-scoped listener. There is currently no run-level span; it only spawns case listeners. */ - final class OtelRunListener implements RunListener { - private final @Nonnull String experimentId; - - private OtelRunListener(@Nonnull String experimentId) { - this.experimentId = experimentId; - } - - @Override - public void onStart(String experimentId) {} - - @Override - public OtelCaseListener createCaseListener(DatasetCase datasetCase) { - return new OtelCaseListener(experimentId, datasetCase); - } - - @Override - public void onEnd() {} - } - - /** Case-scoped listener owning the root/task/score/classifier spans and their scopes. */ - final class OtelCaseListener implements CaseListener { - private final @Nonnull String experimentId; - private final @Nonnull DatasetCase datasetCase; - - private @Nullable Span rootSpan; - private @Nullable Scope rootScope; - private @Nullable String rootTraceId; - private @Nullable String taskSpanId; - - private @Nullable Span taskSpan; - private @Nullable Scope taskScope; - - private @Nullable Span scoreSpan; - private @Nullable Scope scoreScope; - - private int classifierIndex = 0; - private @Nullable Span classifierSpan; - private @Nullable Scope classifierScope; - private @Nullable String classifierName; - - // Accumulated classifier results, written onto the root span at case end. - private final Map>> caseClassifications = - new LinkedHashMap<>(); - private final Map classifierErrors = new LinkedHashMap<>(); - - private OtelCaseListener( - @Nonnull String experimentId, @Nonnull DatasetCase datasetCase) { - this.experimentId = experimentId; - this.datasetCase = datasetCase; - } - - @Override - public void onStart() { - var span = - tracer.spanBuilder("eval") // TODO: allow names for eval cases - .setNoParent() // each eval case is its own trace - .setSpanKind(SpanKind.CLIENT) - .setAttribute(PARENT, "experiment_id:" + experimentId) - .setAttribute( - "braintrust.span_attributes", toJson(Map.of("type", "eval"))) - .setAttribute( - "braintrust.input_json", - toJson(Map.of("input", datasetCase.input()))) - .setAttribute("braintrust.expected", toJson(datasetCase.expected())) - .startSpan(); - if (datasetCase.origin().isPresent()) { - span.setAttribute("braintrust.origin", toJson(datasetCase.origin().get())); - } - if (!datasetCase.tags().isEmpty()) { - span.setAttribute( - AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags()); - } - if (!datasetCase.metadata().isEmpty()) { - span.setAttribute( - AttributeKey.stringKey("braintrust.metadata"), - toJson(datasetCase.metadata())); - } - this.rootSpan = span; - this.rootTraceId = span.getSpanContext().getTraceId(); - this.rootScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent(); - } - - @Override - public void onTaskStart(String experimentId, DatasetCase datasetCase) { - var span = - tracer.spanBuilder("task") - .setAttribute(PARENT, "experiment_id:" + this.experimentId) - .setAttribute( - "braintrust.span_attributes", toJson(Map.of("type", "task"))) - .startSpan(); - this.taskSpan = span; - this.taskSpanId = span.getSpanContext().getSpanId(); - this.taskScope = BraintrustContext.ofExperiment(this.experimentId, span).makeCurrent(); - } - - @Override - public void onTaskEnd(String experimentId, TaskResult taskResult) { - requireRoot() - .setAttribute( - "braintrust.output_json", - toJson(Map.of("output", taskResult.result()))); - closeTaskScope(); - requireTask().end(); - } - - @Override - public void onTaskError( - String experimentId, DatasetCase datasetCase, Exception error) { - var task = requireTask(); - task.setStatus(StatusCode.ERROR, error.getMessage()); - task.recordException(error); - closeTaskScope(); - task.end(); - - var root = requireRoot(); - root.setStatus(StatusCode.ERROR, error.getMessage()); - var nullOutput = new LinkedHashMap(); - nullOutput.put("output", null); - root.setAttribute("braintrust.output_json", toJson(nullOutput)); - } - - @Override - public void onScoreStart(Scorer scorer) { - var span = - tracer.spanBuilder("score") - .setAttribute(PARENT, "experiment_id:" + experimentId) - .startSpan(); - this.scoreSpan = span; - this.scoreScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent(); - } - - @Override - public void onScoreEnd( - Scorer scorer, List scores, @Nullable Exception scoreException) { - var span = requireScore(); - try { - if (scoreException != null) { - span.setStatus(StatusCode.ERROR, scoreException.getMessage()); - span.recordException(scoreException); - } - recordScores(span, requireRoot(), scorer, scores); - } finally { - closeScoreScope(); - span.end(); - } - } - - @Override - public void onClassifierStart(Classifier classifier) { - var resolvedName = classifier.getName(); - if (resolvedName == null || resolvedName.isBlank()) { - resolvedName = "classifier_" + classifierIndex; - } - classifierIndex++; - this.classifierName = resolvedName; - - var span = - tracer.spanBuilder(resolvedName) - .setAttribute(PARENT, "experiment_id:" + experimentId) - .startSpan(); - Map spanAttrs = new LinkedHashMap<>(); - spanAttrs.put("type", "classifier"); - spanAttrs.put("name", resolvedName); - spanAttrs.put("purpose", "scorer"); - span.setAttribute("braintrust.span_attributes", toJson(spanAttrs)); - - this.classifierSpan = span; - this.classifierScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent(); - } - - @Override - public void onClassifierEnd( - Classifier classifier, - List classifications, - @Nullable Exception classifierException) { - var span = requireClassifier(); - var resolvedName = classifierName; - try { - if (classifierException != null) { - span.setStatus(StatusCode.ERROR, classifierException.getMessage()); - span.recordException(classifierException); - classifierErrors.put( - resolvedName, - classifierException.getMessage() == null - ? classifierException.toString() - : classifierException.getMessage()); - return; - } - - // Group results by resolved item name (item.name, falling back to the classifier - // name when blank). Same map is logged to the classifier span and merged into the - // per-case aggregate logged on the root span. - Map>> outputByName = new LinkedHashMap<>(); - for (var item : classifications) { - var itemName = item.name(); - if (itemName == null || itemName.isBlank()) { - itemName = resolvedName; - } - var itemMap = toClassificationItem(item); - outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap); - caseClassifications - .computeIfAbsent(itemName, k -> new ArrayList<>()) - .add(itemMap); - } - span.setAttribute("braintrust.output_json", toJson(outputByName)); - } finally { - closeClassifierScope(); - span.end(); - } - } - - @Override - public void onEnd() { - var root = requireRoot(); - try { - if (!caseClassifications.isEmpty()) { - root.setAttribute("braintrust.classifications", toJson(caseClassifications)); - } - if (!classifierErrors.isEmpty()) { - Map mergedMetadata = - new LinkedHashMap<>(datasetCase.metadata()); - mergedMetadata.put("classifier_errors", classifierErrors); - root.setAttribute( - AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata)); - } - } finally { - closeRootScope(); - root.end(); - } - } - - /** - * Builds the {@link BrainstoreTrace} for this case from the root trace id and the task span - * id. Must be called after {@link #onTaskEnd}. - */ - BrainstoreTrace brainstoreTrace() { - return BrainstoreTrace.forExperiment( - client, - experimentId, - requireNonNullState(rootTraceId, "rootTraceId"), - List.of(requireNonNullState(taskSpanId, "taskSpanId"))); - } - - private Span requireRoot() { - return requireNonNullState(rootSpan, "rootSpan"); - } - - private Span requireTask() { - return requireNonNullState(taskSpan, "taskSpan"); - } - - private Span requireScore() { - return requireNonNullState(scoreSpan, "scoreSpan"); - } - - private Span requireClassifier() { - return requireNonNullState(classifierSpan, "classifierSpan"); - } - - private void closeRootScope() { - if (rootScope != null) { - rootScope.close(); - rootScope = null; - } - } - - private void closeTaskScope() { - if (taskScope != null) { - taskScope.close(); - taskScope = null; - } - } - - private void closeScoreScope() { - if (scoreScope != null) { - scoreScope.close(); - scoreScope = null; - } - } - - private void closeClassifierScope() { - if (classifierScope != null) { - classifierScope.close(); - classifierScope = null; - } - } - } - - /** Records scores onto the score span and root span. Validation is the caller's job. */ - private static void recordScores( - Span scoreSpan, Span rootSpan, Scorer scorer, List scores) { - if (scores == null || scores.isEmpty()) { - return; - } - final Map scorerScores = new LinkedHashMap<>(); - for (var score : scores) { - scorerScores.put(score.name(), score.value()); - } - Map spanAttrs = new LinkedHashMap<>(); - spanAttrs.put("type", "score"); - spanAttrs.put("name", scorer.getName()); - spanAttrs.put("purpose", "scorer"); - scoreSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs)); - var scoresJson = toJson(scorerScores); - scoreSpan.setAttribute("braintrust.output_json", scoresJson); - scoreSpan.setAttribute("braintrust.scores", scoresJson); - } - - /** - * Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code - * name}, includes {@code label} and {@code metadata} only when present. - */ - private static Map toClassificationItem(Classification c) { - Map m = new LinkedHashMap<>(); - m.put("id", c.id()); - if (c.label() != null) { - m.put("label", c.label()); - } - if (c.metadata() != null) { - m.put("metadata", c.metadata()); - } - return m; - } - - private static T requireNonNullState(@Nullable T value, String name) { - if (value == null) { - throw new IllegalStateException("OtelEvalListener: " + name + " accessed out of order"); - } - return value; - } -} diff --git a/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java b/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java index e1755a03..10b545f8 100644 --- a/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java +++ b/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java @@ -793,9 +793,9 @@ void testTaskErrorHandling() throws Exception { erroredTaskSpan.getEvents().stream().anyMatch(e -> e.getName().equals("exception")), "task span should have an exception event"); - // The errored case should still have a score span (from scoreForTaskException default 0.0) - // The score span is a child of the task span (since the task scope is still active when - // runScoreForTaskException is called from the catch block) + // The errored case should still have a score span (from scoreForTaskException default 0.0). + // The score span is a child of the eval (root) span: scoreForTaskException runs after the + // task scope has closed, so its span nests under the root rather than the task span. var erroredScoreSpans = allSpans.stream() .filter(s -> s.getName().equals("score")) @@ -804,7 +804,7 @@ void testTaskErrorHandling() throws Exception { s.getParentSpanContext() .getSpanId() .equals( - erroredTaskSpan + erroredEvalSpan .getSpanContext() .getSpanId())) .toList();