From 841f094dd17be2c36a44b09877c203a23224b65d Mon Sep 17 00:00:00 2001
From: Andrew Kent <andrew@braintrustdata.com>
Date: Fri, 26 Jun 2026 13:17:24 -0600
Subject: [PATCH 1/3] eval listener + otel eval listener

---
 .../main/java/dev/braintrust/eval/Eval.java   | 377 +++++++----------
 .../dev/braintrust/eval/EvalListener.java     |  43 ++
 .../dev/braintrust/eval/OtelEvalListener.java | 381 ++++++++++++++++++
 .../eval/DatasetBrainstoreImplTest.java       |  41 ++
 4 files changed, 619 insertions(+), 223 deletions(-)
 create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java
 create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
index a9c3d06b..4483f062 100644
--- a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
@@ -1,21 +1,16 @@
 package dev.braintrust.eval;
 
-import static dev.braintrust.json.BraintrustJsonMapper.toJson;
-
 import dev.braintrust.BraintrustUtils;
 import dev.braintrust.api.BraintrustApiClient;
 import dev.braintrust.api.BraintrustOpenApiClient;
 import dev.braintrust.config.BraintrustConfig;
+import dev.braintrust.eval.EvalListener.CaseListener;
+import dev.braintrust.eval.EvalListener.RunListener;
 import dev.braintrust.openapi.api.ExperimentsApi;
 import dev.braintrust.openapi.model.CreateExperiment;
 import dev.braintrust.openapi.model.Project;
 import dev.braintrust.trace.BrainstoreTrace;
-import dev.braintrust.trace.BraintrustContext;
 import dev.braintrust.trace.BraintrustTracing;
-import io.opentelemetry.api.common.AttributeKey;
-import io.opentelemetry.api.trace.Span;
-import io.opentelemetry.api.trace.SpanKind;
-import io.opentelemetry.api.trace.StatusCode;
 import io.opentelemetry.api.trace.Tracer;
 import java.util.*;
 import java.util.function.Function;
@@ -31,14 +26,11 @@
  */
 @Slf4j
 public final class Eval<INPUT, OUTPUT> {
-    private static final AttributeKey<String> PARENT =
-            AttributeKey.stringKey(BraintrustTracing.PARENT_KEY);
     private final @Nonnull String experimentName;
     private final @Nonnull BraintrustConfig config;
     private final @Nonnull BraintrustOpenApiClient client;
     private final @Nonnull Project project;
     private final @Nonnull BraintrustOpenApiClient.OrgInfo orgInfo;
-    private final @Nonnull Tracer tracer;
     private final @Nonnull Dataset<INPUT, OUTPUT> dataset;
     private final @Nonnull Task<INPUT, OUTPUT> task;
     private final @Nonnull List<Scorer<INPUT, OUTPUT>> scorers;
@@ -47,6 +39,18 @@ public final class Eval<INPUT, OUTPUT> {
     private final @Nonnull Map<String, Object> metadata;
     private final @Nonnull Parameters parameters;
 
+    /**
+     * All listeners attached to this eval, including the built-in {@link OtelEvalListener} (always
+     * first) which manages the OTel spans.
+     */
+    private final @Nonnull List<EvalListener> listeners;
+
+    /**
+     * Typed reference to the built-in OTel listener (also present in {@link #listeners}). Kept so
+     * we can pull span-derived info — e.g. the per-case {@link BrainstoreTrace} — back out of it.
+     */
+    private final @Nonnull OtelEvalListener otelListener;
+
     private Eval(Builder<INPUT, OUTPUT> builder) {
         this.experimentName = builder.experimentName;
         this.config = Objects.requireNonNull(builder.config);
@@ -55,7 +59,6 @@ private Eval(Builder<INPUT, OUTPUT> builder) {
                 client.fetchOrCreateProject(
                         builder.projectId, config.defaultProjectName().orElse(null));
         this.orgInfo = client.fetchOrgInfo(project.getOrgId().toString());
-        this.tracer = Objects.requireNonNull(builder.tracer);
         this.dataset = builder.dataset;
         this.task = Objects.requireNonNull(builder.task);
         this.scorers = List.copyOf(builder.scorers);
@@ -63,6 +66,12 @@ private Eval(Builder<INPUT, OUTPUT> builder) {
         this.tags = List.copyOf(builder.tags);
         this.metadata = Map.copyOf(builder.metadata);
         this.parameters = builder.buildParameters();
+        this.otelListener = new OtelEvalListener(Objects.requireNonNull(builder.tracer), client);
+        // built-in OTel listener runs first, then any user-supplied listeners
+        var allListeners = new ArrayList<EvalListener>();
+        allListeners.add(otelListener);
+        allListeners.addAll(builder.listeners);
+        this.listeners = List.copyOf(allListeners);
     }
 
     /** Runs the evaluation and returns results. */
@@ -88,8 +97,25 @@ public EvalResult run() {
             datasetVersion.ifPresent(createExperiment::datasetVersion);
 
             var experiment = new ExperimentsApi(client).postExperiment(createExperiment);
+            var experimentId = experiment.getId().toString();
+
+            // Create one RunListener per attached listener, tracking the built-in OTel run
+            // listener by identity so we can later pull the per-case BrainstoreTrace from it.
+            var runListeners = new ArrayList<RunListener>(listeners.size());
+            OtelEvalListener.OtelRunListener otelRunListener = null;
+            for (var listener : listeners) {
+                var runListener = listener.createRunListener(experimentId);
+                if (listener == otelListener) {
+                    otelRunListener = (OtelEvalListener.OtelRunListener) runListener;
+                }
+                runListeners.add(runListener);
+            }
+            final var otelRun = otelRunListener;
 
-            cursor.forEach(datasetCase -> evalOne(experiment.getId().toString(), datasetCase));
+            runListeners.forEach(runListener -> runListener.onStart(experimentId));
+            cursor.forEach(
+                    datasetCase -> evalOne(experimentId, datasetCase, runListeners, otelRun));
+            runListeners.forEach(RunListener::onEnd);
         }
 
         var experimentUrl =
@@ -102,136 +128,79 @@ public EvalResult run() {
         return new EvalResult(experimentUrl);
     }
 
-    private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase) {
-        var rootSpan =
-                tracer.spanBuilder("eval") // TODO: allow names for eval cases
-                        .setNoParent() // each eval case is its own trace
-                        .setSpanKind(SpanKind.CLIENT)
-                        .setAttribute(PARENT, "experiment_id:" + experimentId)
-                        .setAttribute("braintrust.span_attributes", toJson(Map.of("type", "eval")))
-                        .setAttribute(
-                                "braintrust.input_json",
-                                toJson(Map.of("input", datasetCase.input())))
-                        .setAttribute("braintrust.expected", toJson(datasetCase.expected()))
-                        .startSpan();
-        if (datasetCase.origin().isPresent()) {
-            rootSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get()));
-        }
-        if (!datasetCase.tags().isEmpty()) {
-            rootSpan.setAttribute(
-                    AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
-        }
-        if (!datasetCase.metadata().isEmpty()) {
-            rootSpan.setAttribute(
-                    AttributeKey.stringKey("braintrust.metadata"), toJson(datasetCase.metadata()));
-        }
-        try (var rootScope = BraintrustContext.ofExperiment(experimentId, rootSpan).makeCurrent()) {
+    private void evalOne(
+            String experimentId,
+            DatasetCase<INPUT, OUTPUT> datasetCase,
+            List<RunListener> runListeners,
+            @Nullable OtelEvalListener.OtelRunListener otelRunListener) {
+        // Create one CaseListener per RunListener, tracking the OTel one by identity so we can
+        // pull the BrainstoreTrace from it later.
+        var caseListeners = new ArrayList<CaseListener>(runListeners.size());
+        OtelEvalListener.OtelCaseListener otelCase = null;
+        for (var runListener : runListeners) {
+            var caseListener = runListener.createCaseListener(datasetCase);
+            if (runListener == otelRunListener) {
+                otelCase = (OtelEvalListener.OtelCaseListener) caseListener;
+            }
+            caseListeners.add(caseListener);
+        }
+
+        caseListeners.forEach(CaseListener::onStart);
+        try {
+            // run task
+            caseListeners.forEach(cl -> cl.onTaskStart(experimentId, datasetCase));
             final TaskResult<INPUT, OUTPUT> taskResult;
-            final String taskSpanId;
-            { // run task
-                var taskSpan =
-                        tracer.spanBuilder("task")
-                                .setAttribute(PARENT, "experiment_id:" + experimentId)
-                                .setAttribute(
-                                        "braintrust.span_attributes",
-                                        toJson(Map.of("type", "task")))
-                                .startSpan();
-                taskSpanId = taskSpan.getSpanContext().getSpanId();
-                try (var unused =
-                        BraintrustContext.ofExperiment(experimentId, taskSpan).makeCurrent()) {
-                    taskResult = task.apply(datasetCase, parameters);
-                    rootSpan.setAttribute(
-                            "braintrust.output_json",
-                            toJson(Map.of("output", taskResult.result())));
-                } catch (Exception e) {
-                    taskSpan.setStatus(StatusCode.ERROR, e.getMessage());
-                    taskSpan.recordException(e);
-                    taskSpan.end();
-                    rootSpan.setStatus(StatusCode.ERROR, e.getMessage());
-                    rootSpan.setAttribute(
-                            "braintrust.output_json",
-                            toJson(Collections.singletonMap("output", null)));
-                    log.debug("Task threw exception for input: " + datasetCase.input(), e);
-                    // run scoreForTaskException on each scorer
-                    for (var scorer : scorers) {
-                        runScoreForTaskException(experimentId, rootSpan, scorer, e, datasetCase);
-                    }
-                    return;
+            try {
+                taskResult = task.apply(datasetCase, parameters);
+            } catch (Exception e) {
+                caseListeners.forEach(cl -> cl.onTaskError(experimentId, datasetCase, e));
+                log.debug("Task threw exception for input: " + datasetCase.input(), e);
+                // run scoreForTaskException on each scorer; classifiers are skipped
+                for (var scorer : scorers) {
+                    runScoreForTaskException(caseListeners, scorer, e, datasetCase);
                 }
-                taskSpan.end();
+                return;
             }
+            caseListeners.forEach(cl -> cl.onTaskEnd(experimentId, taskResult));
 
-            // Create a single BrainstoreTrace for this eval case, shared across all scorers.
-            // It fetches spans lazily on first access (only if a TracedScorer actually calls it).
-            // We wait specifically for the task span to appear, which guarantees its children
-            // (LLM spans, tool spans) have also been indexed — since children end before parents.
-            var rootTraceId = rootSpan.getSpanContext().getTraceId();
-            var trace =
-                    BrainstoreTrace.forExperiment(
-                            client, experimentId, rootTraceId, List.of(taskSpanId));
+            // A single BrainstoreTrace for this eval case, shared across all scorers/classifiers.
+            // It fetches spans lazily on first access (only if a traced scorer/classifier calls
+            // it). Owned by the OTel listener since it is derived from span ids.
+            BrainstoreTrace trace = otelCase != null ? otelCase.brainstoreTrace() : null;
 
-            // run scorers - one span per scorer
+            // run scorers
             for (var scorer : scorers) {
-                runScorer(experimentId, rootSpan, scorer, taskResult, trace);
+                runScorer(caseListeners, scorer, taskResult, trace);
             }
 
-            // run classifiers - one span per classifier. Classifier exceptions are non-fatal:
-            // they are recorded on the classifier span and surfaced in the root span's metadata
-            // under `classifier_errors`, but do not abort the eval or affect other classifiers/
-            // scorers. Classifiers only run when the task succeeded (no scoreForTaskException
-            // analogue).
-            if (!classifiers.isEmpty()) {
-                Map<String, List<Map<String, Object>>> caseClassifications = new LinkedHashMap<>();
-                Map<String, String> classifierErrors = new LinkedHashMap<>();
-                for (int i = 0; i < classifiers.size(); i++) {
-                    var classifier = classifiers.get(i);
-                    var classifierName = classifier.getName();
-                    if (classifierName == null || classifierName.isBlank()) {
-                        classifierName = "classifier_" + i;
-                    }
-                    runClassifier(
-                            experimentId,
-                            classifier,
-                            classifierName,
-                            taskResult,
-                            trace,
-                            caseClassifications,
-                            classifierErrors);
-                }
-                if (!caseClassifications.isEmpty()) {
-                    rootSpan.setAttribute(
-                            "braintrust.classifications", toJson(caseClassifications));
-                }
-                if (!classifierErrors.isEmpty()) {
-                    Map<String, Object> mergedMetadata =
-                            new LinkedHashMap<>(datasetCase.metadata());
-                    mergedMetadata.put("classifier_errors", classifierErrors);
-                    rootSpan.setAttribute(
-                            AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata));
-                }
+            // run classifiers. Classifier exceptions are non-fatal: they are recorded on the
+            // classifier span and surfaced in the root span's metadata under `classifier_errors`,
+            // but do not abort the eval or affect other classifiers/scorers. Classifiers only run
+            // when the task succeeded (no scoreForTaskException analogue).
+            for (var classifier : classifiers) {
+                runClassifier(caseListeners, classifier, taskResult, trace);
             }
         } finally {
-            rootSpan.end();
+            caseListeners.forEach(CaseListener::onEnd);
         }
     }
 
     /**
      * Runs a scorer against a successful task result. If the scorer is a {@link TracedScorer}, it
      * receives the {@link BrainstoreTrace} for the eval case. If the scorer throws, falls back to
-     * {@link Scorer#scoreForScorerException}.
+     * {@link Scorer#scoreForScorerException}. The {@code onScoreEnd} event is always dispatched (so
+     * the OTel listener can end its span) even when score validation aborts the eval.
      */
     private void runScorer(
-            String experimentId,
-            Span rootSpan,
+            List<CaseListener> caseListeners,
             Scorer<INPUT, OUTPUT> scorer,
             TaskResult<INPUT, OUTPUT> taskResult,
-            BrainstoreTrace trace) {
-        var scoreSpan =
-                tracer.spanBuilder("score")
-                        .setAttribute(PARENT, "experiment_id:" + experimentId)
-                        .startSpan();
-        try (var unused = BraintrustContext.ofExperiment(experimentId, scoreSpan).makeCurrent()) {
-            List<Score> scores;
+            @Nullable BrainstoreTrace trace) {
+        caseListeners.forEach(cl -> cl.onScoreStart(scorer));
+        List<Score> scores = List.of();
+        Exception scoreException = null;
+        RuntimeException pending = null;
+        try {
             try {
                 if (scorer instanceof TracedScorer<INPUT, OUTPUT> tracedScorer) {
                     scores = tracedScorer.score(taskResult, trace);
@@ -239,142 +208,97 @@ private void runScorer(
                     scores = scorer.score(taskResult);
                 }
             } catch (Exception e) {
-                scoreSpan.setStatus(StatusCode.ERROR, e.getMessage());
-                scoreSpan.recordException(e);
+                scoreException = e;
                 log.debug("Scorer '{}' threw exception", scorer.getName(), e);
                 // fall back to scoreForScorerException — if this throws, eval aborts
                 scores = scorer.scoreForScorerException(e, taskResult);
             }
-            recordScores(scoreSpan, rootSpan, scorer, scores);
+            validateScores(scorer, scores);
+        } catch (RuntimeException re) {
+            // validation (or a throwing fallback) aborts the eval; record nothing for this score
+            pending = re;
+            scores = List.of();
         } finally {
-            scoreSpan.end();
+            final var finalScores = scores;
+            final var finalException = scoreException;
+            caseListeners.forEach(cl -> cl.onScoreEnd(scorer, finalScores, finalException));
+        }
+        if (pending != null) {
+            throw pending;
         }
     }
 
     /**
-     * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback throws, the
-     * eval aborts.
+     * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback (or score
+     * validation) throws, the eval aborts — but the {@code onScoreEnd} event is still dispatched.
      */
     private void runScoreForTaskException(
-            String experimentId,
-            Span rootSpan,
+            List<CaseListener> caseListeners,
             Scorer<INPUT, OUTPUT> scorer,
             Exception taskException,
             DatasetCase<INPUT, OUTPUT> datasetCase) {
-        var scoreSpan =
-                tracer.spanBuilder("score")
-                        .setAttribute(PARENT, "experiment_id:" + experimentId)
-                        .startSpan();
-        try (var unused = BraintrustContext.ofExperiment(experimentId, scoreSpan).makeCurrent()) {
-            // if this throws, it propagates and the eval aborts
-            var scores = scorer.scoreForTaskException(taskException, datasetCase);
-            recordScores(scoreSpan, rootSpan, scorer, scores);
+        caseListeners.forEach(cl -> cl.onScoreStart(scorer));
+        List<Score> scores = List.of();
+        RuntimeException pending = null;
+        try {
+            scores = scorer.scoreForTaskException(taskException, datasetCase);
+            validateScores(scorer, scores);
+        } catch (RuntimeException re) {
+            pending = re;
+            scores = List.of();
         } finally {
-            scoreSpan.end();
+            final var finalScores = scores;
+            caseListeners.forEach(cl -> cl.onScoreEnd(scorer, finalScores, null));
+        }
+        if (pending != null) {
+            throw pending;
         }
     }
 
     /**
-     * Runs a classifier inside its own span. Exceptions are recorded on the classifier span and
-     * surfaced via {@code classifierErrors}; they do not propagate.
+     * Runs a classifier. Exceptions are non-fatal: they are surfaced to listeners via the {@code
+     * classifierException} argument of {@code onClassifierEnd} and do not propagate.
      */
     private void runClassifier(
-            String experimentId,
+            List<CaseListener> caseListeners,
             Classifier<INPUT, OUTPUT> classifier,
-            String resolvedName,
             TaskResult<INPUT, OUTPUT> taskResult,
-            BrainstoreTrace trace,
-            Map<String, List<Map<String, Object>>> caseClassifications,
-            Map<String, String> classifierErrors) {
-        var classifierSpan =
-                tracer.spanBuilder(resolvedName)
-                        .setAttribute(PARENT, "experiment_id:" + experimentId)
-                        .startSpan();
-        try (var unused =
-                BraintrustContext.ofExperiment(experimentId, classifierSpan).makeCurrent()) {
-            Map<String, Object> spanAttrs = new LinkedHashMap<>();
-            spanAttrs.put("type", "classifier");
-            spanAttrs.put("name", resolvedName);
-            spanAttrs.put("purpose", "scorer");
-            classifierSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs));
-
-            List<Classification> classifications;
-            try {
-                if (classifier instanceof TracedClassifier<INPUT, OUTPUT> tracedClassifier) {
-                    classifications = tracedClassifier.classify(taskResult, trace);
-                } else {
-                    classifications = classifier.classify(taskResult);
-                }
-                if (classifications == null) {
-                    classifications = List.of();
-                }
-            } catch (Exception e) {
-                classifierSpan.setStatus(StatusCode.ERROR, e.getMessage());
-                classifierSpan.recordException(e);
-                log.debug("Classifier '{}' threw exception", resolvedName, e);
-                classifierErrors.put(
-                        resolvedName, e.getMessage() == null ? e.toString() : e.getMessage());
-                return;
+            @Nullable BrainstoreTrace trace) {
+        caseListeners.forEach(cl -> cl.onClassifierStart(classifier));
+        List<Classification> classifications = List.of();
+        Exception classifierException = null;
+        try {
+            if (classifier instanceof TracedClassifier<INPUT, OUTPUT> tracedClassifier) {
+                classifications = tracedClassifier.classify(taskResult, trace);
+            } else {
+                classifications = classifier.classify(taskResult);
             }
-
-            // Group results by resolved item name (item.name, falling back to the classifier
-            // name when blank). Same map is logged to the classifier span and merged into the
-            // per-case aggregate logged on the root span.
-            Map<String, List<Map<String, Object>>> outputByName = new LinkedHashMap<>();
-            for (var item : classifications) {
-                var itemName = item.name();
-                if (itemName == null || itemName.isBlank()) {
-                    itemName = resolvedName;
-                }
-                var itemMap = toClassificationItem(item);
-                outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap);
-                caseClassifications.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap);
+            if (classifications == null) {
+                classifications = List.of();
             }
-            classifierSpan.setAttribute("braintrust.output_json", toJson(outputByName));
-        } finally {
-            classifierSpan.end();
-        }
+        } catch (Exception e) {
+            classifierException = e;
+            classifications = List.of();
+            log.debug("Classifier '{}' threw exception", classifier.getName(), e);
+        }
+        final var finalClassifications = classifications;
+        final var finalException = classifierException;
+        caseListeners.forEach(
+                cl -> cl.onClassifierEnd(classifier, finalClassifications, finalException));
     }
 
-    /**
-     * Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code
-     * name}, includes {@code label} and {@code metadata} only when present.
-     */
-    private static Map<String, Object> toClassificationItem(Classification c) {
-        Map<String, Object> m = new LinkedHashMap<>();
-        m.put("id", c.id());
-        if (c.label() != null) {
-            m.put("label", c.label());
-        }
-        if (c.metadata() != null) {
-            m.put("metadata", c.metadata());
-        }
-        return m;
-    }
-
-    /** Validates and records scores on the score span and root span. */
-    private void recordScores(
-            Span scoreSpan, Span rootSpan, Scorer<INPUT, OUTPUT> scorer, List<Score> scores) {
-        if (scores == null || scores.isEmpty()) {
+    /** Validates that every score value is between 0 and 1 inclusive. Throws (aborting) if not. */
+    private void validateScores(Scorer<INPUT, OUTPUT> scorer, @Nullable List<Score> scores) {
+        if (scores == null) {
             return;
         }
-        final Map<String, Double> scorerScores = new LinkedHashMap<>();
         for (var score : scores) {
             if (score.value() < 0.0 || score.value() > 1.0) {
                 throw new RuntimeException(
                         "score must be between 0 and 1: %s : %s"
                                 .formatted(scorer.getName(), score));
             }
-            scorerScores.put(score.name(), score.value());
-        }
-        Map<String, Object> spanAttrs = new LinkedHashMap<>();
-        spanAttrs.put("type", "score");
-        spanAttrs.put("name", scorer.getName());
-        spanAttrs.put("purpose", "scorer");
-        scoreSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs));
-        var scoresJson = toJson(scorerScores);
-        scoreSpan.setAttribute("braintrust.output_json", scoresJson);
-        scoreSpan.setAttribute("braintrust.scores", scoresJson);
+        }
     }
 
     /** Creates a new eval builder. */
@@ -397,6 +321,7 @@ public static final class Builder<INPUT, OUTPUT> {
         private @Nonnull Map<String, Object> parameterValues = Map.of();
         private @Nonnull List<String> tags = List.of();
         private @Nonnull Map<String, Object> metadata = Map.of();
+        private @Nonnull List<EvalListener> listeners = new ArrayList<>();
 
         public Eval<INPUT, OUTPUT> build() {
             if (config == null) {
@@ -515,6 +440,12 @@ public Builder<INPUT, OUTPUT> tags(String... tags) {
             return this;
         }
 
+        /** Adds a listener which will be notified of eval lifecycle events. */
+        public Builder<INPUT, OUTPUT> addListener(@Nonnull EvalListener listener) {
+            this.listeners.add(Objects.requireNonNull(listener));
+            return this;
+        }
+
         /** Sets metadata for the experiment. */
         public Builder<INPUT, OUTPUT> metadata(Map<String, Object> metadata) {
             this.metadata = Map.copyOf(metadata);
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java
new file mode 100644
index 00000000..401f3167
--- /dev/null
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java
@@ -0,0 +1,43 @@
+package dev.braintrust.eval;
+
+import java.util.List;
+import javax.annotation.Nullable;
+
+/** a listener which can be attached to an eval and hook specific events */
+public interface EvalListener {
+    RunListener createRunListener(String experimentId);
+
+    /** a listener which receives events over the lifecycle of a single eval run */
+    public interface RunListener {
+        void onStart(String experimentId);
+
+        CaseListener createCaseListener(DatasetCase<?, ?> datasetCase);
+
+        void onEnd();
+    }
+
+    /** a listener which receives events over the lifecycle of a single case of an eval run */
+    public interface CaseListener {
+        void onStart();
+
+        void onTaskStart(String experimentId, DatasetCase<?, ?> datasetCase);
+
+        void onTaskEnd(String experimentId, TaskResult<?, ?> taskResult);
+
+        void onTaskError(String experimentId, DatasetCase<?, ?> datasetCase, Exception error);
+
+        void onScoreStart(Scorer<?, ?> scorer);
+
+        void onScoreEnd(
+                Scorer<?, ?> scorer, List<Score> scores, @Nullable Exception scoreException);
+
+        void onClassifierStart(Classifier<?, ?> classifier);
+
+        void onClassifierEnd(
+                Classifier<?, ?> classifier,
+                List<Classification> classifications,
+                @Nullable Exception classifierException);
+
+        void onEnd();
+    }
+}
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java
new file mode 100644
index 00000000..e1e9083e
--- /dev/null
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java
@@ -0,0 +1,381 @@
+package dev.braintrust.eval;
+
+import static dev.braintrust.json.BraintrustJsonMapper.toJson;
+
+import dev.braintrust.api.BraintrustOpenApiClient;
+import dev.braintrust.trace.BrainstoreTrace;
+import dev.braintrust.trace.BraintrustContext;
+import dev.braintrust.trace.BraintrustTracing;
+import io.opentelemetry.api.common.AttributeKey;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.SpanKind;
+import io.opentelemetry.api.trace.StatusCode;
+import io.opentelemetry.api.trace.Tracer;
+import io.opentelemetry.context.Scope;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import lombok.extern.slf4j.Slf4j;
+
+/**
+ * Built-in {@link EvalListener} that manages all OpenTelemetry spans for an eval (root {@code eval}
+ * span, {@code task} span, {@code score} spans, and {@code classifier} spans), pushing/popping the
+ * braintrust OTel context across the start/end events so user code nests correctly.
+ *
+ * <p>This listener pushes spans onto the current OTel context on start events and pops them on the
+ * matching end events. That makes it inherently thread-affine and dependent on strictly nested
+ * (LIFO) start/end ordering — which holds because evals run sequentially per case on one thread.
+ */
+@Slf4j
+final class OtelEvalListener implements EvalListener {
+    private static final AttributeKey<String> PARENT =
+            AttributeKey.stringKey(BraintrustTracing.PARENT_KEY);
+
+    private final @Nonnull Tracer tracer;
+    private final @Nonnull BraintrustOpenApiClient client;
+
+    OtelEvalListener(@Nonnull Tracer tracer, @Nonnull BraintrustOpenApiClient client) {
+        this.tracer = tracer;
+        this.client = client;
+    }
+
+    @Override
+    public OtelRunListener createRunListener(String experimentId) {
+        return new OtelRunListener(experimentId);
+    }
+
+    /** Run-scoped listener. There is currently no run-level span; it only spawns case listeners. */
+    final class OtelRunListener implements RunListener {
+        private final @Nonnull String experimentId;
+
+        private OtelRunListener(@Nonnull String experimentId) {
+            this.experimentId = experimentId;
+        }
+
+        @Override
+        public void onStart(String experimentId) {}
+
+        @Override
+        public OtelCaseListener createCaseListener(DatasetCase<?, ?> datasetCase) {
+            return new OtelCaseListener(experimentId, datasetCase);
+        }
+
+        @Override
+        public void onEnd() {}
+    }
+
+    /** Case-scoped listener owning the root/task/score/classifier spans and their scopes. */
+    final class OtelCaseListener implements CaseListener {
+        private final @Nonnull String experimentId;
+        private final @Nonnull DatasetCase<?, ?> datasetCase;
+
+        private @Nullable Span rootSpan;
+        private @Nullable Scope rootScope;
+        private @Nullable String rootTraceId;
+        private @Nullable String taskSpanId;
+
+        private @Nullable Span taskSpan;
+        private @Nullable Scope taskScope;
+
+        private @Nullable Span scoreSpan;
+        private @Nullable Scope scoreScope;
+
+        private int classifierIndex = 0;
+        private @Nullable Span classifierSpan;
+        private @Nullable Scope classifierScope;
+        private @Nullable String classifierName;
+
+        // Accumulated classifier results, written onto the root span at case end.
+        private final Map<String, List<Map<String, Object>>> caseClassifications =
+                new LinkedHashMap<>();
+        private final Map<String, String> classifierErrors = new LinkedHashMap<>();
+
+        private OtelCaseListener(
+                @Nonnull String experimentId, @Nonnull DatasetCase<?, ?> datasetCase) {
+            this.experimentId = experimentId;
+            this.datasetCase = datasetCase;
+        }
+
+        @Override
+        public void onStart() {
+            var span =
+                    tracer.spanBuilder("eval") // TODO: allow names for eval cases
+                            .setNoParent() // each eval case is its own trace
+                            .setSpanKind(SpanKind.CLIENT)
+                            .setAttribute(PARENT, "experiment_id:" + experimentId)
+                            .setAttribute(
+                                    "braintrust.span_attributes", toJson(Map.of("type", "eval")))
+                            .setAttribute(
+                                    "braintrust.input_json",
+                                    toJson(Map.of("input", datasetCase.input())))
+                            .setAttribute("braintrust.expected", toJson(datasetCase.expected()))
+                            .startSpan();
+            if (datasetCase.origin().isPresent()) {
+                span.setAttribute("braintrust.origin", toJson(datasetCase.origin().get()));
+            }
+            if (!datasetCase.tags().isEmpty()) {
+                span.setAttribute(
+                        AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
+            }
+            if (!datasetCase.metadata().isEmpty()) {
+                span.setAttribute(
+                        AttributeKey.stringKey("braintrust.metadata"),
+                        toJson(datasetCase.metadata()));
+            }
+            this.rootSpan = span;
+            this.rootTraceId = span.getSpanContext().getTraceId();
+            this.rootScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent();
+        }
+
+        @Override
+        public void onTaskStart(String experimentId, DatasetCase<?, ?> datasetCase) {
+            var span =
+                    tracer.spanBuilder("task")
+                            .setAttribute(PARENT, "experiment_id:" + this.experimentId)
+                            .setAttribute(
+                                    "braintrust.span_attributes", toJson(Map.of("type", "task")))
+                            .startSpan();
+            this.taskSpan = span;
+            this.taskSpanId = span.getSpanContext().getSpanId();
+            this.taskScope = BraintrustContext.ofExperiment(this.experimentId, span).makeCurrent();
+        }
+
+        @Override
+        public void onTaskEnd(String experimentId, TaskResult<?, ?> taskResult) {
+            requireRoot()
+                    .setAttribute(
+                            "braintrust.output_json",
+                            toJson(Map.of("output", taskResult.result())));
+            closeTaskScope();
+            requireTask().end();
+        }
+
+        @Override
+        public void onTaskError(
+                String experimentId, DatasetCase<?, ?> datasetCase, Exception error) {
+            var task = requireTask();
+            task.setStatus(StatusCode.ERROR, error.getMessage());
+            task.recordException(error);
+            closeTaskScope();
+            task.end();
+
+            var root = requireRoot();
+            root.setStatus(StatusCode.ERROR, error.getMessage());
+            var nullOutput = new LinkedHashMap<String, Object>();
+            nullOutput.put("output", null);
+            root.setAttribute("braintrust.output_json", toJson(nullOutput));
+        }
+
+        @Override
+        public void onScoreStart(Scorer<?, ?> scorer) {
+            var span =
+                    tracer.spanBuilder("score")
+                            .setAttribute(PARENT, "experiment_id:" + experimentId)
+                            .startSpan();
+            this.scoreSpan = span;
+            this.scoreScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent();
+        }
+
+        @Override
+        public void onScoreEnd(
+                Scorer<?, ?> scorer, List<Score> scores, @Nullable Exception scoreException) {
+            var span = requireScore();
+            try {
+                if (scoreException != null) {
+                    span.setStatus(StatusCode.ERROR, scoreException.getMessage());
+                    span.recordException(scoreException);
+                }
+                recordScores(span, requireRoot(), scorer, scores);
+            } finally {
+                closeScoreScope();
+                span.end();
+            }
+        }
+
+        @Override
+        public void onClassifierStart(Classifier<?, ?> classifier) {
+            var resolvedName = classifier.getName();
+            if (resolvedName == null || resolvedName.isBlank()) {
+                resolvedName = "classifier_" + classifierIndex;
+            }
+            classifierIndex++;
+            this.classifierName = resolvedName;
+
+            var span =
+                    tracer.spanBuilder(resolvedName)
+                            .setAttribute(PARENT, "experiment_id:" + experimentId)
+                            .startSpan();
+            Map<String, Object> spanAttrs = new LinkedHashMap<>();
+            spanAttrs.put("type", "classifier");
+            spanAttrs.put("name", resolvedName);
+            spanAttrs.put("purpose", "scorer");
+            span.setAttribute("braintrust.span_attributes", toJson(spanAttrs));
+
+            this.classifierSpan = span;
+            this.classifierScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent();
+        }
+
+        @Override
+        public void onClassifierEnd(
+                Classifier<?, ?> classifier,
+                List<Classification> classifications,
+                @Nullable Exception classifierException) {
+            var span = requireClassifier();
+            var resolvedName = classifierName;
+            try {
+                if (classifierException != null) {
+                    span.setStatus(StatusCode.ERROR, classifierException.getMessage());
+                    span.recordException(classifierException);
+                    classifierErrors.put(
+                            resolvedName,
+                            classifierException.getMessage() == null
+                                    ? classifierException.toString()
+                                    : classifierException.getMessage());
+                    return;
+                }
+
+                // Group results by resolved item name (item.name, falling back to the classifier
+                // name when blank). Same map is logged to the classifier span and merged into the
+                // per-case aggregate logged on the root span.
+                Map<String, List<Map<String, Object>>> outputByName = new LinkedHashMap<>();
+                for (var item : classifications) {
+                    var itemName = item.name();
+                    if (itemName == null || itemName.isBlank()) {
+                        itemName = resolvedName;
+                    }
+                    var itemMap = toClassificationItem(item);
+                    outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap);
+                    caseClassifications
+                            .computeIfAbsent(itemName, k -> new ArrayList<>())
+                            .add(itemMap);
+                }
+                span.setAttribute("braintrust.output_json", toJson(outputByName));
+            } finally {
+                closeClassifierScope();
+                span.end();
+            }
+        }
+
+        @Override
+        public void onEnd() {
+            var root = requireRoot();
+            try {
+                if (!caseClassifications.isEmpty()) {
+                    root.setAttribute("braintrust.classifications", toJson(caseClassifications));
+                }
+                if (!classifierErrors.isEmpty()) {
+                    Map<String, Object> mergedMetadata =
+                            new LinkedHashMap<>(datasetCase.metadata());
+                    mergedMetadata.put("classifier_errors", classifierErrors);
+                    root.setAttribute(
+                            AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata));
+                }
+            } finally {
+                closeRootScope();
+                root.end();
+            }
+        }
+
+        /**
+         * Builds the {@link BrainstoreTrace} for this case from the root trace id and the task span
+         * id. Must be called after {@link #onTaskEnd}.
+         */
+        BrainstoreTrace brainstoreTrace() {
+            return BrainstoreTrace.forExperiment(
+                    client,
+                    experimentId,
+                    requireNonNullState(rootTraceId, "rootTraceId"),
+                    List.of(requireNonNullState(taskSpanId, "taskSpanId")));
+        }
+
+        private Span requireRoot() {
+            return requireNonNullState(rootSpan, "rootSpan");
+        }
+
+        private Span requireTask() {
+            return requireNonNullState(taskSpan, "taskSpan");
+        }
+
+        private Span requireScore() {
+            return requireNonNullState(scoreSpan, "scoreSpan");
+        }
+
+        private Span requireClassifier() {
+            return requireNonNullState(classifierSpan, "classifierSpan");
+        }
+
+        private void closeRootScope() {
+            if (rootScope != null) {
+                rootScope.close();
+                rootScope = null;
+            }
+        }
+
+        private void closeTaskScope() {
+            if (taskScope != null) {
+                taskScope.close();
+                taskScope = null;
+            }
+        }
+
+        private void closeScoreScope() {
+            if (scoreScope != null) {
+                scoreScope.close();
+                scoreScope = null;
+            }
+        }
+
+        private void closeClassifierScope() {
+            if (classifierScope != null) {
+                classifierScope.close();
+                classifierScope = null;
+            }
+        }
+    }
+
+    /** Records scores onto the score span and root span. Validation is the caller's job. */
+    private static void recordScores(
+            Span scoreSpan, Span rootSpan, Scorer<?, ?> scorer, List<Score> scores) {
+        if (scores == null || scores.isEmpty()) {
+            return;
+        }
+        final Map<String, Double> scorerScores = new LinkedHashMap<>();
+        for (var score : scores) {
+            scorerScores.put(score.name(), score.value());
+        }
+        Map<String, Object> spanAttrs = new LinkedHashMap<>();
+        spanAttrs.put("type", "score");
+        spanAttrs.put("name", scorer.getName());
+        spanAttrs.put("purpose", "scorer");
+        scoreSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs));
+        var scoresJson = toJson(scorerScores);
+        scoreSpan.setAttribute("braintrust.output_json", scoresJson);
+        scoreSpan.setAttribute("braintrust.scores", scoresJson);
+    }
+
+    /**
+     * Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code
+     * name}, includes {@code label} and {@code metadata} only when present.
+     */
+    private static Map<String, Object> toClassificationItem(Classification c) {
+        Map<String, Object> m = new LinkedHashMap<>();
+        m.put("id", c.id());
+        if (c.label() != null) {
+            m.put("label", c.label());
+        }
+        if (c.metadata() != null) {
+            m.put("metadata", c.metadata());
+        }
+        return m;
+    }
+
+    private static <T> T requireNonNullState(@Nullable T value, String name) {
+        if (value == null) {
+            throw new IllegalStateException("OtelEvalListener: " + name + " accessed out of order");
+        }
+        return value;
+    }
+}
diff --git a/braintrust-sdk/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java b/braintrust-sdk/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java
index 40227f08..17d712a1 100644
--- a/braintrust-sdk/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java
+++ b/braintrust-sdk/src/test/java/dev/braintrust/eval/DatasetBrainstoreImplTest.java
@@ -265,6 +265,47 @@ void testMetadataPopulatedFromDatasetRow() {
         assertEquals("user123", metadata.get("userId"));
     }
 
+    @Test
+    void testTagsPopulatedFromDatasetRow() {
+        wireMock.stubFor(
+                post(urlEqualTo("/v1/dataset/" + datasetId + "/fetch"))
+                        .willReturn(
+                                aResponse()
+                                        .withStatus(200)
+                                        .withHeader("Content-Type", "application/json")
+                                        .withBody(
+                                                """
+                                {
+                                  "events": [
+                                    {
+                                      "object_type": "dataset",
+                                      "dataset_id": "%s",
+                                      "id": "meta-row-1",
+                                      "_xact_id": "1",
+                                      "created": "2024-01-01T00:00:00Z",
+                                      "input": "test input",
+                                      "expected": "test output",
+                                      "tags": ["unit-test"]
+                                    }
+                                  ],
+                                  "cursor": null
+                                }
+                                """
+                                                        .formatted(datasetId))));
+
+        DatasetBrainstoreImpl<String, String> dataset =
+                new DatasetBrainstoreImpl<>(apiClient, datasetId, "test-version");
+
+        List<DatasetCase<String, String>> cases = new ArrayList<>();
+        dataset.forEach(cases::add);
+
+        assertEquals(1, cases.size());
+        List<String> tags = cases.get(0).tags();
+        assertFalse(tags.isEmpty(), "tags should not be empty");
+        assertEquals(1, tags.size());
+        assertEquals("unit-test", tags.get(0));
+    }
+
     @Test
     void testFetchFromBraintrustNotFound() {
         String projectName = "test-project";

From e3730195d6eb55486643da9d34a58dd194535ffb Mon Sep 17 00:00:00 2001
From: Andrew Kent <andrew@braintrustdata.com>
Date: Fri, 26 Jun 2026 16:30:56 -0600
Subject: [PATCH 2/3] wip

---
 braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
index 4483f062..a20f7861 100644
--- a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
@@ -308,7 +308,7 @@ public static <INPUT, OUTPUT> Builder<INPUT, OUTPUT> builder() {
 
     /** Builder for creating evaluations with fluent API. */
     public static final class Builder<INPUT, OUTPUT> {
-        public @Nonnull Dataset<INPUT, OUTPUT> dataset;
+        private @Nonnull Dataset<INPUT, OUTPUT> dataset;
         private @Nonnull String experimentName = "unnamed-java-eval";
         private @Nullable BraintrustConfig config;
         private @Nullable BraintrustOpenApiClient apiClient;

From 1ba64efe554ba4f44b38d9d1ac191f0254493fa0 Mon Sep 17 00:00:00 2001
From: Andrew Kent <andrew@braintrustdata.com>
Date: Fri, 26 Jun 2026 17:06:41 -0600
Subject: [PATCH 3/3] wip 50/50 otel

---
 .../dev/braintrust/devserver/Devserver.java   | 461 ++++--------------
 .../devserver/PlaygroundSpanDecorator.java    | 151 ++++++
 .../main/java/dev/braintrust/eval/Eval.java   | 290 ++++++-----
 .../dev/braintrust/eval/EvalListener.java     |  76 ++-
 .../java/dev/braintrust/eval/EvalRunInfo.java |  24 +
 .../braintrust/eval/EvalSpanDecorator.java    | 211 ++++++++
 .../braintrust/eval/EvalTargetProvider.java   |  33 ++
 .../eval/ExperimentTargetProvider.java        |  46 ++
 .../dev/braintrust/eval/OtelEvalListener.java | 381 ---------------
 .../braintrust/devserver/DevserverTest.java   |   8 +-
 10 files changed, 812 insertions(+), 869 deletions(-)
 create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/devserver/PlaygroundSpanDecorator.java
 create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/EvalRunInfo.java
 create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/EvalSpanDecorator.java
 create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/EvalTargetProvider.java
 create mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/ExperimentTargetProvider.java
 delete mode 100644 braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java

diff --git a/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java b/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java
index f83024af..d7f2906a 100644
--- a/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java
+++ b/braintrust-sdk/src/main/java/dev/braintrust/devserver/Devserver.java
@@ -13,14 +13,7 @@
 import dev.braintrust.api.BraintrustOpenApiClient;
 import dev.braintrust.config.BraintrustConfig;
 import dev.braintrust.eval.*;
-import dev.braintrust.trace.BraintrustContext;
-import dev.braintrust.trace.BraintrustTracing;
-import io.opentelemetry.api.common.AttributeKey;
 import io.opentelemetry.api.trace.Span;
-import io.opentelemetry.api.trace.SpanKind;
-import io.opentelemetry.api.trace.StatusCode;
-import io.opentelemetry.api.trace.Tracer;
-import io.opentelemetry.context.Context;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
@@ -69,9 +62,6 @@ public class Devserver {
     private static final String EXPOSED_HEADERS =
             "x-bt-cursor, x-bt-found-existing-experiment, x-bt-span-id, x-bt-span-export";
 
-    private static final AttributeKey<String> PARENT =
-            AttributeKey.stringKey(BraintrustTracing.PARENT_KEY);
-
     private final List<String> corsOriginWhitelist;
     private final BraintrustConfig config;
 
@@ -345,12 +335,12 @@ private void handleEval(HttpExchange exchange) throws IOException {
     }
 
     @SuppressWarnings({"unchecked", "rawtypes"})
-    private <I, O> void handleStreamingEval(
+    private void handleStreamingEval(
             HttpExchange exchange,
             RemoteEval eval,
             EvalRequest request,
             RequestContext context,
-            List<Scorer<I, O>> remoteScorers)
+            List<Scorer<Object, Object>> remoteScorers)
             throws Exception {
         // Set SSE headers
         exchange.getResponseHeaders().set("Content-Type", "text/event-stream");
@@ -377,174 +367,44 @@ private <I, O> void handleStreamingEval(
                         BraintrustUtils.createProjectURI(
                                         braintrust.config().appUrl(), orgName, projectName)
                                 .toASCIIString();
-                final var experimentUrl = projectUrl + "/experiments/" + experimentName;
-
-                var tracer = BraintrustTracing.getTracer();
 
-                // Merge parameters: evaluator defaults + request overrides
-                final Parameters mergedParameters =
-                        new Parameters(
-                                eval.getParameters(),
-                                null == request.getParameters()
-                                        ? Map.of()
-                                        : request.getParameters());
+                // Combine local scorers from RemoteEval with remote scorers from the request
+                List<Scorer<Object, Object>> allScorers = new ArrayList<>(eval.getScorers());
+                allScorers.addAll(remoteScorers);
 
-                // Execute task and scorers for each case
-                final Map<String, List<Double>> scoresByName = new ConcurrentHashMap<>();
                 final var parentInfo = extractParentInfo(request);
-                final var braintrustParent = parentInfo.braintrustParent();
-                final var braintrustGeneration = parentInfo.generation();
-
-                // NOTE: this code is serial but written in a thread-safe manner to support
-                // concurrent dataset fetching and eval execution
-                extractDataset(request, apiClient)
-                        .forEach(
-                                rawDataset -> {
-                                    final DatasetCase<I, O> datasetCase =
-                                            (DatasetCase<I, O>) rawDataset;
-                                    var evalSpan =
-                                            tracer.spanBuilder("eval")
-                                                    .setNoParent()
-                                                    .setSpanKind(SpanKind.CLIENT)
-                                                    .setAttribute(
-                                                            PARENT,
-                                                            braintrustParent.toParentValue())
-                                                    .startSpan();
-                                    Context evalContext = Context.current().with(evalSpan);
-                                    evalContext =
-                                            BraintrustContext.setParentInBaggage(
-                                                    evalContext,
-                                                    braintrustParent.type(),
-                                                    braintrustParent.id());
-                                    // Make the eval context (with span and baggage) current
-                                    try (var rootScope = evalContext.makeCurrent()) {
-                                        final TaskResult<I, O> taskResult;
-                                        { // run task
-                                            var taskSpan = tracer.spanBuilder("task").startSpan();
-                                            try (var unused =
-                                                    Context.current()
-                                                            .with(taskSpan)
-                                                            .makeCurrent()) {
-                                                var task = eval.getTask();
-                                                try {
-                                                    taskResult =
-                                                            task.apply(
-                                                                    datasetCase, mergedParameters);
-                                                } catch (Exception e) {
-                                                    taskSpan.setStatus(
-                                                            StatusCode.ERROR, e.getMessage());
-                                                    taskSpan.recordException(e);
-                                                    taskSpan.end();
-                                                    evalSpan.setStatus(
-                                                            StatusCode.ERROR, e.getMessage());
-                                                    evalSpan.setAttribute(
-                                                            "braintrust.output_json",
-                                                            toJson(
-                                                                    Collections.singletonMap(
-                                                                            "output", null)));
-                                                    log.debug(
-                                                            "Task threw exception for input: "
-                                                                    + datasetCase.input(),
-                                                            e);
-                                                    // Set eval span attributes so Braintrust can
-                                                    // resolve the trace
-                                                    setEvalSpanAttributesForError(
-                                                            evalSpan,
-                                                            braintrustParent,
-                                                            braintrustGeneration,
-                                                            datasetCase);
-                                                    // Send progress event even on error so the
-                                                    // Playground can link to the trace
-                                                    sendProgressEvent(
-                                                            os,
-                                                            evalSpan.getSpanContext().getSpanId(),
-                                                            datasetCase.origin(),
-                                                            eval.getName(),
-                                                            null);
-                                                    // run scoreForTaskException on each scorer
-                                                    List<Scorer<I, O>> allScorersForError =
-                                                            new ArrayList<>(eval.getScorers());
-                                                    allScorersForError.addAll(remoteScorers);
-                                                    for (var scorer : allScorersForError) {
-                                                        runScoreForTaskException(
-                                                                tracer,
-                                                                evalSpan,
-                                                                braintrustParent,
-                                                                braintrustGeneration,
-                                                                scorer,
-                                                                e,
-                                                                datasetCase,
-                                                                scoresByName);
-                                                    }
-                                                    return;
-                                                }
-                                                // Send progress event for task completion
-                                                sendProgressEvent(
-                                                        os,
-                                                        evalSpan.getSpanContext().getSpanId(),
-                                                        datasetCase.origin(),
-                                                        eval.getName(),
-                                                        taskResult.result());
-                                                setTaskSpanAttributes(
-                                                        taskSpan,
-                                                        braintrustParent,
-                                                        braintrustGeneration,
-                                                        datasetCase,
-                                                        taskResult);
-                                            } finally {
-                                                taskSpan.end();
-                                            }
-                                            // setting eval span attributes here because we need the
-                                            // task output
-                                            setEvalSpanAttributes(
-                                                    evalSpan,
-                                                    braintrustParent,
-                                                    braintrustGeneration,
-                                                    datasetCase,
-                                                    taskResult);
-                                        }
-                                        // run scorers - one score span per scorer
-                                        // Combine local scorers from RemoteEval with remote scorers
-                                        // from request
-                                        List<Scorer<I, O>> allScorers =
-                                                new ArrayList<>(eval.getScorers());
-                                        allScorers.addAll(remoteScorers);
-                                        for (var scorer : allScorers) {
-                                            runScorer(
-                                                    tracer,
-                                                    evalSpan,
-                                                    braintrustParent,
-                                                    braintrustGeneration,
-                                                    scorer,
-                                                    taskResult,
-                                                    scoresByName);
-                                        }
-                                    } catch (IOException e) {
-                                        throw new RuntimeException(
-                                                "Failed to send progress event", e);
-                                    } finally {
-                                        evalSpan.end();
-                                    }
-                                });
-
-                // Aggregate scores
-                Map<String, EvalResponse.ScoreSummary> scoreSummaries = new LinkedHashMap<>();
-                for (Map.Entry<String, List<Double>> entry : scoresByName.entrySet()) {
-                    String scoreName = entry.getKey();
-                    List<Double> values = entry.getValue();
-
-                    double avgScore =
-                            values.stream().mapToDouble(Double::doubleValue).average().orElse(0.0);
-
-                    scoreSummaries.put(
-                            scoreName,
-                            EvalResponse.ScoreSummary.builder()
-                                    .name(scoreName)
-                                    .score(avgScore)
-                                    .improvements(0)
-                                    .regressions(0)
-                                    .build());
-                }
+                // The playground targets a playground_id parent (no experiment is created) and
+                // weaves the request's generation into span attributes.
+                EvalTargetProvider playgroundTarget =
+                        ctx ->
+                                new EvalRunInfo(
+                                        parentInfo.braintrustParent(),
+                                        parentInfo.generation(),
+                                        null,
+                                        null,
+                                        false);
+
+                var sseListener = new SseEvalListener(os, eval.getName());
+
+                Eval.<Object, Object>builder()
+                        .name(experimentName)
+                        .config(braintrust.config())
+                        .apiClient(apiClient)
+                        .projectId(projectId)
+                        .dataset((Dataset<Object, Object>) extractDataset(request, apiClient))
+                        .task((Task<Object, Object>) eval.getTask())
+                        .scorers(allScorers.toArray(new Scorer[0]))
+                        .parameters(eval.getParameters())
+                        .parameterValues(
+                                request.getParameters() == null
+                                        ? Map.of()
+                                        : request.getParameters())
+                        .evalTargetProvider(playgroundTarget)
+                        .clearListeners()
+                        .addListener(new PlaygroundSpanDecorator())
+                        .addListener(sseListener)
+                        .build()
+                        .run();
 
                 sendSummaryEvent(
                         os,
@@ -552,8 +412,8 @@ private <I, O> void handleStreamingEval(
                         projectId,
                         experimentName,
                         projectUrl,
-                        experimentUrl,
-                        scoreSummaries);
+                        null,
+                        sseListener.scoreSummaries());
                 sendDoneEvent(os);
             } catch (Exception e) {
                 // Send error event via SSE
@@ -577,194 +437,87 @@ private <I, O> void handleStreamingEval(
         }
     }
 
-    private void setEvalSpanAttributes(
-            Span evalSpan,
-            BraintrustUtils.Parent braintrustParent,
-            String braintrustGeneration,
-            DatasetCase<?, ?> datasetCase,
-            TaskResult<?, ?> taskResult) {
-        var spanAttrs = new LinkedHashMap<>();
-        spanAttrs.put("type", "eval");
-        spanAttrs.put("name", "eval");
-        if (braintrustGeneration != null) {
-            spanAttrs.put("generation", braintrustGeneration);
-        }
-        evalSpan.setAttribute(PARENT, braintrustParent.toParentValue())
-                .setAttribute("braintrust.span_attributes", toJson(spanAttrs))
-                .setAttribute("braintrust.input_json", toJson(Map.of("input", datasetCase.input())))
-                .setAttribute("braintrust.expected_json", toJson(datasetCase.expected()));
-
-        if (datasetCase.origin().isPresent()) {
-            evalSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get()));
-        }
-        if (!datasetCase.tags().isEmpty()) {
-            evalSpan.setAttribute(
-                    AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
-        }
-        if (!datasetCase.metadata().isEmpty()) {
-            evalSpan.setAttribute("braintrust.metadata", toJson(datasetCase.metadata()));
-        }
-        evalSpan.setAttribute(
-                "braintrust.output_json", toJson(Map.of("output", taskResult.result())));
-    }
-
     /**
-     * Sets eval span attributes when the task threw an exception. Similar to {@link
-     * #setEvalSpanAttributes} but does not require a TaskResult.
+     * An {@link EvalListener} that streams playground SSE {@code progress} events (one per case,
+     * including on task error) and accumulates per-scorer averages for the {@code summary} event.
+     * Span decoration is handled separately by {@link PlaygroundSpanDecorator}.
      */
-    private void setEvalSpanAttributesForError(
-            Span evalSpan,
-            BraintrustUtils.Parent braintrustParent,
-            String braintrustGeneration,
-            DatasetCase<?, ?> datasetCase) {
-        var spanAttrs = new LinkedHashMap<>();
-        spanAttrs.put("type", "eval");
-        spanAttrs.put("name", "eval");
-        if (braintrustGeneration != null) {
-            spanAttrs.put("generation", braintrustGeneration);
-        }
-        evalSpan.setAttribute(PARENT, braintrustParent.toParentValue())
-                .setAttribute("braintrust.span_attributes", toJson(spanAttrs))
-                .setAttribute("braintrust.input_json", toJson(Map.of("input", datasetCase.input())))
-                .setAttribute("braintrust.expected_json", toJson(datasetCase.expected()));
-
-        if (datasetCase.origin().isPresent()) {
-            evalSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get()));
-        }
-        if (!datasetCase.tags().isEmpty()) {
-            evalSpan.setAttribute(
-                    AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
+    private final class SseEvalListener implements EvalListener {
+        private final OutputStream os;
+        private final String evalName;
+        private final Map<String, List<Double>> scoresByName = new ConcurrentHashMap<>();
+
+        SseEvalListener(OutputStream os, String evalName) {
+            this.os = os;
+            this.evalName = evalName;
         }
-        if (!datasetCase.metadata().isEmpty()) {
-            evalSpan.setAttribute("braintrust.metadata", toJson(datasetCase.metadata()));
-        }
-    }
 
-    private void setTaskSpanAttributes(
-            Span taskSpan,
-            BraintrustUtils.Parent braintrustParent,
-            String braintrustGeneration,
-            DatasetCase<?, ?> datasetCase,
-            TaskResult<?, ?> taskResult) {
-        Map<String, Object> taskSpanAttrs = new LinkedHashMap<>();
-        taskSpanAttrs.put("type", "task");
-        taskSpanAttrs.put("name", "task");
-        if (braintrustGeneration != null) {
-            taskSpanAttrs.put("generation", braintrustGeneration);
+        Map<String, EvalResponse.ScoreSummary> scoreSummaries() {
+            Map<String, EvalResponse.ScoreSummary> scoreSummaries = new LinkedHashMap<>();
+            for (var entry : scoresByName.entrySet()) {
+                double avgScore =
+                        entry.getValue().stream()
+                                .mapToDouble(Double::doubleValue)
+                                .average()
+                                .orElse(0.0);
+                scoreSummaries.put(
+                        entry.getKey(),
+                        EvalResponse.ScoreSummary.builder()
+                                .name(entry.getKey())
+                                .score(avgScore)
+                                .improvements(0)
+                                .regressions(0)
+                                .build());
+            }
+            return scoreSummaries;
         }
 
-        taskSpan.setAttribute(PARENT, braintrustParent.toParentValue())
-                .setAttribute("braintrust.span_attributes", toJson(taskSpanAttrs))
-                .setAttribute("braintrust.input_json", toJson(Map.of("input", datasetCase.input())))
-                .setAttribute(
-                        "braintrust.output_json", toJson(Map.of("output", taskResult.result())));
-    }
-
-    private void setScoreSpanAttributes(
-            Span scoreSpan,
-            BraintrustUtils.Parent braintrustParent,
-            String braintrustGeneration,
-            String scorerName,
-            Map<String, Double> scorerScores) {
-        Map<String, Object> scoreSpanAttrs = new LinkedHashMap<>();
-        scoreSpanAttrs.put("type", "score");
-        scoreSpanAttrs.put("name", scorerName);
-        scoreSpanAttrs.put("purpose", "scorer");
-        if (braintrustGeneration != null) {
-            scoreSpanAttrs.put("generation", braintrustGeneration);
+        @Override
+        public RunListener createRunListener(EvalRunInfo info) {
+            return datasetCase -> new SseCaseListener();
         }
 
-        var scoresJson = toJson(scorerScores);
-        scoreSpan
-                .setAttribute(PARENT, braintrustParent.toParentValue())
-                .setAttribute("braintrust.span_attributes", toJson(scoreSpanAttrs))
-                .setAttribute("braintrust.output_json", scoresJson)
-                .setAttribute("braintrust.scores", scoresJson);
-    }
+        private final class SseCaseListener implements CaseListener {
+            @Override
+            public void onTaskSuccess(Span rootSpan, Span taskSpan, TaskResult<?, ?> taskResult) {
+                sendProgress(rootSpan, taskResult.datasetCase(), taskResult.result());
+            }
 
-    /**
-     * Runs a scorer against a successful task result. If the scorer throws, falls back to {@link
-     * Scorer#scoreForScorerException}.
-     */
-    private <I, O> void runScorer(
-            Tracer tracer,
-            Span evalSpan,
-            BraintrustUtils.Parent braintrustParent,
-            String braintrustGeneration,
-            Scorer<I, O> scorer,
-            TaskResult<I, O> taskResult,
-            Map<String, List<Double>> scoresByName) {
-        var scoreSpan = tracer.spanBuilder("score").startSpan();
-        try (var unused = Context.current().with(scoreSpan).makeCurrent()) {
-            List<Score> scores;
-            try {
-                scores = scorer.score(taskResult);
-            } catch (Exception e) {
-                scoreSpan.setStatus(StatusCode.ERROR, e.getMessage());
-                scoreSpan.recordException(e);
-                log.debug("Scorer '{}' threw exception", scorer.getName(), e);
-                // fall back to scoreForScorerException — if this throws, eval aborts
-                scores = scorer.scoreForScorerException(e, taskResult);
+            @Override
+            public void onTaskError(
+                    Span rootSpan, Span taskSpan, DatasetCase<?, ?> datasetCase, Exception error) {
+                // Send progress even on error so the Playground can link to the trace.
+                sendProgress(rootSpan, datasetCase, null);
             }
-            recordScores(
-                    scoreSpan,
-                    braintrustParent,
-                    braintrustGeneration,
-                    scorer,
-                    scores,
-                    scoresByName);
-        } finally {
-            scoreSpan.end();
-        }
-    }
 
-    /**
-     * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback throws, the
-     * eval aborts.
-     */
-    private <I, O> void runScoreForTaskException(
-            Tracer tracer,
-            Span evalSpan,
-            BraintrustUtils.Parent braintrustParent,
-            String braintrustGeneration,
-            Scorer<I, O> scorer,
-            Exception taskException,
-            DatasetCase<I, O> datasetCase,
-            Map<String, List<Double>> scoresByName) {
-        var scoreSpan = tracer.spanBuilder("score").startSpan();
-        try (var unused = Context.current().with(scoreSpan).makeCurrent()) {
-            // if this throws, it propagates and the eval aborts
-            var scores = scorer.scoreForTaskException(taskException, datasetCase);
-            recordScores(
-                    scoreSpan,
-                    braintrustParent,
-                    braintrustGeneration,
-                    scorer,
-                    scores,
-                    scoresByName);
-        } finally {
-            scoreSpan.end();
-        }
-    }
+            @Override
+            public void onScoreResult(
+                    Span scoreSpan,
+                    Span rootSpan,
+                    Scorer<?, ?> scorer,
+                    List<Score> scores,
+                    @Nullable Exception scoreException) {
+                for (var score : scores) {
+                    scoresByName
+                            .computeIfAbsent(score.name(), k -> new ArrayList<>())
+                            .add(score.value());
+                }
+            }
 
-    /** Records scores on the score span and accumulates them into scoresByName. */
-    private void recordScores(
-            Span scoreSpan,
-            BraintrustUtils.Parent braintrustParent,
-            String braintrustGeneration,
-            Scorer<?, ?> scorer,
-            List<Score> scores,
-            Map<String, List<Double>> scoresByName) {
-        if (scores == null || scores.isEmpty()) {
-            return;
-        }
-        Map<String, Double> scorerScores = new LinkedHashMap<>();
-        for (Score score : scores) {
-            scoresByName.computeIfAbsent(score.name(), k -> new ArrayList<>()).add(score.value());
-            scorerScores.put(score.name(), score.value());
+            private void sendProgress(
+                    Span rootSpan, DatasetCase<?, ?> datasetCase, @Nullable Object output) {
+                try {
+                    sendProgressEvent(
+                            os,
+                            rootSpan.getSpanContext().getSpanId(),
+                            datasetCase.origin(),
+                            evalName,
+                            output);
+                } catch (IOException e) {
+                    throw new RuntimeException("Failed to send progress event", e);
+                }
+            }
         }
-        setScoreSpanAttributes(
-                scoreSpan, braintrustParent, braintrustGeneration, scorer.getName(), scorerScores);
     }
 
     private void sendSSEEvent(OutputStream os, String eventType, String data) throws IOException {
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/devserver/PlaygroundSpanDecorator.java b/braintrust-sdk/src/main/java/dev/braintrust/devserver/PlaygroundSpanDecorator.java
new file mode 100644
index 00000000..5b30dd46
--- /dev/null
+++ b/braintrust-sdk/src/main/java/dev/braintrust/devserver/PlaygroundSpanDecorator.java
@@ -0,0 +1,151 @@
+package dev.braintrust.devserver;
+
+import static dev.braintrust.json.BraintrustJsonMapper.toJson;
+
+import dev.braintrust.eval.Classifier;
+import dev.braintrust.eval.DatasetCase;
+import dev.braintrust.eval.EvalListener;
+import dev.braintrust.eval.EvalRunInfo;
+import dev.braintrust.eval.Score;
+import dev.braintrust.eval.Scorer;
+import dev.braintrust.eval.TaskResult;
+import dev.braintrust.trace.BraintrustTracing;
+import io.opentelemetry.api.common.AttributeKey;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.StatusCode;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import javax.annotation.Nullable;
+
+/**
+ * Playground variant of the span decorator. Mirrors {@link dev.braintrust.eval.EvalSpanDecorator}
+ * but emits the playground attribute shape: a {@code playground_id} parent, a {@code generation}
+ * woven into each {@code span_attributes}, a {@code name} on the eval/task span attributes, {@code
+ * braintrust.expected_json} (rather than {@code braintrust.expected}), and input/output on the task
+ * span.
+ *
+ * <p>Standalone (does not extend {@code EvalSpanDecorator}) so the two attribute shapes can evolve
+ * independently.
+ */
+final class PlaygroundSpanDecorator implements EvalListener {
+    private static final AttributeKey<String> PARENT =
+            AttributeKey.stringKey(BraintrustTracing.PARENT_KEY);
+
+    @Override
+    public RunListener createRunListener(EvalRunInfo info) {
+        return new RunListener() {
+            @Override
+            public CaseListener createCaseListener(DatasetCase<?, ?> datasetCase) {
+                return new Decorator(info);
+            }
+        };
+    }
+
+    private static final class Decorator implements CaseListener {
+        private final EvalRunInfo info;
+
+        Decorator(EvalRunInfo info) {
+            this.info = info;
+        }
+
+        private String parentValue() {
+            return info.parent().toParentValue();
+        }
+
+        private Map<String, Object> spanAttrs(String type, String name) {
+            var m = new LinkedHashMap<String, Object>();
+            m.put("type", type);
+            m.put("name", name);
+            if (info.generation() != null) {
+                m.put("generation", info.generation());
+            }
+            return m;
+        }
+
+        @Override
+        public void onRootSpan(Span rootSpan, DatasetCase<?, ?> datasetCase) {
+            rootSpan.setAttribute(PARENT, parentValue());
+            rootSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs("eval", "eval")));
+            rootSpan.setAttribute(
+                    "braintrust.input_json", toJson(Map.of("input", datasetCase.input())));
+            rootSpan.setAttribute("braintrust.expected_json", toJson(datasetCase.expected()));
+            if (datasetCase.origin().isPresent()) {
+                rootSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get()));
+            }
+            if (!datasetCase.tags().isEmpty()) {
+                rootSpan.setAttribute(
+                        AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
+            }
+            if (!datasetCase.metadata().isEmpty()) {
+                rootSpan.setAttribute("braintrust.metadata", toJson(datasetCase.metadata()));
+            }
+        }
+
+        @Override
+        public void onTaskSpan(Span taskSpan, DatasetCase<?, ?> datasetCase) {
+            taskSpan.setAttribute(PARENT, parentValue());
+            taskSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs("task", "task")));
+            taskSpan.setAttribute(
+                    "braintrust.input_json", toJson(Map.of("input", datasetCase.input())));
+        }
+
+        @Override
+        public void onTaskSuccess(Span rootSpan, Span taskSpan, TaskResult<?, ?> taskResult) {
+            var output = toJson(Map.of("output", taskResult.result()));
+            taskSpan.setAttribute("braintrust.output_json", output);
+            rootSpan.setAttribute("braintrust.output_json", output);
+        }
+
+        @Override
+        public void onTaskError(
+                Span rootSpan, Span taskSpan, DatasetCase<?, ?> datasetCase, Exception error) {
+            taskSpan.setStatus(StatusCode.ERROR, error.getMessage());
+            taskSpan.recordException(error);
+            rootSpan.setStatus(StatusCode.ERROR, error.getMessage());
+            rootSpan.setAttribute(
+                    "braintrust.output_json", toJson(Collections.singletonMap("output", null)));
+        }
+
+        @Override
+        public void onScoreSpan(Span scoreSpan, Scorer<?, ?> scorer) {
+            scoreSpan.setAttribute(PARENT, parentValue());
+        }
+
+        @Override
+        public void onScoreResult(
+                Span scoreSpan,
+                Span rootSpan,
+                Scorer<?, ?> scorer,
+                List<Score> scores,
+                @Nullable Exception scoreException) {
+            if (scoreException != null) {
+                scoreSpan.setStatus(StatusCode.ERROR, scoreException.getMessage());
+                scoreSpan.recordException(scoreException);
+            }
+            if (scores == null || scores.isEmpty()) {
+                return;
+            }
+            var scorerScores = new LinkedHashMap<String, Double>();
+            for (var score : scores) {
+                scorerScores.put(score.name(), score.value());
+            }
+            var attrs = spanAttrs("score", scorer.getName());
+            attrs.put("purpose", "scorer");
+            scoreSpan.setAttribute("braintrust.span_attributes", toJson(attrs));
+            var scoresJson = toJson(scorerScores);
+            scoreSpan.setAttribute("braintrust.output_json", scoresJson);
+            scoreSpan.setAttribute("braintrust.scores", scoresJson);
+        }
+
+        @Override
+        public void onClassifierSpan(
+                Span classifierSpan, Classifier<?, ?> classifier, String resolvedName) {
+            classifierSpan.setAttribute(PARENT, parentValue());
+            var attrs = spanAttrs("classifier", resolvedName);
+            attrs.put("purpose", "scorer");
+            classifierSpan.setAttribute("braintrust.span_attributes", toJson(attrs));
+        }
+    }
+}
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
index a20f7861..5d1f6e97 100644
--- a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
@@ -6,12 +6,15 @@
 import dev.braintrust.config.BraintrustConfig;
 import dev.braintrust.eval.EvalListener.CaseListener;
 import dev.braintrust.eval.EvalListener.RunListener;
-import dev.braintrust.openapi.api.ExperimentsApi;
-import dev.braintrust.openapi.model.CreateExperiment;
 import dev.braintrust.openapi.model.Project;
 import dev.braintrust.trace.BrainstoreTrace;
+import dev.braintrust.trace.BraintrustContext;
 import dev.braintrust.trace.BraintrustTracing;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.SpanKind;
 import io.opentelemetry.api.trace.Tracer;
+import io.opentelemetry.context.Context;
+import io.opentelemetry.context.Scope;
 import java.util.*;
 import java.util.function.Function;
 import javax.annotation.Nonnull;
@@ -31,6 +34,7 @@ public final class Eval<INPUT, OUTPUT> {
     private final @Nonnull BraintrustOpenApiClient client;
     private final @Nonnull Project project;
     private final @Nonnull BraintrustOpenApiClient.OrgInfo orgInfo;
+    private final @Nonnull Tracer tracer;
     private final @Nonnull Dataset<INPUT, OUTPUT> dataset;
     private final @Nonnull Task<INPUT, OUTPUT> task;
     private final @Nonnull List<Scorer<INPUT, OUTPUT>> scorers;
@@ -38,19 +42,14 @@ public final class Eval<INPUT, OUTPUT> {
     private final @Nonnull List<String> tags;
     private final @Nonnull Map<String, Object> metadata;
     private final @Nonnull Parameters parameters;
+    private final @Nonnull EvalTargetProvider targetProvider;
 
     /**
-     * All listeners attached to this eval, including the built-in {@link OtelEvalListener} (always
-     * first) which manages the OTel spans.
+     * All listeners attached to this eval. {@link Eval} owns the spans; listeners decorate/observe
+     * them. By default this includes the built-in {@link EvalSpanDecorator}.
      */
     private final @Nonnull List<EvalListener> listeners;
 
-    /**
-     * Typed reference to the built-in OTel listener (also present in {@link #listeners}). Kept so
-     * we can pull span-derived info — e.g. the per-case {@link BrainstoreTrace} — back out of it.
-     */
-    private final @Nonnull OtelEvalListener otelListener;
-
     private Eval(Builder<INPUT, OUTPUT> builder) {
         this.experimentName = builder.experimentName;
         this.config = Objects.requireNonNull(builder.config);
@@ -59,6 +58,7 @@ private Eval(Builder<INPUT, OUTPUT> builder) {
                 client.fetchOrCreateProject(
                         builder.projectId, config.defaultProjectName().orElse(null));
         this.orgInfo = client.fetchOrgInfo(project.getOrgId().toString());
+        this.tracer = Objects.requireNonNull(builder.tracer);
         this.dataset = builder.dataset;
         this.task = Objects.requireNonNull(builder.task);
         this.scorers = List.copyOf(builder.scorers);
@@ -66,16 +66,13 @@ private Eval(Builder<INPUT, OUTPUT> builder) {
         this.tags = List.copyOf(builder.tags);
         this.metadata = Map.copyOf(builder.metadata);
         this.parameters = builder.buildParameters();
-        this.otelListener = new OtelEvalListener(Objects.requireNonNull(builder.tracer), client);
-        // built-in OTel listener runs first, then any user-supplied listeners
-        var allListeners = new ArrayList<EvalListener>();
-        allListeners.add(otelListener);
-        allListeners.addAll(builder.listeners);
-        this.listeners = List.copyOf(allListeners);
+        this.targetProvider = Objects.requireNonNull(builder.targetProvider);
+        this.listeners = List.copyOf(builder.listeners);
     }
 
     /** Runs the evaluation and returns results. */
     public EvalResult run() {
+        final EvalRunInfo runInfo;
         try (var cursor = dataset.openCursor()) {
             Optional<String> datasetVersion = Optional.empty();
             Optional<String> datasetId = Optional.empty();
@@ -84,123 +81,145 @@ public EvalResult run() {
                 datasetId = Optional.of(dataset.id());
             }
 
-            var createExperiment =
-                    new CreateExperiment().projectId(project.getId()).name(experimentName);
-
-            if (!tags.isEmpty()) {
-                createExperiment.tags(tags);
-            }
-            if (!metadata.isEmpty()) {
-                createExperiment.metadata(metadata);
-            }
-            datasetId.ifPresent(id -> createExperiment.datasetId(UUID.fromString(id)));
-            datasetVersion.ifPresent(createExperiment::datasetVersion);
+            runInfo =
+                    targetProvider.create(
+                            new EvalTargetProvider.Context(
+                                    config,
+                                    client,
+                                    project,
+                                    orgInfo,
+                                    experimentName,
+                                    tags,
+                                    metadata,
+                                    datasetId,
+                                    datasetVersion));
 
-            var experiment = new ExperimentsApi(client).postExperiment(createExperiment);
-            var experimentId = experiment.getId().toString();
-
-            // Create one RunListener per attached listener, tracking the built-in OTel run
-            // listener by identity so we can later pull the per-case BrainstoreTrace from it.
             var runListeners = new ArrayList<RunListener>(listeners.size());
-            OtelEvalListener.OtelRunListener otelRunListener = null;
             for (var listener : listeners) {
-                var runListener = listener.createRunListener(experimentId);
-                if (listener == otelListener) {
-                    otelRunListener = (OtelEvalListener.OtelRunListener) runListener;
-                }
-                runListeners.add(runListener);
+                runListeners.add(listener.createRunListener(runInfo));
             }
-            final var otelRun = otelRunListener;
-
-            runListeners.forEach(runListener -> runListener.onStart(experimentId));
-            cursor.forEach(
-                    datasetCase -> evalOne(experimentId, datasetCase, runListeners, otelRun));
-            runListeners.forEach(RunListener::onEnd);
-        }
-
-        var experimentUrl =
-                "%s/experiments/%s"
-                        .formatted(
-                                BraintrustUtils.createProjectURI(
-                                                config.appUrl(), orgInfo.name(), project.getName())
-                                        .toASCIIString(),
-                                experimentName);
-        return new EvalResult(experimentUrl);
+
+            runListeners.forEach(RunListener::onRunStart);
+            cursor.forEach(datasetCase -> evalOne(runInfo, datasetCase, runListeners));
+            runListeners.forEach(RunListener::onRunEnd);
+        }
+
+        return new EvalResult(runInfo.experimentUrl());
+    }
+
+    /** Makes {@code span} current with the braintrust parent set in baggage for child spans. */
+    private Scope makeCurrent(Span span, BraintrustUtils.Parent parent) {
+        var ctx = Context.current().with(span);
+        ctx = BraintrustContext.setParentInBaggage(ctx, parent.type(), parent.id());
+        return ctx.makeCurrent();
     }
 
     private void evalOne(
-            String experimentId,
+            EvalRunInfo runInfo,
             DatasetCase<INPUT, OUTPUT> datasetCase,
-            List<RunListener> runListeners,
-            @Nullable OtelEvalListener.OtelRunListener otelRunListener) {
-        // Create one CaseListener per RunListener, tracking the OTel one by identity so we can
-        // pull the BrainstoreTrace from it later.
+            List<RunListener> runListeners) {
         var caseListeners = new ArrayList<CaseListener>(runListeners.size());
-        OtelEvalListener.OtelCaseListener otelCase = null;
         for (var runListener : runListeners) {
-            var caseListener = runListener.createCaseListener(datasetCase);
-            if (runListener == otelRunListener) {
-                otelCase = (OtelEvalListener.OtelCaseListener) caseListener;
+            caseListeners.add(runListener.createCaseListener(datasetCase));
+        }
+        var parent = runInfo.parent();
+
+        // Eval owns the span structure: create the root span (name only), then let listeners
+        // decorate it.
+        var rootSpan =
+                tracer.spanBuilder("eval") // TODO: allow names for eval cases
+                        .setNoParent() // each eval case is its own trace
+                        .setSpanKind(SpanKind.CLIENT)
+                        .startSpan();
+        for (var cl : caseListeners) {
+            cl.onRootSpan(rootSpan, datasetCase);
+        }
+        try (var rootScope = makeCurrent(rootSpan, parent)) {
+            TaskResult<INPUT, OUTPUT> taskResult = null;
+            Exception taskError = null;
+            var taskSpan = tracer.spanBuilder("task").startSpan();
+            final String taskSpanId = taskSpan.getSpanContext().getSpanId();
+            for (var cl : caseListeners) {
+                cl.onTaskSpan(taskSpan, datasetCase);
             }
-            caseListeners.add(caseListener);
-        }
-
-        caseListeners.forEach(CaseListener::onStart);
-        try {
-            // run task
-            caseListeners.forEach(cl -> cl.onTaskStart(experimentId, datasetCase));
-            final TaskResult<INPUT, OUTPUT> taskResult;
-            try {
+            try (var taskScope = makeCurrent(taskSpan, parent)) {
                 taskResult = task.apply(datasetCase, parameters);
+                for (var cl : caseListeners) {
+                    cl.onTaskSuccess(rootSpan, taskSpan, taskResult);
+                }
             } catch (Exception e) {
-                caseListeners.forEach(cl -> cl.onTaskError(experimentId, datasetCase, e));
-                log.debug("Task threw exception for input: " + datasetCase.input(), e);
-                // run scoreForTaskException on each scorer; classifiers are skipped
+                taskError = e;
+                for (var cl : caseListeners) {
+                    cl.onTaskError(rootSpan, taskSpan, datasetCase, e);
+                }
+            }
+            taskSpan.end();
+
+            if (taskError != null) {
+                log.debug("Task threw exception for input: " + datasetCase.input(), taskError);
+                // run scoreForTaskException on each scorer (score spans nest under the root span,
+                // since the task scope is now closed); classifiers are skipped
                 for (var scorer : scorers) {
-                    runScoreForTaskException(caseListeners, scorer, e, datasetCase);
+                    runScoreForTaskException(
+                            caseListeners, rootSpan, parent, scorer, taskError, datasetCase);
                 }
                 return;
             }
-            caseListeners.forEach(cl -> cl.onTaskEnd(experimentId, taskResult));
 
             // A single BrainstoreTrace for this eval case, shared across all scorers/classifiers.
             // It fetches spans lazily on first access (only if a traced scorer/classifier calls
-            // it). Owned by the OTel listener since it is derived from span ids.
-            BrainstoreTrace trace = otelCase != null ? otelCase.brainstoreTrace() : null;
+            // it). Only available when targeting an experiment.
+            BrainstoreTrace trace =
+                    runInfo.tracingSupported()
+                            ? BrainstoreTrace.forExperiment(
+                                    client,
+                                    Objects.requireNonNull(runInfo.experimentId()),
+                                    rootSpan.getSpanContext().getTraceId(),
+                                    List.of(taskSpanId))
+                            : null;
 
             // run scorers
             for (var scorer : scorers) {
-                runScorer(caseListeners, scorer, taskResult, trace);
+                runScorer(caseListeners, rootSpan, parent, scorer, taskResult, trace);
             }
 
             // run classifiers. Classifier exceptions are non-fatal: they are recorded on the
             // classifier span and surfaced in the root span's metadata under `classifier_errors`,
             // but do not abort the eval or affect other classifiers/scorers. Classifiers only run
             // when the task succeeded (no scoreForTaskException analogue).
-            for (var classifier : classifiers) {
-                runClassifier(caseListeners, classifier, taskResult, trace);
+            for (int i = 0; i < classifiers.size(); i++) {
+                runClassifier(
+                        caseListeners, rootSpan, parent, classifiers.get(i), i, taskResult, trace);
             }
         } finally {
-            caseListeners.forEach(CaseListener::onEnd);
+            for (var cl : caseListeners) {
+                cl.onCaseEnd(rootSpan);
+            }
+            rootSpan.end();
         }
     }
 
     /**
      * Runs a scorer against a successful task result. If the scorer is a {@link TracedScorer}, it
      * receives the {@link BrainstoreTrace} for the eval case. If the scorer throws, falls back to
-     * {@link Scorer#scoreForScorerException}. The {@code onScoreEnd} event is always dispatched (so
-     * the OTel listener can end its span) even when score validation aborts the eval.
+     * {@link Scorer#scoreForScorerException}. {@code onScoreResult} is dispatched only when scores
+     * are valid; on validation/fallback failure the span is still ended and the eval aborts.
      */
     private void runScorer(
             List<CaseListener> caseListeners,
+            Span rootSpan,
+            BraintrustUtils.Parent parent,
             Scorer<INPUT, OUTPUT> scorer,
             TaskResult<INPUT, OUTPUT> taskResult,
             @Nullable BrainstoreTrace trace) {
-        caseListeners.forEach(cl -> cl.onScoreStart(scorer));
-        List<Score> scores = List.of();
-        Exception scoreException = null;
+        var scoreSpan = tracer.spanBuilder("score").startSpan();
+        for (var cl : caseListeners) {
+            cl.onScoreSpan(scoreSpan, scorer);
+        }
         RuntimeException pending = null;
-        try {
+        try (var unused = makeCurrent(scoreSpan, parent)) {
+            List<Score> scores;
+            Exception scoreException = null;
             try {
                 if (scorer instanceof TracedScorer<INPUT, OUTPUT> tracedScorer) {
                     scores = tracedScorer.score(taskResult, trace);
@@ -214,14 +233,16 @@ private void runScorer(
                 scores = scorer.scoreForScorerException(e, taskResult);
             }
             validateScores(scorer, scores);
+            final var finalScores = scores;
+            final var finalException = scoreException;
+            for (var cl : caseListeners) {
+                cl.onScoreResult(scoreSpan, rootSpan, scorer, finalScores, finalException);
+            }
         } catch (RuntimeException re) {
             // validation (or a throwing fallback) aborts the eval; record nothing for this score
             pending = re;
-            scores = List.of();
         } finally {
-            final var finalScores = scores;
-            final var finalException = scoreException;
-            caseListeners.forEach(cl -> cl.onScoreEnd(scorer, finalScores, finalException));
+            scoreSpan.end();
         }
         if (pending != null) {
             throw pending;
@@ -230,25 +251,30 @@ private void runScorer(
 
     /**
      * Runs {@link Scorer#scoreForTaskException} when the task threw. If the fallback (or score
-     * validation) throws, the eval aborts — but the {@code onScoreEnd} event is still dispatched.
+     * validation) throws, the eval aborts — but the score span is still ended.
      */
     private void runScoreForTaskException(
             List<CaseListener> caseListeners,
+            Span rootSpan,
+            BraintrustUtils.Parent parent,
             Scorer<INPUT, OUTPUT> scorer,
             Exception taskException,
             DatasetCase<INPUT, OUTPUT> datasetCase) {
-        caseListeners.forEach(cl -> cl.onScoreStart(scorer));
-        List<Score> scores = List.of();
+        var scoreSpan = tracer.spanBuilder("score").startSpan();
+        for (var cl : caseListeners) {
+            cl.onScoreSpan(scoreSpan, scorer);
+        }
         RuntimeException pending = null;
-        try {
-            scores = scorer.scoreForTaskException(taskException, datasetCase);
+        try (var unused = makeCurrent(scoreSpan, parent)) {
+            var scores = scorer.scoreForTaskException(taskException, datasetCase);
             validateScores(scorer, scores);
+            for (var cl : caseListeners) {
+                cl.onScoreResult(scoreSpan, rootSpan, scorer, scores, null);
+            }
         } catch (RuntimeException re) {
             pending = re;
-            scores = List.of();
         } finally {
-            final var finalScores = scores;
-            caseListeners.forEach(cl -> cl.onScoreEnd(scorer, finalScores, null));
+            scoreSpan.end();
         }
         if (pending != null) {
             throw pending;
@@ -256,18 +282,29 @@ private void runScoreForTaskException(
     }
 
     /**
-     * Runs a classifier. Exceptions are non-fatal: they are surfaced to listeners via the {@code
-     * classifierException} argument of {@code onClassifierEnd} and do not propagate.
+     * Runs a classifier inside its own span. Exceptions are non-fatal: they are surfaced to
+     * listeners via the {@code classifierException} argument of {@code onClassifierResult} and do
+     * not propagate.
      */
     private void runClassifier(
             List<CaseListener> caseListeners,
+            Span rootSpan,
+            BraintrustUtils.Parent parent,
             Classifier<INPUT, OUTPUT> classifier,
+            int index,
             TaskResult<INPUT, OUTPUT> taskResult,
             @Nullable BrainstoreTrace trace) {
-        caseListeners.forEach(cl -> cl.onClassifierStart(classifier));
+        var resolvedName = classifier.getName();
+        if (resolvedName == null || resolvedName.isBlank()) {
+            resolvedName = "classifier_" + index;
+        }
+        var classifierSpan = tracer.spanBuilder(resolvedName).startSpan();
+        for (var cl : caseListeners) {
+            cl.onClassifierSpan(classifierSpan, classifier, resolvedName);
+        }
         List<Classification> classifications = List.of();
         Exception classifierException = null;
-        try {
+        try (var unused = makeCurrent(classifierSpan, parent)) {
             if (classifier instanceof TracedClassifier<INPUT, OUTPUT> tracedClassifier) {
                 classifications = tracedClassifier.classify(taskResult, trace);
             } else {
@@ -279,12 +316,22 @@ private void runClassifier(
         } catch (Exception e) {
             classifierException = e;
             classifications = List.of();
-            log.debug("Classifier '{}' threw exception", classifier.getName(), e);
+            log.debug("Classifier '{}' threw exception", resolvedName, e);
+        } finally {
+            final var finalClassifications = classifications;
+            final var finalException = classifierException;
+            final var finalResolvedName = resolvedName;
+            for (var cl : caseListeners) {
+                cl.onClassifierResult(
+                        classifierSpan,
+                        rootSpan,
+                        classifier,
+                        finalResolvedName,
+                        finalClassifications,
+                        finalException);
+            }
+            classifierSpan.end();
         }
-        final var finalClassifications = classifications;
-        final var finalException = classifierException;
-        caseListeners.forEach(
-                cl -> cl.onClassifierEnd(classifier, finalClassifications, finalException));
     }
 
     /** Validates that every score value is between 0 and 1 inclusive. Throws (aborting) if not. */
@@ -321,7 +368,10 @@ public static final class Builder<INPUT, OUTPUT> {
         private @Nonnull Map<String, Object> parameterValues = Map.of();
         private @Nonnull List<String> tags = List.of();
         private @Nonnull Map<String, Object> metadata = Map.of();
-        private @Nonnull List<EvalListener> listeners = new ArrayList<>();
+        private @Nonnull EvalTargetProvider targetProvider = new ExperimentTargetProvider();
+        // Seeded with the standard span decorator; removable via clearListeners().
+        private @Nonnull List<EvalListener> listeners =
+                new ArrayList<>(List.of(new EvalSpanDecorator()));
 
         public Eval<INPUT, OUTPUT> build() {
             if (config == null) {
@@ -446,6 +496,24 @@ public Builder<INPUT, OUTPUT> addListener(@Nonnull EvalListener listener) {
             return this;
         }
 
+        /**
+         * Removes all attached listeners, including the built-in {@link EvalSpanDecorator}. Use
+         * this to fully control span decoration (e.g. the playground attaches its own decorator).
+         */
+        public Builder<INPUT, OUTPUT> clearListeners() {
+            this.listeners.clear();
+            return this;
+        }
+
+        /**
+         * Overrides how the eval target (parent / experiment) is resolved. Defaults to creating a
+         * Braintrust experiment ({@link ExperimentTargetProvider}).
+         */
+        public Builder<INPUT, OUTPUT> evalTargetProvider(@Nonnull EvalTargetProvider provider) {
+            this.targetProvider = Objects.requireNonNull(provider);
+            return this;
+        }
+
         /** Sets metadata for the experiment. */
         public Builder<INPUT, OUTPUT> metadata(Map<String, Object> metadata) {
             this.metadata = Map.copyOf(metadata);
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java
index 401f3167..24625db6 100644
--- a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalListener.java
@@ -1,43 +1,81 @@
 package dev.braintrust.eval;
 
+import io.opentelemetry.api.trace.Span;
 import java.util.List;
 import javax.annotation.Nullable;
 
-/** a listener which can be attached to an eval and hook specific events */
+/**
+ * A listener which can be attached to an eval to observe and/or decorate its lifecycle.
+ *
+ * <p>{@link Eval} owns the OpenTelemetry span <em>structure</em> — it creates the root ({@code
+ * eval}), {@code task}, {@code score}, and classifier spans (names only), manages the current
+ * context (so user/LLM child spans nest correctly), and ends the spans. Listeners receive the live
+ * {@link Span}s at each lifecycle point and may decorate them with attributes (e.g. the built-in
+ * {@link EvalSpanDecorator}) or simply observe them (e.g. read span ids for streaming progress).
+ *
+ * <p>All callbacks are no-ops by default so implementations only override what they need.
+ */
 public interface EvalListener {
-    RunListener createRunListener(String experimentId);
+    /** Creates a run-scoped listener. Called once per {@link Eval#run()}. */
+    RunListener createRunListener(EvalRunInfo info);
 
-    /** a listener which receives events over the lifecycle of a single eval run */
-    public interface RunListener {
-        void onStart(String experimentId);
+    /** Run-scoped listener; spawns a {@link CaseListener} per eval case. */
+    interface RunListener {
+        default void onRunStart() {}
 
         CaseListener createCaseListener(DatasetCase<?, ?> datasetCase);
 
-        void onEnd();
+        default void onRunEnd() {}
     }
 
-    /** a listener which receives events over the lifecycle of a single case of an eval run */
-    public interface CaseListener {
-        void onStart();
+    /** Case-scoped listener receiving the live spans for a single eval case. */
+    interface CaseListener {
+        /** The root {@code eval} span has been created (no attributes yet). */
+        default void onRootSpan(Span rootSpan, DatasetCase<?, ?> datasetCase) {}
 
-        void onTaskStart(String experimentId, DatasetCase<?, ?> datasetCase);
+        /** The {@code task} span has been created (no attributes yet). */
+        default void onTaskSpan(Span taskSpan, DatasetCase<?, ?> datasetCase) {}
 
-        void onTaskEnd(String experimentId, TaskResult<?, ?> taskResult);
+        /** The task completed successfully. */
+        default void onTaskSuccess(Span rootSpan, Span taskSpan, TaskResult<?, ?> taskResult) {}
 
-        void onTaskError(String experimentId, DatasetCase<?, ?> datasetCase, Exception error);
+        /**
+         * The task threw. Scorers still run via {@code scoreForTaskException}; classifiers do not.
+         */
+        default void onTaskError(
+                Span rootSpan, Span taskSpan, DatasetCase<?, ?> datasetCase, Exception error) {}
 
-        void onScoreStart(Scorer<?, ?> scorer);
+        /** A {@code score} span has been created (no attributes yet). */
+        default void onScoreSpan(Span scoreSpan, Scorer<?, ?> scorer) {}
 
-        void onScoreEnd(
-                Scorer<?, ?> scorer, List<Score> scores, @Nullable Exception scoreException);
+        /**
+         * A scorer produced scores. Not called when score validation aborts the eval. {@code
+         * scoreException} is non-null when the scorer threw and the fallback was used.
+         */
+        default void onScoreResult(
+                Span scoreSpan,
+                Span rootSpan,
+                Scorer<?, ?> scorer,
+                List<Score> scores,
+                @Nullable Exception scoreException) {}
 
-        void onClassifierStart(Classifier<?, ?> classifier);
+        /** A classifier span has been created (no attributes yet). */
+        default void onClassifierSpan(
+                Span classifierSpan, Classifier<?, ?> classifier, String resolvedName) {}
 
-        void onClassifierEnd(
+        /**
+         * A classifier finished. {@code classifierException} is non-null when the classifier threw
+         * (non-fatal).
+         */
+        default void onClassifierResult(
+                Span classifierSpan,
+                Span rootSpan,
                 Classifier<?, ?> classifier,
+                String resolvedName,
                 List<Classification> classifications,
-                @Nullable Exception classifierException);
+                @Nullable Exception classifierException) {}
 
-        void onEnd();
+        /** The case is finishing; the root span is about to be ended. */
+        default void onCaseEnd(Span rootSpan) {}
     }
 }
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalRunInfo.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalRunInfo.java
new file mode 100644
index 00000000..907f9ec8
--- /dev/null
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalRunInfo.java
@@ -0,0 +1,24 @@
+package dev.braintrust.eval;
+
+import dev.braintrust.BraintrustUtils;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+/**
+ * Resolved target for an eval run, produced by an {@link EvalTargetProvider} and handed to every
+ * {@link EvalListener} via {@link EvalListener#createRunListener(EvalRunInfo)}.
+ *
+ * @param parent the braintrust parent for all spans (e.g. {@code experiment_id:…} or {@code
+ *     playground_id:…})
+ * @param generation optional generation identifier woven into span attributes (playground)
+ * @param experimentId the experiment id, when running against an experiment; otherwise null
+ * @param experimentUrl the experiment URL, when applicable; otherwise null
+ * @param tracingSupported whether a {@link dev.braintrust.trace.BrainstoreTrace} can be built for
+ *     traced scorers/classifiers (true only in experiment mode)
+ */
+public record EvalRunInfo(
+        @Nonnull BraintrustUtils.Parent parent,
+        @Nullable String generation,
+        @Nullable String experimentId,
+        @Nullable String experimentUrl,
+        boolean tracingSupported) {}
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalSpanDecorator.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalSpanDecorator.java
new file mode 100644
index 00000000..e7f1f0a7
--- /dev/null
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalSpanDecorator.java
@@ -0,0 +1,211 @@
+package dev.braintrust.eval;
+
+import static dev.braintrust.json.BraintrustJsonMapper.toJson;
+
+import dev.braintrust.trace.BraintrustTracing;
+import io.opentelemetry.api.common.AttributeKey;
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.StatusCode;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import javax.annotation.Nullable;
+
+/**
+ * The standard {@link EvalListener} that decorates the spans created by {@link Eval} with the
+ * canonical Braintrust attributes (root {@code eval}, {@code task}, {@code score}, and classifier
+ * spans). Attached automatically by {@link Eval.Builder}; can be removed via {@link
+ * Eval.Builder#clearListeners()}.
+ */
+public final class EvalSpanDecorator implements EvalListener {
+    private static final AttributeKey<String> PARENT =
+            AttributeKey.stringKey(BraintrustTracing.PARENT_KEY);
+
+    @Override
+    public RunListener createRunListener(EvalRunInfo info) {
+        return new RunListener() {
+            @Override
+            public CaseListener createCaseListener(DatasetCase<?, ?> datasetCase) {
+                return new Decorator(info);
+            }
+        };
+    }
+
+    private static final class Decorator implements CaseListener {
+        private final EvalRunInfo info;
+        private final Map<String, List<Map<String, Object>>> caseClassifications =
+                new LinkedHashMap<>();
+        private final Map<String, String> classifierErrors = new LinkedHashMap<>();
+        private @Nullable DatasetCase<?, ?> datasetCase;
+
+        Decorator(EvalRunInfo info) {
+            this.info = info;
+        }
+
+        private String parentValue() {
+            return info.parent().toParentValue();
+        }
+
+        private Map<String, Object> spanAttrs(String type) {
+            var m = new LinkedHashMap<String, Object>();
+            m.put("type", type);
+            if (info.generation() != null) {
+                m.put("generation", info.generation());
+            }
+            return m;
+        }
+
+        @Override
+        public void onRootSpan(Span rootSpan, DatasetCase<?, ?> datasetCase) {
+            this.datasetCase = datasetCase;
+            rootSpan.setAttribute(PARENT, parentValue());
+            rootSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs("eval")));
+            rootSpan.setAttribute(
+                    "braintrust.input_json", toJson(Map.of("input", datasetCase.input())));
+            rootSpan.setAttribute("braintrust.expected", toJson(datasetCase.expected()));
+            if (datasetCase.origin().isPresent()) {
+                rootSpan.setAttribute("braintrust.origin", toJson(datasetCase.origin().get()));
+            }
+            if (!datasetCase.tags().isEmpty()) {
+                rootSpan.setAttribute(
+                        AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
+            }
+            if (!datasetCase.metadata().isEmpty()) {
+                rootSpan.setAttribute(
+                        AttributeKey.stringKey("braintrust.metadata"),
+                        toJson(datasetCase.metadata()));
+            }
+        }
+
+        @Override
+        public void onTaskSpan(Span taskSpan, DatasetCase<?, ?> datasetCase) {
+            taskSpan.setAttribute(PARENT, parentValue());
+            taskSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs("task")));
+        }
+
+        @Override
+        public void onTaskSuccess(Span rootSpan, Span taskSpan, TaskResult<?, ?> taskResult) {
+            rootSpan.setAttribute(
+                    "braintrust.output_json", toJson(Map.of("output", taskResult.result())));
+        }
+
+        @Override
+        public void onTaskError(
+                Span rootSpan, Span taskSpan, DatasetCase<?, ?> datasetCase, Exception error) {
+            taskSpan.setStatus(StatusCode.ERROR, error.getMessage());
+            taskSpan.recordException(error);
+            rootSpan.setStatus(StatusCode.ERROR, error.getMessage());
+            rootSpan.setAttribute(
+                    "braintrust.output_json", toJson(Collections.singletonMap("output", null)));
+        }
+
+        @Override
+        public void onScoreSpan(Span scoreSpan, Scorer<?, ?> scorer) {
+            scoreSpan.setAttribute(PARENT, parentValue());
+        }
+
+        @Override
+        public void onScoreResult(
+                Span scoreSpan,
+                Span rootSpan,
+                Scorer<?, ?> scorer,
+                List<Score> scores,
+                @Nullable Exception scoreException) {
+            if (scoreException != null) {
+                scoreSpan.setStatus(StatusCode.ERROR, scoreException.getMessage());
+                scoreSpan.recordException(scoreException);
+            }
+            if (scores == null || scores.isEmpty()) {
+                return;
+            }
+            var scorerScores = new LinkedHashMap<String, Double>();
+            for (var score : scores) {
+                scorerScores.put(score.name(), score.value());
+            }
+            var attrs = spanAttrs("score");
+            attrs.put("name", scorer.getName());
+            attrs.put("purpose", "scorer");
+            scoreSpan.setAttribute("braintrust.span_attributes", toJson(attrs));
+            var scoresJson = toJson(scorerScores);
+            scoreSpan.setAttribute("braintrust.output_json", scoresJson);
+            scoreSpan.setAttribute("braintrust.scores", scoresJson);
+        }
+
+        @Override
+        public void onClassifierSpan(
+                Span classifierSpan, Classifier<?, ?> classifier, String resolvedName) {
+            classifierSpan.setAttribute(PARENT, parentValue());
+            var attrs = spanAttrs("classifier");
+            attrs.put("name", resolvedName);
+            attrs.put("purpose", "scorer");
+            classifierSpan.setAttribute("braintrust.span_attributes", toJson(attrs));
+        }
+
+        @Override
+        public void onClassifierResult(
+                Span classifierSpan,
+                Span rootSpan,
+                Classifier<?, ?> classifier,
+                String resolvedName,
+                List<Classification> classifications,
+                @Nullable Exception classifierException) {
+            if (classifierException != null) {
+                classifierSpan.setStatus(StatusCode.ERROR, classifierException.getMessage());
+                classifierSpan.recordException(classifierException);
+                classifierErrors.put(
+                        resolvedName,
+                        classifierException.getMessage() == null
+                                ? classifierException.toString()
+                                : classifierException.getMessage());
+                return;
+            }
+            // Group results by resolved item name (item.name, falling back to the classifier name
+            // when blank). Same map is logged to the classifier span and merged into the per-case
+            // aggregate logged on the root span.
+            Map<String, List<Map<String, Object>>> outputByName = new LinkedHashMap<>();
+            for (var item : classifications) {
+                var itemName = item.name();
+                if (itemName == null || itemName.isBlank()) {
+                    itemName = resolvedName;
+                }
+                var itemMap = toClassificationItem(item);
+                outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap);
+                caseClassifications.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap);
+            }
+            classifierSpan.setAttribute("braintrust.output_json", toJson(outputByName));
+        }
+
+        @Override
+        public void onCaseEnd(Span rootSpan) {
+            if (!caseClassifications.isEmpty()) {
+                rootSpan.setAttribute("braintrust.classifications", toJson(caseClassifications));
+            }
+            if (!classifierErrors.isEmpty()) {
+                Map<String, Object> mergedMetadata =
+                        new LinkedHashMap<>(
+                                datasetCase == null ? Map.of() : datasetCase.metadata());
+                mergedMetadata.put("classifier_errors", classifierErrors);
+                rootSpan.setAttribute(
+                        AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata));
+            }
+        }
+    }
+
+    /**
+     * Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code
+     * name}, includes {@code label} and {@code metadata} only when present.
+     */
+    private static Map<String, Object> toClassificationItem(Classification c) {
+        Map<String, Object> m = new LinkedHashMap<>();
+        m.put("id", c.id());
+        if (c.label() != null) {
+            m.put("label", c.label());
+        }
+        if (c.metadata() != null) {
+            m.put("metadata", c.metadata());
+        }
+        return m;
+    }
+}
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalTargetProvider.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalTargetProvider.java
new file mode 100644
index 00000000..2aa0a640
--- /dev/null
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/EvalTargetProvider.java
@@ -0,0 +1,33 @@
+package dev.braintrust.eval;
+
+import dev.braintrust.api.BraintrustOpenApiClient;
+import dev.braintrust.config.BraintrustConfig;
+import dev.braintrust.openapi.model.Project;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import javax.annotation.Nonnull;
+
+/**
+ * Resolves the {@link EvalRunInfo target} for an eval run. The default implementation creates a
+ * Braintrust experiment (see {@link ExperimentTargetProvider}); alternative implementations (e.g.
+ * the devserver/playground) can supply a different parent and skip experiment creation.
+ */
+public interface EvalTargetProvider {
+    @Nonnull
+    EvalRunInfo create(@Nonnull Context ctx);
+
+    /**
+     * Inputs available when resolving the eval target, gathered at the start of {@link Eval#run()}.
+     */
+    record Context(
+            @Nonnull BraintrustConfig config,
+            @Nonnull BraintrustOpenApiClient client,
+            @Nonnull Project project,
+            @Nonnull BraintrustOpenApiClient.OrgInfo orgInfo,
+            @Nonnull String experimentName,
+            @Nonnull List<String> tags,
+            @Nonnull Map<String, Object> metadata,
+            @Nonnull Optional<String> datasetId,
+            @Nonnull Optional<String> datasetVersion) {}
+}
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/ExperimentTargetProvider.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/ExperimentTargetProvider.java
new file mode 100644
index 00000000..825f0ea3
--- /dev/null
+++ b/braintrust-sdk/src/main/java/dev/braintrust/eval/ExperimentTargetProvider.java
@@ -0,0 +1,46 @@
+package dev.braintrust.eval;
+
+import dev.braintrust.BraintrustUtils;
+import dev.braintrust.openapi.api.ExperimentsApi;
+import dev.braintrust.openapi.model.CreateExperiment;
+import java.util.UUID;
+import javax.annotation.Nonnull;
+
+/**
+ * Default {@link EvalTargetProvider}: creates a Braintrust experiment and targets spans at it via
+ * an {@code experiment_id:} parent.
+ */
+final class ExperimentTargetProvider implements EvalTargetProvider {
+    @Override
+    @Nonnull
+    public EvalRunInfo create(@Nonnull Context ctx) {
+        var createExperiment =
+                new CreateExperiment().projectId(ctx.project().getId()).name(ctx.experimentName());
+        if (!ctx.tags().isEmpty()) {
+            createExperiment.tags(ctx.tags());
+        }
+        if (!ctx.metadata().isEmpty()) {
+            createExperiment.metadata(ctx.metadata());
+        }
+        ctx.datasetId().ifPresent(id -> createExperiment.datasetId(UUID.fromString(id)));
+        ctx.datasetVersion().ifPresent(createExperiment::datasetVersion);
+
+        var experiment = new ExperimentsApi(ctx.client()).postExperiment(createExperiment);
+        var experimentId = experiment.getId().toString();
+        var experimentUrl =
+                "%s/experiments/%s"
+                        .formatted(
+                                BraintrustUtils.createProjectURI(
+                                                ctx.config().appUrl(),
+                                                ctx.orgInfo().name(),
+                                                ctx.project().getName())
+                                        .toASCIIString(),
+                                ctx.experimentName());
+        return new EvalRunInfo(
+                new BraintrustUtils.Parent("experiment_id", experimentId),
+                null,
+                experimentId,
+                experimentUrl,
+                true);
+    }
+}
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java
deleted file mode 100644
index e1e9083e..00000000
--- a/braintrust-sdk/src/main/java/dev/braintrust/eval/OtelEvalListener.java
+++ /dev/null
@@ -1,381 +0,0 @@
-package dev.braintrust.eval;
-
-import static dev.braintrust.json.BraintrustJsonMapper.toJson;
-
-import dev.braintrust.api.BraintrustOpenApiClient;
-import dev.braintrust.trace.BrainstoreTrace;
-import dev.braintrust.trace.BraintrustContext;
-import dev.braintrust.trace.BraintrustTracing;
-import io.opentelemetry.api.common.AttributeKey;
-import io.opentelemetry.api.trace.Span;
-import io.opentelemetry.api.trace.SpanKind;
-import io.opentelemetry.api.trace.StatusCode;
-import io.opentelemetry.api.trace.Tracer;
-import io.opentelemetry.context.Scope;
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import javax.annotation.Nonnull;
-import javax.annotation.Nullable;
-import lombok.extern.slf4j.Slf4j;
-
-/**
- * Built-in {@link EvalListener} that manages all OpenTelemetry spans for an eval (root {@code eval}
- * span, {@code task} span, {@code score} spans, and {@code classifier} spans), pushing/popping the
- * braintrust OTel context across the start/end events so user code nests correctly.
- *
- * <p>This listener pushes spans onto the current OTel context on start events and pops them on the
- * matching end events. That makes it inherently thread-affine and dependent on strictly nested
- * (LIFO) start/end ordering — which holds because evals run sequentially per case on one thread.
- */
-@Slf4j
-final class OtelEvalListener implements EvalListener {
-    private static final AttributeKey<String> PARENT =
-            AttributeKey.stringKey(BraintrustTracing.PARENT_KEY);
-
-    private final @Nonnull Tracer tracer;
-    private final @Nonnull BraintrustOpenApiClient client;
-
-    OtelEvalListener(@Nonnull Tracer tracer, @Nonnull BraintrustOpenApiClient client) {
-        this.tracer = tracer;
-        this.client = client;
-    }
-
-    @Override
-    public OtelRunListener createRunListener(String experimentId) {
-        return new OtelRunListener(experimentId);
-    }
-
-    /** Run-scoped listener. There is currently no run-level span; it only spawns case listeners. */
-    final class OtelRunListener implements RunListener {
-        private final @Nonnull String experimentId;
-
-        private OtelRunListener(@Nonnull String experimentId) {
-            this.experimentId = experimentId;
-        }
-
-        @Override
-        public void onStart(String experimentId) {}
-
-        @Override
-        public OtelCaseListener createCaseListener(DatasetCase<?, ?> datasetCase) {
-            return new OtelCaseListener(experimentId, datasetCase);
-        }
-
-        @Override
-        public void onEnd() {}
-    }
-
-    /** Case-scoped listener owning the root/task/score/classifier spans and their scopes. */
-    final class OtelCaseListener implements CaseListener {
-        private final @Nonnull String experimentId;
-        private final @Nonnull DatasetCase<?, ?> datasetCase;
-
-        private @Nullable Span rootSpan;
-        private @Nullable Scope rootScope;
-        private @Nullable String rootTraceId;
-        private @Nullable String taskSpanId;
-
-        private @Nullable Span taskSpan;
-        private @Nullable Scope taskScope;
-
-        private @Nullable Span scoreSpan;
-        private @Nullable Scope scoreScope;
-
-        private int classifierIndex = 0;
-        private @Nullable Span classifierSpan;
-        private @Nullable Scope classifierScope;
-        private @Nullable String classifierName;
-
-        // Accumulated classifier results, written onto the root span at case end.
-        private final Map<String, List<Map<String, Object>>> caseClassifications =
-                new LinkedHashMap<>();
-        private final Map<String, String> classifierErrors = new LinkedHashMap<>();
-
-        private OtelCaseListener(
-                @Nonnull String experimentId, @Nonnull DatasetCase<?, ?> datasetCase) {
-            this.experimentId = experimentId;
-            this.datasetCase = datasetCase;
-        }
-
-        @Override
-        public void onStart() {
-            var span =
-                    tracer.spanBuilder("eval") // TODO: allow names for eval cases
-                            .setNoParent() // each eval case is its own trace
-                            .setSpanKind(SpanKind.CLIENT)
-                            .setAttribute(PARENT, "experiment_id:" + experimentId)
-                            .setAttribute(
-                                    "braintrust.span_attributes", toJson(Map.of("type", "eval")))
-                            .setAttribute(
-                                    "braintrust.input_json",
-                                    toJson(Map.of("input", datasetCase.input())))
-                            .setAttribute("braintrust.expected", toJson(datasetCase.expected()))
-                            .startSpan();
-            if (datasetCase.origin().isPresent()) {
-                span.setAttribute("braintrust.origin", toJson(datasetCase.origin().get()));
-            }
-            if (!datasetCase.tags().isEmpty()) {
-                span.setAttribute(
-                        AttributeKey.stringArrayKey("braintrust.tags"), datasetCase.tags());
-            }
-            if (!datasetCase.metadata().isEmpty()) {
-                span.setAttribute(
-                        AttributeKey.stringKey("braintrust.metadata"),
-                        toJson(datasetCase.metadata()));
-            }
-            this.rootSpan = span;
-            this.rootTraceId = span.getSpanContext().getTraceId();
-            this.rootScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent();
-        }
-
-        @Override
-        public void onTaskStart(String experimentId, DatasetCase<?, ?> datasetCase) {
-            var span =
-                    tracer.spanBuilder("task")
-                            .setAttribute(PARENT, "experiment_id:" + this.experimentId)
-                            .setAttribute(
-                                    "braintrust.span_attributes", toJson(Map.of("type", "task")))
-                            .startSpan();
-            this.taskSpan = span;
-            this.taskSpanId = span.getSpanContext().getSpanId();
-            this.taskScope = BraintrustContext.ofExperiment(this.experimentId, span).makeCurrent();
-        }
-
-        @Override
-        public void onTaskEnd(String experimentId, TaskResult<?, ?> taskResult) {
-            requireRoot()
-                    .setAttribute(
-                            "braintrust.output_json",
-                            toJson(Map.of("output", taskResult.result())));
-            closeTaskScope();
-            requireTask().end();
-        }
-
-        @Override
-        public void onTaskError(
-                String experimentId, DatasetCase<?, ?> datasetCase, Exception error) {
-            var task = requireTask();
-            task.setStatus(StatusCode.ERROR, error.getMessage());
-            task.recordException(error);
-            closeTaskScope();
-            task.end();
-
-            var root = requireRoot();
-            root.setStatus(StatusCode.ERROR, error.getMessage());
-            var nullOutput = new LinkedHashMap<String, Object>();
-            nullOutput.put("output", null);
-            root.setAttribute("braintrust.output_json", toJson(nullOutput));
-        }
-
-        @Override
-        public void onScoreStart(Scorer<?, ?> scorer) {
-            var span =
-                    tracer.spanBuilder("score")
-                            .setAttribute(PARENT, "experiment_id:" + experimentId)
-                            .startSpan();
-            this.scoreSpan = span;
-            this.scoreScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent();
-        }
-
-        @Override
-        public void onScoreEnd(
-                Scorer<?, ?> scorer, List<Score> scores, @Nullable Exception scoreException) {
-            var span = requireScore();
-            try {
-                if (scoreException != null) {
-                    span.setStatus(StatusCode.ERROR, scoreException.getMessage());
-                    span.recordException(scoreException);
-                }
-                recordScores(span, requireRoot(), scorer, scores);
-            } finally {
-                closeScoreScope();
-                span.end();
-            }
-        }
-
-        @Override
-        public void onClassifierStart(Classifier<?, ?> classifier) {
-            var resolvedName = classifier.getName();
-            if (resolvedName == null || resolvedName.isBlank()) {
-                resolvedName = "classifier_" + classifierIndex;
-            }
-            classifierIndex++;
-            this.classifierName = resolvedName;
-
-            var span =
-                    tracer.spanBuilder(resolvedName)
-                            .setAttribute(PARENT, "experiment_id:" + experimentId)
-                            .startSpan();
-            Map<String, Object> spanAttrs = new LinkedHashMap<>();
-            spanAttrs.put("type", "classifier");
-            spanAttrs.put("name", resolvedName);
-            spanAttrs.put("purpose", "scorer");
-            span.setAttribute("braintrust.span_attributes", toJson(spanAttrs));
-
-            this.classifierSpan = span;
-            this.classifierScope = BraintrustContext.ofExperiment(experimentId, span).makeCurrent();
-        }
-
-        @Override
-        public void onClassifierEnd(
-                Classifier<?, ?> classifier,
-                List<Classification> classifications,
-                @Nullable Exception classifierException) {
-            var span = requireClassifier();
-            var resolvedName = classifierName;
-            try {
-                if (classifierException != null) {
-                    span.setStatus(StatusCode.ERROR, classifierException.getMessage());
-                    span.recordException(classifierException);
-                    classifierErrors.put(
-                            resolvedName,
-                            classifierException.getMessage() == null
-                                    ? classifierException.toString()
-                                    : classifierException.getMessage());
-                    return;
-                }
-
-                // Group results by resolved item name (item.name, falling back to the classifier
-                // name when blank). Same map is logged to the classifier span and merged into the
-                // per-case aggregate logged on the root span.
-                Map<String, List<Map<String, Object>>> outputByName = new LinkedHashMap<>();
-                for (var item : classifications) {
-                    var itemName = item.name();
-                    if (itemName == null || itemName.isBlank()) {
-                        itemName = resolvedName;
-                    }
-                    var itemMap = toClassificationItem(item);
-                    outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap);
-                    caseClassifications
-                            .computeIfAbsent(itemName, k -> new ArrayList<>())
-                            .add(itemMap);
-                }
-                span.setAttribute("braintrust.output_json", toJson(outputByName));
-            } finally {
-                closeClassifierScope();
-                span.end();
-            }
-        }
-
-        @Override
-        public void onEnd() {
-            var root = requireRoot();
-            try {
-                if (!caseClassifications.isEmpty()) {
-                    root.setAttribute("braintrust.classifications", toJson(caseClassifications));
-                }
-                if (!classifierErrors.isEmpty()) {
-                    Map<String, Object> mergedMetadata =
-                            new LinkedHashMap<>(datasetCase.metadata());
-                    mergedMetadata.put("classifier_errors", classifierErrors);
-                    root.setAttribute(
-                            AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata));
-                }
-            } finally {
-                closeRootScope();
-                root.end();
-            }
-        }
-
-        /**
-         * Builds the {@link BrainstoreTrace} for this case from the root trace id and the task span
-         * id. Must be called after {@link #onTaskEnd}.
-         */
-        BrainstoreTrace brainstoreTrace() {
-            return BrainstoreTrace.forExperiment(
-                    client,
-                    experimentId,
-                    requireNonNullState(rootTraceId, "rootTraceId"),
-                    List.of(requireNonNullState(taskSpanId, "taskSpanId")));
-        }
-
-        private Span requireRoot() {
-            return requireNonNullState(rootSpan, "rootSpan");
-        }
-
-        private Span requireTask() {
-            return requireNonNullState(taskSpan, "taskSpan");
-        }
-
-        private Span requireScore() {
-            return requireNonNullState(scoreSpan, "scoreSpan");
-        }
-
-        private Span requireClassifier() {
-            return requireNonNullState(classifierSpan, "classifierSpan");
-        }
-
-        private void closeRootScope() {
-            if (rootScope != null) {
-                rootScope.close();
-                rootScope = null;
-            }
-        }
-
-        private void closeTaskScope() {
-            if (taskScope != null) {
-                taskScope.close();
-                taskScope = null;
-            }
-        }
-
-        private void closeScoreScope() {
-            if (scoreScope != null) {
-                scoreScope.close();
-                scoreScope = null;
-            }
-        }
-
-        private void closeClassifierScope() {
-            if (classifierScope != null) {
-                classifierScope.close();
-                classifierScope = null;
-            }
-        }
-    }
-
-    /** Records scores onto the score span and root span. Validation is the caller's job. */
-    private static void recordScores(
-            Span scoreSpan, Span rootSpan, Scorer<?, ?> scorer, List<Score> scores) {
-        if (scores == null || scores.isEmpty()) {
-            return;
-        }
-        final Map<String, Double> scorerScores = new LinkedHashMap<>();
-        for (var score : scores) {
-            scorerScores.put(score.name(), score.value());
-        }
-        Map<String, Object> spanAttrs = new LinkedHashMap<>();
-        spanAttrs.put("type", "score");
-        spanAttrs.put("name", scorer.getName());
-        spanAttrs.put("purpose", "scorer");
-        scoreSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs));
-        var scoresJson = toJson(scorerScores);
-        scoreSpan.setAttribute("braintrust.output_json", scoresJson);
-        scoreSpan.setAttribute("braintrust.scores", scoresJson);
-    }
-
-    /**
-     * Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code
-     * name}, includes {@code label} and {@code metadata} only when present.
-     */
-    private static Map<String, Object> toClassificationItem(Classification c) {
-        Map<String, Object> m = new LinkedHashMap<>();
-        m.put("id", c.id());
-        if (c.label() != null) {
-            m.put("label", c.label());
-        }
-        if (c.metadata() != null) {
-            m.put("metadata", c.metadata());
-        }
-        return m;
-    }
-
-    private static <T> T requireNonNullState(@Nullable T value, String name) {
-        if (value == null) {
-            throw new IllegalStateException("OtelEvalListener: " + name + " accessed out of order");
-        }
-        return value;
-    }
-}
diff --git a/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java b/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java
index e1755a03..10b545f8 100644
--- a/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java
+++ b/braintrust-sdk/src/test/java/dev/braintrust/devserver/DevserverTest.java
@@ -793,9 +793,9 @@ void testTaskErrorHandling() throws Exception {
                 erroredTaskSpan.getEvents().stream().anyMatch(e -> e.getName().equals("exception")),
                 "task span should have an exception event");
 
-        // The errored case should still have a score span (from scoreForTaskException default 0.0)
-        // The score span is a child of the task span (since the task scope is still active when
-        // runScoreForTaskException is called from the catch block)
+        // The errored case should still have a score span (from scoreForTaskException default 0.0).
+        // The score span is a child of the eval (root) span: scoreForTaskException runs after the
+        // task scope has closed, so its span nests under the root rather than the task span.
         var erroredScoreSpans =
                 allSpans.stream()
                         .filter(s -> s.getName().equals("score"))
@@ -804,7 +804,7 @@ void testTaskErrorHandling() throws Exception {
                                         s.getParentSpanContext()
                                                 .getSpanId()
                                                 .equals(
-                                                        erroredTaskSpan
+                                                        erroredEvalSpan
                                                                 .getSpanContext()
                                                                 .getSpanId()))
                         .toList();