braintrustdata · Stephen Belanger (Qard) · Jun 26, 2026
diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts
@@ -1168,6 +1168,88 @@ describe("CodexEventProcessor: llm spans", () => {
     );
   });
 
+  test("llm token metrics normalize alternate and nested usage shapes", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "yo!" }),
+        assistantMessage("Hey."),
+        tokenCount({
+          prompt_tokens: 100,
+          completion_tokens: 14,
+          prompt_tokens_details: { cached_tokens: 25, cache_creation_tokens: 7 },
+          completion_tokens_details: { reasoning_tokens: 4 },
+          cost: 0.1234,
+        }),
+        taskComplete({ turn_id: "t1", last_agent_message: "Hey." }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                output: { role: "assistant", content: "Hey." },
+                metrics: {
+                  prompt_tokens: 100,
+                  completion_tokens: 14,
+                  tokens: 114,
+                  prompt_cached_tokens: 25,
+                  prompt_cache_creation_tokens: 7,
+                  completion_reasoning_tokens: 4,
+                  cost: 0.1234,
+                  estimated_cost: 0.1234,
+                },
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
+  test("a dangling llm span records why token usage is unavailable", async () => {
+    await assertProducesTrace(
+      [
+        sessionStart(),
+        sessionMeta({ cwd: "/work" }),
+        turnContext({ model: "gpt-5.5" }),
+        taskStarted({ turn_id: "t1" }),
+        userMessage({ message: "yo!" }),
+        assistantMessage("Hey."),
+        taskComplete({ turn_id: "t1", last_agent_message: "Hey." }),
+        stop({ turn_id: "t1" }),
+      ],
+      {
+        span_attributes: { name: "codex: work", type: "task" },
+        ended: true,
+        children: [
+          {
+            span_attributes: { name: "turn: t1", type: "task" },
+            ended: true,
+            children: [
+              {
+                span_attributes: { name: "gpt-5.5", type: "llm" },
+                output: { role: "assistant", content: "Hey." },
+                metadata: { usage_unavailable_reason: "codex_transcript_missing_token_count" },
+                ended: true,
+              },
+            ],
+          },
+        ],
+      },
+    );
+  });
+
   // A reasoning item with a readable summary is surfaced as a `reasoning` entry
   // in the llm output (Braintrust's OpenAI Responses shape), interleaved before
   // the assistant message.

diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts
@@ -276,17 +276,53 @@ function tokenMetrics(usage: Record<string, unknown>): Record<string, number> {
   const metrics: Record<string, number> = {};
   const num = (v: unknown): number | undefined =>
     typeof v === "number" && Number.isFinite(v) ? v : undefined;
+  const at = (path: string): number | undefined => {
+    let cur: unknown = usage;
+    for (const part of path.split(".")) {
+      if (cur === null || typeof cur !== "object") return undefined;
+      cur = (cur as Record<string, unknown>)[part];
+    }
+    return num(cur);
+  };
   const map: Array<[string, string]> = [
     ["input_tokens", "prompt_tokens"],
+    ["prompt_tokens", "prompt_tokens"],
     ["output_tokens", "completion_tokens"],
+    ["completion_tokens", "completion_tokens"],
     ["total_tokens", "tokens"],
+    ["tokens", "tokens"],
     ["cached_input_tokens", "prompt_cached_tokens"],
+    ["prompt_cached_tokens", "prompt_cached_tokens"],
+    ["input_tokens_details.cached_tokens", "prompt_cached_tokens"],
+    ["prompt_tokens_details.cached_tokens", "prompt_cached_tokens"],
+    ["prompt_cache_creation_tokens", "prompt_cache_creation_tokens"],
+    ["input_tokens_details.cache_creation_tokens", "prompt_cache_creation_tokens"],
+    ["input_tokens_details.cache_write_tokens", "prompt_cache_creation_tokens"],
+    ["prompt_tokens_details.cache_creation_tokens", "prompt_cache_creation_tokens"],
+    ["prompt_tokens_details.cache_write_tokens", "prompt_cache_creation_tokens"],
     ["reasoning_output_tokens", "completion_reasoning_tokens"],
+    ["completion_reasoning_tokens", "completion_reasoning_tokens"],
+    ["reasoning_tokens", "completion_reasoning_tokens"],
+    ["output_tokens_details.reasoning_tokens", "completion_reasoning_tokens"],
+    ["completion_tokens_details.reasoning_tokens", "completion_reasoning_tokens"],
+    ["cost", "cost"],
+    ["cost", "estimated_cost"],
+    ["estimated_cost", "estimated_cost"],
+    ["total_cost", "estimated_cost"],
+    ["cost_usd", "estimated_cost"],
   ];
   for (const [from, to] of map) {
-    const v = num(usage[from]);
+    if (metrics[to] !== undefined) continue;
+    const v = at(from);
     if (v !== undefined) metrics[to] = v;
   }
+  if (
+    metrics.tokens === undefined &&
+    metrics.prompt_tokens !== undefined &&
+    metrics.completion_tokens !== undefined
+  ) {
+    metrics.tokens = metrics.prompt_tokens + metrics.completion_tokens;
+  }
   return metrics;
 }
 
@@ -1467,6 +1503,15 @@ export class CodexEventProcessor implements EventProcessor {
     const info = (payload.info ?? {}) as Record<string, unknown>;
     const usage = (info.last_token_usage ?? {}) as Record<string, unknown>;
     const metrics = tokenMetrics(usage);
+    const metadata =
+      Object.keys(metrics).length === 0
+        ? {
+            usage_unavailable_reason:
+              Object.keys(usage).length === 0
+                ? "codex_token_count_missing_usage"
+                : "codex_token_count_unrecognized_usage",
+          }
+        : undefined;
     const { span, turnId, output, outputPreset, lastOutputTime } = scope.openLlm;
     // End the span when the model last generated (its last output item), NOT at
     // this token_count: Codex writes the token_count after the tool result (at
@@ -1476,7 +1521,11 @@ export class CodexEventProcessor implements EventProcessor {
     // Fall back to the token_count time if we somehow have no output time.
     const endTime = lastOutputTime ?? isoToUnixSeconds(record.timestamp);
     try {
-      span.log(outputPreset ? { metrics } : { output: llmOutput(output), metrics });
+      span.log(
+        outputPreset
+          ? { metrics, ...(metadata !== undefined ? { metadata } : {}) }
+          : { output: llmOutput(output), metrics, ...(metadata !== undefined ? { metadata } : {}) },
+      );
       span.end(endTime !== undefined ? { endTime } : undefined);
       // This LLM call is a child of its turn; advance the turn's boundary so the
       // next child (LLM or tool) starts where this one ended (its last output).
@@ -1501,7 +1550,8 @@ export class CodexEventProcessor implements EventProcessor {
     const { span, turnId, output, lastOutputTime } = scope.openLlm;
     const effectiveEnd = lastOutputTime ?? endTime;
     try {
-      if (output.length > 0) span.log({ output: llmOutput(output) });
+      const metadata = { usage_unavailable_reason: "codex_transcript_missing_token_count" };
+      span.log(output.length > 0 ? { output: llmOutput(output), metadata } : { metadata });
       span.end(effectiveEnd !== undefined ? { endTime: effectiveEnd } : undefined);
       this.noteChildEnded(scope, turnId, effectiveEnd);
     } catch (err) {

diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts
@@ -228,7 +228,7 @@ export function reasoning(summary: string[] = []): TranscriptEntry {
  * token_count event: closes the current llm span. `usage` maps the Codex token
  * keys (input_tokens, output_tokens, total_tokens, ...) onto last_token_usage.
  */
-export function tokenCount(usage: Record<string, number> = {}): TranscriptEntry {
+export function tokenCount(usage: Record<string, unknown> = {}): TranscriptEntry {
   return transcript({
     type: "event_msg",
     payload: { type: "token_count", info: { last_token_usage: usage } },