diff --git a/plugins/trace-codex/src/agents/codex/event-processor.test.ts b/plugins/trace-codex/src/agents/codex/event-processor.test.ts index 7f5da2f..02c666d 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.test.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.test.ts @@ -1168,6 +1168,88 @@ describe("CodexEventProcessor: llm spans", () => { ); }); + test("llm token metrics normalize alternate and nested usage shapes", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "yo!" }), + assistantMessage("Hey."), + tokenCount({ + prompt_tokens: 100, + completion_tokens: 14, + prompt_tokens_details: { cached_tokens: 25, cache_creation_tokens: 7 }, + completion_tokens_details: { reasoning_tokens: 4 }, + cost: 0.1234, + }), + taskComplete({ turn_id: "t1", last_agent_message: "Hey." }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: { role: "assistant", content: "Hey." }, + metrics: { + prompt_tokens: 100, + completion_tokens: 14, + tokens: 114, + prompt_cached_tokens: 25, + prompt_cache_creation_tokens: 7, + completion_reasoning_tokens: 4, + cost: 0.1234, + estimated_cost: 0.1234, + }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + + test("a dangling llm span records why token usage is unavailable", async () => { + await assertProducesTrace( + [ + sessionStart(), + sessionMeta({ cwd: "/work" }), + turnContext({ model: "gpt-5.5" }), + taskStarted({ turn_id: "t1" }), + userMessage({ message: "yo!" }), + assistantMessage("Hey."), + taskComplete({ turn_id: "t1", last_agent_message: "Hey." }), + stop({ turn_id: "t1" }), + ], + { + span_attributes: { name: "codex: work", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "turn: t1", type: "task" }, + ended: true, + children: [ + { + span_attributes: { name: "gpt-5.5", type: "llm" }, + output: { role: "assistant", content: "Hey." }, + metadata: { usage_unavailable_reason: "codex_transcript_missing_token_count" }, + ended: true, + }, + ], + }, + ], + }, + ); + }); + // A reasoning item with a readable summary is surfaced as a `reasoning` entry // in the llm output (Braintrust's OpenAI Responses shape), interleaved before // the assistant message. diff --git a/plugins/trace-codex/src/agents/codex/event-processor.ts b/plugins/trace-codex/src/agents/codex/event-processor.ts index 55fbfe2..edd9f35 100644 --- a/plugins/trace-codex/src/agents/codex/event-processor.ts +++ b/plugins/trace-codex/src/agents/codex/event-processor.ts @@ -276,17 +276,53 @@ function tokenMetrics(usage: Record): Record { const metrics: Record = {}; const num = (v: unknown): number | undefined => typeof v === "number" && Number.isFinite(v) ? v : undefined; + const at = (path: string): number | undefined => { + let cur: unknown = usage; + for (const part of path.split(".")) { + if (cur === null || typeof cur !== "object") return undefined; + cur = (cur as Record)[part]; + } + return num(cur); + }; const map: Array<[string, string]> = [ ["input_tokens", "prompt_tokens"], + ["prompt_tokens", "prompt_tokens"], ["output_tokens", "completion_tokens"], + ["completion_tokens", "completion_tokens"], ["total_tokens", "tokens"], + ["tokens", "tokens"], ["cached_input_tokens", "prompt_cached_tokens"], + ["prompt_cached_tokens", "prompt_cached_tokens"], + ["input_tokens_details.cached_tokens", "prompt_cached_tokens"], + ["prompt_tokens_details.cached_tokens", "prompt_cached_tokens"], + ["prompt_cache_creation_tokens", "prompt_cache_creation_tokens"], + ["input_tokens_details.cache_creation_tokens", "prompt_cache_creation_tokens"], + ["input_tokens_details.cache_write_tokens", "prompt_cache_creation_tokens"], + ["prompt_tokens_details.cache_creation_tokens", "prompt_cache_creation_tokens"], + ["prompt_tokens_details.cache_write_tokens", "prompt_cache_creation_tokens"], ["reasoning_output_tokens", "completion_reasoning_tokens"], + ["completion_reasoning_tokens", "completion_reasoning_tokens"], + ["reasoning_tokens", "completion_reasoning_tokens"], + ["output_tokens_details.reasoning_tokens", "completion_reasoning_tokens"], + ["completion_tokens_details.reasoning_tokens", "completion_reasoning_tokens"], + ["cost", "cost"], + ["cost", "estimated_cost"], + ["estimated_cost", "estimated_cost"], + ["total_cost", "estimated_cost"], + ["cost_usd", "estimated_cost"], ]; for (const [from, to] of map) { - const v = num(usage[from]); + if (metrics[to] !== undefined) continue; + const v = at(from); if (v !== undefined) metrics[to] = v; } + if ( + metrics.tokens === undefined && + metrics.prompt_tokens !== undefined && + metrics.completion_tokens !== undefined + ) { + metrics.tokens = metrics.prompt_tokens + metrics.completion_tokens; + } return metrics; } @@ -1467,6 +1503,15 @@ export class CodexEventProcessor implements EventProcessor { const info = (payload.info ?? {}) as Record; const usage = (info.last_token_usage ?? {}) as Record; const metrics = tokenMetrics(usage); + const metadata = + Object.keys(metrics).length === 0 + ? { + usage_unavailable_reason: + Object.keys(usage).length === 0 + ? "codex_token_count_missing_usage" + : "codex_token_count_unrecognized_usage", + } + : undefined; const { span, turnId, output, outputPreset, lastOutputTime } = scope.openLlm; // End the span when the model last generated (its last output item), NOT at // this token_count: Codex writes the token_count after the tool result (at @@ -1476,7 +1521,11 @@ export class CodexEventProcessor implements EventProcessor { // Fall back to the token_count time if we somehow have no output time. const endTime = lastOutputTime ?? isoToUnixSeconds(record.timestamp); try { - span.log(outputPreset ? { metrics } : { output: llmOutput(output), metrics }); + span.log( + outputPreset + ? { metrics, ...(metadata !== undefined ? { metadata } : {}) } + : { output: llmOutput(output), metrics, ...(metadata !== undefined ? { metadata } : {}) }, + ); span.end(endTime !== undefined ? { endTime } : undefined); // This LLM call is a child of its turn; advance the turn's boundary so the // next child (LLM or tool) starts where this one ended (its last output). @@ -1501,7 +1550,8 @@ export class CodexEventProcessor implements EventProcessor { const { span, turnId, output, lastOutputTime } = scope.openLlm; const effectiveEnd = lastOutputTime ?? endTime; try { - if (output.length > 0) span.log({ output: llmOutput(output) }); + const metadata = { usage_unavailable_reason: "codex_transcript_missing_token_count" }; + span.log(output.length > 0 ? { output: llmOutput(output), metadata } : { metadata }); span.end(effectiveEnd !== undefined ? { endTime: effectiveEnd } : undefined); this.noteChildEnded(scope, turnId, effectiveEnd); } catch (err) { diff --git a/plugins/trace-codex/src/agents/codex/test-helpers.ts b/plugins/trace-codex/src/agents/codex/test-helpers.ts index e85d729..a590523 100644 --- a/plugins/trace-codex/src/agents/codex/test-helpers.ts +++ b/plugins/trace-codex/src/agents/codex/test-helpers.ts @@ -228,7 +228,7 @@ export function reasoning(summary: string[] = []): TranscriptEntry { * token_count event: closes the current llm span. `usage` maps the Codex token * keys (input_tokens, output_tokens, total_tokens, ...) onto last_token_usage. */ -export function tokenCount(usage: Record = {}): TranscriptEntry { +export function tokenCount(usage: Record = {}): TranscriptEntry { return transcript({ type: "event_msg", payload: { type: "token_count", info: { last_token_usage: usage } },