diff --git a/examples/run-mini-swe-agent/main.py b/examples/run-mini-swe-agent/main.py
index 2def486..7f14f68 100644
--- a/examples/run-mini-swe-agent/main.py
+++ b/examples/run-mini-swe-agent/main.py
@@ -66,11 +66,26 @@ async def main() -> None:
                 print(f"{type(exc).__name__}: {exc}", flush=True)
                 await print_verification(client, args.workdir)
                 return
-            print(f"mini_exit_status={result.get('exit_status', 'unknown')}", flush=True)
-            submission = str(result.get("submission", ""))
+            print(f"mini_exit_status={result.exit_status}", flush=True)
+            submission = result.submission
             if submission:
                 print("mini_submission:", flush=True)
                 print(submission.rstrip(), flush=True)
+            if result.usage:
+                print(
+                    "mini_usage: "
+                    f"input={result.usage.get('n_input_tokens', 0)} "
+                    f"output={result.usage.get('n_output_tokens', 0)} "
+                    f"cached={result.usage.get('n_cache_tokens', 0)} "
+                    f"cost=${result.usage.get('cost_usd', 0):.4f}",
+                    flush=True,
+                )
+            if result.trajectory is not None:
+                print(
+                    f"mini_trajectory_steps={len(result.trajectory.steps)} "
+                    f"schema={result.trajectory.schema_version}",
+                    flush=True,
+                )
             await print_verification(client, args.workdir)
 
 
diff --git a/plugins/agents/mini-swe-agent/pyproject.toml b/plugins/agents/mini-swe-agent/pyproject.toml
index 3632962..4324784 100644
--- a/plugins/agents/mini-swe-agent/pyproject.toml
+++ b/plugins/agents/mini-swe-agent/pyproject.toml
@@ -18,3 +18,6 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools]
 package-dir = { "agentix.agents.mini_swe_agent" = "src" }
 packages = ["agentix.agents.mini_swe_agent"]
+
+[tool.setuptools.package-data]
+"agentix.agents.mini_swe_agent" = ["*.py"]
diff --git a/plugins/agents/mini-swe-agent/src/__init__.py b/plugins/agents/mini-swe-agent/src/__init__.py
index a2d72bb..9a5284d 100644
--- a/plugins/agents/mini-swe-agent/src/__init__.py
+++ b/plugins/agents/mini-swe-agent/src/__init__.py
@@ -1,5 +1,32 @@
 """mini-swe-agent integration public exports."""
 
-from .runner import run
+from .runner import MiniSweAgentResult, run
+from .trajectory import (
+    SCHEMA_VERSION,
+    AgentInfo,
+    FinalMetrics,
+    Metrics,
+    Observation,
+    ObservationResult,
+    Step,
+    ToolCall,
+    Trajectory,
+    aggregate_usage,
+    from_mini_swe_agent,
+)
 
-__all__ = ["run"]
+__all__ = [
+    "AgentInfo",
+    "FinalMetrics",
+    "Metrics",
+    "MiniSweAgentResult",
+    "Observation",
+    "ObservationResult",
+    "SCHEMA_VERSION",
+    "Step",
+    "ToolCall",
+    "Trajectory",
+    "aggregate_usage",
+    "from_mini_swe_agent",
+    "run",
+]
diff --git a/plugins/agents/mini-swe-agent/src/runner.py b/plugins/agents/mini-swe-agent/src/runner.py
index 18aeea1..b485c2b 100644
--- a/plugins/agents/mini-swe-agent/src/runner.py
+++ b/plugins/agents/mini-swe-agent/src/runner.py
@@ -1,18 +1,91 @@
+"""mini-swe-agent runner for Agentix sandboxes.
+
+Patterns borrowed from harbor's `MiniSweAgent`:
+
+  * `run()` drives a pre-built agent inside the sandbox, captures
+    mini-swe-agent's native v2 trajectory file, and post-processes it
+    into a structured `Trajectory` plus aggregated usage metrics.
+  * The captured trajectory rides back to the host as part of the
+    return value (no shared filesystem assumptions, no extra file
+    round-trip — `client.remote(...)` pickles the value).
+  * `cost_limit` / `reasoning_effort` / `config_yaml` are accepted but
+    handled in-process via the `DefaultAgent` instance the caller
+    constructs, not via a CLI subprocess (we do not spawn the
+    mini-swe-agent CLI; the agent object's `run(task)` is the
+    integration point).
+
+The return shape is a `MiniSweAgentResult` dataclass; legacy callers
+that expected a plain dict still get one via `result.to_dict()`.
+"""
+
 from __future__ import annotations
 
+import json
+import uuid
+from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import Any
 
 from minisweagent import Agent
 
+from .trajectory import Trajectory, aggregate_usage, from_mini_swe_agent
+
+
+@dataclass(slots=True)
+class MiniSweAgentResult:
+    """Enriched return type from a mini-swe-agent run.
+
+    `exit_status` and `submission` mirror mini-swe-agent's own return
+    shape so existing callers keep working. The structured
+    `trajectory` and `usage` fields are the new value-add — they
+    surface the same metrics harbor's `populate_context_post_run`
+    pushes into the trial context, without the agent-installer
+    machinery.
+    """
+
+    exit_status: str
+    submission: str
+    workdir: str
+    raw_trajectory: dict[str, Any] = field(default_factory=dict)
+    trajectory: Trajectory | None = None
+    usage: dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        d = asdict(self)
+        if self.trajectory is not None:
+            d["trajectory"] = self.trajectory.to_dict()
+        return d
+
+
+# ── public entry point ────────────────────────────────────────────────────
+
 
 def run(
     task: str,
     *,
     workdir: str = "/testbed",
     agent: Agent,
-) -> dict[str, Any]:
-    """Run a pre-built mini-swe-agent instance in sandbox."""
+    trajectory_path: str | Path | None = None,
+    session_id: str | None = None,
+) -> MiniSweAgentResult:
+    """Run a pre-built mini-swe-agent inside the sandbox.
+
+    Mini-swe-agent's `DefaultAgent.run(task)` returns a `(exit_status,
+    submission)` tuple-shaped object (in practice it returns a
+    2-element iterable). We:
+
+    1. Set the environment's working directory if the agent exposes
+       `env.config.cwd` (mirrors harbor's wiring).
+    2. Run the agent and collect the result.
+    3. If `trajectory_path` is provided AND the file exists after the
+       run, load mini-swe-agent's native v2 trajectory and convert to
+       a structured `Trajectory`.
+    4. Compute aggregated `usage` for callers that just need totals.
+
+    All paths are sandbox-local; the host pickle-recovers the
+    `MiniSweAgentResult` over the runtime client and can persist /
+    inspect / score it without re-running anything.
+    """
     workdir_path = Path(workdir)
     workdir_path.mkdir(parents=True, exist_ok=True)
 
@@ -21,4 +94,52 @@ def run(
     if env_config is not None and hasattr(env_config, "cwd"):
         env_config.cwd = str(workdir_path)
 
-    return dict(agent.run(task))
+    raw_result = dict(agent.run(task))
+    exit_status = str(raw_result.get("exit_status", "unknown"))
+    submission = str(raw_result.get("submission", ""))
+
+    raw_trajectory: dict[str, Any] = {}
+    if trajectory_path is not None:
+        raw_trajectory = _read_trajectory(Path(trajectory_path))
+    elif "messages" in raw_result:
+        # Some mini-swe-agent versions expose the trajectory inline on
+        # `agent.run()`'s return value (notably bench scripts that
+        # don't persist a file). Honour that too.
+        raw_trajectory = raw_result
+
+    trajectory: Trajectory | None = None
+    usage: dict[str, Any] = {}
+    if raw_trajectory:
+        usage = aggregate_usage(raw_trajectory)
+        try:
+            trajectory = from_mini_swe_agent(
+                raw_trajectory,
+                session_id=session_id or uuid.uuid4().hex,
+            )
+        except Exception:
+            # Best-effort: aggregating usage is cheaper / more
+            # tolerant than full conversion; keep `usage` even if
+            # conversion fails so the host always sees token counts.
+            trajectory = None
+
+    return MiniSweAgentResult(
+        exit_status=exit_status,
+        submission=submission,
+        workdir=str(workdir_path),
+        raw_trajectory=raw_trajectory,
+        trajectory=trajectory,
+        usage=usage,
+    )
+
+
+def _read_trajectory(path: Path) -> dict[str, Any]:
+    if not path.is_file():
+        return {}
+    try:
+        data = json.loads(path.read_text())
+    except (OSError, json.JSONDecodeError):
+        return {}
+    return data if isinstance(data, dict) else {}
+
+
+__all__ = ["MiniSweAgentResult", "run"]
diff --git a/plugins/agents/mini-swe-agent/src/trajectory.py b/plugins/agents/mini-swe-agent/src/trajectory.py
new file mode 100644
index 0000000..95b1fde
--- /dev/null
+++ b/plugins/agents/mini-swe-agent/src/trajectory.py
@@ -0,0 +1,388 @@
+"""Structured trajectory model for mini-swe-agent runs.
+
+Mirrors the ATIF (Agent Trial Interaction Format) shape used by
+harbor's `MiniSweAgent.populate_context_post_run`: one
+`Trajectory` per agent run, made of ordered `Step`s, each carrying
+the message, optional tool calls, observations, and per-call
+metrics. `FinalMetrics` aggregates the totals used downstream by
+schedulers / eval harnesses / RL buffers.
+
+The actual format is decoupled from any external schema package so
+the plugin can be consumed by the in-tree examples and tests without
+pulling in harbor.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterable
+from dataclasses import asdict, dataclass, field
+from datetime import UTC, datetime
+from typing import Any, Literal
+
+SCHEMA_VERSION = "ATIF-v1.2"
+
+StepSource = Literal["agent", "user", "system"]
+
+
+@dataclass(slots=True)
+class ToolCall:
+    tool_call_id: str
+    function_name: str
+    arguments: dict[str, Any]
+
+
+@dataclass(slots=True)
+class ObservationResult:
+    content: str
+
+
+@dataclass(slots=True)
+class Observation:
+    results: list[ObservationResult] = field(default_factory=list)
+
+
+@dataclass(slots=True)
+class Metrics:
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    cached_tokens: int | None = None
+    cost_usd: float | None = None
+    extra: dict[str, Any] | None = None
+
+
+@dataclass(slots=True)
+class FinalMetrics:
+    total_prompt_tokens: int = 0
+    total_completion_tokens: int = 0
+    total_cached_tokens: int | None = None
+    total_cost_usd: float | None = None
+    extra: dict[str, Any] | None = None
+
+
+@dataclass(slots=True)
+class Step:
+    step_id: int
+    timestamp: str
+    source: StepSource
+    message: str = ""
+    model_name: str | None = None
+    reasoning_content: str | None = None
+    tool_calls: list[ToolCall] | None = None
+    observation: Observation | None = None
+    metrics: Metrics | None = None
+
+
+@dataclass(slots=True)
+class AgentInfo:
+    name: str
+    version: str | None = None
+    model_name: str | None = None
+    extra: dict[str, Any] | None = None
+
+
+@dataclass(slots=True)
+class Trajectory:
+    """Structured representation of one mini-swe-agent run."""
+
+    schema_version: str
+    session_id: str
+    agent: AgentInfo
+    steps: list[Step]
+    final_metrics: FinalMetrics
+    notes: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return _strip_none(asdict(self))
+
+    def to_json(self, *, indent: int | None = 2) -> str:
+        return json.dumps(self.to_dict(), indent=indent)
+
+
+# ── mini-swe-agent v2 -> Trajectory ───────────────────────────────────────
+
+
+def from_mini_swe_agent(
+    trajectory: dict[str, Any],
+    *,
+    session_id: str,
+    now: datetime | None = None,
+) -> Trajectory:
+    """Convert a mini-swe-agent v2 trajectory dict into our `Trajectory`.
+
+    Expects the v2 native tool-calling format where assistant messages
+    contain a `tool_calls` array and tool results use `role: "tool"`.
+    Unknown shapes degrade gracefully: text content is preserved,
+    tool calls are best-effort parsed, message role mapping mirrors
+    harbor's `convert_mini_swe_agent_to_atif`.
+    """
+    info = trajectory.get("info") or {}
+    config = info.get("config") or {}
+    model_config = config.get("model") or {}
+    agent_config = config.get("agent") or {}
+    model_name = model_config.get("model_name") or "unknown"
+    mini_version = info.get("mini_version") or "unknown"
+    original_format = trajectory.get("trajectory_format", "unknown")
+
+    messages = trajectory.get("messages") or []
+    total_cost_usd = float((info.get("model_stats") or {}).get("instance_cost") or 0.0)
+
+    total_completion_tokens = 0
+    for message in messages:
+        usage = _usage_of(message)
+        total_completion_tokens += int(usage.get("completion_tokens") or 0)
+
+    base_now = now or datetime.now(UTC)
+    steps: list[Step] = []
+    step_id = 1
+    total_prompt = 0
+    total_cached = 0
+    total_reasoning = 0
+
+    for i, message in enumerate(messages):
+        role = message.get("role")
+        content = _normalize_content(message.get("content"))
+        usage = _usage_of(message)
+        prompt_tokens = int(usage.get("prompt_tokens") or 0)
+        completion_tokens = int(usage.get("completion_tokens") or 0)
+        prompt_details = usage.get("prompt_tokens_details") or {}
+        completion_details = usage.get("completion_tokens_details") or {}
+        cached_tokens = (
+            int(prompt_details.get("cached_tokens") or 0)
+            if isinstance(prompt_details, dict)
+            else 0
+        )
+        reasoning_tokens = (
+            int(completion_details.get("reasoning_tokens") or 0)
+            if isinstance(completion_details, dict)
+            else 0
+        )
+        total_prompt += prompt_tokens
+        total_cached += cached_tokens
+        total_reasoning += reasoning_tokens
+
+        timestamp = _isoformat(base_now)
+
+        if role == "system":
+            steps.append(Step(step_id=step_id, timestamp=timestamp, source="system", message=content))
+            step_id += 1
+        elif role == "user":
+            if i == 1:
+                steps.append(Step(step_id=step_id, timestamp=timestamp, source="user", message=content))
+                step_id += 1
+            else:
+                _attach_observation(steps, content)
+        elif role == "tool":
+            _attach_observation(steps, content)
+        elif role == "assistant":
+            tool_calls, reasoning = _parse_tool_calls(message, content, step_id)
+            metrics = _build_step_metrics(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                cached_tokens=cached_tokens,
+                prompt_tokens_details=prompt_details if isinstance(prompt_details, dict) else {},
+                completion_tokens_details=completion_details
+                if isinstance(completion_details, dict)
+                else {},
+                total_cost_usd=total_cost_usd,
+                total_completion_tokens=total_completion_tokens,
+            )
+            steps.append(
+                Step(
+                    step_id=step_id,
+                    timestamp=timestamp,
+                    source="agent",
+                    model_name=model_name,
+                    message=content,
+                    reasoning_content=reasoning,
+                    tool_calls=tool_calls,
+                    metrics=metrics,
+                )
+            )
+            step_id += 1
+
+    final_extra: dict[str, Any] = {}
+    if total_reasoning > 0:
+        final_extra["total_reasoning_tokens"] = total_reasoning
+
+    final = FinalMetrics(
+        total_prompt_tokens=total_prompt,
+        total_completion_tokens=total_completion_tokens,
+        total_cached_tokens=total_cached if total_cached > 0 else None,
+        total_cost_usd=total_cost_usd if total_cost_usd > 0 else None,
+        extra=final_extra or None,
+    )
+
+    return Trajectory(
+        schema_version=SCHEMA_VERSION,
+        session_id=session_id,
+        agent=AgentInfo(
+            name="mini-swe-agent",
+            version=mini_version,
+            model_name=model_name,
+            extra={"original_format": original_format, "agent_config": agent_config},
+        ),
+        steps=steps,
+        final_metrics=final,
+        notes="Converted from mini-swe-agent v2 trajectory",
+    )
+
+
+def aggregate_usage(trajectory: dict[str, Any]) -> dict[str, Any]:
+    """Light-weight summary: total tokens + cost from a raw mini-swe-agent trajectory.
+
+    Cheaper than `from_mini_swe_agent` for callers that only need the
+    metrics summary (e.g. populating `AgentContext.n_*` fields).
+    """
+    info = trajectory.get("info") or {}
+    total_cost_usd = float((info.get("model_stats") or {}).get("instance_cost") or 0.0)
+    prompt = 0
+    completion = 0
+    cached = 0
+    for message in trajectory.get("messages") or []:
+        usage = _usage_of(message)
+        prompt += int(usage.get("prompt_tokens") or 0)
+        completion += int(usage.get("completion_tokens") or 0)
+        details = usage.get("prompt_tokens_details") or {}
+        if isinstance(details, dict):
+            cached += int(details.get("cached_tokens") or 0)
+    return {
+        "n_input_tokens": prompt,
+        "n_output_tokens": completion,
+        "n_cache_tokens": cached,
+        "cost_usd": total_cost_usd,
+    }
+
+
+# ── internals ─────────────────────────────────────────────────────────────
+
+
+def _isoformat(dt: datetime) -> str:
+    return dt.astimezone(UTC).isoformat().replace("+00:00", "Z")
+
+
+def _usage_of(message: dict[str, Any]) -> dict[str, Any]:
+    extra = message.get("extra") or {}
+    response = extra.get("response") or {} if isinstance(extra, dict) else {}
+    usage = response.get("usage") or {} if isinstance(response, dict) else {}
+    return usage if isinstance(usage, dict) else {}
+
+
+def _normalize_content(raw: Any) -> str:
+    if raw is None:
+        return ""
+    if isinstance(raw, str):
+        return raw
+    if isinstance(raw, list):
+        parts: list[str] = []
+        for part in raw:
+            if isinstance(part, dict):
+                parts.append(str(part.get("text", part)))
+            else:
+                parts.append(str(part))
+        return "\n".join(parts)
+    return str(raw)
+
+
+def _attach_observation(steps: list[Step], content: str) -> None:
+    if not steps or steps[-1].source != "agent":
+        # Message has no preceding agent step (rare).
+        return
+    prev = steps[-1]
+    if prev.observation is None:
+        prev.observation = Observation(results=[ObservationResult(content=content)])
+    else:
+        prev.observation.results.append(ObservationResult(content=content))
+
+
+def _parse_tool_calls(
+    message: dict[str, Any], content: str, step_id: int
+) -> tuple[list[ToolCall] | None, str | None]:
+    raw_calls = message.get("tool_calls")
+    if not isinstance(raw_calls, list) or not raw_calls:
+        return None, content if content else None
+    parsed: list[ToolCall] = []
+    for tc in raw_calls:
+        if not isinstance(tc, dict):
+            continue
+        tc_id = str(tc.get("id") or f"call_{step_id}_{len(parsed) + 1}")
+        function = tc.get("function") or {}
+        name = str(function.get("name", "bash")) if isinstance(function, dict) else "bash"
+        raw_args = function.get("arguments", "{}") if isinstance(function, dict) else "{}"
+        if isinstance(raw_args, dict):
+            arguments = raw_args
+        elif isinstance(raw_args, str):
+            try:
+                arguments = json.loads(raw_args)
+            except (json.JSONDecodeError, TypeError):
+                arguments = {"command": raw_args}
+        else:
+            arguments = {"command": str(raw_args)}
+        if not isinstance(arguments, dict):
+            arguments = {"_raw": arguments}
+        parsed.append(ToolCall(tool_call_id=tc_id, function_name=name, arguments=arguments))
+    reasoning = content if content else None
+    return (parsed or None), reasoning
+
+
+def _build_step_metrics(
+    *,
+    prompt_tokens: int,
+    completion_tokens: int,
+    cached_tokens: int,
+    prompt_tokens_details: dict[str, Any],
+    completion_tokens_details: dict[str, Any],
+    total_cost_usd: float,
+    total_completion_tokens: int,
+) -> Metrics | None:
+    if prompt_tokens == 0 and completion_tokens == 0:
+        return None
+
+    step_cost: float | None = None
+    if total_cost_usd > 0 and total_completion_tokens > 0 and completion_tokens > 0:
+        step_cost = (completion_tokens / total_completion_tokens) * total_cost_usd
+
+    extra: dict[str, Any] = {}
+    if prompt_tokens_details:
+        extra["prompt_tokens_details"] = prompt_tokens_details
+    if completion_tokens_details:
+        extra["completion_tokens_details"] = completion_tokens_details
+    return Metrics(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        cached_tokens=cached_tokens if cached_tokens > 0 else None,
+        cost_usd=step_cost if step_cost and step_cost > 0 else None,
+        extra=extra or None,
+    )
+
+
+def _strip_none(obj: Any) -> Any:
+    """Recursively drop keys whose value is `None`, so the serialised
+    Trajectory is compact and stable for diff comparisons."""
+    if isinstance(obj, dict):
+        return {k: _strip_none(v) for k, v in obj.items() if v is not None}
+    if isinstance(obj, list):
+        return [_strip_none(v) for v in obj]
+    return obj
+
+
+def trajectory_records(trajectory: Trajectory) -> Iterable[Step]:
+    """Convenience iterator for callers that want to stream steps."""
+    return iter(trajectory.steps)
+
+
+__all__ = [
+    "AgentInfo",
+    "FinalMetrics",
+    "Metrics",
+    "Observation",
+    "ObservationResult",
+    "SCHEMA_VERSION",
+    "Step",
+    "StepSource",
+    "ToolCall",
+    "Trajectory",
+    "aggregate_usage",
+    "from_mini_swe_agent",
+    "trajectory_records",
+]
diff --git a/plugins/agents/mini-swe-agent/tests/test_mini_swe_agent.py b/plugins/agents/mini-swe-agent/tests/test_mini_swe_agent.py
index c845f81..a2dcf37 100644
--- a/plugins/agents/mini-swe-agent/tests/test_mini_swe_agent.py
+++ b/plugins/agents/mini-swe-agent/tests/test_mini_swe_agent.py
@@ -1,5 +1,26 @@
+"""Tests for the mini-swe-agent runner and trajectory conversion.
+
+Covers:
+
+  * `run(...)` returns a `MiniSweAgentResult` carrying `exit_status`
+    and `submission` from the agent, the resolved `workdir`, and the
+    raw trajectory dict.
+  * When a `trajectory_path` is provided, the structured
+    `Trajectory` is constructed from mini-swe-agent's v2 format,
+    including system / user / assistant / tool / tool_use steps.
+  * `aggregate_usage(...)` matches what `from_mini_swe_agent(...)`
+    produces in `final_metrics` (consistency check between the
+    cheap and full paths).
+  * Errors inside the agent surface as exceptions instead of being
+    swallowed.
+"""
+
 from __future__ import annotations
 
+import json
+from pathlib import Path
+from typing import Any
+
 import agentix.agents.mini_swe_agent as mini_swe
 import pytest
 
@@ -14,36 +35,209 @@ def __init__(self) -> None:
         self.config = DummyEnvConfig()
 
 
-def test_run_success(tmp_path):
+def _v2_trajectory() -> dict[str, Any]:
+    """Minimal mini-swe-agent v2 trajectory with system/user/assistant/tool."""
+    usage_a = {
+        "prompt_tokens": 12,
+        "completion_tokens": 4,
+        "prompt_tokens_details": {"cached_tokens": 3},
+        "completion_tokens_details": {"reasoning_tokens": 1},
+    }
+    usage_b = {"prompt_tokens": 20, "completion_tokens": 6}
+    return {
+        "trajectory_format": "mini-swe-agent.v2",
+        "info": {
+            "mini_version": "2.3.0",
+            "config": {
+                "model": {"model_name": "openai/gpt-4o-mini"},
+                "agent": {"mode": "yolo"},
+            },
+            "model_stats": {"instance_cost": 0.012},
+        },
+        "messages": [
+            {"role": "system", "content": "You are a helpful agent."},
+            {"role": "user", "content": "fix the bug"},
+            {
+                "role": "assistant",
+                "content": "thinking...",
+                "tool_calls": [
+                    {
+                        "id": "call-1",
+                        "function": {"name": "bash", "arguments": json.dumps({"command": "ls"})},
+                    }
+                ],
+                "extra": {"response": {"usage": usage_a}},
+            },
+            {"role": "tool", "content": "file_a.py\nfile_b.py"},
+            {
+                "role": "assistant",
+                "content": "done",
+                "extra": {"response": {"usage": usage_b}},
+            },
+        ],
+    }
+
+
+# ── run() ─────────────────────────────────────────────────────────────────
+
+
+def test_run_returns_structured_result(tmp_path: Path) -> None:
     class DummyAgent:
         def __init__(self) -> None:
             self.env = DummyEnv()
 
-        def run(self, task: str):
+        def run(self, _: str):
             return {"exit_status": "submitted", "submission": "diff --git ..."}
 
     agent = DummyAgent()
+    result = mini_swe.run("fix bug", workdir=str(tmp_path), agent=agent)
+
+    assert isinstance(result, mini_swe.MiniSweAgentResult)
+    assert result.exit_status == "submitted"
+    assert result.submission == "diff --git ..."
+    assert result.workdir == str(tmp_path)
+    assert agent.env.config.cwd == str(tmp_path)
+    # No trajectory_path passed and `run` returned a plain result
+    # without `messages` -> no trajectory.
+    assert result.trajectory is None
+    assert result.usage == {}
+
+
+def test_run_loads_trajectory_from_file(tmp_path: Path) -> None:
+    trajectory_path = tmp_path / "mini-swe-agent.trajectory.json"
+    trajectory_path.write_text(json.dumps(_v2_trajectory()))
+
+    class DummyAgent:
+        def __init__(self) -> None:
+            self.env = DummyEnv()
+
+        def run(self, _: str):
+            return {"exit_status": "submitted", "submission": "patch"}
+
     result = mini_swe.run(
-        "fix bug",
+        "fix",
         workdir=str(tmp_path),
-        agent=agent,
+        agent=DummyAgent(),
+        trajectory_path=trajectory_path,
+        session_id="sess-test",
     )
-    assert result["exit_status"] == "submitted"
-    assert result["submission"] == "diff --git ..."
-    assert agent.env.config.cwd == str(tmp_path)
 
+    assert result.trajectory is not None
+    traj = result.trajectory
+    assert traj.session_id == "sess-test"
+    assert traj.agent.name == "mini-swe-agent"
+    assert traj.agent.version == "2.3.0"
+    assert traj.agent.model_name == "openai/gpt-4o-mini"
+
+    sources = [s.source for s in traj.steps]
+    assert sources == ["system", "user", "agent", "agent"]
+    # First assistant carries a tool call and a tool_result observation
+    # was attached to that same step.
+    [first_agent, second_agent] = [s for s in traj.steps if s.source == "agent"]
+    assert first_agent.tool_calls is not None
+    assert first_agent.tool_calls[0].function_name == "bash"
+    assert first_agent.tool_calls[0].arguments == {"command": "ls"}
+    assert first_agent.observation is not None
+    assert first_agent.observation.results[0].content == "file_a.py\nfile_b.py"
+    # Second assistant — no tool call, message text preserved.
+    assert second_agent.tool_calls is None
+    assert second_agent.message == "done"
+
+    # Final metrics aggregate correctly.
+    assert traj.final_metrics.total_prompt_tokens == 32
+    assert traj.final_metrics.total_completion_tokens == 10
+    assert traj.final_metrics.total_cached_tokens == 3
+    assert traj.final_metrics.total_cost_usd == 0.012
+    assert (traj.final_metrics.extra or {}).get("total_reasoning_tokens") == 1
 
-def test_run_exception_propagates(tmp_path):
+    # Cheap aggregate matches the full path.
+    assert result.usage == {
+        "n_input_tokens": 32,
+        "n_output_tokens": 10,
+        "n_cache_tokens": 3,
+        "cost_usd": 0.012,
+    }
+
+
+def test_run_inline_trajectory_passthrough(tmp_path: Path) -> None:
+    """Some bench scripts return the trajectory inline. Honour that path."""
+    inline = _v2_trajectory()
+    inline["exit_status"] = "submitted"
+    inline["submission"] = "patch"
+
+    class DummyAgent:
+        def __init__(self) -> None:
+            self.env = DummyEnv()
+
+        def run(self, _: str):
+            return inline
+
+    result = mini_swe.run("fix", workdir=str(tmp_path), agent=DummyAgent())
+    assert result.trajectory is not None
+    assert result.usage["n_input_tokens"] == 32
+
+
+def test_run_exception_propagates(tmp_path: Path) -> None:
     class BoomAgent:
         def __init__(self) -> None:
             self.env = DummyEnv()
 
-        def run(self, task: str):
+        def run(self, _: str):
             raise RuntimeError("boom")
 
     with pytest.raises(RuntimeError, match="boom"):
-        mini_swe.run(
-            "fix bug",
-            workdir=str(tmp_path),
-            agent=BoomAgent(),
-        )
+        mini_swe.run("fix", workdir=str(tmp_path), agent=BoomAgent())
+
+
+# ── trajectory module direct tests ────────────────────────────────────────
+
+
+def test_aggregate_usage_matches_final_metrics() -> None:
+    raw = _v2_trajectory()
+    usage = mini_swe.aggregate_usage(raw)
+    traj = mini_swe.from_mini_swe_agent(raw, session_id="sid")
+    assert traj.final_metrics.total_prompt_tokens == usage["n_input_tokens"]
+    assert traj.final_metrics.total_completion_tokens == usage["n_output_tokens"]
+
+
+def test_trajectory_to_dict_strips_none() -> None:
+    raw = _v2_trajectory()
+    traj = mini_swe.from_mini_swe_agent(raw, session_id="sid")
+    d = traj.to_dict()
+    # No top-level None values
+    for v in d.values():
+        if isinstance(v, dict):
+            assert all(value is not None for value in v.values())
+    # SCHEMA_VERSION exposed for downstream consumers.
+    assert d["schema_version"] == mini_swe.SCHEMA_VERSION
+
+
+def test_trajectory_to_json_is_valid() -> None:
+    raw = _v2_trajectory()
+    traj = mini_swe.from_mini_swe_agent(raw, session_id="sid")
+    parsed = json.loads(traj.to_json())
+    assert parsed["agent"]["name"] == "mini-swe-agent"
+    assert parsed["steps"][0]["source"] == "system"
+
+
+def test_tool_call_with_string_arguments_falls_back_to_command() -> None:
+    raw: dict[str, Any] = {
+        "trajectory_format": "mini-swe-agent.v2",
+        "info": {"mini_version": "2.3", "model_stats": {"instance_cost": 0.0}},
+        "messages": [
+            {"role": "system", "content": "x"},
+            {"role": "user", "content": "y"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {"id": "c1", "function": {"name": "bash", "arguments": "ls -la"}}
+                ],
+                "extra": {"response": {"usage": {"prompt_tokens": 1, "completion_tokens": 1}}},
+            },
+        ],
+    }
+    traj = mini_swe.from_mini_swe_agent(raw, session_id="sid")
+    [agent_step] = [s for s in traj.steps if s.source == "agent"]
+    assert agent_step.tool_calls is not None
+    assert agent_step.tool_calls[0].arguments == {"command": "ls -la"}