diff --git a/examples/run-mini-swe-agent/main.py b/examples/run-mini-swe-agent/main.py index 2def486..7f14f68 100644 --- a/examples/run-mini-swe-agent/main.py +++ b/examples/run-mini-swe-agent/main.py @@ -66,11 +66,26 @@ async def main() -> None: print(f"{type(exc).__name__}: {exc}", flush=True) await print_verification(client, args.workdir) return - print(f"mini_exit_status={result.get('exit_status', 'unknown')}", flush=True) - submission = str(result.get("submission", "")) + print(f"mini_exit_status={result.exit_status}", flush=True) + submission = result.submission if submission: print("mini_submission:", flush=True) print(submission.rstrip(), flush=True) + if result.usage: + print( + "mini_usage: " + f"input={result.usage.get('n_input_tokens', 0)} " + f"output={result.usage.get('n_output_tokens', 0)} " + f"cached={result.usage.get('n_cache_tokens', 0)} " + f"cost=${result.usage.get('cost_usd', 0):.4f}", + flush=True, + ) + if result.trajectory is not None: + print( + f"mini_trajectory_steps={len(result.trajectory.steps)} " + f"schema={result.trajectory.schema_version}", + flush=True, + ) await print_verification(client, args.workdir) diff --git a/plugins/agents/mini-swe-agent/pyproject.toml b/plugins/agents/mini-swe-agent/pyproject.toml index 3632962..4324784 100644 --- a/plugins/agents/mini-swe-agent/pyproject.toml +++ b/plugins/agents/mini-swe-agent/pyproject.toml @@ -18,3 +18,6 @@ build-backend = "setuptools.build_meta" [tool.setuptools] package-dir = { "agentix.agents.mini_swe_agent" = "src" } packages = ["agentix.agents.mini_swe_agent"] + +[tool.setuptools.package-data] +"agentix.agents.mini_swe_agent" = ["*.py"] diff --git a/plugins/agents/mini-swe-agent/src/__init__.py b/plugins/agents/mini-swe-agent/src/__init__.py index a2d72bb..9a5284d 100644 --- a/plugins/agents/mini-swe-agent/src/__init__.py +++ b/plugins/agents/mini-swe-agent/src/__init__.py @@ -1,5 +1,32 @@ """mini-swe-agent integration public exports.""" -from .runner import run +from .runner import MiniSweAgentResult, run +from .trajectory import ( + SCHEMA_VERSION, + AgentInfo, + FinalMetrics, + Metrics, + Observation, + ObservationResult, + Step, + ToolCall, + Trajectory, + aggregate_usage, + from_mini_swe_agent, +) -__all__ = ["run"] +__all__ = [ + "AgentInfo", + "FinalMetrics", + "Metrics", + "MiniSweAgentResult", + "Observation", + "ObservationResult", + "SCHEMA_VERSION", + "Step", + "ToolCall", + "Trajectory", + "aggregate_usage", + "from_mini_swe_agent", + "run", +] diff --git a/plugins/agents/mini-swe-agent/src/runner.py b/plugins/agents/mini-swe-agent/src/runner.py index 18aeea1..b485c2b 100644 --- a/plugins/agents/mini-swe-agent/src/runner.py +++ b/plugins/agents/mini-swe-agent/src/runner.py @@ -1,18 +1,91 @@ +"""mini-swe-agent runner for Agentix sandboxes. + +Patterns borrowed from harbor's `MiniSweAgent`: + + * `run()` drives a pre-built agent inside the sandbox, captures + mini-swe-agent's native v2 trajectory file, and post-processes it + into a structured `Trajectory` plus aggregated usage metrics. + * The captured trajectory rides back to the host as part of the + return value (no shared filesystem assumptions, no extra file + round-trip — `client.remote(...)` pickles the value). + * `cost_limit` / `reasoning_effort` / `config_yaml` are accepted but + handled in-process via the `DefaultAgent` instance the caller + constructs, not via a CLI subprocess (we do not spawn the + mini-swe-agent CLI; the agent object's `run(task)` is the + integration point). + +The return shape is a `MiniSweAgentResult` dataclass; legacy callers +that expected a plain dict still get one via `result.to_dict()`. +""" + from __future__ import annotations +import json +import uuid +from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Any from minisweagent import Agent +from .trajectory import Trajectory, aggregate_usage, from_mini_swe_agent + + +@dataclass(slots=True) +class MiniSweAgentResult: + """Enriched return type from a mini-swe-agent run. + + `exit_status` and `submission` mirror mini-swe-agent's own return + shape so existing callers keep working. The structured + `trajectory` and `usage` fields are the new value-add — they + surface the same metrics harbor's `populate_context_post_run` + pushes into the trial context, without the agent-installer + machinery. + """ + + exit_status: str + submission: str + workdir: str + raw_trajectory: dict[str, Any] = field(default_factory=dict) + trajectory: Trajectory | None = None + usage: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + d = asdict(self) + if self.trajectory is not None: + d["trajectory"] = self.trajectory.to_dict() + return d + + +# ── public entry point ──────────────────────────────────────────────────── + def run( task: str, *, workdir: str = "/testbed", agent: Agent, -) -> dict[str, Any]: - """Run a pre-built mini-swe-agent instance in sandbox.""" + trajectory_path: str | Path | None = None, + session_id: str | None = None, +) -> MiniSweAgentResult: + """Run a pre-built mini-swe-agent inside the sandbox. + + Mini-swe-agent's `DefaultAgent.run(task)` returns a `(exit_status, + submission)` tuple-shaped object (in practice it returns a + 2-element iterable). We: + + 1. Set the environment's working directory if the agent exposes + `env.config.cwd` (mirrors harbor's wiring). + 2. Run the agent and collect the result. + 3. If `trajectory_path` is provided AND the file exists after the + run, load mini-swe-agent's native v2 trajectory and convert to + a structured `Trajectory`. + 4. Compute aggregated `usage` for callers that just need totals. + + All paths are sandbox-local; the host pickle-recovers the + `MiniSweAgentResult` over the runtime client and can persist / + inspect / score it without re-running anything. + """ workdir_path = Path(workdir) workdir_path.mkdir(parents=True, exist_ok=True) @@ -21,4 +94,52 @@ def run( if env_config is not None and hasattr(env_config, "cwd"): env_config.cwd = str(workdir_path) - return dict(agent.run(task)) + raw_result = dict(agent.run(task)) + exit_status = str(raw_result.get("exit_status", "unknown")) + submission = str(raw_result.get("submission", "")) + + raw_trajectory: dict[str, Any] = {} + if trajectory_path is not None: + raw_trajectory = _read_trajectory(Path(trajectory_path)) + elif "messages" in raw_result: + # Some mini-swe-agent versions expose the trajectory inline on + # `agent.run()`'s return value (notably bench scripts that + # don't persist a file). Honour that too. + raw_trajectory = raw_result + + trajectory: Trajectory | None = None + usage: dict[str, Any] = {} + if raw_trajectory: + usage = aggregate_usage(raw_trajectory) + try: + trajectory = from_mini_swe_agent( + raw_trajectory, + session_id=session_id or uuid.uuid4().hex, + ) + except Exception: + # Best-effort: aggregating usage is cheaper / more + # tolerant than full conversion; keep `usage` even if + # conversion fails so the host always sees token counts. + trajectory = None + + return MiniSweAgentResult( + exit_status=exit_status, + submission=submission, + workdir=str(workdir_path), + raw_trajectory=raw_trajectory, + trajectory=trajectory, + usage=usage, + ) + + +def _read_trajectory(path: Path) -> dict[str, Any]: + if not path.is_file(): + return {} + try: + data = json.loads(path.read_text()) + except (OSError, json.JSONDecodeError): + return {} + return data if isinstance(data, dict) else {} + + +__all__ = ["MiniSweAgentResult", "run"] diff --git a/plugins/agents/mini-swe-agent/src/trajectory.py b/plugins/agents/mini-swe-agent/src/trajectory.py new file mode 100644 index 0000000..95b1fde --- /dev/null +++ b/plugins/agents/mini-swe-agent/src/trajectory.py @@ -0,0 +1,388 @@ +"""Structured trajectory model for mini-swe-agent runs. + +Mirrors the ATIF (Agent Trial Interaction Format) shape used by +harbor's `MiniSweAgent.populate_context_post_run`: one +`Trajectory` per agent run, made of ordered `Step`s, each carrying +the message, optional tool calls, observations, and per-call +metrics. `FinalMetrics` aggregates the totals used downstream by +schedulers / eval harnesses / RL buffers. + +The actual format is decoupled from any external schema package so +the plugin can be consumed by the in-tree examples and tests without +pulling in harbor. +""" + +from __future__ import annotations + +import json +from collections.abc import Iterable +from dataclasses import asdict, dataclass, field +from datetime import UTC, datetime +from typing import Any, Literal + +SCHEMA_VERSION = "ATIF-v1.2" + +StepSource = Literal["agent", "user", "system"] + + +@dataclass(slots=True) +class ToolCall: + tool_call_id: str + function_name: str + arguments: dict[str, Any] + + +@dataclass(slots=True) +class ObservationResult: + content: str + + +@dataclass(slots=True) +class Observation: + results: list[ObservationResult] = field(default_factory=list) + + +@dataclass(slots=True) +class Metrics: + prompt_tokens: int | None = None + completion_tokens: int | None = None + cached_tokens: int | None = None + cost_usd: float | None = None + extra: dict[str, Any] | None = None + + +@dataclass(slots=True) +class FinalMetrics: + total_prompt_tokens: int = 0 + total_completion_tokens: int = 0 + total_cached_tokens: int | None = None + total_cost_usd: float | None = None + extra: dict[str, Any] | None = None + + +@dataclass(slots=True) +class Step: + step_id: int + timestamp: str + source: StepSource + message: str = "" + model_name: str | None = None + reasoning_content: str | None = None + tool_calls: list[ToolCall] | None = None + observation: Observation | None = None + metrics: Metrics | None = None + + +@dataclass(slots=True) +class AgentInfo: + name: str + version: str | None = None + model_name: str | None = None + extra: dict[str, Any] | None = None + + +@dataclass(slots=True) +class Trajectory: + """Structured representation of one mini-swe-agent run.""" + + schema_version: str + session_id: str + agent: AgentInfo + steps: list[Step] + final_metrics: FinalMetrics + notes: str | None = None + + def to_dict(self) -> dict[str, Any]: + return _strip_none(asdict(self)) + + def to_json(self, *, indent: int | None = 2) -> str: + return json.dumps(self.to_dict(), indent=indent) + + +# ── mini-swe-agent v2 -> Trajectory ─────────────────────────────────────── + + +def from_mini_swe_agent( + trajectory: dict[str, Any], + *, + session_id: str, + now: datetime | None = None, +) -> Trajectory: + """Convert a mini-swe-agent v2 trajectory dict into our `Trajectory`. + + Expects the v2 native tool-calling format where assistant messages + contain a `tool_calls` array and tool results use `role: "tool"`. + Unknown shapes degrade gracefully: text content is preserved, + tool calls are best-effort parsed, message role mapping mirrors + harbor's `convert_mini_swe_agent_to_atif`. + """ + info = trajectory.get("info") or {} + config = info.get("config") or {} + model_config = config.get("model") or {} + agent_config = config.get("agent") or {} + model_name = model_config.get("model_name") or "unknown" + mini_version = info.get("mini_version") or "unknown" + original_format = trajectory.get("trajectory_format", "unknown") + + messages = trajectory.get("messages") or [] + total_cost_usd = float((info.get("model_stats") or {}).get("instance_cost") or 0.0) + + total_completion_tokens = 0 + for message in messages: + usage = _usage_of(message) + total_completion_tokens += int(usage.get("completion_tokens") or 0) + + base_now = now or datetime.now(UTC) + steps: list[Step] = [] + step_id = 1 + total_prompt = 0 + total_cached = 0 + total_reasoning = 0 + + for i, message in enumerate(messages): + role = message.get("role") + content = _normalize_content(message.get("content")) + usage = _usage_of(message) + prompt_tokens = int(usage.get("prompt_tokens") or 0) + completion_tokens = int(usage.get("completion_tokens") or 0) + prompt_details = usage.get("prompt_tokens_details") or {} + completion_details = usage.get("completion_tokens_details") or {} + cached_tokens = ( + int(prompt_details.get("cached_tokens") or 0) + if isinstance(prompt_details, dict) + else 0 + ) + reasoning_tokens = ( + int(completion_details.get("reasoning_tokens") or 0) + if isinstance(completion_details, dict) + else 0 + ) + total_prompt += prompt_tokens + total_cached += cached_tokens + total_reasoning += reasoning_tokens + + timestamp = _isoformat(base_now) + + if role == "system": + steps.append(Step(step_id=step_id, timestamp=timestamp, source="system", message=content)) + step_id += 1 + elif role == "user": + if i == 1: + steps.append(Step(step_id=step_id, timestamp=timestamp, source="user", message=content)) + step_id += 1 + else: + _attach_observation(steps, content) + elif role == "tool": + _attach_observation(steps, content) + elif role == "assistant": + tool_calls, reasoning = _parse_tool_calls(message, content, step_id) + metrics = _build_step_metrics( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cached_tokens=cached_tokens, + prompt_tokens_details=prompt_details if isinstance(prompt_details, dict) else {}, + completion_tokens_details=completion_details + if isinstance(completion_details, dict) + else {}, + total_cost_usd=total_cost_usd, + total_completion_tokens=total_completion_tokens, + ) + steps.append( + Step( + step_id=step_id, + timestamp=timestamp, + source="agent", + model_name=model_name, + message=content, + reasoning_content=reasoning, + tool_calls=tool_calls, + metrics=metrics, + ) + ) + step_id += 1 + + final_extra: dict[str, Any] = {} + if total_reasoning > 0: + final_extra["total_reasoning_tokens"] = total_reasoning + + final = FinalMetrics( + total_prompt_tokens=total_prompt, + total_completion_tokens=total_completion_tokens, + total_cached_tokens=total_cached if total_cached > 0 else None, + total_cost_usd=total_cost_usd if total_cost_usd > 0 else None, + extra=final_extra or None, + ) + + return Trajectory( + schema_version=SCHEMA_VERSION, + session_id=session_id, + agent=AgentInfo( + name="mini-swe-agent", + version=mini_version, + model_name=model_name, + extra={"original_format": original_format, "agent_config": agent_config}, + ), + steps=steps, + final_metrics=final, + notes="Converted from mini-swe-agent v2 trajectory", + ) + + +def aggregate_usage(trajectory: dict[str, Any]) -> dict[str, Any]: + """Light-weight summary: total tokens + cost from a raw mini-swe-agent trajectory. + + Cheaper than `from_mini_swe_agent` for callers that only need the + metrics summary (e.g. populating `AgentContext.n_*` fields). + """ + info = trajectory.get("info") or {} + total_cost_usd = float((info.get("model_stats") or {}).get("instance_cost") or 0.0) + prompt = 0 + completion = 0 + cached = 0 + for message in trajectory.get("messages") or []: + usage = _usage_of(message) + prompt += int(usage.get("prompt_tokens") or 0) + completion += int(usage.get("completion_tokens") or 0) + details = usage.get("prompt_tokens_details") or {} + if isinstance(details, dict): + cached += int(details.get("cached_tokens") or 0) + return { + "n_input_tokens": prompt, + "n_output_tokens": completion, + "n_cache_tokens": cached, + "cost_usd": total_cost_usd, + } + + +# ── internals ───────────────────────────────────────────────────────────── + + +def _isoformat(dt: datetime) -> str: + return dt.astimezone(UTC).isoformat().replace("+00:00", "Z") + + +def _usage_of(message: dict[str, Any]) -> dict[str, Any]: + extra = message.get("extra") or {} + response = extra.get("response") or {} if isinstance(extra, dict) else {} + usage = response.get("usage") or {} if isinstance(response, dict) else {} + return usage if isinstance(usage, dict) else {} + + +def _normalize_content(raw: Any) -> str: + if raw is None: + return "" + if isinstance(raw, str): + return raw + if isinstance(raw, list): + parts: list[str] = [] + for part in raw: + if isinstance(part, dict): + parts.append(str(part.get("text", part))) + else: + parts.append(str(part)) + return "\n".join(parts) + return str(raw) + + +def _attach_observation(steps: list[Step], content: str) -> None: + if not steps or steps[-1].source != "agent": + # Message has no preceding agent step (rare). + return + prev = steps[-1] + if prev.observation is None: + prev.observation = Observation(results=[ObservationResult(content=content)]) + else: + prev.observation.results.append(ObservationResult(content=content)) + + +def _parse_tool_calls( + message: dict[str, Any], content: str, step_id: int +) -> tuple[list[ToolCall] | None, str | None]: + raw_calls = message.get("tool_calls") + if not isinstance(raw_calls, list) or not raw_calls: + return None, content if content else None + parsed: list[ToolCall] = [] + for tc in raw_calls: + if not isinstance(tc, dict): + continue + tc_id = str(tc.get("id") or f"call_{step_id}_{len(parsed) + 1}") + function = tc.get("function") or {} + name = str(function.get("name", "bash")) if isinstance(function, dict) else "bash" + raw_args = function.get("arguments", "{}") if isinstance(function, dict) else "{}" + if isinstance(raw_args, dict): + arguments = raw_args + elif isinstance(raw_args, str): + try: + arguments = json.loads(raw_args) + except (json.JSONDecodeError, TypeError): + arguments = {"command": raw_args} + else: + arguments = {"command": str(raw_args)} + if not isinstance(arguments, dict): + arguments = {"_raw": arguments} + parsed.append(ToolCall(tool_call_id=tc_id, function_name=name, arguments=arguments)) + reasoning = content if content else None + return (parsed or None), reasoning + + +def _build_step_metrics( + *, + prompt_tokens: int, + completion_tokens: int, + cached_tokens: int, + prompt_tokens_details: dict[str, Any], + completion_tokens_details: dict[str, Any], + total_cost_usd: float, + total_completion_tokens: int, +) -> Metrics | None: + if prompt_tokens == 0 and completion_tokens == 0: + return None + + step_cost: float | None = None + if total_cost_usd > 0 and total_completion_tokens > 0 and completion_tokens > 0: + step_cost = (completion_tokens / total_completion_tokens) * total_cost_usd + + extra: dict[str, Any] = {} + if prompt_tokens_details: + extra["prompt_tokens_details"] = prompt_tokens_details + if completion_tokens_details: + extra["completion_tokens_details"] = completion_tokens_details + return Metrics( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cached_tokens=cached_tokens if cached_tokens > 0 else None, + cost_usd=step_cost if step_cost and step_cost > 0 else None, + extra=extra or None, + ) + + +def _strip_none(obj: Any) -> Any: + """Recursively drop keys whose value is `None`, so the serialised + Trajectory is compact and stable for diff comparisons.""" + if isinstance(obj, dict): + return {k: _strip_none(v) for k, v in obj.items() if v is not None} + if isinstance(obj, list): + return [_strip_none(v) for v in obj] + return obj + + +def trajectory_records(trajectory: Trajectory) -> Iterable[Step]: + """Convenience iterator for callers that want to stream steps.""" + return iter(trajectory.steps) + + +__all__ = [ + "AgentInfo", + "FinalMetrics", + "Metrics", + "Observation", + "ObservationResult", + "SCHEMA_VERSION", + "Step", + "StepSource", + "ToolCall", + "Trajectory", + "aggregate_usage", + "from_mini_swe_agent", + "trajectory_records", +] diff --git a/plugins/agents/mini-swe-agent/tests/test_mini_swe_agent.py b/plugins/agents/mini-swe-agent/tests/test_mini_swe_agent.py index c845f81..a2dcf37 100644 --- a/plugins/agents/mini-swe-agent/tests/test_mini_swe_agent.py +++ b/plugins/agents/mini-swe-agent/tests/test_mini_swe_agent.py @@ -1,5 +1,26 @@ +"""Tests for the mini-swe-agent runner and trajectory conversion. + +Covers: + + * `run(...)` returns a `MiniSweAgentResult` carrying `exit_status` + and `submission` from the agent, the resolved `workdir`, and the + raw trajectory dict. + * When a `trajectory_path` is provided, the structured + `Trajectory` is constructed from mini-swe-agent's v2 format, + including system / user / assistant / tool / tool_use steps. + * `aggregate_usage(...)` matches what `from_mini_swe_agent(...)` + produces in `final_metrics` (consistency check between the + cheap and full paths). + * Errors inside the agent surface as exceptions instead of being + swallowed. +""" + from __future__ import annotations +import json +from pathlib import Path +from typing import Any + import agentix.agents.mini_swe_agent as mini_swe import pytest @@ -14,36 +35,209 @@ def __init__(self) -> None: self.config = DummyEnvConfig() -def test_run_success(tmp_path): +def _v2_trajectory() -> dict[str, Any]: + """Minimal mini-swe-agent v2 trajectory with system/user/assistant/tool.""" + usage_a = { + "prompt_tokens": 12, + "completion_tokens": 4, + "prompt_tokens_details": {"cached_tokens": 3}, + "completion_tokens_details": {"reasoning_tokens": 1}, + } + usage_b = {"prompt_tokens": 20, "completion_tokens": 6} + return { + "trajectory_format": "mini-swe-agent.v2", + "info": { + "mini_version": "2.3.0", + "config": { + "model": {"model_name": "openai/gpt-4o-mini"}, + "agent": {"mode": "yolo"}, + }, + "model_stats": {"instance_cost": 0.012}, + }, + "messages": [ + {"role": "system", "content": "You are a helpful agent."}, + {"role": "user", "content": "fix the bug"}, + { + "role": "assistant", + "content": "thinking...", + "tool_calls": [ + { + "id": "call-1", + "function": {"name": "bash", "arguments": json.dumps({"command": "ls"})}, + } + ], + "extra": {"response": {"usage": usage_a}}, + }, + {"role": "tool", "content": "file_a.py\nfile_b.py"}, + { + "role": "assistant", + "content": "done", + "extra": {"response": {"usage": usage_b}}, + }, + ], + } + + +# ── run() ───────────────────────────────────────────────────────────────── + + +def test_run_returns_structured_result(tmp_path: Path) -> None: class DummyAgent: def __init__(self) -> None: self.env = DummyEnv() - def run(self, task: str): + def run(self, _: str): return {"exit_status": "submitted", "submission": "diff --git ..."} agent = DummyAgent() + result = mini_swe.run("fix bug", workdir=str(tmp_path), agent=agent) + + assert isinstance(result, mini_swe.MiniSweAgentResult) + assert result.exit_status == "submitted" + assert result.submission == "diff --git ..." + assert result.workdir == str(tmp_path) + assert agent.env.config.cwd == str(tmp_path) + # No trajectory_path passed and `run` returned a plain result + # without `messages` -> no trajectory. + assert result.trajectory is None + assert result.usage == {} + + +def test_run_loads_trajectory_from_file(tmp_path: Path) -> None: + trajectory_path = tmp_path / "mini-swe-agent.trajectory.json" + trajectory_path.write_text(json.dumps(_v2_trajectory())) + + class DummyAgent: + def __init__(self) -> None: + self.env = DummyEnv() + + def run(self, _: str): + return {"exit_status": "submitted", "submission": "patch"} + result = mini_swe.run( - "fix bug", + "fix", workdir=str(tmp_path), - agent=agent, + agent=DummyAgent(), + trajectory_path=trajectory_path, + session_id="sess-test", ) - assert result["exit_status"] == "submitted" - assert result["submission"] == "diff --git ..." - assert agent.env.config.cwd == str(tmp_path) + assert result.trajectory is not None + traj = result.trajectory + assert traj.session_id == "sess-test" + assert traj.agent.name == "mini-swe-agent" + assert traj.agent.version == "2.3.0" + assert traj.agent.model_name == "openai/gpt-4o-mini" + + sources = [s.source for s in traj.steps] + assert sources == ["system", "user", "agent", "agent"] + # First assistant carries a tool call and a tool_result observation + # was attached to that same step. + [first_agent, second_agent] = [s for s in traj.steps if s.source == "agent"] + assert first_agent.tool_calls is not None + assert first_agent.tool_calls[0].function_name == "bash" + assert first_agent.tool_calls[0].arguments == {"command": "ls"} + assert first_agent.observation is not None + assert first_agent.observation.results[0].content == "file_a.py\nfile_b.py" + # Second assistant — no tool call, message text preserved. + assert second_agent.tool_calls is None + assert second_agent.message == "done" + + # Final metrics aggregate correctly. + assert traj.final_metrics.total_prompt_tokens == 32 + assert traj.final_metrics.total_completion_tokens == 10 + assert traj.final_metrics.total_cached_tokens == 3 + assert traj.final_metrics.total_cost_usd == 0.012 + assert (traj.final_metrics.extra or {}).get("total_reasoning_tokens") == 1 -def test_run_exception_propagates(tmp_path): + # Cheap aggregate matches the full path. + assert result.usage == { + "n_input_tokens": 32, + "n_output_tokens": 10, + "n_cache_tokens": 3, + "cost_usd": 0.012, + } + + +def test_run_inline_trajectory_passthrough(tmp_path: Path) -> None: + """Some bench scripts return the trajectory inline. Honour that path.""" + inline = _v2_trajectory() + inline["exit_status"] = "submitted" + inline["submission"] = "patch" + + class DummyAgent: + def __init__(self) -> None: + self.env = DummyEnv() + + def run(self, _: str): + return inline + + result = mini_swe.run("fix", workdir=str(tmp_path), agent=DummyAgent()) + assert result.trajectory is not None + assert result.usage["n_input_tokens"] == 32 + + +def test_run_exception_propagates(tmp_path: Path) -> None: class BoomAgent: def __init__(self) -> None: self.env = DummyEnv() - def run(self, task: str): + def run(self, _: str): raise RuntimeError("boom") with pytest.raises(RuntimeError, match="boom"): - mini_swe.run( - "fix bug", - workdir=str(tmp_path), - agent=BoomAgent(), - ) + mini_swe.run("fix", workdir=str(tmp_path), agent=BoomAgent()) + + +# ── trajectory module direct tests ──────────────────────────────────────── + + +def test_aggregate_usage_matches_final_metrics() -> None: + raw = _v2_trajectory() + usage = mini_swe.aggregate_usage(raw) + traj = mini_swe.from_mini_swe_agent(raw, session_id="sid") + assert traj.final_metrics.total_prompt_tokens == usage["n_input_tokens"] + assert traj.final_metrics.total_completion_tokens == usage["n_output_tokens"] + + +def test_trajectory_to_dict_strips_none() -> None: + raw = _v2_trajectory() + traj = mini_swe.from_mini_swe_agent(raw, session_id="sid") + d = traj.to_dict() + # No top-level None values + for v in d.values(): + if isinstance(v, dict): + assert all(value is not None for value in v.values()) + # SCHEMA_VERSION exposed for downstream consumers. + assert d["schema_version"] == mini_swe.SCHEMA_VERSION + + +def test_trajectory_to_json_is_valid() -> None: + raw = _v2_trajectory() + traj = mini_swe.from_mini_swe_agent(raw, session_id="sid") + parsed = json.loads(traj.to_json()) + assert parsed["agent"]["name"] == "mini-swe-agent" + assert parsed["steps"][0]["source"] == "system" + + +def test_tool_call_with_string_arguments_falls_back_to_command() -> None: + raw: dict[str, Any] = { + "trajectory_format": "mini-swe-agent.v2", + "info": {"mini_version": "2.3", "model_stats": {"instance_cost": 0.0}}, + "messages": [ + {"role": "system", "content": "x"}, + {"role": "user", "content": "y"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + {"id": "c1", "function": {"name": "bash", "arguments": "ls -la"}} + ], + "extra": {"response": {"usage": {"prompt_tokens": 1, "completion_tokens": 1}}}, + }, + ], + } + traj = mini_swe.from_mini_swe_agent(raw, session_id="sid") + [agent_step] = [s for s in traj.steps if s.source == "agent"] + assert agent_step.tool_calls is not None + assert agent_step.tool_calls[0].arguments == {"command": "ls -la"}