From d717c4902f70453c1bcba2cf530851b30f3fa346 Mon Sep 17 00:00:00 2001
From: Austin Swinney <aswinney@harvard.edu>
Date: Tue, 9 Jun 2026 22:26:58 -0400
Subject: [PATCH 1/7] feat(benchmark): rank prompt variants on a RAGAS
 leaderboard

Add a prompt-sweep workflow: generate_prompt_sweep.py renders one benchmarking
config per prompt variant from a manifest, and ResultHandler.build_leaderboard
aggregates each variant's mean RAGAS metrics into a ranked leaderboard (with a
shared_context block and incomplete-row handling) emitted in the dump JSON.
Model, queries, retriever and judge are held fixed so only the prompt varies.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/docs/benchmarking.md                     |  75 ++++++
 scripts/benchmarking/generate_prompt_sweep.py | 165 +++++++++++++
 src/bin/service_benchmark.py                  | 158 +++++++++++++
 tests/unit/test_generate_prompt_sweep.py      | 125 ++++++++++
 tests/unit/test_prompt_sweep_dump_gating.py   |  58 +++++
 tests/unit/test_prompt_sweep_leaderboard.py   | 218 ++++++++++++++++++
 6 files changed, 799 insertions(+)
 create mode 100644 scripts/benchmarking/generate_prompt_sweep.py
 create mode 100644 tests/unit/test_generate_prompt_sweep.py
 create mode 100644 tests/unit/test_prompt_sweep_dump_gating.py
 create mode 100644 tests/unit/test_prompt_sweep_leaderboard.py

diff --git a/docs/docs/benchmarking.md b/docs/docs/benchmarking.md
index b9f9cb71..60ceccbe 100644
--- a/docs/docs/benchmarking.md
+++ b/docs/docs/benchmarking.md
@@ -130,6 +130,81 @@ To analyze results, see `scripts/benchmarking/` which contains:
 
 ---
 
+## Prompt sweep
+
+A *prompt sweep* runs N agent prompts through the RAGAS harness with everything
+but the prompt held fixed — model, queries, retriever, and the RAGAS judge all
+come from one base config — and ranks the prompts on a **leaderboard** by mean
+RAGAS metric. It generalizes a 2-way A/B to the whole prompt field, which is how
+we settle "which support prompt do we ship?" with data rather than by eyeballing
+one pair at a time (the open question Q5 / Decision 3 in
+`docs/docs/notes_response_tuning.md`).
+
+Only `services.benchmarking.agent_md_file` varies across variants. The model and
+RAGAS judge are deliberately held fixed; the leaderboard's `shared_context`
+cross-checks this and flags any drift.
+
+### 1. Write a manifest
+
+A manifest names one base config and the prompts to sweep. See
+`config/benchmarking/prompt_sweep.yaml` for a worked example:
+
+```yaml
+base_config: config/benchmarking/ragas.yaml   # supplies model/queries/judge/retriever
+out_dir: bench_out/sweep_configs              # optional (this is the default)
+primary_metric: faithfulness                  # leaderboard sort key (optional)
+prompts:
+  - config/agents/fasrc-cannon-v1-strict.md
+  - config/agents/fasrc-cannon-v2-lean.md
+  - config/agents/fasrc-cannon-v3-cited.md
+  - config/agents/fasrc-cannon-v4-linked.md
+```
+
+`primary_metric` is one of `answer_relevancy`, `faithfulness`,
+`context_precision`, `context_recall` (default `faithfulness` — grounding is the
+load-bearing property for a "never guess" support bot). All four metrics are
+reported per variant regardless; this only sets the ranking key.
+
+### 2. Generate the per-prompt configs
+
+```bash
+python scripts/benchmarking/generate_prompt_sweep.py -m config/benchmarking/prompt_sweep.yaml
+```
+
+This writes one config per prompt into the sweep directory, each identical to
+the base except `services.benchmarking.agent_md_file` (the prompt) and `.name`
+(the prompt's filename stem). It refuses to write anything if a prompt path is
+missing, so a bad manifest never leaves a partial set behind.
+
+### 3. Run the sweep
+
+```bash
+archi evaluate --config-dir bench_out/sweep_configs --hostmode
+```
+
+The harness runs each config in turn (the existing `--config-dir` path) and,
+because 2+ configs ran, emits a `leaderboard` block in the dump JSON.
+
+### Reading the leaderboard
+
+The dump JSON gains a `leaderboard` key:
+
+- `rows` — one per variant: `name`, `agent_md_file`, the four mean RAGAS
+  `metrics`, `primary_score`, `rank`, `query_count`, and `incomplete`. Rows are
+  ranked best-first by `primary_metric`; ties share a rank. A variant that
+  failed to produce a metric (missing/NaN) has `None` for it, is marked
+  `incomplete: true`, and sorts after all complete variants — it is never
+  treated as a zero.
+- `shared_context` — the model, provider, judge `evaluator_model`,
+  `queries_path`, and `corpus_snapshot_id` shared by all variants. If any of
+  these differ across the swept configs, the discrepancy is recorded in
+  `shared_context.warnings` (the sweep is no longer apples-to-apples).
+
+The pairwise `ab_comparisons` are still produced alongside the leaderboard; the
+leaderboard is computed independently from each config's aggregates.
+
+---
+
 ## Human grading via Argilla
 
 `archi evaluate --argilla` pushes benchmark results to a self-hosted [Argilla](https://argilla.io/) instance for independent human grading. This is the platform we use to answer the question "is config A better than config B for FASRC users?" with data we trust — RAGAS scores alone can't decide prompt or model choices because the judge LLM has its own biases.
diff --git a/scripts/benchmarking/generate_prompt_sweep.py b/scripts/benchmarking/generate_prompt_sweep.py
new file mode 100644
index 00000000..1d812d25
--- /dev/null
+++ b/scripts/benchmarking/generate_prompt_sweep.py
@@ -0,0 +1,165 @@
+"""Generate a RAGAS prompt-sweep: one benchmarking config per prompt variant.
+
+A prompt sweep runs N agent prompts through the existing RAGAS harness with
+everything but the prompt held fixed, so the only moving part is
+`services.benchmarking.agent_md_file`. This script takes one base config plus a
+list of prompt files (a sweep manifest) and writes one rendered config per
+prompt into a sweep directory. Each generated config is byte-for-byte identical
+to the base except for:
+
+  - services.benchmarking.agent_md_file  -> the prompt's path
+  - services.benchmarking.name           -> the prompt's filename stem
+  - services.benchmarking.primary_metric -> manifest primary_metric (if set)
+
+Holding everything else constant is what makes the resulting leaderboard an
+apples-to-apples comparison (the leaderboard's shared_context cross-check will
+flag any drift). Run the sweep with:
+
+    archi evaluate --config-dir <sweep_dir> --hostmode ...
+
+Manifest format (YAML):
+
+    base_config: config/benchmarking/ragas.yaml
+    out_dir: bench_out/sweep_configs        # optional, this is the default
+    primary_metric: faithfulness            # optional, default faithfulness
+    prompts:
+      - config/agents/fasrc-cannon-v1-strict.md
+      - config/agents/fasrc-cannon-v2-lean.md
+      - config/agents/fasrc-cannon-v3-cited.md
+      - config/agents/fasrc-cannon-v4-linked.md
+
+Usage:
+    python scripts/benchmarking/generate_prompt_sweep.py --manifest config/benchmarking/prompt_sweep.yaml
+"""
+
+from __future__ import annotations
+
+import argparse
+import copy
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+
+import yaml
+
+DEFAULT_OUT_DIR = "bench_out/sweep_configs"
+DEFAULT_PRIMARY_METRIC = "faithfulness"
+KNOWN_METRICS = {
+    "answer_relevancy",
+    "faithfulness",
+    "context_precision",
+    "context_recall",
+}
+
+
+def _load_yaml(path: Path) -> Dict[str, Any]:
+    with open(path, "r") as f:
+        data = yaml.safe_load(f)
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected a YAML mapping at {path}, got {type(data).__name__}")
+    return data
+
+
+def generate_sweep_configs(manifest_path: Path) -> List[Path]:
+    """Render one benchmarking config per prompt listed in the manifest.
+
+    Validates that every prompt file exists BEFORE writing anything, so a bad
+    manifest never leaves a partial set of configs in the sweep directory.
+    Returns the list of written config paths.
+    """
+    manifest = _load_yaml(manifest_path)
+
+    base_config_raw = manifest.get("base_config")
+    if not base_config_raw:
+        raise ValueError("Manifest missing required key 'base_config'.")
+    prompts_raw = manifest.get("prompts")
+    if not isinstance(prompts_raw, list) or not prompts_raw:
+        raise ValueError("Manifest 'prompts' must be a non-empty list of prompt file paths.")
+
+    primary_metric = str(manifest.get("primary_metric", DEFAULT_PRIMARY_METRIC))
+    if primary_metric not in KNOWN_METRICS:
+        raise ValueError(
+            f"Manifest primary_metric '{primary_metric}' is not a known RAGAS metric "
+            f"{sorted(KNOWN_METRICS)}."
+        )
+
+    # Relative paths resolve against the current working directory (the repo
+    # root — the documented place to run this from, and how the prompts in the
+    # example manifest are written). Absolute paths are used as-is.
+    repo_root = Path.cwd()
+
+    def _resolve(p: str) -> Path:
+        candidate = Path(p)
+        return candidate if candidate.is_absolute() else repo_root / candidate
+
+    base_config_path = _resolve(str(base_config_raw))
+    if not base_config_path.is_file():
+        raise ValueError(f"base_config not found: '{base_config_path}'")
+
+    prompt_paths = [_resolve(str(p)) for p in prompts_raw]
+    missing = [str(p) for p in prompt_paths if not p.is_file()]
+    if missing:
+        # Atomic: refuse before writing any config.
+        raise ValueError(f"Prompt file(s) not found, aborting (no configs written): {missing}")
+
+    out_dir = _resolve(str(manifest.get("out_dir", DEFAULT_OUT_DIR)))
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    base_config = _load_yaml(base_config_path)
+    if "services" not in base_config or "benchmarking" not in base_config.get("services", {}):
+        raise ValueError(
+            f"base_config '{base_config_path}' has no services.benchmarking section to sweep."
+        )
+
+    written: List[Path] = []
+    for prompt_path in prompt_paths:
+        cfg = copy.deepcopy(base_config)
+        bench = cfg["services"]["benchmarking"]
+        stem = prompt_path.stem
+        bench["agent_md_file"] = str(prompt_path)
+        bench["name"] = stem
+        bench["primary_metric"] = primary_metric
+
+        out_path = out_dir / f"{stem}.yaml"
+        with open(out_path, "w") as f:
+            yaml.safe_dump(cfg, f, sort_keys=False)
+        written.append(out_path)
+
+    return written
+
+
+def main(argv: List[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument(
+        "--manifest", "-m", required=True,
+        help="Path to a sweep manifest YAML (base_config + prompts).",
+    )
+    args = parser.parse_args(argv)
+
+    manifest_path = Path(args.manifest)
+    if not manifest_path.is_file():
+        print(f"Manifest not found: {manifest_path}", file=sys.stderr)
+        return 2
+
+    try:
+        written = generate_sweep_configs(manifest_path)
+    except ValueError as exc:
+        print(f"Error: {exc}", file=sys.stderr)
+        return 1
+
+    out_dir = written[0].parent
+    manifest = _load_yaml(manifest_path)
+    primary_metric = str(manifest.get("primary_metric", DEFAULT_PRIMARY_METRIC))
+
+    print(f"Wrote {len(written)} sweep config(s) to {out_dir}/:")
+    for p in written:
+        print(f"  - {p.name}")
+    print()
+    print("Next: run the sweep (leaderboard ranks by "
+          f"'{primary_metric}'):")
+    print(f"  archi evaluate --config-dir {out_dir} --hostmode")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/bin/service_benchmark.py b/src/bin/service_benchmark.py
index 8ed40c39..1588cb4d 100644
--- a/src/bin/service_benchmark.py
+++ b/src/bin/service_benchmark.py
@@ -75,6 +75,7 @@ class ResultHandler:
     metadata = {}  # store the metadata about the benchmark run
     ab_comparison: Dict[str, Any] = {}  # single-pair compat (populated only in ab_mode with 2 configs)
     ab_comparisons: List[Dict[str, Any]] = []  # multi-pair: list of pair comparison dicts
+    leaderboard: Dict[str, Any] = {}  # prompt-sweep leaderboard (populated only when 2+ configs run)
     # Per-invocation identifier shared by every config in this archi-evaluate run.
     # Stamped onto Argilla records as metadata so the analysis notebook can refuse
     # to compute primary-outcome statistics across configs that were NOT run
@@ -177,6 +178,8 @@ def dump(benchmark_name: Path):
             output["ab_comparison"] = ResultHandler.ab_comparison
         if ResultHandler.ab_comparisons:
             output["ab_comparisons"] = ResultHandler.ab_comparisons
+        if ResultHandler.leaderboard:
+            output["leaderboard"] = ResultHandler.leaderboard
         with open(file_path, "w") as f:
             json.dump(output, f, indent=4)
 
@@ -312,6 +315,135 @@ def generate_pairwise_combinations(n_configs: int) -> List[Tuple[int, int]]:
         """Generate all pairwise index combinations for N configs."""
         return list(combinations(range(n_configs), 2))
 
+    # Leaderboard metric name -> the aggregate key the run loop writes onto
+    # total_results (service_benchmark.py RAGAS block). Order is display order.
+    LEADERBOARD_METRICS: List[Tuple[str, str]] = [
+        ("answer_relevancy", "aggregate_answer_relevancy"),
+        ("faithfulness", "aggregate_faithfulness"),
+        ("context_precision", "aggregate_context_precision"),
+        ("context_recall", "aggregate_context_recall"),
+    ]
+
+    @staticmethod
+    def build_leaderboard(primary_metric: str = "faithfulness") -> Dict[str, Any]:
+        """Rank swept prompt variants by mean RAGAS metric.
+
+        Reads each config's per-run aggregates from ResultHandler.results
+        (the means the RAGAS block already wrote onto total_results) and
+        builds a ranked leaderboard. Independent of the pairwise A/B plumbing:
+        it never touches pair_ab_results/ab_comparisons.
+
+        Each row: {name, agent_md_file, metrics{...}, primary_score, rank,
+        incomplete, query_count}. A metric is None (and the row `incomplete`)
+        when its aggregate key is absent or NaN — never silently zeroed.
+        Incomplete rows always sort after complete ones. Ties share a rank.
+
+        shared_context records the run context common to all variants and
+        flags any drift (a hand-edited config that breaks apples-to-apples).
+        """
+        metric_names = [name for name, _ in ResultHandler.LEADERBOARD_METRICS]
+        if primary_metric not in metric_names:
+            logger.warning(
+                "Leaderboard primary_metric '%s' is not a known RAGAS metric %s; "
+                "falling back to 'faithfulness'.",
+                primary_metric, metric_names,
+            )
+            primary_metric = "faithfulness"
+
+        def _benchmarking(record: Dict[str, Any]) -> Dict[str, Any]:
+            return (
+                record.get("configuration", {})
+                .get("services", {})
+                .get("benchmarking", {})
+            )
+
+        rows: List[Dict[str, Any]] = []
+        # Accumulate shared-context candidates to detect drift across configs.
+        ctx_fields: Dict[str, set] = {
+            "model": set(), "provider": set(),
+            "evaluator_model": set(), "queries_path": set(),
+        }
+
+        for record in ResultHandler.results:
+            bench = _benchmarking(record)
+            total = record.get("total_results", {}) or {}
+
+            agent_md_file = bench.get("agent_md_file", "") or ""
+            name = bench.get("name") or (Path(agent_md_file).stem if agent_md_file else "")
+
+            metrics: Dict[str, Optional[float]] = {}
+            incomplete = False
+            for metric_name, agg_key in ResultHandler.LEADERBOARD_METRICS:
+                value = total.get(agg_key)
+                if value is None or (isinstance(value, float) and math.isnan(value)):
+                    metrics[metric_name] = None
+                    incomplete = True
+                else:
+                    metrics[metric_name] = float(value)
+
+            if incomplete:
+                logger.warning(
+                    "Leaderboard: variant '%s' (%s) is incomplete — missing/NaN metrics: %s",
+                    name, agent_md_file,
+                    [m for m in metric_names if metrics[m] is None],
+                )
+
+            rows.append({
+                "name": name,
+                "agent_md_file": agent_md_file,
+                "metrics": metrics,
+                "primary_score": metrics[primary_metric],
+                "incomplete": incomplete,
+                "query_count": len(record.get("single_question_results") or {}),
+            })
+
+            ragas_settings = (bench.get("mode_settings", {}) or {}).get("ragas_settings", {}) or {}
+            ctx_fields["model"].add(bench.get("model"))
+            ctx_fields["provider"].add(bench.get("provider"))
+            ctx_fields["evaluator_model"].add(ragas_settings.get("evaluator_model"))
+            ctx_fields["queries_path"].add(bench.get("queries_path"))
+
+        # Complete rows first, then by descending primary score; incomplete last.
+        rows.sort(key=lambda r: (
+            1 if r["incomplete"] else 0,
+            -(r["primary_score"] if r["primary_score"] is not None else 0.0),
+        ))
+
+        # Dense ranking: equal primary scores share a rank.
+        rank = 0
+        prev_score: Any = object()
+        for row in rows:
+            score = row["primary_score"]
+            if score != prev_score:
+                rank += 1
+                prev_score = score
+            row["rank"] = rank
+
+        warnings: List[str] = []
+        shared_context: Dict[str, Any] = {
+            "corpus_snapshot_id": ResultHandler.get_corpus_snapshot_id(),
+        }
+        for field_name, values in ctx_fields.items():
+            present = {v for v in values if v is not None}
+            if len(present) <= 1:
+                shared_context[field_name] = next(iter(present), None)
+            else:
+                shared_context[field_name] = sorted(str(v) for v in present)
+                warnings.append(
+                    f"{field_name} differs across swept configs: {sorted(str(v) for v in present)}"
+                )
+        if warnings:
+            for w in warnings:
+                logger.warning("Leaderboard shared-context drift: %s", w)
+        shared_context["warnings"] = warnings
+
+        ResultHandler.leaderboard = {
+            "shared_context": shared_context,
+            "primary_metric": primary_metric,
+            "rows": rows,
+        }
+        return ResultHandler.leaderboard
+
 
 class Benchmarker:
 
@@ -885,6 +1017,32 @@ def run(self):
                     comp["aggregate"]["ties"],
                 )
 
+        # Prompt-sweep leaderboard: rank every config by mean RAGAS metric.
+        # Independent of the pairwise block above (reads per-config aggregates
+        # directly). Only meaningful with 2+ variants.
+        if len(ResultHandler.results) >= 2:
+            primary_metric = str(
+                self.config.get("services", {}).get("benchmarking", {}).get("primary_metric", "faithfulness")
+            )
+            leaderboard = ResultHandler.build_leaderboard(primary_metric)
+            logger.info("Prompt-sweep leaderboard (ranked by %s):", leaderboard["primary_metric"])
+            logger.info(
+                "  %-4s %-28s %-10s %-10s %-10s %-10s %-10s %s",
+                "rank", "name", "ans_rel", "faith", "ctx_prec", "ctx_rec", "n_q", "prompt",
+            )
+            for row in leaderboard["rows"]:
+                m = row["metrics"]
+                def _fmt(v: Optional[float]) -> str:
+                    return f"{v:.4f}" if isinstance(v, float) else "  n/a"
+                flag = "  (incomplete)" if row["incomplete"] else ""
+                logger.info(
+                    "  %-4d %-28s %-10s %-10s %-10s %-10s %-10d %s%s",
+                    row["rank"], row["name"][:28],
+                    _fmt(m["answer_relevancy"]), _fmt(m["faithfulness"]),
+                    _fmt(m["context_precision"]), _fmt(m["context_recall"]),
+                    row["query_count"], row["agent_md_file"], flag,
+                )
+
         # Push to Argilla when ARCHI_ARGILLA=1 in the benchmarks container env.
         # The CLI flag --argilla on `archi evaluate` sets this (see Task 2.5).
         argilla_enabled = os.environ.get("ARCHI_ARGILLA", "").strip().lower() in ("1", "true", "yes")
diff --git a/tests/unit/test_generate_prompt_sweep.py b/tests/unit/test_generate_prompt_sweep.py
new file mode 100644
index 00000000..7fa33c28
--- /dev/null
+++ b/tests/unit/test_generate_prompt_sweep.py
@@ -0,0 +1,125 @@
+"""Unit tests for the prompt-sweep config generator.
+
+Covers scripts/benchmarking/generate_prompt_sweep.py: one config per prompt,
+each differing from the base ONLY in services.benchmarking.agent_md_file and
+.name (+ primary_metric), name = prompt stem, and atomic failure (a missing
+prompt aborts before any config is written).
+"""
+
+import copy
+
+import pytest
+import yaml
+
+from scripts.benchmarking.generate_prompt_sweep import generate_sweep_configs
+
+BASE_CONFIG = {
+    "name": "ragas-bench",
+    "global": {"DATA_PATH": "/root/data/"},
+    "services": {
+        "benchmarking": {
+            "agent_class": "CMSCompOpsAgent",
+            "agent_md_file": "config/agents/fasrc-cannon-v1-strict.md",
+            "provider": "openai",
+            "model": "Qwen/Qwen3.5-35B-A3B-GPTQ-Int4",
+            "queries_path": "config/benchmarking/queries.json",
+            "modes": ["RAGAS"],
+        },
+    },
+}
+
+
+def _write_yaml(path, data):
+    with open(path, "w") as f:
+        yaml.safe_dump(data, f, sort_keys=False)
+
+
+def _setup(tmp_path, prompt_names, *, primary_metric=None, base=None):
+    """Create a base config, prompt files, and a manifest under tmp_path."""
+    base_path = tmp_path / "base.yaml"
+    _write_yaml(base_path, base if base is not None else BASE_CONFIG)
+
+    prompt_paths = []
+    for name in prompt_names:
+        p = tmp_path / name
+        p.write_text(f"# prompt {name}\n")
+        prompt_paths.append(p)
+
+    manifest = {
+        "base_config": str(base_path),
+        "out_dir": str(tmp_path / "sweep_configs"),
+        "prompts": [str(p) for p in prompt_paths],
+    }
+    if primary_metric is not None:
+        manifest["primary_metric"] = primary_metric
+    manifest_path = tmp_path / "manifest.yaml"
+    _write_yaml(manifest_path, manifest)
+    return manifest_path
+
+
+def test_one_config_per_prompt_only_prompt_fields_differ(tmp_path):
+    manifest = _setup(tmp_path, ["v1-strict.md", "v2-lean.md", "v3-cited.md"])
+    written = generate_sweep_configs(manifest)
+    assert len(written) == 3
+
+    for path in written:
+        with open(path) as f:
+            cfg = yaml.safe_load(f)
+        bench = cfg["services"]["benchmarking"]
+        # agent_md_file points at one of the prompts
+        assert bench["agent_md_file"].endswith(".md")
+        # Everything else equals the base, ignoring the three swept fields.
+        stripped = copy.deepcopy(cfg)
+        sb = stripped["services"]["benchmarking"]
+        for k in ("agent_md_file", "name", "primary_metric"):
+            sb.pop(k, None)
+        expected = copy.deepcopy(BASE_CONFIG)
+        expected["services"]["benchmarking"].pop("agent_md_file", None)
+        assert stripped == expected
+
+
+def test_name_is_prompt_stem(tmp_path):
+    manifest = _setup(tmp_path, ["fasrc-cannon-v2-lean.md"])
+    written = generate_sweep_configs(manifest)
+    with open(written[0]) as f:
+        cfg = yaml.safe_load(f)
+    assert cfg["services"]["benchmarking"]["name"] == "fasrc-cannon-v2-lean"
+    # output file is named for the stem too
+    assert written[0].name == "fasrc-cannon-v2-lean.yaml"
+
+
+def test_primary_metric_threaded_into_each_config(tmp_path):
+    manifest = _setup(tmp_path, ["a.md", "b.md"], primary_metric="answer_relevancy")
+    written = generate_sweep_configs(manifest)
+    for path in written:
+        with open(path) as f:
+            cfg = yaml.safe_load(f)
+        assert cfg["services"]["benchmarking"]["primary_metric"] == "answer_relevancy"
+
+
+def test_missing_prompt_aborts_atomically(tmp_path):
+    # One real prompt, one bogus path that does not exist.
+    base_path = tmp_path / "base.yaml"
+    _write_yaml(base_path, BASE_CONFIG)
+    real = tmp_path / "real.md"
+    real.write_text("# real\n")
+    out_dir = tmp_path / "sweep_configs"
+    manifest = {
+        "base_config": str(base_path),
+        "out_dir": str(out_dir),
+        "prompts": [str(real), str(tmp_path / "does-not-exist.md")],
+    }
+    manifest_path = tmp_path / "manifest.yaml"
+    _write_yaml(manifest_path, manifest)
+
+    with pytest.raises(ValueError, match="does-not-exist.md"):
+        generate_sweep_configs(manifest_path)
+
+    # Atomic: no configs written.
+    assert not out_dir.exists() or list(out_dir.glob("*.yaml")) == []
+
+
+def test_invalid_primary_metric_rejected(tmp_path):
+    manifest = _setup(tmp_path, ["a.md"], primary_metric="bogus_metric")
+    with pytest.raises(ValueError, match="bogus_metric"):
+        generate_sweep_configs(manifest)
diff --git a/tests/unit/test_prompt_sweep_dump_gating.py b/tests/unit/test_prompt_sweep_dump_gating.py
new file mode 100644
index 00000000..4d24d020
--- /dev/null
+++ b/tests/unit/test_prompt_sweep_dump_gating.py
@@ -0,0 +1,58 @@
+"""Dump-level gating test for the prompt-sweep leaderboard.
+
+ResultHandler.dump must include a `leaderboard` key only when the leaderboard
+has been populated (a 2+ config sweep), and omit it otherwise (an ordinary
+single-config run), leaving today's output otherwise unchanged.
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+import src.bin.service_benchmark as sb
+from src.bin.service_benchmark import ResultHandler
+
+
+@pytest.fixture(autouse=True)
+def _reset_and_redirect(tmp_path, monkeypatch):
+    """Reset ResultHandler class state and redirect dump output to tmp_path."""
+    saved = (ResultHandler.results, ResultHandler.metadata,
+             ResultHandler.leaderboard, ResultHandler.ab_comparison,
+             ResultHandler.ab_comparisons)
+    ResultHandler.results = []
+    ResultHandler.metadata = {}
+    ResultHandler.leaderboard = {}
+    ResultHandler.ab_comparison = {}
+    ResultHandler.ab_comparisons = []
+    monkeypatch.setattr(sb, "OUTPUT_DIR", tmp_path)
+    yield
+    (ResultHandler.results, ResultHandler.metadata,
+     ResultHandler.leaderboard, ResultHandler.ab_comparison,
+     ResultHandler.ab_comparisons) = saved
+
+
+def _dump_and_load(tmp_path):
+    ResultHandler.dump(Path("test-bench"))
+    files = list(tmp_path.glob("test-bench-*.json"))
+    assert len(files) == 1, f"expected one dump file, found {files}"
+    with open(files[0]) as f:
+        return json.load(f)
+
+
+def test_leaderboard_emitted_for_multi_config(tmp_path):
+    ResultHandler.results = [{"total_results": {}}, {"total_results": {}}]
+    ResultHandler.leaderboard = {"primary_metric": "faithfulness", "rows": [], "shared_context": {}}
+    output = _dump_and_load(tmp_path)
+    assert "leaderboard" in output
+    assert output["leaderboard"]["primary_metric"] == "faithfulness"
+
+
+def test_no_leaderboard_for_single_config(tmp_path):
+    ResultHandler.results = [{"total_results": {}}]
+    # leaderboard left empty (single-config run never builds one)
+    output = _dump_and_load(tmp_path)
+    assert "leaderboard" not in output
+    # output otherwise unchanged: the usual top-level keys are present
+    assert "benchmarking_results" in output
+    assert "metadata" in output
diff --git a/tests/unit/test_prompt_sweep_leaderboard.py b/tests/unit/test_prompt_sweep_leaderboard.py
new file mode 100644
index 00000000..a805f75f
--- /dev/null
+++ b/tests/unit/test_prompt_sweep_leaderboard.py
@@ -0,0 +1,218 @@
+"""Unit tests for the RAGAS prompt-sweep leaderboard aggregator.
+
+Covers ResultHandler.build_leaderboard: one row per config, ranking by a
+configurable primary metric, tie handling, incomplete-variant handling
+(missing/NaN metrics sort last), name fallback to the prompt stem, and the
+shared-context drift check. The aggregator reads per-config aggregates only —
+these tests never construct any pairwise A/B data.
+"""
+
+from typing import Optional
+
+import pytest
+
+from src.bin.service_benchmark import ResultHandler
+
+
+def _make_record(
+    name,
+    agent_md_file,
+    *,
+    answer_relevancy: Optional[float] = 0.8,
+    faithfulness: Optional[float] = 0.8,
+    context_precision: Optional[float] = 0.8,
+    context_recall: Optional[float] = 0.8,
+    model="Qwen/Qwen3.5-35B-A3B-GPTQ-Int4",
+    provider="openai",
+    evaluator_model="us.anthropic.claude-sonnet-4-5-20250929-v1:0",
+    queries_path="config/benchmarking/queries.json",
+    n_questions=3,
+    include_name=True,
+):
+    """Build a ResultHandler.results record shaped like handle_results writes."""
+    total_results = {}
+    for key, value in (
+        ("aggregate_answer_relevancy", answer_relevancy),
+        ("aggregate_faithfulness", faithfulness),
+        ("aggregate_context_precision", context_precision),
+        ("aggregate_context_recall", context_recall),
+    ):
+        if value is not None:  # omit the key entirely to model a missing metric
+            total_results[key] = value
+
+    benchmarking = {
+        "agent_md_file": agent_md_file,
+        "model": model,
+        "provider": provider,
+        "queries_path": queries_path,
+        "mode_settings": {"ragas_settings": {"evaluator_model": evaluator_model}},
+    }
+    if include_name:
+        benchmarking["name"] = name
+
+    return {
+        "single_question_results": {f"q{i}": {} for i in range(n_questions)},
+        "total_results": total_results,
+        "configuration_file": f"/tmp/{name}.yaml",
+        "configuration": {"services": {"benchmarking": benchmarking}},
+    }
+
+
+@pytest.fixture(autouse=True)
+def _reset_result_handler():
+    """ResultHandler holds class-level mutable state; reset around every test."""
+    saved_results = ResultHandler.results
+    saved_leaderboard = ResultHandler.leaderboard
+    ResultHandler.results = []
+    ResultHandler.leaderboard = {}
+    yield
+    ResultHandler.results = saved_results
+    ResultHandler.leaderboard = saved_leaderboard
+
+
+# -- 4.2 one row per config --------------------------------------------------
+
+def test_one_row_per_config():
+    ResultHandler.results = [
+        _make_record("v1-strict", "config/agents/fasrc-cannon-v1-strict.md", faithfulness=0.7),
+        _make_record("v2-lean", "config/agents/fasrc-cannon-v2-lean.md", faithfulness=0.9),
+        _make_record("v3-cited", "config/agents/fasrc-cannon-v3-cited.md", faithfulness=0.8),
+    ]
+    lb = ResultHandler.build_leaderboard()
+    assert len(lb["rows"]) == 3
+    by_name = {r["name"]: r for r in lb["rows"]}
+    assert set(by_name) == {"v1-strict", "v2-lean", "v3-cited"}
+    row = by_name["v2-lean"]
+    assert row["agent_md_file"] == "config/agents/fasrc-cannon-v2-lean.md"
+    assert set(row["metrics"]) == {
+        "answer_relevancy", "faithfulness", "context_precision", "context_recall",
+    }
+    assert row["metrics"]["faithfulness"] == pytest.approx(0.9)
+
+
+# -- 4.3 default + configured ranking ----------------------------------------
+
+def test_default_ranking_by_faithfulness():
+    ResultHandler.results = [
+        _make_record("low", "p/low.md", faithfulness=0.5),
+        _make_record("high", "p/high.md", faithfulness=0.95),
+        _make_record("mid", "p/mid.md", faithfulness=0.7),
+    ]
+    lb = ResultHandler.build_leaderboard()
+    assert lb["primary_metric"] == "faithfulness"
+    assert lb["rows"][0]["name"] == "high"
+    assert lb["rows"][0]["rank"] == 1
+    assert [r["name"] for r in lb["rows"]] == ["high", "mid", "low"]
+
+
+def test_configured_primary_metric_reranks():
+    ResultHandler.results = [
+        _make_record("a", "p/a.md", faithfulness=0.9, answer_relevancy=0.2),
+        _make_record("b", "p/b.md", faithfulness=0.4, answer_relevancy=0.99),
+    ]
+    lb = ResultHandler.build_leaderboard("answer_relevancy")
+    assert lb["primary_metric"] == "answer_relevancy"
+    assert lb["rows"][0]["name"] == "b"
+    # all four metric values still present regardless of primary
+    assert set(lb["rows"][0]["metrics"]) == {
+        "answer_relevancy", "faithfulness", "context_precision", "context_recall",
+    }
+
+
+def test_unknown_primary_metric_falls_back_to_faithfulness():
+    ResultHandler.results = [
+        _make_record("a", "p/a.md", faithfulness=0.6),
+        _make_record("b", "p/b.md", faithfulness=0.8),
+    ]
+    lb = ResultHandler.build_leaderboard("not_a_metric")
+    assert lb["primary_metric"] == "faithfulness"
+    assert lb["rows"][0]["name"] == "b"
+
+
+# -- 4.4 tie handling --------------------------------------------------------
+
+def test_ties_share_a_rank():
+    ResultHandler.results = [
+        _make_record("a", "p/a.md", faithfulness=0.8),
+        _make_record("b", "p/b.md", faithfulness=0.8),
+        _make_record("c", "p/c.md", faithfulness=0.5),
+    ]
+    lb = ResultHandler.build_leaderboard()
+    ranks = {r["name"]: r["rank"] for r in lb["rows"]}
+    assert ranks["a"] == ranks["b"] == 1
+    assert ranks["c"] == 2
+
+
+# -- 4.5 incomplete handling -------------------------------------------------
+
+def test_missing_metric_marks_incomplete_and_sorts_last():
+    ResultHandler.results = [
+        _make_record("complete-low", "p/low.md", faithfulness=0.3),
+        _make_record("missing", "p/missing.md", faithfulness=None),  # key omitted
+    ]
+    lb = ResultHandler.build_leaderboard()
+    by_name = {r["name"]: r for r in lb["rows"]}
+    assert by_name["missing"]["metrics"]["faithfulness"] is None
+    assert by_name["missing"]["incomplete"] is True
+    # complete row ranks ahead of incomplete even though its score (0.3) is low
+    assert lb["rows"][0]["name"] == "complete-low"
+    assert lb["rows"][-1]["name"] == "missing"
+
+
+def test_nan_metric_marks_incomplete():
+    rec = _make_record("nan", "p/nan.md")
+    rec["total_results"]["aggregate_faithfulness"] = float("nan")
+    ResultHandler.results = [
+        _make_record("ok", "p/ok.md", faithfulness=0.6),
+        rec,
+    ]
+    lb = ResultHandler.build_leaderboard()
+    by_name = {r["name"]: r for r in lb["rows"]}
+    assert by_name["nan"]["metrics"]["faithfulness"] is None
+    assert by_name["nan"]["incomplete"] is True
+    assert lb["rows"][-1]["name"] == "nan"
+
+
+# -- 4.6 name fallback -------------------------------------------------------
+
+def test_name_falls_back_to_prompt_stem():
+    ResultHandler.results = [
+        _make_record("ignored", "config/agents/fasrc-cannon-v4-linked.md",
+                     faithfulness=0.7, include_name=False),
+        _make_record("named", "config/agents/fasrc-cannon-v1-strict.md",
+                     faithfulness=0.6),
+    ]
+    lb = ResultHandler.build_leaderboard()
+    names = {r["name"] for r in lb["rows"]}
+    assert "fasrc-cannon-v4-linked" in names  # stem, not config_0
+    assert not any(n.startswith("config_") for n in names)
+
+
+# -- 4.7 shared context ------------------------------------------------------
+
+def test_shared_context_uniform_sweep():
+    ResultHandler.results = [
+        _make_record("a", "p/a.md", faithfulness=0.7),
+        _make_record("b", "p/b.md", faithfulness=0.8),
+    ]
+    lb = ResultHandler.build_leaderboard()
+    ctx = lb["shared_context"]
+    assert ctx["model"] == "Qwen/Qwen3.5-35B-A3B-GPTQ-Int4"
+    assert ctx["provider"] == "openai"
+    assert ctx["evaluator_model"] == "us.anthropic.claude-sonnet-4-5-20250929-v1:0"
+    assert ctx["queries_path"] == "config/benchmarking/queries.json"
+    assert ctx["corpus_snapshot_id"]
+    assert ctx["warnings"] == []
+
+
+def test_shared_context_flags_model_drift():
+    ResultHandler.results = [
+        _make_record("a", "p/a.md", faithfulness=0.7, model="modelA"),
+        _make_record("b", "p/b.md", faithfulness=0.8, model="modelB"),
+    ]
+    lb = ResultHandler.build_leaderboard()
+    ctx = lb["shared_context"]
+    assert ctx["warnings"], "expected a drift warning"
+    assert any("model" in w for w in ctx["warnings"])
+    # rows still emitted despite drift
+    assert len(lb["rows"]) == 2

From 1699a16624b5b2d8fd8ee09a37c2bf538647033c Mon Sep 17 00:00:00 2001
From: Austin Swinney <aswinney@harvard.edu>
Date: Tue, 9 Jun 2026 22:27:09 -0400
Subject: [PATCH 2/7] fix(cli): make archi evaluate --config-dir run
 multi-config benchmark sweeps

Four plumbing fixes so a prompt sweep actually runs end-to-end, each surfaced by
a live --config-dir run:
- config_manager: exempt services.benchmarking from the cross-config consistency
  check (it is the sweep axis); global + other services.* stay strict.
- templates_manager: render a distinct file per config in multi-config mode
  (configs sharing a top-level name no longer collide onto one file).
- config_seed: fall back to the first *.yaml when config.yaml is absent, so the
  chatbot-oriented seed step no longer aborts a benchmark deployment.
- templates_manager: stage every config's agent_md_file, not just the first, so
  all sweep variants are available in the container.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/cli/managers/config_manager.py            | 17 +++-
 src/cli/managers/templates_manager.py         | 72 ++++++++++++----
 src/cli/tools/config_seed.py                  | 23 ++++++
 ...t_config_manager_benchmarking_variation.py | 82 +++++++++++++++++++
 tests/unit/test_config_seed_resolve.py        | 40 +++++++++
 tests/unit/test_render_config_filename.py     | 48 +++++++++++
 tests/unit/test_stage_agents_multiconfig.py   | 57 +++++++++++++
 7 files changed, 321 insertions(+), 18 deletions(-)
 create mode 100644 tests/unit/test_config_manager_benchmarking_variation.py
 create mode 100644 tests/unit/test_config_seed_resolve.py
 create mode 100644 tests/unit/test_render_config_filename.py
 create mode 100644 tests/unit/test_stage_agents_multiconfig.py

diff --git a/src/cli/managers/config_manager.py b/src/cli/managers/config_manager.py
index 7ed9f940..15619b6a 100644
--- a/src/cli/managers/config_manager.py
+++ b/src/cli/managers/config_manager.py
@@ -13,6 +13,21 @@
 
 STATIC_FIELDS = ['global', 'services']
 
+
+def _comparable_static_field(static_field: str, value: Any) -> Any:
+    """Return the value used for cross-config consistency comparison.
+
+    `services.benchmarking` is the sweep axis for an ``archi evaluate
+    --config-dir`` run (the prompt under test and its name vary per config),
+    so it is exempted from the equality check. ``global`` and every other
+    ``services.*`` subsection are still compared as-is. ``archi create`` does
+    not consume ``services.benchmarking``, so this exemption cannot affect a
+    created deployment.
+    """
+    if static_field == 'services' and isinstance(value, dict):
+        return {k: v for k, v in value.items() if k != 'benchmarking'}
+    return value
+
 class ConfigurationManager:
     """Manages archi configuration loading and validation"""
     
@@ -64,7 +79,7 @@ def _append(self,config):
                     if not static_field in config.keys():
                         raise ValueError(f"The field {static_field} must be present in all configurations.")
 
-                    if previous_config[static_field] != config[static_field]:
+                    if _comparable_static_field(static_field, previous_config[static_field]) != _comparable_static_field(static_field, config[static_field]):
                         raise ValueError(f"The field {static_field} must be consistent across all configurations.")
 
             self.configs.append(config)
diff --git a/src/cli/managers/templates_manager.py b/src/cli/managers/templates_manager.py
index cf40c429..e3aa0583 100644
--- a/src/cli/managers/templates_manager.py
+++ b/src/cli/managers/templates_manager.py
@@ -17,6 +17,32 @@
 logger = get_logger(__name__)
 
 
+def _render_config_target_name(
+    single_mode: bool,
+    top_level_name: str,
+    benchmarking_name: Optional[str],
+    index: int,
+    used_names: set,
+) -> str:
+    """Pick the rendered-config filename for one config.
+
+    A single-config deployment renders to ``config.yaml`` (the path config-seed
+    and the chatbot expect). A multi-config run (e.g. a benchmarking sweep)
+    renders one distinct file per config so the benchmarker iterates every
+    variant instead of overwriting a single file. The per-variant
+    ``services.benchmarking.name`` is preferred for readability, falling back to
+    the top-level ``name``; collisions are disambiguated with the config index.
+    """
+    if single_mode:
+        return "config.yaml"
+    stem = str(benchmarking_name or top_level_name)
+    candidate = f"{stem}.yaml"
+    if candidate in used_names:
+        candidate = f"{stem}_{index}.yaml"
+    used_names.add(candidate)
+    return candidate
+
+
 # Template file constants
 BASE_CONFIG_TEMPLATE = "base-config.yaml"
 BASE_COMPOSE_TEMPLATE = "base-compose.yaml"
@@ -161,22 +187,26 @@ def _stage_agents(self, context: TemplateContext) -> None:
         services_cfg = config.get("services", {}) or {}
 
         if context.benchmarking:
-            benchmark_cfg = services_cfg.get("benchmarking", {}) or {}
-            agent_md_file = benchmark_cfg.get("agent_md_file")
-            if not agent_md_file:
-                raise ValueError("Missing required services.benchmarking.agent_md_file in config.")
-            source_path = Path(str(agent_md_file)).expanduser()
-            config_path = Path(str(config.get("_config_path", ""))).expanduser()
-            if not source_path.is_absolute() and config_path:
-                candidate = (config_path.parent / source_path).resolve()
-                if candidate.exists():
-                    source_path = candidate
-            if not source_path.exists() or not source_path.is_file():
-                raise ValueError(f"Benchmark agent file not found: {source_path}")
-            if source_path.suffix.lower() != ".md":
-                raise ValueError(f"Benchmark agent file must be a .md file: {source_path}")
+            # A multi-config sweep names a distinct agent_md_file per config; stage
+            # every one so the benchmarker can load each variant (not just the first).
             dst_dir.mkdir(parents=True, exist_ok=True)
-            shutil.copyfile(source_path, dst_dir / source_path.name)
+            for bench_config in context.config_manager.get_configs():
+                bench_services = bench_config.get("services", {}) or {}
+                benchmark_cfg = bench_services.get("benchmarking", {}) or {}
+                agent_md_file = benchmark_cfg.get("agent_md_file")
+                if not agent_md_file:
+                    raise ValueError("Missing required services.benchmarking.agent_md_file in config.")
+                source_path = Path(str(agent_md_file)).expanduser()
+                config_path = Path(str(bench_config.get("_config_path", ""))).expanduser()
+                if not source_path.is_absolute() and config_path:
+                    candidate = (config_path.parent / source_path).resolve()
+                    if candidate.exists():
+                        source_path = candidate
+                if not source_path.exists() or not source_path.is_file():
+                    raise ValueError(f"Benchmark agent file not found: {source_path}")
+                if source_path.suffix.lower() != ".md":
+                    raise ValueError(f"Benchmark agent file must be a .md file: {source_path}")
+                shutil.copyfile(source_path, dst_dir / source_path.name)
             return
 
         agents_dir = (services_cfg.get("chat_app") or {}).get("agents_dir")
@@ -329,7 +359,8 @@ def _render_config_files(self, context: TemplateContext) -> None:
 
         archi_configs = context.config_manager.get_configs()
         single_mode = len(archi_configs) == 1
-        for archi_config in archi_configs:
+        used_names: set = set()
+        for index, archi_config in enumerate(archi_configs):
             name = archi_config["name"]
             updated_config = copy.deepcopy(archi_config)
 
@@ -354,7 +385,14 @@ def _render_config_files(self, context: TemplateContext) -> None:
             config_template = self.env.get_template(BASE_CONFIG_TEMPLATE)
             config_rendered = config_template.render(verbosity=context.plan.verbosity, **updated_config)
 
-            target_name = "config.yaml" if single_mode else f"{name}.yaml"
+            benchmarking_name = None
+            if context.benchmarking:
+                benchmark_cfg = services_cfg.get("benchmarking")
+                if isinstance(benchmark_cfg, dict):
+                    benchmarking_name = benchmark_cfg.get("name")
+            target_name = _render_config_target_name(
+                single_mode, name, benchmarking_name, index, used_names
+            )
             with open(configs_path / target_name, "w") as f:
                 f.write(config_rendered)
             logger.info(f"Rendered configuration file {configs_path / target_name}")
diff --git a/src/cli/tools/config_seed.py b/src/cli/tools/config_seed.py
index 81b2e29a..8dc60e28 100644
--- a/src/cli/tools/config_seed.py
+++ b/src/cli/tools/config_seed.py
@@ -12,6 +12,7 @@
 Exits 0 on success, non-zero on failure.
 """
 
+import glob
 import os
 import sys
 import yaml
@@ -25,6 +26,27 @@ def load_config(path: str):
         return yaml.safe_load(f)
 
 
+def resolve_config_path(config_path: str) -> str:
+    """Resolve the config file to seed Postgres from.
+
+    A single-config deployment renders ``config.yaml`` and this returns it
+    unchanged. A multi-config benchmarking deployment renders per-variant files
+    (e.g. ``fasrc-cannon-v1-strict.yaml``) instead, so the hardcoded
+    ``config.yaml`` is absent — fall back to the first ``*.yaml`` in the
+    rendered-config directory rather than aborting the whole deployment.
+    Seeding from any one config is harmless: the benchmarker reads the YAML
+    files directly and never consumes the seeded static_config. If nothing is
+    found, return the original path so ``load_config`` raises a clear error.
+    """
+    if os.path.isfile(config_path):
+        return config_path
+    directory = config_path if os.path.isdir(config_path) else (os.path.dirname(config_path) or ".")
+    candidates = sorted(glob.glob(os.path.join(directory, "*.yaml")))
+    if candidates:
+        return candidates[0]
+    return config_path
+
+
 def seed(config: dict, cs: ConfigService):
     print("[config-seed] Starting seed with config keys:", list(config.keys()))
     dm = config.get("data_manager", {})
@@ -91,6 +113,7 @@ def main():
 
 
 def seed_entry(config_path: str, env: dict):
+    config_path = resolve_config_path(config_path)
     print(f"[config-seed] Loading config from {config_path}")
     config = load_config(config_path)
     factory = PostgresServiceFactory.from_env(password_override=env.get("PGPASSWORD") or env.get("PG_PASSWORD"))
diff --git a/tests/unit/test_config_manager_benchmarking_variation.py b/tests/unit/test_config_manager_benchmarking_variation.py
new file mode 100644
index 00000000..dfd1f75b
--- /dev/null
+++ b/tests/unit/test_config_manager_benchmarking_variation.py
@@ -0,0 +1,82 @@
+"""Unit tests for the multi-config loader's cross-config consistency check.
+
+A `--config-dir` benchmarking run sweeps prompt variants that differ only in
+`services.benchmarking` (the prompt under test + its name). The loader must
+accept that variation while still rejecting drift in `global` or in any other
+`services.*` subsection (e.g. `services.chat_app`, which carries the SUT).
+"""
+
+import copy
+
+import pytest
+
+from src.cli.managers.config_manager import ConfigurationManager
+
+
+def _manager():
+    """A ConfigurationManager shell without running its file-loading __init__."""
+    mgr = object.__new__(ConfigurationManager)
+    mgr.configs = []
+    return mgr
+
+
+def _base_config():
+    return {
+        "name": "ragas-bench",
+        "global": {"DATA_PATH": "/root/data/"},
+        "services": {
+            "chat_app": {"default_provider": "openai", "default_model": "qwen"},
+            "benchmarking": {
+                "agent_md_file": "config/agents/archive/fasrc-cannon-v1-strict.md",
+                "name": "fasrc-cannon-v1-strict",
+                "model": "qwen",
+            },
+        },
+    }
+
+
+def test_configs_differing_only_in_benchmarking_load():
+    mgr = _manager()
+    base = _base_config()
+    variant = copy.deepcopy(base)
+    variant["services"]["benchmarking"]["agent_md_file"] = (
+        "config/agents/archive/fasrc-cannon-v2-lean.md"
+    )
+    variant["services"]["benchmarking"]["name"] = "fasrc-cannon-v2-lean"
+
+    mgr._append(base)
+    mgr._append(variant)
+
+    assert len(mgr.configs) == 2
+
+
+def test_differing_global_still_rejected():
+    mgr = _manager()
+    base = _base_config()
+    variant = copy.deepcopy(base)
+    variant["global"]["DATA_PATH"] = "/somewhere/else/"
+
+    mgr._append(base)
+    with pytest.raises(ValueError, match="must be consistent across all configurations"):
+        mgr._append(variant)
+
+
+def test_differing_non_benchmarking_service_still_rejected():
+    mgr = _manager()
+    base = _base_config()
+    variant = copy.deepcopy(base)
+    # Drift in services.chat_app (the SUT) must still be rejected.
+    variant["services"]["chat_app"]["default_model"] = "different-model"
+
+    mgr._append(base)
+    with pytest.raises(ValueError, match="must be consistent across all configurations"):
+        mgr._append(variant)
+
+
+def test_identical_configs_still_load():
+    """Regression guard: the narrowed comparison must not change the equal case."""
+    mgr = _manager()
+    base = _base_config()
+    mgr._append(base)
+    mgr._append(copy.deepcopy(base))
+    assert len(mgr.configs) == 2
diff --git a/tests/unit/test_config_seed_resolve.py b/tests/unit/test_config_seed_resolve.py
new file mode 100644
index 00000000..b79e1644
--- /dev/null
+++ b/tests/unit/test_config_seed_resolve.py
@@ -0,0 +1,40 @@
+"""Unit tests for config-seed's config-path resolution.
+
+A multi-config benchmarking deployment renders per-variant files (e.g.
+`fasrc-cannon-v1-strict.yaml`) instead of a single `config.yaml`, so config-seed
+must fall back to the first `*.yaml` in the rendered-config directory rather than
+aborting the whole deployment with FileNotFoundError. (Seeding Postgres from any
+one config is harmless — the benchmarker reads the YAML files directly and never
+consumes the seeded static_config.)
+"""
+
+from src.cli.tools.config_seed import resolve_config_path
+
+
+def test_existing_file_returned_as_is(tmp_path):
+    cfg = tmp_path / "config.yaml"
+    cfg.write_text("name: x\n")
+    assert resolve_config_path(str(cfg)) == str(cfg)
+
+
+def test_missing_config_yaml_falls_back_to_first_yaml(tmp_path):
+    (tmp_path / "fasrc-cannon-v2-lean.yaml").write_text("name: b\n")
+    (tmp_path / "fasrc-cannon-v1-strict.yaml").write_text("name: a\n")
+    missing = tmp_path / "config.yaml"  # does not exist
+
+    resolved = resolve_config_path(str(missing))
+
+    # first in sorted order
+    assert resolved == str(tmp_path / "fasrc-cannon-v1-strict.yaml")
+
+
+def test_directory_path_resolves_to_first_yaml(tmp_path):
+    (tmp_path / "b.yaml").write_text("name: b\n")
+    (tmp_path / "a.yaml").write_text("name: a\n")
+    assert resolve_config_path(str(tmp_path)) == str(tmp_path / "a.yaml")
+
+
+def test_no_yaml_returns_original_path(tmp_path):
+    missing = tmp_path / "config.yaml"
+    # empty dir, nothing to fall back to -> return original so load raises clearly
+    assert resolve_config_path(str(missing)) == str(missing)
diff --git a/tests/unit/test_render_config_filename.py b/tests/unit/test_render_config_filename.py
new file mode 100644
index 00000000..715e2e32
--- /dev/null
+++ b/tests/unit/test_render_config_filename.py
@@ -0,0 +1,48 @@
+"""Unit tests for rendered-config filename derivation in the deployment manager.
+
+A multi-config `archi evaluate --config-dir` run must render one distinct file
+per config so the benchmarker iterates every variant. Previously all configs
+collided onto `{top_level_name}.yaml` (e.g. three `ragas-bench` sweep variants
+all overwrote `ragas-bench.yaml`), so only the last survived.
+"""
+
+from src.cli.managers.templates_manager import _render_config_target_name
+
+
+def test_single_mode_is_config_yaml():
+    assert _render_config_target_name(True, "ragas-bench", None, 0, set()) == "config.yaml"
+
+
+def test_multi_uses_benchmarking_name():
+    used = set()
+    assert (
+        _render_config_target_name(False, "ragas-bench", "fasrc-cannon-v1-strict", 0, used)
+        == "fasrc-cannon-v1-strict.yaml"
+    )
+
+
+def test_multi_distinct_files_for_distinct_variants():
+    used = set()
+    names = [
+        _render_config_target_name(False, "ragas-bench", "v1", 0, used),
+        _render_config_target_name(False, "ragas-bench", "v2", 1, used),
+        _render_config_target_name(False, "ragas-bench", "v3", 2, used),
+    ]
+    assert names == ["v1.yaml", "v2.yaml", "v3.yaml"]
+    assert len(set(names)) == 3
+
+
+def test_collision_disambiguated_by_index():
+    used = set()
+    first = _render_config_target_name(False, "ragas-bench", "dup", 0, used)
+    second = _render_config_target_name(False, "ragas-bench", "dup", 1, used)
+    assert first == "dup.yaml"
+    assert second == "dup_1.yaml"
+    assert first != second
+
+
+def test_multi_falls_back_to_top_level_name_when_no_benchmarking_name():
+    used = set()
+    assert (
+        _render_config_target_name(False, "ragas-bench", None, 0, used) == "ragas-bench.yaml"
+    )
diff --git a/tests/unit/test_stage_agents_multiconfig.py b/tests/unit/test_stage_agents_multiconfig.py
new file mode 100644
index 00000000..1adbb7e3
--- /dev/null
+++ b/tests/unit/test_stage_agents_multiconfig.py
@@ -0,0 +1,57 @@
+"""Unit test: benchmarking sweeps must stage every config's agent .md file.
+
+In a multi-config `archi evaluate --config-dir` run, each config names its own
+`services.benchmarking.agent_md_file` (the prompt variant under test). The
+deployment must copy ALL of them into data/agents/ so the benchmarker can load
+each variant; previously it staged only the first config's agent, so later
+variants crashed with FileNotFoundError inside the container.
+"""
+
+from pathlib import Path
+from types import SimpleNamespace
+
+from src.cli.managers.templates_manager import TemplateManager
+
+
+def _config(tmp_path: Path, agent_rel: str):
+    cfg = {
+        "name": "ragas-bench",
+        "services": {"benchmarking": {"agent_md_file": agent_rel}},
+        "_config_path": str(tmp_path / "sweep" / "variant.yaml"),
+    }
+    return cfg
+
+
+def test_stages_all_configs_agent_files(tmp_path, monkeypatch):
+    # three prompt variants on disk (cwd-relative, like config/agents/archive/*.md)
+    agents_src = tmp_path / "agents"
+    agents_src.mkdir()
+    names = ["v1-strict.md", "v2-lean.md", "v3-cited.md"]
+    for n in names:
+        (agents_src / n).write_text(f"# {n}\n")
+
+    # run from tmp_path so the cwd-relative agent_md_file resolves
+    monkeypatch.chdir(tmp_path)
+    rels = [f"agents/{n}" for n in names]
+    configs = [_config(tmp_path, r) for r in rels]
+
+    base_dir = tmp_path / "deploy"
+    base_dir.mkdir()
+
+    class _CM:
+        config = configs[0]
+
+        def get_configs(self):
+            return configs
+
+    context = SimpleNamespace(
+        config_manager=_CM(),
+        base_dir=base_dir,
+        benchmarking=True,
+    )
+
+    mgr = object.__new__(TemplateManager)
+    mgr._stage_agents(context)
+
+    staged = sorted(p.name for p in (base_dir / "data" / "agents").iterdir())
+    assert staged == sorted(names)

From 6ffd7c8f54bbf345df77caf9e8682d97330bc9ef Mon Sep 17 00:00:00 2001
From: Austin Swinney <aswinney@harvard.edu>
Date: Tue, 9 Jun 2026 22:27:17 -0400
Subject: [PATCH 3/7] feat(agent): enforce initial vector retrieval in
 CMSCompOpsAgent

The model can ignore an 'always search first' prompt and answer from its own
weights, leaving source_documents empty (chat UI shows 'Link unavailable').
_inject_forced_retrieval prefills a completed search_vectorstore_hybrid tool
round before the model's first turn, so retrieval always happens and its
store_docs callback populates the source links. The model may still search
again. Gated by services.chat_app.force_initial_retrieval (default on).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/archi/pipelines/agents/base_react.py      |  13 +-
 .../pipelines/agents/cms_comp_ops_agent.py    |  64 ++++++++++
 tests/unit/test_forced_retrieval.py           | 113 ++++++++++++++++++
 3 files changed, 188 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/test_forced_retrieval.py

diff --git a/src/archi/pipelines/agents/base_react.py b/src/archi/pipelines/agents/base_react.py
index 76ae3470..34553d6f 100644
--- a/src/archi/pipelines/agents/base_react.py
+++ b/src/archi/pipelines/agents/base_react.py
@@ -1176,7 +1176,7 @@ def _prepare_agent_inputs(self, **kwargs) -> Dict[str, Any]:
                     "Invalid context window (%s), skipping trimming.",
                     context_window,
                     )
-                    return {"messages": history_messages}
+                    return {"messages": self._inject_forced_retrieval(history_messages)}
 
                 safety_margin = int(context_window * 0.15)
                 max_prompt_tokens = context_window - safety_margin
@@ -1220,7 +1220,16 @@ def _prepare_agent_inputs(self, **kwargs) -> Dict[str, Any]:
         except Exception as e:
             logger.debug("Token trimming skipped: %s", e)
 
-        return {"messages": history_messages}
+        return {"messages": self._inject_forced_retrieval(history_messages)}
+
+    def _inject_forced_retrieval(self, messages: List[BaseMessage]) -> List[BaseMessage]:
+        """Hook for subclasses to force a retrieval before the model's first turn.
+
+        Base implementation is a no-op; agents with a vector retriever override
+        this to prefill a completed ``search_vectorstore_hybrid`` tool round so
+        retrieval happens regardless of whether the model chooses to call it.
+        """
+        return messages
 
     def _metadata_from_agent_output(self, answer_output: Dict[str, Any]) -> Dict[str, Any]:
         """Hook for subclasses to enrich metadata returned to callers."""
diff --git a/src/archi/pipelines/agents/cms_comp_ops_agent.py b/src/archi/pipelines/agents/cms_comp_ops_agent.py
index 48f4385a..a9b9fef6 100644
--- a/src/archi/pipelines/agents/cms_comp_ops_agent.py
+++ b/src/archi/pipelines/agents/cms_comp_ops_agent.py
@@ -1,8 +1,11 @@
 from __future__ import annotations
 
 import json
+import uuid
 from typing import Any, Callable, Dict, List
 
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, ToolMessage
+
 from src.utils.logging import get_logger
 from src.utils.env import read_secret
 from src.archi.pipelines.agents.base_react import BaseReActAgent
@@ -313,3 +316,64 @@ def _update_vector_retrievers(self, vectorstore: Any) -> None:
                 store_tool_input=getattr(self, "_store_tool_input", None),
             )
         )
+
+    def _inject_forced_retrieval(self, messages: List[BaseMessage]) -> List[BaseMessage]:
+        """Force one ``search_vectorstore_hybrid`` call before the model answers.
+
+        The model can ignore an "always search first" prompt and answer from its
+        own weights, leaving ``source_documents`` empty (chat UI shows "Link
+        unavailable"). To enforce retrieval, prefill a completed tool round —
+        an ``AIMessage`` carrying the tool call plus the matching ``ToolMessage``
+        result — so the ReAct loop starts with real chunks already in context.
+        Invoking the existing retriever tool also runs its ``store_docs``
+        callback, so retrieved documents flow into ``source_documents``/links
+        exactly as a model-initiated search would. The model may still search
+        again. Gated by ``services.chat_app.force_initial_retrieval`` (default
+        on) so prompt-vs-enforcement variants can be A/B'd in the sweep.
+        """
+        if not getattr(self, "enable_vector_tools", False):
+            logger.debug("Forced retrieval skipped: vector tools disabled")
+            return messages
+        if not self._chat_app_config.get("force_initial_retrieval", True):
+            logger.debug("Forced retrieval skipped: force_initial_retrieval=false")
+            return messages
+        tools = getattr(self, "_vector_tools", None)
+        if not tools:
+            logger.debug("Forced retrieval skipped: no vector tools built")
+            return messages
+        # Only force on a fresh user turn (the latest message is the question).
+        if not messages or not isinstance(messages[-1], HumanMessage):
+            last_type = type(messages[-1]).__name__ if messages else "none"
+            logger.debug("Forced retrieval skipped: last message is %s", last_type)
+            return messages
+        query = (self._message_content(messages[-1]) or "").strip()
+        if not query:
+            logger.debug("Forced retrieval skipped: empty query")
+            return messages
+
+        try:
+            result = tools[0].invoke({"query": query})
+        except Exception:
+            # Fail open: a retrieval error must not break the chat turn.
+            logger.warning("Forced initial retrieval failed", exc_info=True)
+            return messages
+
+        logger.info("Forced initial retrieval ran: query=%r -> %d chars", query, len(str(result)))
+
+        call_id = f"forced_search_{uuid.uuid4().hex}"
+        ai = AIMessage(
+            content="",
+            tool_calls=[
+                {
+                    "name": "search_vectorstore_hybrid",
+                    "args": {"query": query},
+                    "id": call_id,
+                }
+            ],
+        )
+        tool_msg = ToolMessage(
+            content=result if isinstance(result, str) else str(result),
+            tool_call_id=call_id,
+            name="search_vectorstore_hybrid",
+        )
+        return list(messages) + [ai, tool_msg]
diff --git a/tests/unit/test_forced_retrieval.py b/tests/unit/test_forced_retrieval.py
new file mode 100644
index 00000000..2b31c370
--- /dev/null
+++ b/tests/unit/test_forced_retrieval.py
@@ -0,0 +1,113 @@
+"""Unit tests for enforced (mandatory) vector retrieval in CMSCompOpsAgent.
+
+The chat model can ignore a "always search first" system prompt and answer
+from its own weights, leaving ``source_documents`` empty (and the chat UI
+showing "Link unavailable"). To *enforce* retrieval, the agent prefills a
+completed tool round — an ``AIMessage`` carrying a ``search_vectorstore_hybrid``
+tool call plus the matching ``ToolMessage`` result — before the model's first
+turn, so the retrieval happens regardless of the model's choice.
+"""
+
+from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+
+from src.archi.pipelines.agents.cms_comp_ops_agent import CMSCompOpsAgent
+
+
+class _FakeTool:
+    """Stand-in for the real retriever StructuredTool."""
+
+    def __init__(self, result: str = "[1] GLOBUS doc snippet"):
+        self.result = result
+        self.calls = []
+
+    def invoke(self, payload):
+        self.calls.append(payload)
+        return self.result
+
+
+def _make_agent(*, enabled=True, force=True, tools=None):
+    """Build a CMSCompOpsAgent shell without running its heavy __init__."""
+    agent = object.__new__(CMSCompOpsAgent)
+    agent.enable_vector_tools = enabled
+    agent._vector_tools = tools
+    agent.config = {"services": {"chat_app": {"force_initial_retrieval": force}}}
+    return agent
+
+
+def test_injects_forced_tool_round():
+    fake = _FakeTool()
+    agent = _make_agent(tools=[fake])
+
+    out = agent._inject_forced_retrieval([HumanMessage("What is GLOBUS for?")])
+
+    # original human message + prefilled (AIMessage tool_call, ToolMessage)
+    assert len(out) == 3
+    assert isinstance(out[0], HumanMessage)
+
+    ai = out[1]
+    assert isinstance(ai, AIMessage)
+    assert len(ai.tool_calls) == 1
+    assert ai.tool_calls[0]["name"] == "search_vectorstore_hybrid"
+    assert ai.tool_calls[0]["args"] == {"query": "What is GLOBUS for?"}
+
+    tool_msg = out[2]
+    assert isinstance(tool_msg, ToolMessage)
+    assert tool_msg.content == "[1] GLOBUS doc snippet"
+    assert tool_msg.tool_call_id == ai.tool_calls[0]["id"]
+
+    # the real retriever tool was actually invoked with the user's query
+    assert fake.calls == [{"query": "What is GLOBUS for?"}]
+
+
+def test_disabled_by_flag_is_noop():
+    fake = _FakeTool()
+    agent = _make_agent(tools=[fake], force=False)
+
+    msgs = [HumanMessage("What is GLOBUS for?")]
+    out = agent._inject_forced_retrieval(msgs)
+
+    assert out == msgs
+    assert fake.calls == []
+
+
+def test_no_vector_tools_is_noop():
+    agent = _make_agent(enabled=False, tools=None)
+    msgs = [HumanMessage("What is GLOBUS for?")]
+    assert agent._inject_forced_retrieval(msgs) == msgs
+
+
+def test_only_injects_on_fresh_human_turn():
+    """If the turn does not end with a human message, do not inject."""
+    fake = _FakeTool()
+    agent = _make_agent(tools=[fake])
+
+    msgs = [HumanMessage("hi"), AIMessage("hello")]
+    out = agent._inject_forced_retrieval(msgs)
+
+    assert out == msgs
+    assert fake.calls == []
+
+
+def test_empty_query_is_noop():
+    fake = _FakeTool()
+    agent = _make_agent(tools=[fake])
+
+    msgs = [HumanMessage("   ")]
+    out = agent._inject_forced_retrieval(msgs)
+
+    assert out == msgs
+    assert fake.calls == []
+
+
+def test_retriever_failure_fails_open():
+    """A retrieval error must not break the chat turn."""
+
+    class _BoomTool:
+        def invoke(self, payload):
+            raise RuntimeError("vectorstore down")
+
+    agent = _make_agent(tools=[_BoomTool()])
+    msgs = [HumanMessage("What is GLOBUS for?")]
+    out = agent._inject_forced_retrieval(msgs)
+
+    assert out == msgs

From 2ba3f0974cd19cfa9f7cde74c6268355c71dac7d Mon Sep 17 00:00:00 2001
From: Austin Swinney <aswinney@harvard.edu>
Date: Wed, 10 Jun 2026 10:12:06 -0400
Subject: [PATCH 4/7] fix(benchmark): report per-metric scored sample size on
 the leaderboard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

aggregate_* is a pandas .mean() that skips NaN, so a RAGAS judge timeout on a
question silently shrinks that metric's sample without making the aggregate
NaN — a mean over 4 of 9 answered questions looked fully-backed. Add
scored_counts{metric: n} (non-NaN per-question count) to each leaderboard row,
keep query_count as the answered count, log a warning for under-sampled
metrics, and annotate the log table cells with @<n> when n < answered.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/bin/service_benchmark.py                | 66 +++++++++++++++++----
 tests/unit/test_prompt_sweep_leaderboard.py | 29 +++++++++
 2 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/src/bin/service_benchmark.py b/src/bin/service_benchmark.py
index 1588cb4d..3af35f3b 100644
--- a/src/bin/service_benchmark.py
+++ b/src/bin/service_benchmark.py
@@ -334,9 +334,13 @@ def build_leaderboard(primary_metric: str = "faithfulness") -> Dict[str, Any]:
         it never touches pair_ab_results/ab_comparisons.
 
         Each row: {name, agent_md_file, metrics{...}, primary_score, rank,
-        incomplete, query_count}. A metric is None (and the row `incomplete`)
-        when its aggregate key is absent or NaN — never silently zeroed.
-        Incomplete rows always sort after complete ones. Ties share a rank.
+        incomplete, query_count, scored_counts{...}}. A metric is None (and the
+        row `incomplete`) when its aggregate key is absent or NaN — never
+        silently zeroed. Incomplete rows always sort after complete ones. Ties
+        share a rank. `query_count` is the number of questions answered;
+        `scored_counts[metric]` is how many non-NaN per-question scores actually
+        backed that metric's mean (a judge timeout shrinks the sample without
+        making the aggregate NaN).
 
         shared_context records the run context common to all variants and
         flags any drift (a hand-edited config that breaks apples-to-apples).
@@ -381,12 +385,44 @@ def _benchmarking(record: Dict[str, Any]) -> Dict[str, Any]:
                 else:
                     metrics[metric_name] = float(value)
 
+            # Per-metric sample size actually behind each mean. The RAGAS block
+            # computes aggregate_* via pandas .mean(), which skips NaN, so a
+            # judge timeout on one question silently shrinks the sample for that
+            # metric without making the aggregate NaN. Count the non-NaN
+            # per-question scores so the leaderboard can show, e.g., a
+            # faithfulness mean taken over 4 of 9 answered questions instead of
+            # implying all 9 backed it. query_count is the answered count.
+            single_question_results = record.get("single_question_results") or {}
+            scored_counts: Dict[str, int] = {}
+            for metric_name, _agg_key in ResultHandler.LEADERBOARD_METRICS:
+                count = 0
+                for q in single_question_results.values():
+                    if not isinstance(q, dict):
+                        continue
+                    v = q.get(metric_name)
+                    if v is not None and not (isinstance(v, float) and math.isnan(v)):
+                        count += 1
+                scored_counts[metric_name] = count
+
             if incomplete:
                 logger.warning(
                     "Leaderboard: variant '%s' (%s) is incomplete — missing/NaN metrics: %s",
                     name, agent_md_file,
                     [m for m in metric_names if metrics[m] is None],
                 )
+            # Surface under-sampling even when the aggregate is a valid float.
+            answered = len(single_question_results)
+            undersampled = [
+                f"{m}={scored_counts[m]}/{answered}"
+                for m in metric_names
+                if metrics[m] is not None and scored_counts[m] < answered
+            ]
+            if undersampled:
+                logger.warning(
+                    "Leaderboard: variant '%s' (%s) has under-sampled metrics "
+                    "(mean over fewer than %d answered questions): %s",
+                    name, agent_md_file, answered, undersampled,
+                )
 
             rows.append({
                 "name": name,
@@ -394,7 +430,8 @@ def _benchmarking(record: Dict[str, Any]) -> Dict[str, Any]:
                 "metrics": metrics,
                 "primary_score": metrics[primary_metric],
                 "incomplete": incomplete,
-                "query_count": len(record.get("single_question_results") or {}),
+                "query_count": answered,
+                "scored_counts": scored_counts,
             })
 
             ragas_settings = (bench.get("mode_settings", {}) or {}).get("ragas_settings", {}) or {}
@@ -1032,15 +1069,24 @@ def run(self):
             )
             for row in leaderboard["rows"]:
                 m = row["metrics"]
-                def _fmt(v: Optional[float]) -> str:
-                    return f"{v:.4f}" if isinstance(v, float) else "  n/a"
+                answered = row["query_count"]
+                scored = row.get("scored_counts", {})
+                # Annotate a metric with @<n> when its mean is over fewer than
+                # the answered questions (judge timeouts), so an under-sampled
+                # score can't masquerade as fully-backed.
+                def _fmt(metric_name: str) -> str:
+                    v = m[metric_name]
+                    if not isinstance(v, float):
+                        return "    n/a"
+                    n = scored.get(metric_name, answered)
+                    return f"{v:.4f}@{n}" if n < answered else f"{v:.4f}"
                 flag = "  (incomplete)" if row["incomplete"] else ""
                 logger.info(
-                    "  %-4d %-28s %-10s %-10s %-10s %-10s %-10d %s%s",
+                    "  %-4d %-28s %-12s %-12s %-12s %-12s %-10d %s%s",
                     row["rank"], row["name"][:28],
-                    _fmt(m["answer_relevancy"]), _fmt(m["faithfulness"]),
-                    _fmt(m["context_precision"]), _fmt(m["context_recall"]),
-                    row["query_count"], row["agent_md_file"], flag,
+                    _fmt("answer_relevancy"), _fmt("faithfulness"),
+                    _fmt("context_precision"), _fmt("context_recall"),
+                    answered, row["agent_md_file"], flag,
                 )
 
         # Push to Argilla when ARCHI_ARGILLA=1 in the benchmarks container env.
diff --git a/tests/unit/test_prompt_sweep_leaderboard.py b/tests/unit/test_prompt_sweep_leaderboard.py
index a805f75f..400b4d4f 100644
--- a/tests/unit/test_prompt_sweep_leaderboard.py
+++ b/tests/unit/test_prompt_sweep_leaderboard.py
@@ -173,6 +173,35 @@ def test_nan_metric_marks_incomplete():
     assert lb["rows"][-1]["name"] == "nan"
 
 
+# -- scored_counts: per-metric sample size behind each mean ------------------
+
+def test_scored_counts_reflect_non_nan_per_question():
+    """A judge timeout shrinks a metric's sample without making the aggregate
+    NaN; scored_counts must report the real (non-NaN) per-metric count while
+    query_count stays the answered count."""
+    rec = _make_record("v", "p/v.md", faithfulness=0.6, context_precision=0.5, n_questions=3)
+    qs = list(rec["single_question_results"].values())
+    for q in qs:  # all three fully scored on three metrics
+        q["faithfulness"] = 0.6
+        q["answer_relevancy"] = 0.8
+        q["context_recall"] = 0.7
+    # context_precision scored on only the first question; other two timed out
+    qs[0]["context_precision"] = 0.5
+    qs[1]["context_precision"] = float("nan")
+    qs[2]["context_precision"] = float("nan")
+
+    ResultHandler.results = [rec]
+    row = ResultHandler.build_leaderboard()["rows"][0]
+
+    assert row["query_count"] == 3  # answered
+    assert row["scored_counts"]["faithfulness"] == 3
+    assert row["scored_counts"]["answer_relevancy"] == 3
+    assert row["scored_counts"]["context_recall"] == 3
+    assert row["scored_counts"]["context_precision"] == 1  # 2 timed out
+    # the aggregate is still a valid float, so the row is NOT marked incomplete
+    assert row["incomplete"] is False
+
+
 # -- 4.6 name fallback -------------------------------------------------------
 
 def test_name_falls_back_to_prompt_stem():

From 7f9b419b5050fafbe95e80bcb45118a992a18217 Mon Sep 17 00:00:00 2001
From: Austin Swinney <aswinney@harvard.edu>
Date: Wed, 10 Jun 2026 14:04:14 -0400
Subject: [PATCH 5/7] fix(benchmark): lazy-import datasets/ragas so unit tests
 collect without them

The leaderboard unit tests import service_benchmark, whose top-level
'from datasets import Dataset' and 'from ragas import ...' pulled heavy
benchmark-only deps absent from the lean unit-test CI env, breaking collection
(ModuleNotFoundError: datasets). Move those imports into the two methods that
use them (get_ragas_results, run) so the module imports for its pure helpers
(ResultHandler.build_leaderboard/dump) without datasets/ragas. The benchmarking
Docker image still has both at runtime.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/bin/service_benchmark.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/bin/service_benchmark.py b/src/bin/service_benchmark.py
index 3af35f3b..24395a18 100644
--- a/src/bin/service_benchmark.py
+++ b/src/bin/service_benchmark.py
@@ -14,15 +14,14 @@
 
 import pandas as pd
 import yaml
-from datasets import Dataset
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
-from ragas import RunConfig, evaluate
-from ragas.embeddings import LangchainEmbeddingsWrapper
-from ragas.llms import LangchainLLMWrapper
-from ragas.metrics import (answer_relevancy, context_precision, context_recall,
-                           faithfulness)
+# NOTE: `datasets` and `ragas` are heavy, benchmark-only deps that live in the
+# benchmarking Docker image but NOT the lean unit-test environment. They are
+# imported lazily inside the methods that use them (get_ragas_results, run)
+# so that importing this module for its pure helpers (e.g. ResultHandler.
+# build_leaderboard / dump, exercised by unit tests) does not require them.
 
 from src.archi.archi import archi
 from src.archi.pipelines.agents.agent_spec import AgentSpecError, load_agent_spec
@@ -834,7 +833,14 @@ def get_source_results(
 
     def get_ragas_results(self, data, to_add):
         """WARNING: this method modifies the to_add dictionary to add the relevant scores to the relevant questions"""
-        
+        # Lazy import: ragas (and its transitive `datasets` dep) is benchmark-only
+        # and absent from the unit-test environment. See the module-header note.
+        from ragas import RunConfig, evaluate
+        from ragas.embeddings import LangchainEmbeddingsWrapper
+        from ragas.llms import LangchainLLMWrapper
+        from ragas.metrics import (answer_relevancy, context_precision,
+                                   context_recall, faithfulness)
+
         all_metrics_dict = {
                 'answer_relevancy': answer_relevancy, 
                 'faithfulness': faithfulness, 
@@ -1011,6 +1017,7 @@ def run(self):
             if "RAGAS" in modes_being_run:
                 # TODO this is likely broken now
                 logger.info(f"Starting to collect RAGAS results")
+                from datasets import Dataset  # lazy: benchmark-only dep (see module header)
                 data = Dataset.from_list(ragas_input)
                 # were modifying final_addition here to add ragas results by question
                 ragas_results = self.get_ragas_results(data, question_wise_results)

From 60dd655d97e42680d75eb4e2e4e1534a90effb1b Mon Sep 17 00:00:00 2001
From: Austin Swinney <aswinney@harvard.edu>
Date: Wed, 10 Jun 2026 14:07:07 -0400
Subject: [PATCH 6/7] fix(cli): harden multi-config rendering against
 filename/basename collisions

Address Copilot review on PR #22:
- _render_config_target_name now loops until the name is unique instead of
  disambiguating only once, so a config can never silently overwrite an
  earlier rendered file even if {stem}_{index} is already taken.
- _stage_agents rejects two benchmark configs whose agent_md_file share a
  basename (they'd overwrite each other when staged and when referenced by
  the rendered config) with a clear error instead of silently dropping one.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/cli/managers/templates_manager.py       | 21 ++++++++++++++--
 tests/unit/test_render_config_filename.py   |  8 ++++++
 tests/unit/test_stage_agents_multiconfig.py | 28 +++++++++++++++++++++
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/src/cli/managers/templates_manager.py b/src/cli/managers/templates_manager.py
index e3aa0583..994f31d1 100644
--- a/src/cli/managers/templates_manager.py
+++ b/src/cli/managers/templates_manager.py
@@ -37,8 +37,13 @@ def _render_config_target_name(
         return "config.yaml"
     stem = str(benchmarking_name or top_level_name)
     candidate = f"{stem}.yaml"
-    if candidate in used_names:
-        candidate = f"{stem}_{index}.yaml"
+    # Disambiguate against ALL previously-used names, not just once: keep
+    # bumping the suffix until the name is unique so a config can never
+    # silently overwrite an earlier rendered file.
+    suffix = index
+    while candidate in used_names:
+        candidate = f"{stem}_{suffix}.yaml"
+        suffix += 1
     used_names.add(candidate)
     return candidate
 
@@ -190,6 +195,10 @@ def _stage_agents(self, context: TemplateContext) -> None:
             # A multi-config sweep names a distinct agent_md_file per config; stage
             # every one so the benchmarker can load each variant (not just the first).
             dst_dir.mkdir(parents=True, exist_ok=True)
+            # Agent files are staged (and later referenced by the rendered config)
+            # by basename, so two configs whose agent_md_file share a basename
+            # would silently overwrite each other. Detect and reject that.
+            staged_by_basename: Dict[str, Path] = {}
             for bench_config in context.config_manager.get_configs():
                 bench_services = bench_config.get("services", {}) or {}
                 benchmark_cfg = bench_services.get("benchmarking", {}) or {}
@@ -206,6 +215,14 @@ def _stage_agents(self, context: TemplateContext) -> None:
                     raise ValueError(f"Benchmark agent file not found: {source_path}")
                 if source_path.suffix.lower() != ".md":
                     raise ValueError(f"Benchmark agent file must be a .md file: {source_path}")
+                prior = staged_by_basename.get(source_path.name)
+                if prior is not None and prior != source_path:
+                    raise ValueError(
+                        "Two benchmark configs reference different agent files with the "
+                        f"same basename '{source_path.name}' ({prior} vs {source_path}); "
+                        "they would overwrite each other when staged. Rename one."
+                    )
+                staged_by_basename[source_path.name] = source_path
                 shutil.copyfile(source_path, dst_dir / source_path.name)
             return
 
diff --git a/tests/unit/test_render_config_filename.py b/tests/unit/test_render_config_filename.py
index 715e2e32..bf0bad2a 100644
--- a/tests/unit/test_render_config_filename.py
+++ b/tests/unit/test_render_config_filename.py
@@ -41,6 +41,14 @@ def test_collision_disambiguated_by_index():
     assert first != second
 
 
+def test_repeated_collisions_stay_unique():
+    """Disambiguation must keep bumping until unique, not collide a second time."""
+    used = set()
+    names = [_render_config_target_name(False, "rb", "dup", i, used) for i in range(4)]
+    assert len(set(names)) == 4  # all distinct, none overwrites another
+    assert names[0] == "dup.yaml"
+
+
 def test_multi_falls_back_to_top_level_name_when_no_benchmarking_name():
     used = set()
     assert (
diff --git a/tests/unit/test_stage_agents_multiconfig.py b/tests/unit/test_stage_agents_multiconfig.py
index 1adbb7e3..f18462ba 100644
--- a/tests/unit/test_stage_agents_multiconfig.py
+++ b/tests/unit/test_stage_agents_multiconfig.py
@@ -55,3 +55,31 @@ def get_configs(self):
 
     staged = sorted(p.name for p in (base_dir / "data" / "agents").iterdir())
     assert staged == sorted(names)
+
+
+def test_same_basename_different_files_is_rejected(tmp_path, monkeypatch):
+    """Two configs whose agent files share a basename would overwrite each
+    other when staged (and when referenced by the rendered config), so the
+    deployment must reject it instead of silently dropping one variant."""
+    import pytest
+
+    (tmp_path / "a").mkdir()
+    (tmp_path / "b").mkdir()
+    (tmp_path / "a" / "prompt.md").write_text("# a\n")
+    (tmp_path / "b" / "prompt.md").write_text("# b\n")
+    monkeypatch.chdir(tmp_path)
+
+    configs = [_config(tmp_path, "a/prompt.md"), _config(tmp_path, "b/prompt.md")]
+    base_dir = tmp_path / "deploy"
+    base_dir.mkdir()
+
+    class _CM:
+        config = configs[0]
+
+        def get_configs(self):
+            return configs
+
+    context = SimpleNamespace(config_manager=_CM(), base_dir=base_dir, benchmarking=True)
+    mgr = object.__new__(TemplateManager)
+    with pytest.raises(ValueError, match="same basename"):
+        mgr._stage_agents(context)

From ef60297e17552d4632fb027a21a5a06f4fc45c2d Mon Sep 17 00:00:00 2001
From: Austin Swinney <aswinney@harvard.edu>
Date: Wed, 10 Jun 2026 14:14:58 -0400
Subject: [PATCH 7/7] fix(benchmark): defer secret/DB init to script entry, not
 module import

After lazy-importing datasets/ragas, collection still failed in CI because
importing service_benchmark ran module-level read_secret() calls and opened a
Postgres connection pool (PostgresServiceFactory.from_env), which can't connect
in the DB-less unit-test env. Move that initialization into _init_runtime(),
called only from __main__, so importing the module for ResultHandler's pure
helpers needs neither live secrets nor a database.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/bin/service_benchmark.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/bin/service_benchmark.py b/src/bin/service_benchmark.py
index 24395a18..ada51532 100644
--- a/src/bin/service_benchmark.py
+++ b/src/bin/service_benchmark.py
@@ -39,13 +39,22 @@
 setup_logging()
 logger = get_logger(__name__)
 
-os.environ['OPENAI_API_KEY'] = read_secret("OPENAI_API_KEY")
-os.environ['ANTHROPIC_API_KEY'] = read_secret("ANTHROPIC_API_KEY")
-os.environ['HUGGING_FACE_HUB_TOKEN'] = read_secret("HUGGING_FACE_HUB_TOKEN")
-os.environ['HUIT_API_KEY'] = read_secret("HUIT_API_KEY")
 
-factory = PostgresServiceFactory.from_env(password_override=os.environ.get("PG_PASSWORD"))
-PostgresServiceFactory.set_instance(factory)
+def _init_runtime() -> None:
+    """Load secrets into the environment and open the Postgres connection pool.
+
+    Called only when this module is run as a script (see __main__), NOT at import
+    time — importing the module for its pure helpers (e.g. ResultHandler.
+    build_leaderboard, exercised by unit tests) must not require live secrets or
+    a reachable database.
+    """
+    os.environ['OPENAI_API_KEY'] = read_secret("OPENAI_API_KEY")
+    os.environ['ANTHROPIC_API_KEY'] = read_secret("ANTHROPIC_API_KEY")
+    os.environ['HUGGING_FACE_HUB_TOKEN'] = read_secret("HUGGING_FACE_HUB_TOKEN")
+    os.environ['HUIT_API_KEY'] = read_secret("HUIT_API_KEY")
+
+    factory = PostgresServiceFactory.from_env(password_override=os.environ.get("PG_PASSWORD"))
+    PostgresServiceFactory.set_instance(factory)
 
 
 @dataclass
@@ -1304,7 +1313,9 @@ def wait_for_ingestion_completion(self):
 
 if __name__ == "__main__":
 
-    query_file = Path("QandA.txt") 
+    _init_runtime()
+
+    query_file = Path("QandA.txt")
     configs_folder = Path('configs')
 
     with open(Path(query_file), "r") as f: