From d717c4902f70453c1bcba2cf530851b30f3fa346 Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Tue, 9 Jun 2026 22:26:58 -0400 Subject: [PATCH 1/7] feat(benchmark): rank prompt variants on a RAGAS leaderboard Add a prompt-sweep workflow: generate_prompt_sweep.py renders one benchmarking config per prompt variant from a manifest, and ResultHandler.build_leaderboard aggregates each variant's mean RAGAS metrics into a ranked leaderboard (with a shared_context block and incomplete-row handling) emitted in the dump JSON. Model, queries, retriever and judge are held fixed so only the prompt varies. Co-Authored-By: Claude Opus 4.8 --- docs/docs/benchmarking.md | 75 ++++++ scripts/benchmarking/generate_prompt_sweep.py | 165 +++++++++++++ src/bin/service_benchmark.py | 158 +++++++++++++ tests/unit/test_generate_prompt_sweep.py | 125 ++++++++++ tests/unit/test_prompt_sweep_dump_gating.py | 58 +++++ tests/unit/test_prompt_sweep_leaderboard.py | 218 ++++++++++++++++++ 6 files changed, 799 insertions(+) create mode 100644 scripts/benchmarking/generate_prompt_sweep.py create mode 100644 tests/unit/test_generate_prompt_sweep.py create mode 100644 tests/unit/test_prompt_sweep_dump_gating.py create mode 100644 tests/unit/test_prompt_sweep_leaderboard.py diff --git a/docs/docs/benchmarking.md b/docs/docs/benchmarking.md index b9f9cb71..60ceccbe 100644 --- a/docs/docs/benchmarking.md +++ b/docs/docs/benchmarking.md @@ -130,6 +130,81 @@ To analyze results, see `scripts/benchmarking/` which contains: --- +## Prompt sweep + +A *prompt sweep* runs N agent prompts through the RAGAS harness with everything +but the prompt held fixed — model, queries, retriever, and the RAGAS judge all +come from one base config — and ranks the prompts on a **leaderboard** by mean +RAGAS metric. It generalizes a 2-way A/B to the whole prompt field, which is how +we settle "which support prompt do we ship?" with data rather than by eyeballing +one pair at a time (the open question Q5 / Decision 3 in +`docs/docs/notes_response_tuning.md`). + +Only `services.benchmarking.agent_md_file` varies across variants. The model and +RAGAS judge are deliberately held fixed; the leaderboard's `shared_context` +cross-checks this and flags any drift. + +### 1. Write a manifest + +A manifest names one base config and the prompts to sweep. See +`config/benchmarking/prompt_sweep.yaml` for a worked example: + +```yaml +base_config: config/benchmarking/ragas.yaml # supplies model/queries/judge/retriever +out_dir: bench_out/sweep_configs # optional (this is the default) +primary_metric: faithfulness # leaderboard sort key (optional) +prompts: + - config/agents/fasrc-cannon-v1-strict.md + - config/agents/fasrc-cannon-v2-lean.md + - config/agents/fasrc-cannon-v3-cited.md + - config/agents/fasrc-cannon-v4-linked.md +``` + +`primary_metric` is one of `answer_relevancy`, `faithfulness`, +`context_precision`, `context_recall` (default `faithfulness` — grounding is the +load-bearing property for a "never guess" support bot). All four metrics are +reported per variant regardless; this only sets the ranking key. + +### 2. Generate the per-prompt configs + +```bash +python scripts/benchmarking/generate_prompt_sweep.py -m config/benchmarking/prompt_sweep.yaml +``` + +This writes one config per prompt into the sweep directory, each identical to +the base except `services.benchmarking.agent_md_file` (the prompt) and `.name` +(the prompt's filename stem). It refuses to write anything if a prompt path is +missing, so a bad manifest never leaves a partial set behind. + +### 3. Run the sweep + +```bash +archi evaluate --config-dir bench_out/sweep_configs --hostmode +``` + +The harness runs each config in turn (the existing `--config-dir` path) and, +because 2+ configs ran, emits a `leaderboard` block in the dump JSON. + +### Reading the leaderboard + +The dump JSON gains a `leaderboard` key: + +- `rows` — one per variant: `name`, `agent_md_file`, the four mean RAGAS + `metrics`, `primary_score`, `rank`, `query_count`, and `incomplete`. Rows are + ranked best-first by `primary_metric`; ties share a rank. A variant that + failed to produce a metric (missing/NaN) has `None` for it, is marked + `incomplete: true`, and sorts after all complete variants — it is never + treated as a zero. +- `shared_context` — the model, provider, judge `evaluator_model`, + `queries_path`, and `corpus_snapshot_id` shared by all variants. If any of + these differ across the swept configs, the discrepancy is recorded in + `shared_context.warnings` (the sweep is no longer apples-to-apples). + +The pairwise `ab_comparisons` are still produced alongside the leaderboard; the +leaderboard is computed independently from each config's aggregates. + +--- + ## Human grading via Argilla `archi evaluate --argilla` pushes benchmark results to a self-hosted [Argilla](https://argilla.io/) instance for independent human grading. This is the platform we use to answer the question "is config A better than config B for FASRC users?" with data we trust — RAGAS scores alone can't decide prompt or model choices because the judge LLM has its own biases. diff --git a/scripts/benchmarking/generate_prompt_sweep.py b/scripts/benchmarking/generate_prompt_sweep.py new file mode 100644 index 00000000..1d812d25 --- /dev/null +++ b/scripts/benchmarking/generate_prompt_sweep.py @@ -0,0 +1,165 @@ +"""Generate a RAGAS prompt-sweep: one benchmarking config per prompt variant. + +A prompt sweep runs N agent prompts through the existing RAGAS harness with +everything but the prompt held fixed, so the only moving part is +`services.benchmarking.agent_md_file`. This script takes one base config plus a +list of prompt files (a sweep manifest) and writes one rendered config per +prompt into a sweep directory. Each generated config is byte-for-byte identical +to the base except for: + + - services.benchmarking.agent_md_file -> the prompt's path + - services.benchmarking.name -> the prompt's filename stem + - services.benchmarking.primary_metric -> manifest primary_metric (if set) + +Holding everything else constant is what makes the resulting leaderboard an +apples-to-apples comparison (the leaderboard's shared_context cross-check will +flag any drift). Run the sweep with: + + archi evaluate --config-dir --hostmode ... + +Manifest format (YAML): + + base_config: config/benchmarking/ragas.yaml + out_dir: bench_out/sweep_configs # optional, this is the default + primary_metric: faithfulness # optional, default faithfulness + prompts: + - config/agents/fasrc-cannon-v1-strict.md + - config/agents/fasrc-cannon-v2-lean.md + - config/agents/fasrc-cannon-v3-cited.md + - config/agents/fasrc-cannon-v4-linked.md + +Usage: + python scripts/benchmarking/generate_prompt_sweep.py --manifest config/benchmarking/prompt_sweep.yaml +""" + +from __future__ import annotations + +import argparse +import copy +import sys +from pathlib import Path +from typing import Any, Dict, List + +import yaml + +DEFAULT_OUT_DIR = "bench_out/sweep_configs" +DEFAULT_PRIMARY_METRIC = "faithfulness" +KNOWN_METRICS = { + "answer_relevancy", + "faithfulness", + "context_precision", + "context_recall", +} + + +def _load_yaml(path: Path) -> Dict[str, Any]: + with open(path, "r") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict): + raise ValueError(f"Expected a YAML mapping at {path}, got {type(data).__name__}") + return data + + +def generate_sweep_configs(manifest_path: Path) -> List[Path]: + """Render one benchmarking config per prompt listed in the manifest. + + Validates that every prompt file exists BEFORE writing anything, so a bad + manifest never leaves a partial set of configs in the sweep directory. + Returns the list of written config paths. + """ + manifest = _load_yaml(manifest_path) + + base_config_raw = manifest.get("base_config") + if not base_config_raw: + raise ValueError("Manifest missing required key 'base_config'.") + prompts_raw = manifest.get("prompts") + if not isinstance(prompts_raw, list) or not prompts_raw: + raise ValueError("Manifest 'prompts' must be a non-empty list of prompt file paths.") + + primary_metric = str(manifest.get("primary_metric", DEFAULT_PRIMARY_METRIC)) + if primary_metric not in KNOWN_METRICS: + raise ValueError( + f"Manifest primary_metric '{primary_metric}' is not a known RAGAS metric " + f"{sorted(KNOWN_METRICS)}." + ) + + # Relative paths resolve against the current working directory (the repo + # root — the documented place to run this from, and how the prompts in the + # example manifest are written). Absolute paths are used as-is. + repo_root = Path.cwd() + + def _resolve(p: str) -> Path: + candidate = Path(p) + return candidate if candidate.is_absolute() else repo_root / candidate + + base_config_path = _resolve(str(base_config_raw)) + if not base_config_path.is_file(): + raise ValueError(f"base_config not found: '{base_config_path}'") + + prompt_paths = [_resolve(str(p)) for p in prompts_raw] + missing = [str(p) for p in prompt_paths if not p.is_file()] + if missing: + # Atomic: refuse before writing any config. + raise ValueError(f"Prompt file(s) not found, aborting (no configs written): {missing}") + + out_dir = _resolve(str(manifest.get("out_dir", DEFAULT_OUT_DIR))) + out_dir.mkdir(parents=True, exist_ok=True) + + base_config = _load_yaml(base_config_path) + if "services" not in base_config or "benchmarking" not in base_config.get("services", {}): + raise ValueError( + f"base_config '{base_config_path}' has no services.benchmarking section to sweep." + ) + + written: List[Path] = [] + for prompt_path in prompt_paths: + cfg = copy.deepcopy(base_config) + bench = cfg["services"]["benchmarking"] + stem = prompt_path.stem + bench["agent_md_file"] = str(prompt_path) + bench["name"] = stem + bench["primary_metric"] = primary_metric + + out_path = out_dir / f"{stem}.yaml" + with open(out_path, "w") as f: + yaml.safe_dump(cfg, f, sort_keys=False) + written.append(out_path) + + return written + + +def main(argv: List[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + "--manifest", "-m", required=True, + help="Path to a sweep manifest YAML (base_config + prompts).", + ) + args = parser.parse_args(argv) + + manifest_path = Path(args.manifest) + if not manifest_path.is_file(): + print(f"Manifest not found: {manifest_path}", file=sys.stderr) + return 2 + + try: + written = generate_sweep_configs(manifest_path) + except ValueError as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + out_dir = written[0].parent + manifest = _load_yaml(manifest_path) + primary_metric = str(manifest.get("primary_metric", DEFAULT_PRIMARY_METRIC)) + + print(f"Wrote {len(written)} sweep config(s) to {out_dir}/:") + for p in written: + print(f" - {p.name}") + print() + print("Next: run the sweep (leaderboard ranks by " + f"'{primary_metric}'):") + print(f" archi evaluate --config-dir {out_dir} --hostmode") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/bin/service_benchmark.py b/src/bin/service_benchmark.py index 8ed40c39..1588cb4d 100644 --- a/src/bin/service_benchmark.py +++ b/src/bin/service_benchmark.py @@ -75,6 +75,7 @@ class ResultHandler: metadata = {} # store the metadata about the benchmark run ab_comparison: Dict[str, Any] = {} # single-pair compat (populated only in ab_mode with 2 configs) ab_comparisons: List[Dict[str, Any]] = [] # multi-pair: list of pair comparison dicts + leaderboard: Dict[str, Any] = {} # prompt-sweep leaderboard (populated only when 2+ configs run) # Per-invocation identifier shared by every config in this archi-evaluate run. # Stamped onto Argilla records as metadata so the analysis notebook can refuse # to compute primary-outcome statistics across configs that were NOT run @@ -177,6 +178,8 @@ def dump(benchmark_name: Path): output["ab_comparison"] = ResultHandler.ab_comparison if ResultHandler.ab_comparisons: output["ab_comparisons"] = ResultHandler.ab_comparisons + if ResultHandler.leaderboard: + output["leaderboard"] = ResultHandler.leaderboard with open(file_path, "w") as f: json.dump(output, f, indent=4) @@ -312,6 +315,135 @@ def generate_pairwise_combinations(n_configs: int) -> List[Tuple[int, int]]: """Generate all pairwise index combinations for N configs.""" return list(combinations(range(n_configs), 2)) + # Leaderboard metric name -> the aggregate key the run loop writes onto + # total_results (service_benchmark.py RAGAS block). Order is display order. + LEADERBOARD_METRICS: List[Tuple[str, str]] = [ + ("answer_relevancy", "aggregate_answer_relevancy"), + ("faithfulness", "aggregate_faithfulness"), + ("context_precision", "aggregate_context_precision"), + ("context_recall", "aggregate_context_recall"), + ] + + @staticmethod + def build_leaderboard(primary_metric: str = "faithfulness") -> Dict[str, Any]: + """Rank swept prompt variants by mean RAGAS metric. + + Reads each config's per-run aggregates from ResultHandler.results + (the means the RAGAS block already wrote onto total_results) and + builds a ranked leaderboard. Independent of the pairwise A/B plumbing: + it never touches pair_ab_results/ab_comparisons. + + Each row: {name, agent_md_file, metrics{...}, primary_score, rank, + incomplete, query_count}. A metric is None (and the row `incomplete`) + when its aggregate key is absent or NaN — never silently zeroed. + Incomplete rows always sort after complete ones. Ties share a rank. + + shared_context records the run context common to all variants and + flags any drift (a hand-edited config that breaks apples-to-apples). + """ + metric_names = [name for name, _ in ResultHandler.LEADERBOARD_METRICS] + if primary_metric not in metric_names: + logger.warning( + "Leaderboard primary_metric '%s' is not a known RAGAS metric %s; " + "falling back to 'faithfulness'.", + primary_metric, metric_names, + ) + primary_metric = "faithfulness" + + def _benchmarking(record: Dict[str, Any]) -> Dict[str, Any]: + return ( + record.get("configuration", {}) + .get("services", {}) + .get("benchmarking", {}) + ) + + rows: List[Dict[str, Any]] = [] + # Accumulate shared-context candidates to detect drift across configs. + ctx_fields: Dict[str, set] = { + "model": set(), "provider": set(), + "evaluator_model": set(), "queries_path": set(), + } + + for record in ResultHandler.results: + bench = _benchmarking(record) + total = record.get("total_results", {}) or {} + + agent_md_file = bench.get("agent_md_file", "") or "" + name = bench.get("name") or (Path(agent_md_file).stem if agent_md_file else "") + + metrics: Dict[str, Optional[float]] = {} + incomplete = False + for metric_name, agg_key in ResultHandler.LEADERBOARD_METRICS: + value = total.get(agg_key) + if value is None or (isinstance(value, float) and math.isnan(value)): + metrics[metric_name] = None + incomplete = True + else: + metrics[metric_name] = float(value) + + if incomplete: + logger.warning( + "Leaderboard: variant '%s' (%s) is incomplete — missing/NaN metrics: %s", + name, agent_md_file, + [m for m in metric_names if metrics[m] is None], + ) + + rows.append({ + "name": name, + "agent_md_file": agent_md_file, + "metrics": metrics, + "primary_score": metrics[primary_metric], + "incomplete": incomplete, + "query_count": len(record.get("single_question_results") or {}), + }) + + ragas_settings = (bench.get("mode_settings", {}) or {}).get("ragas_settings", {}) or {} + ctx_fields["model"].add(bench.get("model")) + ctx_fields["provider"].add(bench.get("provider")) + ctx_fields["evaluator_model"].add(ragas_settings.get("evaluator_model")) + ctx_fields["queries_path"].add(bench.get("queries_path")) + + # Complete rows first, then by descending primary score; incomplete last. + rows.sort(key=lambda r: ( + 1 if r["incomplete"] else 0, + -(r["primary_score"] if r["primary_score"] is not None else 0.0), + )) + + # Dense ranking: equal primary scores share a rank. + rank = 0 + prev_score: Any = object() + for row in rows: + score = row["primary_score"] + if score != prev_score: + rank += 1 + prev_score = score + row["rank"] = rank + + warnings: List[str] = [] + shared_context: Dict[str, Any] = { + "corpus_snapshot_id": ResultHandler.get_corpus_snapshot_id(), + } + for field_name, values in ctx_fields.items(): + present = {v for v in values if v is not None} + if len(present) <= 1: + shared_context[field_name] = next(iter(present), None) + else: + shared_context[field_name] = sorted(str(v) for v in present) + warnings.append( + f"{field_name} differs across swept configs: {sorted(str(v) for v in present)}" + ) + if warnings: + for w in warnings: + logger.warning("Leaderboard shared-context drift: %s", w) + shared_context["warnings"] = warnings + + ResultHandler.leaderboard = { + "shared_context": shared_context, + "primary_metric": primary_metric, + "rows": rows, + } + return ResultHandler.leaderboard + class Benchmarker: @@ -885,6 +1017,32 @@ def run(self): comp["aggregate"]["ties"], ) + # Prompt-sweep leaderboard: rank every config by mean RAGAS metric. + # Independent of the pairwise block above (reads per-config aggregates + # directly). Only meaningful with 2+ variants. + if len(ResultHandler.results) >= 2: + primary_metric = str( + self.config.get("services", {}).get("benchmarking", {}).get("primary_metric", "faithfulness") + ) + leaderboard = ResultHandler.build_leaderboard(primary_metric) + logger.info("Prompt-sweep leaderboard (ranked by %s):", leaderboard["primary_metric"]) + logger.info( + " %-4s %-28s %-10s %-10s %-10s %-10s %-10s %s", + "rank", "name", "ans_rel", "faith", "ctx_prec", "ctx_rec", "n_q", "prompt", + ) + for row in leaderboard["rows"]: + m = row["metrics"] + def _fmt(v: Optional[float]) -> str: + return f"{v:.4f}" if isinstance(v, float) else " n/a" + flag = " (incomplete)" if row["incomplete"] else "" + logger.info( + " %-4d %-28s %-10s %-10s %-10s %-10s %-10d %s%s", + row["rank"], row["name"][:28], + _fmt(m["answer_relevancy"]), _fmt(m["faithfulness"]), + _fmt(m["context_precision"]), _fmt(m["context_recall"]), + row["query_count"], row["agent_md_file"], flag, + ) + # Push to Argilla when ARCHI_ARGILLA=1 in the benchmarks container env. # The CLI flag --argilla on `archi evaluate` sets this (see Task 2.5). argilla_enabled = os.environ.get("ARCHI_ARGILLA", "").strip().lower() in ("1", "true", "yes") diff --git a/tests/unit/test_generate_prompt_sweep.py b/tests/unit/test_generate_prompt_sweep.py new file mode 100644 index 00000000..7fa33c28 --- /dev/null +++ b/tests/unit/test_generate_prompt_sweep.py @@ -0,0 +1,125 @@ +"""Unit tests for the prompt-sweep config generator. + +Covers scripts/benchmarking/generate_prompt_sweep.py: one config per prompt, +each differing from the base ONLY in services.benchmarking.agent_md_file and +.name (+ primary_metric), name = prompt stem, and atomic failure (a missing +prompt aborts before any config is written). +""" + +import copy + +import pytest +import yaml + +from scripts.benchmarking.generate_prompt_sweep import generate_sweep_configs + +BASE_CONFIG = { + "name": "ragas-bench", + "global": {"DATA_PATH": "/root/data/"}, + "services": { + "benchmarking": { + "agent_class": "CMSCompOpsAgent", + "agent_md_file": "config/agents/fasrc-cannon-v1-strict.md", + "provider": "openai", + "model": "Qwen/Qwen3.5-35B-A3B-GPTQ-Int4", + "queries_path": "config/benchmarking/queries.json", + "modes": ["RAGAS"], + }, + }, +} + + +def _write_yaml(path, data): + with open(path, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) + + +def _setup(tmp_path, prompt_names, *, primary_metric=None, base=None): + """Create a base config, prompt files, and a manifest under tmp_path.""" + base_path = tmp_path / "base.yaml" + _write_yaml(base_path, base if base is not None else BASE_CONFIG) + + prompt_paths = [] + for name in prompt_names: + p = tmp_path / name + p.write_text(f"# prompt {name}\n") + prompt_paths.append(p) + + manifest = { + "base_config": str(base_path), + "out_dir": str(tmp_path / "sweep_configs"), + "prompts": [str(p) for p in prompt_paths], + } + if primary_metric is not None: + manifest["primary_metric"] = primary_metric + manifest_path = tmp_path / "manifest.yaml" + _write_yaml(manifest_path, manifest) + return manifest_path + + +def test_one_config_per_prompt_only_prompt_fields_differ(tmp_path): + manifest = _setup(tmp_path, ["v1-strict.md", "v2-lean.md", "v3-cited.md"]) + written = generate_sweep_configs(manifest) + assert len(written) == 3 + + for path in written: + with open(path) as f: + cfg = yaml.safe_load(f) + bench = cfg["services"]["benchmarking"] + # agent_md_file points at one of the prompts + assert bench["agent_md_file"].endswith(".md") + # Everything else equals the base, ignoring the three swept fields. + stripped = copy.deepcopy(cfg) + sb = stripped["services"]["benchmarking"] + for k in ("agent_md_file", "name", "primary_metric"): + sb.pop(k, None) + expected = copy.deepcopy(BASE_CONFIG) + expected["services"]["benchmarking"].pop("agent_md_file", None) + assert stripped == expected + + +def test_name_is_prompt_stem(tmp_path): + manifest = _setup(tmp_path, ["fasrc-cannon-v2-lean.md"]) + written = generate_sweep_configs(manifest) + with open(written[0]) as f: + cfg = yaml.safe_load(f) + assert cfg["services"]["benchmarking"]["name"] == "fasrc-cannon-v2-lean" + # output file is named for the stem too + assert written[0].name == "fasrc-cannon-v2-lean.yaml" + + +def test_primary_metric_threaded_into_each_config(tmp_path): + manifest = _setup(tmp_path, ["a.md", "b.md"], primary_metric="answer_relevancy") + written = generate_sweep_configs(manifest) + for path in written: + with open(path) as f: + cfg = yaml.safe_load(f) + assert cfg["services"]["benchmarking"]["primary_metric"] == "answer_relevancy" + + +def test_missing_prompt_aborts_atomically(tmp_path): + # One real prompt, one bogus path that does not exist. + base_path = tmp_path / "base.yaml" + _write_yaml(base_path, BASE_CONFIG) + real = tmp_path / "real.md" + real.write_text("# real\n") + out_dir = tmp_path / "sweep_configs" + manifest = { + "base_config": str(base_path), + "out_dir": str(out_dir), + "prompts": [str(real), str(tmp_path / "does-not-exist.md")], + } + manifest_path = tmp_path / "manifest.yaml" + _write_yaml(manifest_path, manifest) + + with pytest.raises(ValueError, match="does-not-exist.md"): + generate_sweep_configs(manifest_path) + + # Atomic: no configs written. + assert not out_dir.exists() or list(out_dir.glob("*.yaml")) == [] + + +def test_invalid_primary_metric_rejected(tmp_path): + manifest = _setup(tmp_path, ["a.md"], primary_metric="bogus_metric") + with pytest.raises(ValueError, match="bogus_metric"): + generate_sweep_configs(manifest) diff --git a/tests/unit/test_prompt_sweep_dump_gating.py b/tests/unit/test_prompt_sweep_dump_gating.py new file mode 100644 index 00000000..4d24d020 --- /dev/null +++ b/tests/unit/test_prompt_sweep_dump_gating.py @@ -0,0 +1,58 @@ +"""Dump-level gating test for the prompt-sweep leaderboard. + +ResultHandler.dump must include a `leaderboard` key only when the leaderboard +has been populated (a 2+ config sweep), and omit it otherwise (an ordinary +single-config run), leaving today's output otherwise unchanged. +""" + +import json +from pathlib import Path + +import pytest + +import src.bin.service_benchmark as sb +from src.bin.service_benchmark import ResultHandler + + +@pytest.fixture(autouse=True) +def _reset_and_redirect(tmp_path, monkeypatch): + """Reset ResultHandler class state and redirect dump output to tmp_path.""" + saved = (ResultHandler.results, ResultHandler.metadata, + ResultHandler.leaderboard, ResultHandler.ab_comparison, + ResultHandler.ab_comparisons) + ResultHandler.results = [] + ResultHandler.metadata = {} + ResultHandler.leaderboard = {} + ResultHandler.ab_comparison = {} + ResultHandler.ab_comparisons = [] + monkeypatch.setattr(sb, "OUTPUT_DIR", tmp_path) + yield + (ResultHandler.results, ResultHandler.metadata, + ResultHandler.leaderboard, ResultHandler.ab_comparison, + ResultHandler.ab_comparisons) = saved + + +def _dump_and_load(tmp_path): + ResultHandler.dump(Path("test-bench")) + files = list(tmp_path.glob("test-bench-*.json")) + assert len(files) == 1, f"expected one dump file, found {files}" + with open(files[0]) as f: + return json.load(f) + + +def test_leaderboard_emitted_for_multi_config(tmp_path): + ResultHandler.results = [{"total_results": {}}, {"total_results": {}}] + ResultHandler.leaderboard = {"primary_metric": "faithfulness", "rows": [], "shared_context": {}} + output = _dump_and_load(tmp_path) + assert "leaderboard" in output + assert output["leaderboard"]["primary_metric"] == "faithfulness" + + +def test_no_leaderboard_for_single_config(tmp_path): + ResultHandler.results = [{"total_results": {}}] + # leaderboard left empty (single-config run never builds one) + output = _dump_and_load(tmp_path) + assert "leaderboard" not in output + # output otherwise unchanged: the usual top-level keys are present + assert "benchmarking_results" in output + assert "metadata" in output diff --git a/tests/unit/test_prompt_sweep_leaderboard.py b/tests/unit/test_prompt_sweep_leaderboard.py new file mode 100644 index 00000000..a805f75f --- /dev/null +++ b/tests/unit/test_prompt_sweep_leaderboard.py @@ -0,0 +1,218 @@ +"""Unit tests for the RAGAS prompt-sweep leaderboard aggregator. + +Covers ResultHandler.build_leaderboard: one row per config, ranking by a +configurable primary metric, tie handling, incomplete-variant handling +(missing/NaN metrics sort last), name fallback to the prompt stem, and the +shared-context drift check. The aggregator reads per-config aggregates only — +these tests never construct any pairwise A/B data. +""" + +from typing import Optional + +import pytest + +from src.bin.service_benchmark import ResultHandler + + +def _make_record( + name, + agent_md_file, + *, + answer_relevancy: Optional[float] = 0.8, + faithfulness: Optional[float] = 0.8, + context_precision: Optional[float] = 0.8, + context_recall: Optional[float] = 0.8, + model="Qwen/Qwen3.5-35B-A3B-GPTQ-Int4", + provider="openai", + evaluator_model="us.anthropic.claude-sonnet-4-5-20250929-v1:0", + queries_path="config/benchmarking/queries.json", + n_questions=3, + include_name=True, +): + """Build a ResultHandler.results record shaped like handle_results writes.""" + total_results = {} + for key, value in ( + ("aggregate_answer_relevancy", answer_relevancy), + ("aggregate_faithfulness", faithfulness), + ("aggregate_context_precision", context_precision), + ("aggregate_context_recall", context_recall), + ): + if value is not None: # omit the key entirely to model a missing metric + total_results[key] = value + + benchmarking = { + "agent_md_file": agent_md_file, + "model": model, + "provider": provider, + "queries_path": queries_path, + "mode_settings": {"ragas_settings": {"evaluator_model": evaluator_model}}, + } + if include_name: + benchmarking["name"] = name + + return { + "single_question_results": {f"q{i}": {} for i in range(n_questions)}, + "total_results": total_results, + "configuration_file": f"/tmp/{name}.yaml", + "configuration": {"services": {"benchmarking": benchmarking}}, + } + + +@pytest.fixture(autouse=True) +def _reset_result_handler(): + """ResultHandler holds class-level mutable state; reset around every test.""" + saved_results = ResultHandler.results + saved_leaderboard = ResultHandler.leaderboard + ResultHandler.results = [] + ResultHandler.leaderboard = {} + yield + ResultHandler.results = saved_results + ResultHandler.leaderboard = saved_leaderboard + + +# -- 4.2 one row per config -------------------------------------------------- + +def test_one_row_per_config(): + ResultHandler.results = [ + _make_record("v1-strict", "config/agents/fasrc-cannon-v1-strict.md", faithfulness=0.7), + _make_record("v2-lean", "config/agents/fasrc-cannon-v2-lean.md", faithfulness=0.9), + _make_record("v3-cited", "config/agents/fasrc-cannon-v3-cited.md", faithfulness=0.8), + ] + lb = ResultHandler.build_leaderboard() + assert len(lb["rows"]) == 3 + by_name = {r["name"]: r for r in lb["rows"]} + assert set(by_name) == {"v1-strict", "v2-lean", "v3-cited"} + row = by_name["v2-lean"] + assert row["agent_md_file"] == "config/agents/fasrc-cannon-v2-lean.md" + assert set(row["metrics"]) == { + "answer_relevancy", "faithfulness", "context_precision", "context_recall", + } + assert row["metrics"]["faithfulness"] == pytest.approx(0.9) + + +# -- 4.3 default + configured ranking ---------------------------------------- + +def test_default_ranking_by_faithfulness(): + ResultHandler.results = [ + _make_record("low", "p/low.md", faithfulness=0.5), + _make_record("high", "p/high.md", faithfulness=0.95), + _make_record("mid", "p/mid.md", faithfulness=0.7), + ] + lb = ResultHandler.build_leaderboard() + assert lb["primary_metric"] == "faithfulness" + assert lb["rows"][0]["name"] == "high" + assert lb["rows"][0]["rank"] == 1 + assert [r["name"] for r in lb["rows"]] == ["high", "mid", "low"] + + +def test_configured_primary_metric_reranks(): + ResultHandler.results = [ + _make_record("a", "p/a.md", faithfulness=0.9, answer_relevancy=0.2), + _make_record("b", "p/b.md", faithfulness=0.4, answer_relevancy=0.99), + ] + lb = ResultHandler.build_leaderboard("answer_relevancy") + assert lb["primary_metric"] == "answer_relevancy" + assert lb["rows"][0]["name"] == "b" + # all four metric values still present regardless of primary + assert set(lb["rows"][0]["metrics"]) == { + "answer_relevancy", "faithfulness", "context_precision", "context_recall", + } + + +def test_unknown_primary_metric_falls_back_to_faithfulness(): + ResultHandler.results = [ + _make_record("a", "p/a.md", faithfulness=0.6), + _make_record("b", "p/b.md", faithfulness=0.8), + ] + lb = ResultHandler.build_leaderboard("not_a_metric") + assert lb["primary_metric"] == "faithfulness" + assert lb["rows"][0]["name"] == "b" + + +# -- 4.4 tie handling -------------------------------------------------------- + +def test_ties_share_a_rank(): + ResultHandler.results = [ + _make_record("a", "p/a.md", faithfulness=0.8), + _make_record("b", "p/b.md", faithfulness=0.8), + _make_record("c", "p/c.md", faithfulness=0.5), + ] + lb = ResultHandler.build_leaderboard() + ranks = {r["name"]: r["rank"] for r in lb["rows"]} + assert ranks["a"] == ranks["b"] == 1 + assert ranks["c"] == 2 + + +# -- 4.5 incomplete handling ------------------------------------------------- + +def test_missing_metric_marks_incomplete_and_sorts_last(): + ResultHandler.results = [ + _make_record("complete-low", "p/low.md", faithfulness=0.3), + _make_record("missing", "p/missing.md", faithfulness=None), # key omitted + ] + lb = ResultHandler.build_leaderboard() + by_name = {r["name"]: r for r in lb["rows"]} + assert by_name["missing"]["metrics"]["faithfulness"] is None + assert by_name["missing"]["incomplete"] is True + # complete row ranks ahead of incomplete even though its score (0.3) is low + assert lb["rows"][0]["name"] == "complete-low" + assert lb["rows"][-1]["name"] == "missing" + + +def test_nan_metric_marks_incomplete(): + rec = _make_record("nan", "p/nan.md") + rec["total_results"]["aggregate_faithfulness"] = float("nan") + ResultHandler.results = [ + _make_record("ok", "p/ok.md", faithfulness=0.6), + rec, + ] + lb = ResultHandler.build_leaderboard() + by_name = {r["name"]: r for r in lb["rows"]} + assert by_name["nan"]["metrics"]["faithfulness"] is None + assert by_name["nan"]["incomplete"] is True + assert lb["rows"][-1]["name"] == "nan" + + +# -- 4.6 name fallback ------------------------------------------------------- + +def test_name_falls_back_to_prompt_stem(): + ResultHandler.results = [ + _make_record("ignored", "config/agents/fasrc-cannon-v4-linked.md", + faithfulness=0.7, include_name=False), + _make_record("named", "config/agents/fasrc-cannon-v1-strict.md", + faithfulness=0.6), + ] + lb = ResultHandler.build_leaderboard() + names = {r["name"] for r in lb["rows"]} + assert "fasrc-cannon-v4-linked" in names # stem, not config_0 + assert not any(n.startswith("config_") for n in names) + + +# -- 4.7 shared context ------------------------------------------------------ + +def test_shared_context_uniform_sweep(): + ResultHandler.results = [ + _make_record("a", "p/a.md", faithfulness=0.7), + _make_record("b", "p/b.md", faithfulness=0.8), + ] + lb = ResultHandler.build_leaderboard() + ctx = lb["shared_context"] + assert ctx["model"] == "Qwen/Qwen3.5-35B-A3B-GPTQ-Int4" + assert ctx["provider"] == "openai" + assert ctx["evaluator_model"] == "us.anthropic.claude-sonnet-4-5-20250929-v1:0" + assert ctx["queries_path"] == "config/benchmarking/queries.json" + assert ctx["corpus_snapshot_id"] + assert ctx["warnings"] == [] + + +def test_shared_context_flags_model_drift(): + ResultHandler.results = [ + _make_record("a", "p/a.md", faithfulness=0.7, model="modelA"), + _make_record("b", "p/b.md", faithfulness=0.8, model="modelB"), + ] + lb = ResultHandler.build_leaderboard() + ctx = lb["shared_context"] + assert ctx["warnings"], "expected a drift warning" + assert any("model" in w for w in ctx["warnings"]) + # rows still emitted despite drift + assert len(lb["rows"]) == 2 From 1699a16624b5b2d8fd8ee09a37c2bf538647033c Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Tue, 9 Jun 2026 22:27:09 -0400 Subject: [PATCH 2/7] fix(cli): make archi evaluate --config-dir run multi-config benchmark sweeps Four plumbing fixes so a prompt sweep actually runs end-to-end, each surfaced by a live --config-dir run: - config_manager: exempt services.benchmarking from the cross-config consistency check (it is the sweep axis); global + other services.* stay strict. - templates_manager: render a distinct file per config in multi-config mode (configs sharing a top-level name no longer collide onto one file). - config_seed: fall back to the first *.yaml when config.yaml is absent, so the chatbot-oriented seed step no longer aborts a benchmark deployment. - templates_manager: stage every config's agent_md_file, not just the first, so all sweep variants are available in the container. Co-Authored-By: Claude Opus 4.8 --- src/cli/managers/config_manager.py | 17 +++- src/cli/managers/templates_manager.py | 72 ++++++++++++---- src/cli/tools/config_seed.py | 23 ++++++ ...t_config_manager_benchmarking_variation.py | 82 +++++++++++++++++++ tests/unit/test_config_seed_resolve.py | 40 +++++++++ tests/unit/test_render_config_filename.py | 48 +++++++++++ tests/unit/test_stage_agents_multiconfig.py | 57 +++++++++++++ 7 files changed, 321 insertions(+), 18 deletions(-) create mode 100644 tests/unit/test_config_manager_benchmarking_variation.py create mode 100644 tests/unit/test_config_seed_resolve.py create mode 100644 tests/unit/test_render_config_filename.py create mode 100644 tests/unit/test_stage_agents_multiconfig.py diff --git a/src/cli/managers/config_manager.py b/src/cli/managers/config_manager.py index 7ed9f940..15619b6a 100644 --- a/src/cli/managers/config_manager.py +++ b/src/cli/managers/config_manager.py @@ -13,6 +13,21 @@ STATIC_FIELDS = ['global', 'services'] + +def _comparable_static_field(static_field: str, value: Any) -> Any: + """Return the value used for cross-config consistency comparison. + + `services.benchmarking` is the sweep axis for an ``archi evaluate + --config-dir`` run (the prompt under test and its name vary per config), + so it is exempted from the equality check. ``global`` and every other + ``services.*`` subsection are still compared as-is. ``archi create`` does + not consume ``services.benchmarking``, so this exemption cannot affect a + created deployment. + """ + if static_field == 'services' and isinstance(value, dict): + return {k: v for k, v in value.items() if k != 'benchmarking'} + return value + class ConfigurationManager: """Manages archi configuration loading and validation""" @@ -64,7 +79,7 @@ def _append(self,config): if not static_field in config.keys(): raise ValueError(f"The field {static_field} must be present in all configurations.") - if previous_config[static_field] != config[static_field]: + if _comparable_static_field(static_field, previous_config[static_field]) != _comparable_static_field(static_field, config[static_field]): raise ValueError(f"The field {static_field} must be consistent across all configurations.") self.configs.append(config) diff --git a/src/cli/managers/templates_manager.py b/src/cli/managers/templates_manager.py index cf40c429..e3aa0583 100644 --- a/src/cli/managers/templates_manager.py +++ b/src/cli/managers/templates_manager.py @@ -17,6 +17,32 @@ logger = get_logger(__name__) +def _render_config_target_name( + single_mode: bool, + top_level_name: str, + benchmarking_name: Optional[str], + index: int, + used_names: set, +) -> str: + """Pick the rendered-config filename for one config. + + A single-config deployment renders to ``config.yaml`` (the path config-seed + and the chatbot expect). A multi-config run (e.g. a benchmarking sweep) + renders one distinct file per config so the benchmarker iterates every + variant instead of overwriting a single file. The per-variant + ``services.benchmarking.name`` is preferred for readability, falling back to + the top-level ``name``; collisions are disambiguated with the config index. + """ + if single_mode: + return "config.yaml" + stem = str(benchmarking_name or top_level_name) + candidate = f"{stem}.yaml" + if candidate in used_names: + candidate = f"{stem}_{index}.yaml" + used_names.add(candidate) + return candidate + + # Template file constants BASE_CONFIG_TEMPLATE = "base-config.yaml" BASE_COMPOSE_TEMPLATE = "base-compose.yaml" @@ -161,22 +187,26 @@ def _stage_agents(self, context: TemplateContext) -> None: services_cfg = config.get("services", {}) or {} if context.benchmarking: - benchmark_cfg = services_cfg.get("benchmarking", {}) or {} - agent_md_file = benchmark_cfg.get("agent_md_file") - if not agent_md_file: - raise ValueError("Missing required services.benchmarking.agent_md_file in config.") - source_path = Path(str(agent_md_file)).expanduser() - config_path = Path(str(config.get("_config_path", ""))).expanduser() - if not source_path.is_absolute() and config_path: - candidate = (config_path.parent / source_path).resolve() - if candidate.exists(): - source_path = candidate - if not source_path.exists() or not source_path.is_file(): - raise ValueError(f"Benchmark agent file not found: {source_path}") - if source_path.suffix.lower() != ".md": - raise ValueError(f"Benchmark agent file must be a .md file: {source_path}") + # A multi-config sweep names a distinct agent_md_file per config; stage + # every one so the benchmarker can load each variant (not just the first). dst_dir.mkdir(parents=True, exist_ok=True) - shutil.copyfile(source_path, dst_dir / source_path.name) + for bench_config in context.config_manager.get_configs(): + bench_services = bench_config.get("services", {}) or {} + benchmark_cfg = bench_services.get("benchmarking", {}) or {} + agent_md_file = benchmark_cfg.get("agent_md_file") + if not agent_md_file: + raise ValueError("Missing required services.benchmarking.agent_md_file in config.") + source_path = Path(str(agent_md_file)).expanduser() + config_path = Path(str(bench_config.get("_config_path", ""))).expanduser() + if not source_path.is_absolute() and config_path: + candidate = (config_path.parent / source_path).resolve() + if candidate.exists(): + source_path = candidate + if not source_path.exists() or not source_path.is_file(): + raise ValueError(f"Benchmark agent file not found: {source_path}") + if source_path.suffix.lower() != ".md": + raise ValueError(f"Benchmark agent file must be a .md file: {source_path}") + shutil.copyfile(source_path, dst_dir / source_path.name) return agents_dir = (services_cfg.get("chat_app") or {}).get("agents_dir") @@ -329,7 +359,8 @@ def _render_config_files(self, context: TemplateContext) -> None: archi_configs = context.config_manager.get_configs() single_mode = len(archi_configs) == 1 - for archi_config in archi_configs: + used_names: set = set() + for index, archi_config in enumerate(archi_configs): name = archi_config["name"] updated_config = copy.deepcopy(archi_config) @@ -354,7 +385,14 @@ def _render_config_files(self, context: TemplateContext) -> None: config_template = self.env.get_template(BASE_CONFIG_TEMPLATE) config_rendered = config_template.render(verbosity=context.plan.verbosity, **updated_config) - target_name = "config.yaml" if single_mode else f"{name}.yaml" + benchmarking_name = None + if context.benchmarking: + benchmark_cfg = services_cfg.get("benchmarking") + if isinstance(benchmark_cfg, dict): + benchmarking_name = benchmark_cfg.get("name") + target_name = _render_config_target_name( + single_mode, name, benchmarking_name, index, used_names + ) with open(configs_path / target_name, "w") as f: f.write(config_rendered) logger.info(f"Rendered configuration file {configs_path / target_name}") diff --git a/src/cli/tools/config_seed.py b/src/cli/tools/config_seed.py index 81b2e29a..8dc60e28 100644 --- a/src/cli/tools/config_seed.py +++ b/src/cli/tools/config_seed.py @@ -12,6 +12,7 @@ Exits 0 on success, non-zero on failure. """ +import glob import os import sys import yaml @@ -25,6 +26,27 @@ def load_config(path: str): return yaml.safe_load(f) +def resolve_config_path(config_path: str) -> str: + """Resolve the config file to seed Postgres from. + + A single-config deployment renders ``config.yaml`` and this returns it + unchanged. A multi-config benchmarking deployment renders per-variant files + (e.g. ``fasrc-cannon-v1-strict.yaml``) instead, so the hardcoded + ``config.yaml`` is absent — fall back to the first ``*.yaml`` in the + rendered-config directory rather than aborting the whole deployment. + Seeding from any one config is harmless: the benchmarker reads the YAML + files directly and never consumes the seeded static_config. If nothing is + found, return the original path so ``load_config`` raises a clear error. + """ + if os.path.isfile(config_path): + return config_path + directory = config_path if os.path.isdir(config_path) else (os.path.dirname(config_path) or ".") + candidates = sorted(glob.glob(os.path.join(directory, "*.yaml"))) + if candidates: + return candidates[0] + return config_path + + def seed(config: dict, cs: ConfigService): print("[config-seed] Starting seed with config keys:", list(config.keys())) dm = config.get("data_manager", {}) @@ -91,6 +113,7 @@ def main(): def seed_entry(config_path: str, env: dict): + config_path = resolve_config_path(config_path) print(f"[config-seed] Loading config from {config_path}") config = load_config(config_path) factory = PostgresServiceFactory.from_env(password_override=env.get("PGPASSWORD") or env.get("PG_PASSWORD")) diff --git a/tests/unit/test_config_manager_benchmarking_variation.py b/tests/unit/test_config_manager_benchmarking_variation.py new file mode 100644 index 00000000..dfd1f75b --- /dev/null +++ b/tests/unit/test_config_manager_benchmarking_variation.py @@ -0,0 +1,82 @@ +"""Unit tests for the multi-config loader's cross-config consistency check. + +A `--config-dir` benchmarking run sweeps prompt variants that differ only in +`services.benchmarking` (the prompt under test + its name). The loader must +accept that variation while still rejecting drift in `global` or in any other +`services.*` subsection (e.g. `services.chat_app`, which carries the SUT). +""" + +import copy + +import pytest + +from src.cli.managers.config_manager import ConfigurationManager + + +def _manager(): + """A ConfigurationManager shell without running its file-loading __init__.""" + mgr = object.__new__(ConfigurationManager) + mgr.configs = [] + return mgr + + +def _base_config(): + return { + "name": "ragas-bench", + "global": {"DATA_PATH": "/root/data/"}, + "services": { + "chat_app": {"default_provider": "openai", "default_model": "qwen"}, + "benchmarking": { + "agent_md_file": "config/agents/archive/fasrc-cannon-v1-strict.md", + "name": "fasrc-cannon-v1-strict", + "model": "qwen", + }, + }, + } + + +def test_configs_differing_only_in_benchmarking_load(): + mgr = _manager() + base = _base_config() + variant = copy.deepcopy(base) + variant["services"]["benchmarking"]["agent_md_file"] = ( + "config/agents/archive/fasrc-cannon-v2-lean.md" + ) + variant["services"]["benchmarking"]["name"] = "fasrc-cannon-v2-lean" + + mgr._append(base) + mgr._append(variant) + + assert len(mgr.configs) == 2 + + +def test_differing_global_still_rejected(): + mgr = _manager() + base = _base_config() + variant = copy.deepcopy(base) + variant["global"]["DATA_PATH"] = "/somewhere/else/" + + mgr._append(base) + with pytest.raises(ValueError, match="must be consistent across all configurations"): + mgr._append(variant) + + +def test_differing_non_benchmarking_service_still_rejected(): + mgr = _manager() + base = _base_config() + variant = copy.deepcopy(base) + # Drift in services.chat_app (the SUT) must still be rejected. + variant["services"]["chat_app"]["default_model"] = "different-model" + + mgr._append(base) + with pytest.raises(ValueError, match="must be consistent across all configurations"): + mgr._append(variant) + + +def test_identical_configs_still_load(): + """Regression guard: the narrowed comparison must not change the equal case.""" + mgr = _manager() + base = _base_config() + mgr._append(base) + mgr._append(copy.deepcopy(base)) + assert len(mgr.configs) == 2 diff --git a/tests/unit/test_config_seed_resolve.py b/tests/unit/test_config_seed_resolve.py new file mode 100644 index 00000000..b79e1644 --- /dev/null +++ b/tests/unit/test_config_seed_resolve.py @@ -0,0 +1,40 @@ +"""Unit tests for config-seed's config-path resolution. + +A multi-config benchmarking deployment renders per-variant files (e.g. +`fasrc-cannon-v1-strict.yaml`) instead of a single `config.yaml`, so config-seed +must fall back to the first `*.yaml` in the rendered-config directory rather than +aborting the whole deployment with FileNotFoundError. (Seeding Postgres from any +one config is harmless — the benchmarker reads the YAML files directly and never +consumes the seeded static_config.) +""" + +from src.cli.tools.config_seed import resolve_config_path + + +def test_existing_file_returned_as_is(tmp_path): + cfg = tmp_path / "config.yaml" + cfg.write_text("name: x\n") + assert resolve_config_path(str(cfg)) == str(cfg) + + +def test_missing_config_yaml_falls_back_to_first_yaml(tmp_path): + (tmp_path / "fasrc-cannon-v2-lean.yaml").write_text("name: b\n") + (tmp_path / "fasrc-cannon-v1-strict.yaml").write_text("name: a\n") + missing = tmp_path / "config.yaml" # does not exist + + resolved = resolve_config_path(str(missing)) + + # first in sorted order + assert resolved == str(tmp_path / "fasrc-cannon-v1-strict.yaml") + + +def test_directory_path_resolves_to_first_yaml(tmp_path): + (tmp_path / "b.yaml").write_text("name: b\n") + (tmp_path / "a.yaml").write_text("name: a\n") + assert resolve_config_path(str(tmp_path)) == str(tmp_path / "a.yaml") + + +def test_no_yaml_returns_original_path(tmp_path): + missing = tmp_path / "config.yaml" + # empty dir, nothing to fall back to -> return original so load raises clearly + assert resolve_config_path(str(missing)) == str(missing) diff --git a/tests/unit/test_render_config_filename.py b/tests/unit/test_render_config_filename.py new file mode 100644 index 00000000..715e2e32 --- /dev/null +++ b/tests/unit/test_render_config_filename.py @@ -0,0 +1,48 @@ +"""Unit tests for rendered-config filename derivation in the deployment manager. + +A multi-config `archi evaluate --config-dir` run must render one distinct file +per config so the benchmarker iterates every variant. Previously all configs +collided onto `{top_level_name}.yaml` (e.g. three `ragas-bench` sweep variants +all overwrote `ragas-bench.yaml`), so only the last survived. +""" + +from src.cli.managers.templates_manager import _render_config_target_name + + +def test_single_mode_is_config_yaml(): + assert _render_config_target_name(True, "ragas-bench", None, 0, set()) == "config.yaml" + + +def test_multi_uses_benchmarking_name(): + used = set() + assert ( + _render_config_target_name(False, "ragas-bench", "fasrc-cannon-v1-strict", 0, used) + == "fasrc-cannon-v1-strict.yaml" + ) + + +def test_multi_distinct_files_for_distinct_variants(): + used = set() + names = [ + _render_config_target_name(False, "ragas-bench", "v1", 0, used), + _render_config_target_name(False, "ragas-bench", "v2", 1, used), + _render_config_target_name(False, "ragas-bench", "v3", 2, used), + ] + assert names == ["v1.yaml", "v2.yaml", "v3.yaml"] + assert len(set(names)) == 3 + + +def test_collision_disambiguated_by_index(): + used = set() + first = _render_config_target_name(False, "ragas-bench", "dup", 0, used) + second = _render_config_target_name(False, "ragas-bench", "dup", 1, used) + assert first == "dup.yaml" + assert second == "dup_1.yaml" + assert first != second + + +def test_multi_falls_back_to_top_level_name_when_no_benchmarking_name(): + used = set() + assert ( + _render_config_target_name(False, "ragas-bench", None, 0, used) == "ragas-bench.yaml" + ) diff --git a/tests/unit/test_stage_agents_multiconfig.py b/tests/unit/test_stage_agents_multiconfig.py new file mode 100644 index 00000000..1adbb7e3 --- /dev/null +++ b/tests/unit/test_stage_agents_multiconfig.py @@ -0,0 +1,57 @@ +"""Unit test: benchmarking sweeps must stage every config's agent .md file. + +In a multi-config `archi evaluate --config-dir` run, each config names its own +`services.benchmarking.agent_md_file` (the prompt variant under test). The +deployment must copy ALL of them into data/agents/ so the benchmarker can load +each variant; previously it staged only the first config's agent, so later +variants crashed with FileNotFoundError inside the container. +""" + +from pathlib import Path +from types import SimpleNamespace + +from src.cli.managers.templates_manager import TemplateManager + + +def _config(tmp_path: Path, agent_rel: str): + cfg = { + "name": "ragas-bench", + "services": {"benchmarking": {"agent_md_file": agent_rel}}, + "_config_path": str(tmp_path / "sweep" / "variant.yaml"), + } + return cfg + + +def test_stages_all_configs_agent_files(tmp_path, monkeypatch): + # three prompt variants on disk (cwd-relative, like config/agents/archive/*.md) + agents_src = tmp_path / "agents" + agents_src.mkdir() + names = ["v1-strict.md", "v2-lean.md", "v3-cited.md"] + for n in names: + (agents_src / n).write_text(f"# {n}\n") + + # run from tmp_path so the cwd-relative agent_md_file resolves + monkeypatch.chdir(tmp_path) + rels = [f"agents/{n}" for n in names] + configs = [_config(tmp_path, r) for r in rels] + + base_dir = tmp_path / "deploy" + base_dir.mkdir() + + class _CM: + config = configs[0] + + def get_configs(self): + return configs + + context = SimpleNamespace( + config_manager=_CM(), + base_dir=base_dir, + benchmarking=True, + ) + + mgr = object.__new__(TemplateManager) + mgr._stage_agents(context) + + staged = sorted(p.name for p in (base_dir / "data" / "agents").iterdir()) + assert staged == sorted(names) From 6ffd7c8f54bbf345df77caf9e8682d97330bc9ef Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Tue, 9 Jun 2026 22:27:17 -0400 Subject: [PATCH 3/7] feat(agent): enforce initial vector retrieval in CMSCompOpsAgent The model can ignore an 'always search first' prompt and answer from its own weights, leaving source_documents empty (chat UI shows 'Link unavailable'). _inject_forced_retrieval prefills a completed search_vectorstore_hybrid tool round before the model's first turn, so retrieval always happens and its store_docs callback populates the source links. The model may still search again. Gated by services.chat_app.force_initial_retrieval (default on). Co-Authored-By: Claude Opus 4.8 --- src/archi/pipelines/agents/base_react.py | 13 +- .../pipelines/agents/cms_comp_ops_agent.py | 64 ++++++++++ tests/unit/test_forced_retrieval.py | 113 ++++++++++++++++++ 3 files changed, 188 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_forced_retrieval.py diff --git a/src/archi/pipelines/agents/base_react.py b/src/archi/pipelines/agents/base_react.py index 76ae3470..34553d6f 100644 --- a/src/archi/pipelines/agents/base_react.py +++ b/src/archi/pipelines/agents/base_react.py @@ -1176,7 +1176,7 @@ def _prepare_agent_inputs(self, **kwargs) -> Dict[str, Any]: "Invalid context window (%s), skipping trimming.", context_window, ) - return {"messages": history_messages} + return {"messages": self._inject_forced_retrieval(history_messages)} safety_margin = int(context_window * 0.15) max_prompt_tokens = context_window - safety_margin @@ -1220,7 +1220,16 @@ def _prepare_agent_inputs(self, **kwargs) -> Dict[str, Any]: except Exception as e: logger.debug("Token trimming skipped: %s", e) - return {"messages": history_messages} + return {"messages": self._inject_forced_retrieval(history_messages)} + + def _inject_forced_retrieval(self, messages: List[BaseMessage]) -> List[BaseMessage]: + """Hook for subclasses to force a retrieval before the model's first turn. + + Base implementation is a no-op; agents with a vector retriever override + this to prefill a completed ``search_vectorstore_hybrid`` tool round so + retrieval happens regardless of whether the model chooses to call it. + """ + return messages def _metadata_from_agent_output(self, answer_output: Dict[str, Any]) -> Dict[str, Any]: """Hook for subclasses to enrich metadata returned to callers.""" diff --git a/src/archi/pipelines/agents/cms_comp_ops_agent.py b/src/archi/pipelines/agents/cms_comp_ops_agent.py index 48f4385a..a9b9fef6 100644 --- a/src/archi/pipelines/agents/cms_comp_ops_agent.py +++ b/src/archi/pipelines/agents/cms_comp_ops_agent.py @@ -1,8 +1,11 @@ from __future__ import annotations import json +import uuid from typing import Any, Callable, Dict, List +from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, ToolMessage + from src.utils.logging import get_logger from src.utils.env import read_secret from src.archi.pipelines.agents.base_react import BaseReActAgent @@ -313,3 +316,64 @@ def _update_vector_retrievers(self, vectorstore: Any) -> None: store_tool_input=getattr(self, "_store_tool_input", None), ) ) + + def _inject_forced_retrieval(self, messages: List[BaseMessage]) -> List[BaseMessage]: + """Force one ``search_vectorstore_hybrid`` call before the model answers. + + The model can ignore an "always search first" prompt and answer from its + own weights, leaving ``source_documents`` empty (chat UI shows "Link + unavailable"). To enforce retrieval, prefill a completed tool round — + an ``AIMessage`` carrying the tool call plus the matching ``ToolMessage`` + result — so the ReAct loop starts with real chunks already in context. + Invoking the existing retriever tool also runs its ``store_docs`` + callback, so retrieved documents flow into ``source_documents``/links + exactly as a model-initiated search would. The model may still search + again. Gated by ``services.chat_app.force_initial_retrieval`` (default + on) so prompt-vs-enforcement variants can be A/B'd in the sweep. + """ + if not getattr(self, "enable_vector_tools", False): + logger.debug("Forced retrieval skipped: vector tools disabled") + return messages + if not self._chat_app_config.get("force_initial_retrieval", True): + logger.debug("Forced retrieval skipped: force_initial_retrieval=false") + return messages + tools = getattr(self, "_vector_tools", None) + if not tools: + logger.debug("Forced retrieval skipped: no vector tools built") + return messages + # Only force on a fresh user turn (the latest message is the question). + if not messages or not isinstance(messages[-1], HumanMessage): + last_type = type(messages[-1]).__name__ if messages else "none" + logger.debug("Forced retrieval skipped: last message is %s", last_type) + return messages + query = (self._message_content(messages[-1]) or "").strip() + if not query: + logger.debug("Forced retrieval skipped: empty query") + return messages + + try: + result = tools[0].invoke({"query": query}) + except Exception: + # Fail open: a retrieval error must not break the chat turn. + logger.warning("Forced initial retrieval failed", exc_info=True) + return messages + + logger.info("Forced initial retrieval ran: query=%r -> %d chars", query, len(str(result))) + + call_id = f"forced_search_{uuid.uuid4().hex}" + ai = AIMessage( + content="", + tool_calls=[ + { + "name": "search_vectorstore_hybrid", + "args": {"query": query}, + "id": call_id, + } + ], + ) + tool_msg = ToolMessage( + content=result if isinstance(result, str) else str(result), + tool_call_id=call_id, + name="search_vectorstore_hybrid", + ) + return list(messages) + [ai, tool_msg] diff --git a/tests/unit/test_forced_retrieval.py b/tests/unit/test_forced_retrieval.py new file mode 100644 index 00000000..2b31c370 --- /dev/null +++ b/tests/unit/test_forced_retrieval.py @@ -0,0 +1,113 @@ +"""Unit tests for enforced (mandatory) vector retrieval in CMSCompOpsAgent. + +The chat model can ignore a "always search first" system prompt and answer +from its own weights, leaving ``source_documents`` empty (and the chat UI +showing "Link unavailable"). To *enforce* retrieval, the agent prefills a +completed tool round — an ``AIMessage`` carrying a ``search_vectorstore_hybrid`` +tool call plus the matching ``ToolMessage`` result — before the model's first +turn, so the retrieval happens regardless of the model's choice. +""" + +from langchain_core.messages import AIMessage, HumanMessage, ToolMessage + +from src.archi.pipelines.agents.cms_comp_ops_agent import CMSCompOpsAgent + + +class _FakeTool: + """Stand-in for the real retriever StructuredTool.""" + + def __init__(self, result: str = "[1] GLOBUS doc snippet"): + self.result = result + self.calls = [] + + def invoke(self, payload): + self.calls.append(payload) + return self.result + + +def _make_agent(*, enabled=True, force=True, tools=None): + """Build a CMSCompOpsAgent shell without running its heavy __init__.""" + agent = object.__new__(CMSCompOpsAgent) + agent.enable_vector_tools = enabled + agent._vector_tools = tools + agent.config = {"services": {"chat_app": {"force_initial_retrieval": force}}} + return agent + + +def test_injects_forced_tool_round(): + fake = _FakeTool() + agent = _make_agent(tools=[fake]) + + out = agent._inject_forced_retrieval([HumanMessage("What is GLOBUS for?")]) + + # original human message + prefilled (AIMessage tool_call, ToolMessage) + assert len(out) == 3 + assert isinstance(out[0], HumanMessage) + + ai = out[1] + assert isinstance(ai, AIMessage) + assert len(ai.tool_calls) == 1 + assert ai.tool_calls[0]["name"] == "search_vectorstore_hybrid" + assert ai.tool_calls[0]["args"] == {"query": "What is GLOBUS for?"} + + tool_msg = out[2] + assert isinstance(tool_msg, ToolMessage) + assert tool_msg.content == "[1] GLOBUS doc snippet" + assert tool_msg.tool_call_id == ai.tool_calls[0]["id"] + + # the real retriever tool was actually invoked with the user's query + assert fake.calls == [{"query": "What is GLOBUS for?"}] + + +def test_disabled_by_flag_is_noop(): + fake = _FakeTool() + agent = _make_agent(tools=[fake], force=False) + + msgs = [HumanMessage("What is GLOBUS for?")] + out = agent._inject_forced_retrieval(msgs) + + assert out == msgs + assert fake.calls == [] + + +def test_no_vector_tools_is_noop(): + agent = _make_agent(enabled=False, tools=None) + msgs = [HumanMessage("What is GLOBUS for?")] + assert agent._inject_forced_retrieval(msgs) == msgs + + +def test_only_injects_on_fresh_human_turn(): + """If the turn does not end with a human message, do not inject.""" + fake = _FakeTool() + agent = _make_agent(tools=[fake]) + + msgs = [HumanMessage("hi"), AIMessage("hello")] + out = agent._inject_forced_retrieval(msgs) + + assert out == msgs + assert fake.calls == [] + + +def test_empty_query_is_noop(): + fake = _FakeTool() + agent = _make_agent(tools=[fake]) + + msgs = [HumanMessage(" ")] + out = agent._inject_forced_retrieval(msgs) + + assert out == msgs + assert fake.calls == [] + + +def test_retriever_failure_fails_open(): + """A retrieval error must not break the chat turn.""" + + class _BoomTool: + def invoke(self, payload): + raise RuntimeError("vectorstore down") + + agent = _make_agent(tools=[_BoomTool()]) + msgs = [HumanMessage("What is GLOBUS for?")] + out = agent._inject_forced_retrieval(msgs) + + assert out == msgs From 2ba3f0974cd19cfa9f7cde74c6268355c71dac7d Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Wed, 10 Jun 2026 10:12:06 -0400 Subject: [PATCH 4/7] fix(benchmark): report per-metric scored sample size on the leaderboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit aggregate_* is a pandas .mean() that skips NaN, so a RAGAS judge timeout on a question silently shrinks that metric's sample without making the aggregate NaN — a mean over 4 of 9 answered questions looked fully-backed. Add scored_counts{metric: n} (non-NaN per-question count) to each leaderboard row, keep query_count as the answered count, log a warning for under-sampled metrics, and annotate the log table cells with @ when n < answered. Co-Authored-By: Claude Opus 4.8 --- src/bin/service_benchmark.py | 66 +++++++++++++++++---- tests/unit/test_prompt_sweep_leaderboard.py | 29 +++++++++ 2 files changed, 85 insertions(+), 10 deletions(-) diff --git a/src/bin/service_benchmark.py b/src/bin/service_benchmark.py index 1588cb4d..3af35f3b 100644 --- a/src/bin/service_benchmark.py +++ b/src/bin/service_benchmark.py @@ -334,9 +334,13 @@ def build_leaderboard(primary_metric: str = "faithfulness") -> Dict[str, Any]: it never touches pair_ab_results/ab_comparisons. Each row: {name, agent_md_file, metrics{...}, primary_score, rank, - incomplete, query_count}. A metric is None (and the row `incomplete`) - when its aggregate key is absent or NaN — never silently zeroed. - Incomplete rows always sort after complete ones. Ties share a rank. + incomplete, query_count, scored_counts{...}}. A metric is None (and the + row `incomplete`) when its aggregate key is absent or NaN — never + silently zeroed. Incomplete rows always sort after complete ones. Ties + share a rank. `query_count` is the number of questions answered; + `scored_counts[metric]` is how many non-NaN per-question scores actually + backed that metric's mean (a judge timeout shrinks the sample without + making the aggregate NaN). shared_context records the run context common to all variants and flags any drift (a hand-edited config that breaks apples-to-apples). @@ -381,12 +385,44 @@ def _benchmarking(record: Dict[str, Any]) -> Dict[str, Any]: else: metrics[metric_name] = float(value) + # Per-metric sample size actually behind each mean. The RAGAS block + # computes aggregate_* via pandas .mean(), which skips NaN, so a + # judge timeout on one question silently shrinks the sample for that + # metric without making the aggregate NaN. Count the non-NaN + # per-question scores so the leaderboard can show, e.g., a + # faithfulness mean taken over 4 of 9 answered questions instead of + # implying all 9 backed it. query_count is the answered count. + single_question_results = record.get("single_question_results") or {} + scored_counts: Dict[str, int] = {} + for metric_name, _agg_key in ResultHandler.LEADERBOARD_METRICS: + count = 0 + for q in single_question_results.values(): + if not isinstance(q, dict): + continue + v = q.get(metric_name) + if v is not None and not (isinstance(v, float) and math.isnan(v)): + count += 1 + scored_counts[metric_name] = count + if incomplete: logger.warning( "Leaderboard: variant '%s' (%s) is incomplete — missing/NaN metrics: %s", name, agent_md_file, [m for m in metric_names if metrics[m] is None], ) + # Surface under-sampling even when the aggregate is a valid float. + answered = len(single_question_results) + undersampled = [ + f"{m}={scored_counts[m]}/{answered}" + for m in metric_names + if metrics[m] is not None and scored_counts[m] < answered + ] + if undersampled: + logger.warning( + "Leaderboard: variant '%s' (%s) has under-sampled metrics " + "(mean over fewer than %d answered questions): %s", + name, agent_md_file, answered, undersampled, + ) rows.append({ "name": name, @@ -394,7 +430,8 @@ def _benchmarking(record: Dict[str, Any]) -> Dict[str, Any]: "metrics": metrics, "primary_score": metrics[primary_metric], "incomplete": incomplete, - "query_count": len(record.get("single_question_results") or {}), + "query_count": answered, + "scored_counts": scored_counts, }) ragas_settings = (bench.get("mode_settings", {}) or {}).get("ragas_settings", {}) or {} @@ -1032,15 +1069,24 @@ def run(self): ) for row in leaderboard["rows"]: m = row["metrics"] - def _fmt(v: Optional[float]) -> str: - return f"{v:.4f}" if isinstance(v, float) else " n/a" + answered = row["query_count"] + scored = row.get("scored_counts", {}) + # Annotate a metric with @ when its mean is over fewer than + # the answered questions (judge timeouts), so an under-sampled + # score can't masquerade as fully-backed. + def _fmt(metric_name: str) -> str: + v = m[metric_name] + if not isinstance(v, float): + return " n/a" + n = scored.get(metric_name, answered) + return f"{v:.4f}@{n}" if n < answered else f"{v:.4f}" flag = " (incomplete)" if row["incomplete"] else "" logger.info( - " %-4d %-28s %-10s %-10s %-10s %-10s %-10d %s%s", + " %-4d %-28s %-12s %-12s %-12s %-12s %-10d %s%s", row["rank"], row["name"][:28], - _fmt(m["answer_relevancy"]), _fmt(m["faithfulness"]), - _fmt(m["context_precision"]), _fmt(m["context_recall"]), - row["query_count"], row["agent_md_file"], flag, + _fmt("answer_relevancy"), _fmt("faithfulness"), + _fmt("context_precision"), _fmt("context_recall"), + answered, row["agent_md_file"], flag, ) # Push to Argilla when ARCHI_ARGILLA=1 in the benchmarks container env. diff --git a/tests/unit/test_prompt_sweep_leaderboard.py b/tests/unit/test_prompt_sweep_leaderboard.py index a805f75f..400b4d4f 100644 --- a/tests/unit/test_prompt_sweep_leaderboard.py +++ b/tests/unit/test_prompt_sweep_leaderboard.py @@ -173,6 +173,35 @@ def test_nan_metric_marks_incomplete(): assert lb["rows"][-1]["name"] == "nan" +# -- scored_counts: per-metric sample size behind each mean ------------------ + +def test_scored_counts_reflect_non_nan_per_question(): + """A judge timeout shrinks a metric's sample without making the aggregate + NaN; scored_counts must report the real (non-NaN) per-metric count while + query_count stays the answered count.""" + rec = _make_record("v", "p/v.md", faithfulness=0.6, context_precision=0.5, n_questions=3) + qs = list(rec["single_question_results"].values()) + for q in qs: # all three fully scored on three metrics + q["faithfulness"] = 0.6 + q["answer_relevancy"] = 0.8 + q["context_recall"] = 0.7 + # context_precision scored on only the first question; other two timed out + qs[0]["context_precision"] = 0.5 + qs[1]["context_precision"] = float("nan") + qs[2]["context_precision"] = float("nan") + + ResultHandler.results = [rec] + row = ResultHandler.build_leaderboard()["rows"][0] + + assert row["query_count"] == 3 # answered + assert row["scored_counts"]["faithfulness"] == 3 + assert row["scored_counts"]["answer_relevancy"] == 3 + assert row["scored_counts"]["context_recall"] == 3 + assert row["scored_counts"]["context_precision"] == 1 # 2 timed out + # the aggregate is still a valid float, so the row is NOT marked incomplete + assert row["incomplete"] is False + + # -- 4.6 name fallback ------------------------------------------------------- def test_name_falls_back_to_prompt_stem(): From 7f9b419b5050fafbe95e80bcb45118a992a18217 Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Wed, 10 Jun 2026 14:04:14 -0400 Subject: [PATCH 5/7] fix(benchmark): lazy-import datasets/ragas so unit tests collect without them The leaderboard unit tests import service_benchmark, whose top-level 'from datasets import Dataset' and 'from ragas import ...' pulled heavy benchmark-only deps absent from the lean unit-test CI env, breaking collection (ModuleNotFoundError: datasets). Move those imports into the two methods that use them (get_ragas_results, run) so the module imports for its pure helpers (ResultHandler.build_leaderboard/dump) without datasets/ragas. The benchmarking Docker image still has both at runtime. Co-Authored-By: Claude Opus 4.8 --- src/bin/service_benchmark.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/bin/service_benchmark.py b/src/bin/service_benchmark.py index 3af35f3b..24395a18 100644 --- a/src/bin/service_benchmark.py +++ b/src/bin/service_benchmark.py @@ -14,15 +14,14 @@ import pandas as pd import yaml -from datasets import Dataset from langchain_huggingface import HuggingFaceEmbeddings from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_core.messages import AIMessage, HumanMessage, ToolMessage -from ragas import RunConfig, evaluate -from ragas.embeddings import LangchainEmbeddingsWrapper -from ragas.llms import LangchainLLMWrapper -from ragas.metrics import (answer_relevancy, context_precision, context_recall, - faithfulness) +# NOTE: `datasets` and `ragas` are heavy, benchmark-only deps that live in the +# benchmarking Docker image but NOT the lean unit-test environment. They are +# imported lazily inside the methods that use them (get_ragas_results, run) +# so that importing this module for its pure helpers (e.g. ResultHandler. +# build_leaderboard / dump, exercised by unit tests) does not require them. from src.archi.archi import archi from src.archi.pipelines.agents.agent_spec import AgentSpecError, load_agent_spec @@ -834,7 +833,14 @@ def get_source_results( def get_ragas_results(self, data, to_add): """WARNING: this method modifies the to_add dictionary to add the relevant scores to the relevant questions""" - + # Lazy import: ragas (and its transitive `datasets` dep) is benchmark-only + # and absent from the unit-test environment. See the module-header note. + from ragas import RunConfig, evaluate + from ragas.embeddings import LangchainEmbeddingsWrapper + from ragas.llms import LangchainLLMWrapper + from ragas.metrics import (answer_relevancy, context_precision, + context_recall, faithfulness) + all_metrics_dict = { 'answer_relevancy': answer_relevancy, 'faithfulness': faithfulness, @@ -1011,6 +1017,7 @@ def run(self): if "RAGAS" in modes_being_run: # TODO this is likely broken now logger.info(f"Starting to collect RAGAS results") + from datasets import Dataset # lazy: benchmark-only dep (see module header) data = Dataset.from_list(ragas_input) # were modifying final_addition here to add ragas results by question ragas_results = self.get_ragas_results(data, question_wise_results) From 60dd655d97e42680d75eb4e2e4e1534a90effb1b Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Wed, 10 Jun 2026 14:07:07 -0400 Subject: [PATCH 6/7] fix(cli): harden multi-config rendering against filename/basename collisions Address Copilot review on PR #22: - _render_config_target_name now loops until the name is unique instead of disambiguating only once, so a config can never silently overwrite an earlier rendered file even if {stem}_{index} is already taken. - _stage_agents rejects two benchmark configs whose agent_md_file share a basename (they'd overwrite each other when staged and when referenced by the rendered config) with a clear error instead of silently dropping one. Co-Authored-By: Claude Opus 4.8 --- src/cli/managers/templates_manager.py | 21 ++++++++++++++-- tests/unit/test_render_config_filename.py | 8 ++++++ tests/unit/test_stage_agents_multiconfig.py | 28 +++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/cli/managers/templates_manager.py b/src/cli/managers/templates_manager.py index e3aa0583..994f31d1 100644 --- a/src/cli/managers/templates_manager.py +++ b/src/cli/managers/templates_manager.py @@ -37,8 +37,13 @@ def _render_config_target_name( return "config.yaml" stem = str(benchmarking_name or top_level_name) candidate = f"{stem}.yaml" - if candidate in used_names: - candidate = f"{stem}_{index}.yaml" + # Disambiguate against ALL previously-used names, not just once: keep + # bumping the suffix until the name is unique so a config can never + # silently overwrite an earlier rendered file. + suffix = index + while candidate in used_names: + candidate = f"{stem}_{suffix}.yaml" + suffix += 1 used_names.add(candidate) return candidate @@ -190,6 +195,10 @@ def _stage_agents(self, context: TemplateContext) -> None: # A multi-config sweep names a distinct agent_md_file per config; stage # every one so the benchmarker can load each variant (not just the first). dst_dir.mkdir(parents=True, exist_ok=True) + # Agent files are staged (and later referenced by the rendered config) + # by basename, so two configs whose agent_md_file share a basename + # would silently overwrite each other. Detect and reject that. + staged_by_basename: Dict[str, Path] = {} for bench_config in context.config_manager.get_configs(): bench_services = bench_config.get("services", {}) or {} benchmark_cfg = bench_services.get("benchmarking", {}) or {} @@ -206,6 +215,14 @@ def _stage_agents(self, context: TemplateContext) -> None: raise ValueError(f"Benchmark agent file not found: {source_path}") if source_path.suffix.lower() != ".md": raise ValueError(f"Benchmark agent file must be a .md file: {source_path}") + prior = staged_by_basename.get(source_path.name) + if prior is not None and prior != source_path: + raise ValueError( + "Two benchmark configs reference different agent files with the " + f"same basename '{source_path.name}' ({prior} vs {source_path}); " + "they would overwrite each other when staged. Rename one." + ) + staged_by_basename[source_path.name] = source_path shutil.copyfile(source_path, dst_dir / source_path.name) return diff --git a/tests/unit/test_render_config_filename.py b/tests/unit/test_render_config_filename.py index 715e2e32..bf0bad2a 100644 --- a/tests/unit/test_render_config_filename.py +++ b/tests/unit/test_render_config_filename.py @@ -41,6 +41,14 @@ def test_collision_disambiguated_by_index(): assert first != second +def test_repeated_collisions_stay_unique(): + """Disambiguation must keep bumping until unique, not collide a second time.""" + used = set() + names = [_render_config_target_name(False, "rb", "dup", i, used) for i in range(4)] + assert len(set(names)) == 4 # all distinct, none overwrites another + assert names[0] == "dup.yaml" + + def test_multi_falls_back_to_top_level_name_when_no_benchmarking_name(): used = set() assert ( diff --git a/tests/unit/test_stage_agents_multiconfig.py b/tests/unit/test_stage_agents_multiconfig.py index 1adbb7e3..f18462ba 100644 --- a/tests/unit/test_stage_agents_multiconfig.py +++ b/tests/unit/test_stage_agents_multiconfig.py @@ -55,3 +55,31 @@ def get_configs(self): staged = sorted(p.name for p in (base_dir / "data" / "agents").iterdir()) assert staged == sorted(names) + + +def test_same_basename_different_files_is_rejected(tmp_path, monkeypatch): + """Two configs whose agent files share a basename would overwrite each + other when staged (and when referenced by the rendered config), so the + deployment must reject it instead of silently dropping one variant.""" + import pytest + + (tmp_path / "a").mkdir() + (tmp_path / "b").mkdir() + (tmp_path / "a" / "prompt.md").write_text("# a\n") + (tmp_path / "b" / "prompt.md").write_text("# b\n") + monkeypatch.chdir(tmp_path) + + configs = [_config(tmp_path, "a/prompt.md"), _config(tmp_path, "b/prompt.md")] + base_dir = tmp_path / "deploy" + base_dir.mkdir() + + class _CM: + config = configs[0] + + def get_configs(self): + return configs + + context = SimpleNamespace(config_manager=_CM(), base_dir=base_dir, benchmarking=True) + mgr = object.__new__(TemplateManager) + with pytest.raises(ValueError, match="same basename"): + mgr._stage_agents(context) From ef60297e17552d4632fb027a21a5a06f4fc45c2d Mon Sep 17 00:00:00 2001 From: Austin Swinney Date: Wed, 10 Jun 2026 14:14:58 -0400 Subject: [PATCH 7/7] fix(benchmark): defer secret/DB init to script entry, not module import After lazy-importing datasets/ragas, collection still failed in CI because importing service_benchmark ran module-level read_secret() calls and opened a Postgres connection pool (PostgresServiceFactory.from_env), which can't connect in the DB-less unit-test env. Move that initialization into _init_runtime(), called only from __main__, so importing the module for ResultHandler's pure helpers needs neither live secrets nor a database. Co-Authored-By: Claude Opus 4.8 --- src/bin/service_benchmark.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/bin/service_benchmark.py b/src/bin/service_benchmark.py index 24395a18..ada51532 100644 --- a/src/bin/service_benchmark.py +++ b/src/bin/service_benchmark.py @@ -39,13 +39,22 @@ setup_logging() logger = get_logger(__name__) -os.environ['OPENAI_API_KEY'] = read_secret("OPENAI_API_KEY") -os.environ['ANTHROPIC_API_KEY'] = read_secret("ANTHROPIC_API_KEY") -os.environ['HUGGING_FACE_HUB_TOKEN'] = read_secret("HUGGING_FACE_HUB_TOKEN") -os.environ['HUIT_API_KEY'] = read_secret("HUIT_API_KEY") -factory = PostgresServiceFactory.from_env(password_override=os.environ.get("PG_PASSWORD")) -PostgresServiceFactory.set_instance(factory) +def _init_runtime() -> None: + """Load secrets into the environment and open the Postgres connection pool. + + Called only when this module is run as a script (see __main__), NOT at import + time — importing the module for its pure helpers (e.g. ResultHandler. + build_leaderboard, exercised by unit tests) must not require live secrets or + a reachable database. + """ + os.environ['OPENAI_API_KEY'] = read_secret("OPENAI_API_KEY") + os.environ['ANTHROPIC_API_KEY'] = read_secret("ANTHROPIC_API_KEY") + os.environ['HUGGING_FACE_HUB_TOKEN'] = read_secret("HUGGING_FACE_HUB_TOKEN") + os.environ['HUIT_API_KEY'] = read_secret("HUIT_API_KEY") + + factory = PostgresServiceFactory.from_env(password_override=os.environ.get("PG_PASSWORD")) + PostgresServiceFactory.set_instance(factory) @dataclass @@ -1304,7 +1313,9 @@ def wait_for_ingestion_completion(self): if __name__ == "__main__": - query_file = Path("QandA.txt") + _init_runtime() + + query_file = Path("QandA.txt") configs_folder = Path('configs') with open(Path(query_file), "r") as f: