terrier-org · Copilot · Apr 2, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/docs/experiments.rst b/docs/experiments.rst
@@ -191,6 +191,42 @@ kwarg to ``"overwrite"``::
         save_mode="overwrite"
     )
 
+Saving Evaluation Results as CSVs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Whenever ``save_dir`` is set, ``pt.Experiment`` also writes two CSV summary files to that directory:
+
+ - ``aggregated.csv`` — one row per system with a ``name`` column followed by one column per evaluation measure (mirrors the default ``pt.Experiment`` return value).
+ - ``perquery.csv`` — a long-format table with columns ``name``, ``qid``, ``measure``, and ``value``, giving per-query metric values for every system.
+
+These files are always written regardless of the value of the ``perquery`` kwarg.
+
+If either CSV file already exists (e.g. from a previous call that evaluated ``TF_IDF`` and ``BM25``), rows for systems that are *not* part of the current experiment are loaded from the existing file and merged into the new output. This means that results accumulate across separate experiment calls made to the same ``save_dir``, so no previously-evaluated system's data is lost when a subsequent experiment evaluates only a subset of systems::
+
+    # First run: evaluates TF_IDF and BM25; writes TF_IDF.res.gz, BM25.res.gz, aggregated.csv, perquery.csv
+    pt.Experiment(
+        [tfidf, bm25],
+        dataset.get_topics(),
+        dataset.get_qrels(),
+        eval_metrics=["map"],
+        names=["TF_IDF", "BM25"],
+        save_dir="./runs",
+    )
+
+    # Second run: evaluates PL2 only; TF_IDF and BM25 rows are preserved in the CSV files
+    pt.Experiment(
+        [pl2],
+        dataset.get_topics(),
+        dataset.get_qrels(),
+        eval_metrics=["map"],
+        names=["PL2"],
+        save_dir="./runs",
+    )
+    # ./runs/aggregated.csv now contains rows for TF_IDF, BM25, and PL2
+    # ./runs/perquery.csv   now contains per-query rows for all three systems
+
+Re-running an existing system replaces its rows rather than duplicating them, so the CSV files always contain exactly one row per unique system name in ``aggregated.csv`` (and one row per system/query/measure combination in ``perquery.csv``).
+
 Missing Topics and/or Qrels
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -361,4 +397,4 @@ Consider the following example::
 
 Normally, BM25 retriever would be invoked twice during this experiment - once for each pipeline, resulting in a slower executation time compared to an imperative workflow (get BM25 results, evaluate, apply monoT5, evaluate). By setting `precompute_prefix=True`, ``pt.Experiment`` will execute the `bm25` transformer only once on the input topics, and then reuse those results as input to monoT5.
 
-NB: This is experimental functionality, but should initial usage be successful, it may be turned on by default in future versions of PyTerrier.
+NB: This is experimental functionality, but should initial usage be successful, it may be turned on by default in future versions of PyTerrier.
diff --git a/pyterrier/_evaluation/_experiment.py b/pyterrier/_evaluation/_experiment.py
@@ -167,6 +167,10 @@ def Experiment(
     :param save_dir: If set to the name of a directory, the results of each transformer will be saved in TREC-formatted results file, whose 
         filename is based on the systems names (as specified by ``names`` kwarg). If the file exists and ``save_mode`` is set to "reuse", then the file
         will be used for evaluation rather than the transformer. Default is None, such that saving and loading from files is disabled.
+        In addition, two CSV summary files are written to ``save_dir`` on every call: ``aggregated.csv`` (one row per system, one column per measure)
+        and ``perquery.csv`` (long-format table with columns ``name``, ``qid``, ``measure``, ``value``).
+        If either CSV already exists, rows for systems not in the current experiment are preserved, allowing results to accumulate
+        across multiple calls to ``pt.Experiment`` that each evaluate different subsets of systems.
     :param save_mode: Defines how existing files are used when ``save_dir`` is set. If set to "reuse", then files will be preferred
         over transformers for evaluation. If set to "overwrite", existing files will be replaced. If set to "warn" or "error", the presence of any 
         existing file will cause a warning or error, respectively. Default is "warn".
@@ -333,6 +337,31 @@ def Experiment(
                 pbar=pbar)
             renderer.add_metrics(sysid, evalMeasuresDict, time)
 
+    if save_dir is not None:
+        # always save aggregated and per-query results as CSV files regardless of perquery setting
+        current_names_set = set(names)
+        aggregated_path = os.path.join(save_dir, "aggregated.csv")
+        perquery_path = os.path.join(save_dir, "perquery.csv")
+
+        new_agg = renderer.averages(dataframe=True, mrt_needed=mrt_needed)
+        new_pq = renderer.perquery(dataframe=True)
+
+        # preserve rows for runs that exist in save_dir but are not part of the current experiment
+        if os.path.exists(aggregated_path):
+            old_agg = pd.read_csv(aggregated_path)
+            old_agg = old_agg[~old_agg["name"].isin(current_names_set)]
+            if not old_agg.empty:
+                new_agg = pd.concat([new_agg, old_agg], ignore_index=True)
+
+        if os.path.exists(perquery_path):
+            old_pq = pd.read_csv(perquery_path)
+            old_pq = old_pq[~old_pq["name"].isin(current_names_set)]
+            if not old_pq.empty:
+                new_pq = pd.concat([new_pq, old_pq], ignore_index=True)
+
+        new_agg.to_csv(aggregated_path, index=False)
+        new_pq.to_csv(perquery_path, index=False)
+
     if not perquery:
         return renderer.averages(dataframe=dataframe, highlight=highlight, mrt_needed=mrt_needed)
 

diff --git a/tests/test_experiment.py b/tests/test_experiment.py
@@ -289,6 +289,58 @@ def test_save(self):
                 # a successful experiment using save_dir should be faster
                 self.assertTrue(df2.iloc[0]["mrt"] < df1.iloc[0]["mrt"])
 
+    def test_save_csv(self):
+        index = self._vaswani_index()
+        brs = [
+            pt.terrier.Retriever(index, wmodel="DPH"),
+            pt.terrier.Retriever(index, wmodel="BM25")
+        ]
+        topics = pt.datasets.get_dataset("vaswani").get_topics().head(10)
+        qrels = pt.datasets.get_dataset("vaswani").get_qrels()
+
+        pt.Experiment(brs, topics, qrels, eval_metrics=["map"], save_dir=self.test_dir, names=["DPH", "BM25"])
+
+        aggregated_path = os.path.join(self.test_dir, "aggregated.csv")
+        perquery_path = os.path.join(self.test_dir, "perquery.csv")
+
+        self.assertTrue(os.path.exists(aggregated_path), "aggregated.csv not found")
+        self.assertTrue(os.path.exists(perquery_path), "perquery.csv not found")
+
+        agg_df = pd.read_csv(aggregated_path)
+        self.assertEqual(2, len(agg_df), "aggregated.csv should have one row per system")
+        self.assertIn("name", agg_df.columns)
+        self.assertIn("map", agg_df.columns)
+        self.assertEqual({"DPH", "BM25"}, set(agg_df["name"].tolist()))
+
+        pq_df = pd.read_csv(perquery_path)
+        self.assertIn("name", pq_df.columns)
+        self.assertIn("qid", pq_df.columns)
+        self.assertIn("measure", pq_df.columns)
+        self.assertIn("value", pq_df.columns)
+        # 2 systems × 10 topics × 1 measure = 20 rows
+        self.assertEqual(20, len(pq_df))
+
+        # Run a second experiment with only PL2; DPH and BM25 rows from the first run must be preserved
+        pl2 = pt.terrier.Retriever(index, wmodel="PL2")
+        pt.Experiment([pl2], topics, qrels, eval_metrics=["map"], save_dir=self.test_dir, names=["PL2"])
+
+        agg_df2 = pd.read_csv(aggregated_path)
+        # all three systems should now appear
+        self.assertEqual(3, len(agg_df2), "aggregated.csv should retain rows from previous runs")
+        self.assertEqual({"DPH", "BM25", "PL2"}, set(agg_df2["name"].tolist()))
+
+        pq_df2 = pd.read_csv(perquery_path)
+        # 3 systems × 10 topics × 1 measure = 30 rows
+        self.assertEqual(30, len(pq_df2))
+        self.assertEqual({"DPH", "BM25", "PL2"}, set(pq_df2["name"].tolist()))
+
+        # Running BM25 again should update its row, not duplicate it
+        pt.Experiment([brs[1]], topics, qrels, eval_metrics=["map"], save_dir=self.test_dir, names=["BM25"], save_mode="overwrite")
+
+        agg_df3 = pd.read_csv(aggregated_path)
+        self.assertEqual(3, len(agg_df3), "re-running BM25 must not create duplicate rows")
+        self.assertEqual({"DPH", "BM25", "PL2"}, set(agg_df3["name"].tolist()))
+
     def test_empty(self):
         df1 = pt.new.ranked_documents([[1]]).head(0)
         t1 = pt.Transformer.from_df(df1)