Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion docs/experiments.rst
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,42 @@ kwarg to ``"overwrite"``::
save_mode="overwrite"
)

Saving Evaluation Results as CSVs
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Whenever ``save_dir`` is set, ``pt.Experiment`` also writes two CSV summary files to that directory:

- ``aggregated.csv`` — one row per system with a ``name`` column followed by one column per evaluation measure (mirrors the default ``pt.Experiment`` return value).
- ``perquery.csv`` — a long-format table with columns ``name``, ``qid``, ``measure``, and ``value``, giving per-query metric values for every system.

These files are always written regardless of the value of the ``perquery`` kwarg.

If either CSV file already exists (e.g. from a previous call that evaluated ``TF_IDF`` and ``BM25``), rows for systems that are *not* part of the current experiment are loaded from the existing file and merged into the new output. This means that results accumulate across separate experiment calls made to the same ``save_dir``, so no previously-evaluated system's data is lost when a subsequent experiment evaluates only a subset of systems::

# First run: evaluates TF_IDF and BM25; writes TF_IDF.res.gz, BM25.res.gz, aggregated.csv, perquery.csv
pt.Experiment(
[tfidf, bm25],
dataset.get_topics(),
dataset.get_qrels(),
eval_metrics=["map"],
names=["TF_IDF", "BM25"],
save_dir="./runs",
)

# Second run: evaluates PL2 only; TF_IDF and BM25 rows are preserved in the CSV files
pt.Experiment(
[pl2],
dataset.get_topics(),
dataset.get_qrels(),
eval_metrics=["map"],
names=["PL2"],
save_dir="./runs",
)
# ./runs/aggregated.csv now contains rows for TF_IDF, BM25, and PL2
# ./runs/perquery.csv now contains per-query rows for all three systems

Re-running an existing system replaces its rows rather than duplicating them, so the CSV files always contain exactly one row per unique system name in ``aggregated.csv`` (and one row per system/query/measure combination in ``perquery.csv``).

Missing Topics and/or Qrels
~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -361,4 +397,4 @@ Consider the following example::

Normally, BM25 retriever would be invoked twice during this experiment - once for each pipeline, resulting in a slower executation time compared to an imperative workflow (get BM25 results, evaluate, apply monoT5, evaluate). By setting `precompute_prefix=True`, ``pt.Experiment`` will execute the `bm25` transformer only once on the input topics, and then reuse those results as input to monoT5.

NB: This is experimental functionality, but should initial usage be successful, it may be turned on by default in future versions of PyTerrier.
NB: This is experimental functionality, but should initial usage be successful, it may be turned on by default in future versions of PyTerrier.
29 changes: 29 additions & 0 deletions pyterrier/_evaluation/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ def Experiment(
:param save_dir: If set to the name of a directory, the results of each transformer will be saved in TREC-formatted results file, whose
filename is based on the systems names (as specified by ``names`` kwarg). If the file exists and ``save_mode`` is set to "reuse", then the file
will be used for evaluation rather than the transformer. Default is None, such that saving and loading from files is disabled.
In addition, two CSV summary files are written to ``save_dir`` on every call: ``aggregated.csv`` (one row per system, one column per measure)
and ``perquery.csv`` (long-format table with columns ``name``, ``qid``, ``measure``, ``value``).
If either CSV already exists, rows for systems not in the current experiment are preserved, allowing results to accumulate
across multiple calls to ``pt.Experiment`` that each evaluate different subsets of systems.
:param save_mode: Defines how existing files are used when ``save_dir`` is set. If set to "reuse", then files will be preferred
over transformers for evaluation. If set to "overwrite", existing files will be replaced. If set to "warn" or "error", the presence of any
existing file will cause a warning or error, respectively. Default is "warn".
Expand Down Expand Up @@ -333,6 +337,31 @@ def Experiment(
pbar=pbar)
renderer.add_metrics(sysid, evalMeasuresDict, time)

if save_dir is not None:
# always save aggregated and per-query results as CSV files regardless of perquery setting
current_names_set = set(names)
aggregated_path = os.path.join(save_dir, "aggregated.csv")
perquery_path = os.path.join(save_dir, "perquery.csv")

new_agg = renderer.averages(dataframe=True, mrt_needed=mrt_needed)
new_pq = renderer.perquery(dataframe=True)

# preserve rows for runs that exist in save_dir but are not part of the current experiment
if os.path.exists(aggregated_path):
old_agg = pd.read_csv(aggregated_path)
old_agg = old_agg[~old_agg["name"].isin(current_names_set)]
if not old_agg.empty:
new_agg = pd.concat([new_agg, old_agg], ignore_index=True)

if os.path.exists(perquery_path):
old_pq = pd.read_csv(perquery_path)
old_pq = old_pq[~old_pq["name"].isin(current_names_set)]
if not old_pq.empty:
new_pq = pd.concat([new_pq, old_pq], ignore_index=True)

new_agg.to_csv(aggregated_path, index=False)
new_pq.to_csv(perquery_path, index=False)

if not perquery:
return renderer.averages(dataframe=dataframe, highlight=highlight, mrt_needed=mrt_needed)

Expand Down
52 changes: 52 additions & 0 deletions tests/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,58 @@ def test_save(self):
# a successful experiment using save_dir should be faster
self.assertTrue(df2.iloc[0]["mrt"] < df1.iloc[0]["mrt"])

def test_save_csv(self):
index = self._vaswani_index()
brs = [
pt.terrier.Retriever(index, wmodel="DPH"),
pt.terrier.Retriever(index, wmodel="BM25")
]
topics = pt.datasets.get_dataset("vaswani").get_topics().head(10)
qrels = pt.datasets.get_dataset("vaswani").get_qrels()

pt.Experiment(brs, topics, qrels, eval_metrics=["map"], save_dir=self.test_dir, names=["DPH", "BM25"])

aggregated_path = os.path.join(self.test_dir, "aggregated.csv")
perquery_path = os.path.join(self.test_dir, "perquery.csv")

self.assertTrue(os.path.exists(aggregated_path), "aggregated.csv not found")
self.assertTrue(os.path.exists(perquery_path), "perquery.csv not found")

agg_df = pd.read_csv(aggregated_path)
self.assertEqual(2, len(agg_df), "aggregated.csv should have one row per system")
self.assertIn("name", agg_df.columns)
self.assertIn("map", agg_df.columns)
self.assertEqual({"DPH", "BM25"}, set(agg_df["name"].tolist()))

pq_df = pd.read_csv(perquery_path)
self.assertIn("name", pq_df.columns)
self.assertIn("qid", pq_df.columns)
self.assertIn("measure", pq_df.columns)
self.assertIn("value", pq_df.columns)
# 2 systems × 10 topics × 1 measure = 20 rows
self.assertEqual(20, len(pq_df))

# Run a second experiment with only PL2; DPH and BM25 rows from the first run must be preserved
pl2 = pt.terrier.Retriever(index, wmodel="PL2")
pt.Experiment([pl2], topics, qrels, eval_metrics=["map"], save_dir=self.test_dir, names=["PL2"])

agg_df2 = pd.read_csv(aggregated_path)
# all three systems should now appear
self.assertEqual(3, len(agg_df2), "aggregated.csv should retain rows from previous runs")
self.assertEqual({"DPH", "BM25", "PL2"}, set(agg_df2["name"].tolist()))

pq_df2 = pd.read_csv(perquery_path)
# 3 systems × 10 topics × 1 measure = 30 rows
self.assertEqual(30, len(pq_df2))
self.assertEqual({"DPH", "BM25", "PL2"}, set(pq_df2["name"].tolist()))

# Running BM25 again should update its row, not duplicate it
pt.Experiment([brs[1]], topics, qrels, eval_metrics=["map"], save_dir=self.test_dir, names=["BM25"], save_mode="overwrite")

agg_df3 = pd.read_csv(aggregated_path)
self.assertEqual(3, len(agg_df3), "re-running BM25 must not create duplicate rows")
self.assertEqual({"DPH", "BM25", "PL2"}, set(agg_df3["name"].tolist()))

def test_empty(self):
df1 = pt.new.ranked_documents([[1]]).head(0)
t1 = pt.Transformer.from_df(df1)
Expand Down
Loading