perf-ai-example-org · yishangupenn · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 4, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -40,5 +40,9 @@ These are considered priority 0 issues for this repo, in addition to the normal
 
 ## Recent Learnings
 
+- **`uv run` can inherit the wrong virtualenv in this repo** -> Clear `VIRTUAL_ENV` (for example `env -u VIRTUAL_ENV uv run ...`) when an unrelated environment is active -> Avoids misleading mismatch warnings and makes it clear the repo's `.venv` is the interpreter actually running the harnesses.
 - **Realtime eval shared imports can resolve the wrong module under pytest** -> Add `shared/__init__.py` and ensure tests prepend `examples/evals/realtime_evals` to `sys.path` before importing `shared.*` -> Prevents collection failures caused by unrelated installed packages named `shared`.
 - **Run-level grades can be overweighted by long simulations** -> Store turn-level grades on the matching turn and trace-level grades on one row per simulation instead of copying them onto every row -> Keeps `results.csv` row semantics intact and prevents summary means from favoring longer conversations.
+- **Synthetic-audio scaffold requests can pick the wrong harness** -> Default unspecified synthetic-audio evals to `crawl` text-to-TTS and reserve `walk` for replay-specific audio traits like noise, telephony artifacts, or speaker characteristics -> Keeps new realtime evals on the simplest harness unless audio realism is itself under test.
+- **Task-specific single-turn grading can outgrow the shared crawl schema** -> Keep the shared crawl harness for realtime execution, then add eval-local wrapper scripts that post-grade domain-specific quality and overwrite `results.csv` while preserving `results_base.csv` -> Avoids forking the harness when a use case needs richer grading than tool-call correctness.
+- **Synthetic learner audio can sound like eval scaffolding** -> Write `user_text` as a realistic in-app learner request and keep evaluation rules in metadata plus the system prompt -> Produces audio inputs that match the product surface instead of teaching the model the grading rubric through the spoken prompt.
diff --git a/authors.yaml b/authors.yaml
@@ -567,3 +567,8 @@ kathylau-oai:
   name: "Kathy Lau"
   website: "https://github.com/kathylau-oai"
   avatar: "https://avatars.githubusercontent.com/u/247463782"
+
+nsingaraju-oai:
+  name: "Nishanth Singaraju"
+  website: "https://github.com/nsingaraju-oai"
+  avatar: "https://avatars.githubusercontent.com/u/232978332"
diff --git a/examples/Prompt_Caching_201.ipynb b/examples/Prompt_Caching_201.ipynb
diff --git a/examples/codex/long_horizon_tasks.md b/examples/codex/long_horizon_tasks.md
diff --git a/examples/evals/realtime_evals/Makefile b/examples/evals/realtime_evals/Makefile
@@ -7,13 +7,21 @@ ifeq ($(UV_BIN),)
 RUFF := $(VENV_DIR)/bin/ruff
 MYPY := $(VENV_DIR)/bin/mypy
 PYTEST := $(VENV_DIR)/bin/pytest
+STREAMLIT := $(VENV_DIR)/bin/streamlit
+RUN_PYTHON := $(VENV_PYTHON)
 else
 RUFF := uv run --with ruff ruff
 MYPY := uv run --with mypy --with pandas-stubs --with types-seaborn --with types-tqdm mypy
 PYTEST := uv run --with pytest pytest
+STREAMLIT := uv run --with streamlit streamlit
+RUN_PYTHON := uv run python
 endif
 
-.PHONY: install format lint lint-fix typecheck test
+.PHONY: install streamlit format lint lint-fix typecheck test validate-input validate-output
+
+HARNESS ?=
+DATA_PATH ?=
+RUN_DIR ?=
 
 install:
 ifeq ($(UV_BIN),)
@@ -38,3 +46,14 @@ typecheck:
 
 test:
 	$(PYTEST)
+
+streamlit:
+	cd results_viewer && $(STREAMLIT) run app.py
+
+validate-input:
+	@test -n "$(HARNESS)" || (echo "HARNESS is required, e.g. make validate-input HARNESS=run" && exit 2)
+	$(RUN_PYTHON) shared/scripts/validate_eval_input.py --harness $(HARNESS) $(if $(DATA_PATH),--data-path "$(DATA_PATH)",)
+
+validate-output:
+	@test -n "$(HARNESS)" || (echo "HARNESS is required, e.g. make validate-output HARNESS=run" && exit 2)
+	$(RUN_PYTHON) shared/scripts/validate_eval_output.py --harness $(HARNESS) $(if $(RUN_DIR),--run-dir "$(RUN_DIR)",)
diff --git a/examples/evals/realtime_evals/README.md b/examples/evals/realtime_evals/README.md
@@ -33,6 +33,7 @@ Run a first command per harness. If uv is not installed, replace `uv run` with `
 Use the root `Makefile` for common checks. Run `make install` first to create `.venv`. These targets work with or without `uv`: when `uv` is installed they run through `uv run`, and otherwise they use the matching tool binaries from the local `.venv`.
 
 - `make install`
+- `make streamlit`
 - `make format`
 - `make lint`
 - `make lint-fix`
@@ -102,6 +103,28 @@ To render charts for an existing run after the fact:
 uv run python plot_eval_results.py --run-dir run_harness/results/<run_id>
 ```
 
+## Results Viewer
+
+Use the Streamlit results viewer to browse saved runs from `crawl_harness`, `walk_harness`, and `run_harness` without opening the raw artifacts by hand.
+
+- `Comparison View`: select a harness, choose one or more saved runs, and compare summary metrics, scores, latency, and token usage across runs.
+- `Run Viewer`: inspect one saved run in detail. Crawl and walk runs show row-level audio artifacts and event logs; run-harness runs use a Simulation Viewer with transcripts, event logs, and turn audio.
+
+Run it from this directory with either:
+
+```bash
+make streamlit
+```
+
+or:
+
+```bash
+cd results_viewer
+uv run streamlit run app.py
+```
+
+Then open the local Streamlit URL, usually `http://localhost:8501`.
+
 ## Common CLI flags
 
 All harnesses share a core set of flags so you can switch between them easily:

diff --git a/examples/evals/realtime_evals/pyproject.toml b/examples/evals/realtime_evals/pyproject.toml
@@ -17,9 +17,17 @@ dev = [
   "mypy",
   "pandas-stubs",
   "pytest",
+  "streamlit",
   "types-seaborn",
   "types-tqdm",
 ]
 
 [tool.ruff.lint]
 ignore = ["E402"]
+
+[tool.mypy]
+explicit_package_bases = true
+
+[[tool.mypy.overrides]]
+module = ["streamlit", "streamlit.*"]
+ignore_missing_imports = true
diff --git a/examples/evals/realtime_evals/requirements-dev.txt b/examples/evals/realtime_evals/requirements-dev.txt
@@ -1,5 +1,8 @@
 # This file was autogenerated by uv via the following command:
 #    uv export --format requirements-txt --group dev --output-file requirements-dev.txt
+altair==6.0.0 \
+    --hash=sha256:09ae95b53d5fe5b16987dccc785a7af8588f2dca50de1e7a156efa8a461515f8 \
+    --hash=sha256:614bf5ecbe2337347b590afb111929aa9c16c9527c4887d96c9bc7f6640756b4
 annotated-types==0.7.0 \
     --hash=sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53 \
     --hash=sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89