Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ _build

dbl.opam
.ocamlformat

runner/__pycache__/
138 changes: 138 additions & 0 deletions runner/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""Common utilities for test runners."""

# Default timeout for tests (in seconds)
TIMEOUT = 5.0

# REPL prompt
PROMPT = ">"
Comment on lines +3 to +7
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The REPL prompt is hardcoded as ">". If the actual REPL uses a different prompt or includes additional characters (like spaces, colors, or other formatting), the prompt detection will fail. Consider making the prompt configurable or documenting the expected prompt format.

Suggested change
# Default timeout for tests (in seconds)
TIMEOUT = 5.0
# REPL prompt
PROMPT = ">"
import os
# Default timeout for tests (in seconds)
TIMEOUT = 5.0
# REPL prompt (configurable via REPL_PROMPT environment variable)
PROMPT = os.environ.get("REPL_PROMPT", ">")

Copilot uses AI. Check for mistakes.


def _parse_expectation_line(line):
"""Parses a single expectation line (# @...).

Supports:
# @stdout: text - expect text in stdout
# @stderr: text - expect text in stderr
# @stdout-empty - expect empty stdout
# @stderr-empty - expect empty stderr

Args:
line: Line to parse

Returns:
Tuple (kind, text) or (None, None) if not an expectation line
"""
if not line.startswith("# @"):
return None, None

rest = line[3:].strip()

# Check for empty expectations
if rest in ("stdout-empty", "stderr-empty"):
return rest, ""

if ":" not in rest:
return None, None

kind, text = rest.split(":", 1)
return kind.strip(), text.strip()

def parse_tests(path):
"""Parses REPL test file with code lines and # @ expectations.

Format:
code_line_1
# @stdout: expected_output
# @stderr: expected_error
# @stdout-empty (expect empty stdout)
# @stderr-empty (expect empty stderr)
code_line_2
...

Args:
path: Path to test file

Returns:
List of tests, each with 'code' and 'expect' keys
"""
tests = []
current_test = None

with open(path) as f:
for line in f:
line = line.rstrip()
if not line:
continue
Comment on lines +62 to +65
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty lines are skipped, which means blank lines between test cases are ignored. While this might be intentional for readability, it means that if a test intentionally sends an empty line to the REPL (which could be a valid test case), it cannot be represented in this format. Consider documenting this limitation or supporting escaped empty lines.

Copilot uses AI. Check for mistakes.

# New code line = new test
if not line.startswith("#"):
if current_test:
tests.append(current_test)
current_test = {
"code": line,
"expect": {"stdout": [], "stderr": [], "stdout-empty": False, "stderr-empty": False}
}
# Expected output annotation
elif current_test:
kind, text = _parse_expectation_line(line)
if kind:
if kind in ("stdout-empty", "stderr-empty"):
current_test["expect"][kind] = True
elif kind in current_test["expect"]:
current_test["expect"][kind].append(text)

if current_test:
tests.append(current_test)
return tests

def parse_expectations(file):
tests = parse_tests(file)
expectations = {"stdout": [], "stderr": [], "stdout-empty": False, "stderr-empty": False}
for test in tests:
expectations["stdout"].extend(test["expect"]["stdout"])
expectations["stderr"].extend(test["expect"]["stderr"])
expectations["stdout-empty"] = expectations["stdout-empty"] or test["expect"]["stdout-empty"]
expectations["stderr-empty"] = expectations["stderr-empty"] or test["expect"]["stderr-empty"]

Comment on lines +91 to +96
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The OR logic for combining empty expectations is incorrect. On lines 94-95, the code uses OR to combine stdout-empty and stderr-empty flags. This means if any single expectation in the file specifies empty output, it will override all other expectations. This is likely a bug - either these flags should use AND logic (all expectations must agree), or the parsing should reject files that mix empty and non-empty expectations for the same stream.

Suggested change
for test in tests:
expectations["stdout"].extend(test["expect"]["stdout"])
expectations["stderr"].extend(test["expect"]["stderr"])
expectations["stdout-empty"] = expectations["stdout-empty"] or test["expect"]["stdout-empty"]
expectations["stderr-empty"] = expectations["stderr-empty"] or test["expect"]["stderr-empty"]
# Track aggregate emptiness: start assuming streams can be empty,
# then clear the flags if any test expects non-empty output or
# does not mark the stream as empty.
stdout_empty_all = True
stderr_empty_all = True
for test in tests:
# Collect all expected substrings
expectations["stdout"].extend(test["expect"]["stdout"])
expectations["stderr"].extend(test["expect"]["stderr"])
# Combine empty flags with AND semantics
stdout_empty_all = stdout_empty_all and test["expect"]["stdout-empty"]
stderr_empty_all = stderr_empty_all and test["expect"]["stderr-empty"]
# If any explicit non-empty expectation exists for a stream,
# that stream cannot be globally empty.
if test["expect"]["stdout"]:
stdout_empty_all = False
if test["expect"]["stderr"]:
stderr_empty_all = False
# Finalize empty expectations only if all tests agree and there are
# no conflicting non-empty expectations.
if tests:
expectations["stdout-empty"] = stdout_empty_all
expectations["stderr-empty"] = stderr_empty_all

Copilot uses AI. Check for mistakes.
return expectations

def check_expectations(expectations, stdout, stderr):
"""Checks if output matches expectations for both stdout and stderr.

Supports:
- Checking for presence of text in stdout/stderr
- Checking for empty stdout/stderr

Args:
expectations: Dictionary with 'stdout', 'stderr', 'stdout-empty', 'stderr-empty' keys
stdout: Actual stdout output
stderr: Actual stderr output

Returns:
True if all expectations are met, False otherwise
"""
# Check stdout-empty
if expectations["stdout-empty"]:
if stdout.strip():
print(f"[FAILED] Expected empty stdout, but got:\n{stdout}")
return False

# Check stderr-empty
if expectations["stderr-empty"]:
if stderr.strip():
print(f"[FAILED] Expected empty stderr, but got:\n{stderr}")
return False

# Check stdout contains expected text
for expected in expectations["stdout"]:
if expected not in stdout:
print(f"[FAILED stdout] '{expected}' not in:\n{stdout}")
return False

for expected in expectations["stderr"]:
if expected not in stderr:
print(f"[FAILED stderr] '{expected}' not in:\n{stderr}")
return False

return True

41 changes: 41 additions & 0 deletions runner/functional.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import subprocess
import time

from common import TIMEOUT, parse_expectations, check_expectations

def run_program_test(binary, flags, file, expect_exit, timeout=TIMEOUT):
"""Runs program test and checks exit code and output expectations.

Args:
binary: Path to executable
flags: Command line flags
file: Test file path
expect_exit: Expected exit code
timeout: Timeout in seconds (default: TIMEOUT constant)
"""
cmd = f"{binary} {flags} {file}"
expectations = parse_expectations(file)

try:
start_time = time.time()
proc = subprocess.run(
cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=timeout,
Comment on lines +16 to +27
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using 'shell=True' in subprocess.run is a security risk. Passing shell=True allows shell injection attacks if any of the parameters (binary, flags, or file) contain user-controlled input. Consider using a list of arguments instead of shell=True, for example: subprocess.run([binary] + shlex.split(flags) + [file], ...).

Copilot uses AI. Check for mistakes.
)
except subprocess.TimeoutExpired:
elapsed = time.time() - start_time
print(f"TIMEOUT: {file} (exceeded {timeout}s timeout, ran for {elapsed:.2f}s)")
return False

# Check exit code
if proc.returncode != expect_exit:
print(f"BAD EXIT: {file} got {proc.returncode} expected {expect_exit}")
print(proc.stderr)
return False

# Check expectations
return check_expectations(expectations, proc.stdout, proc.stderr)
84 changes: 84 additions & 0 deletions runner/repl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import subprocess
import time
import shlex

from common import TIMEOUT, PROMPT, parse_tests, check_expectations

def read_until_prompt(proc, timeout=TIMEOUT):
"""Reads output until prompt is found or timeout occurs."""
output = ""
start_time = time.time()

while time.time() - start_time < timeout:
try:
c = proc.stdout.read(1)
if not c: # EOF
break
output += c
if output.endswith(PROMPT):
return output
Comment on lines +7 to +19
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reading output character-by-character is inefficient and may cause performance issues. The function reads one character at a time in a tight loop, which is very slow for larger outputs. Consider reading larger chunks (e.g., using readline() or read() with a buffer size) and then searching for the prompt in the accumulated output.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The REPL prompt is included in the output being checked. The function 'read_until_prompt' returns output that includes the prompt at the end (line 19). This means when checking expectations against stdout, the prompt character(s) will be part of the output string. This could cause unexpected matches or failures if expectations don't account for the prompt. Consider removing the prompt from the output before returning it, or document this behavior clearly.

Suggested change
return output
# Return the output without the trailing prompt characters
return output[:-len(PROMPT)]

Copilot uses AI. Check for mistakes.
except Exception:
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bare 'except Exception' catches all exceptions and may hide important errors. The catch-all exception handler on line 20 could suppress important errors like KeyboardInterrupt or other critical issues. Consider catching more specific exceptions (e.g., OSError, IOError) or at least re-raising certain critical exceptions.

Suggested change
except Exception:
except OSError:

Copilot uses AI. Check for mistakes.
break
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exception case incorrectly raises TimeoutError. When an exception occurs during reading (line 20-21), the loop breaks and a TimeoutError is raised on line 24, even though the actual error was something else (e.g., an I/O error). This makes debugging difficult as the real error is hidden. Consider either re-raising the caught exception or raising a different error type.

Suggested change
break
raise

Copilot uses AI. Check for mistakes.
Comment on lines +16 to +21
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EOF case incorrectly raises TimeoutError. When EOF is encountered (line 15-16), the loop breaks and execution continues to line 23 where a TimeoutError is raised, even though a timeout didn't actually occur. The error message will be misleading in this case. Consider returning the partial output or raising a different exception type (e.g., EOFError) when EOF is reached without finding the prompt.

Suggested change
break
output += c
if output.endswith(PROMPT):
return output
except Exception:
break
# EOF occurred before the prompt was seen; report this explicitly.
raise EOFError(f"EOF reached while waiting for prompt (got {len(output)} chars)")
output += c
if output.endswith(PROMPT):
return output
except EOFError:
# Propagate EOF to the caller so it is not misreported as a timeout.
raise
except Exception:
# Propagate unexpected read errors instead of mislabeling them as timeouts.
raise

Copilot uses AI. Check for mistakes.

elapsed = time.time() - start_time
raise TimeoutError(f"Timeout waiting for prompt after {timeout}s (elapsed: {elapsed:.2f}s, got {len(output)} chars)")

def maybe_read_stderr(proc, timeout=0.1):
"""Attempts to read stderr with timeout."""
try:
import select
ready, _, _ = select.select([proc.stderr], [], [], timeout)
return proc.stderr.readline() if ready else ""
except Exception:
return ""
Comment on lines +26 to +33
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 'import select' statement is inside the function and wrapped in a try-except block. If the 'select' module is not available (e.g., on Windows), this function will silently return an empty string without any indication that stderr reading is not working. Consider moving the import to the top of the file to fail fast if the module is unavailable, or add a warning log when falling back to the empty string.

Copilot uses AI. Check for mistakes.

def _run_single_test(proc, test, timeout):
"""Runs a single REPL test, returns True if passes."""
try:
proc.stdin.write(test["code"] + "\n")
proc.stdin.flush()

stdout = read_until_prompt(proc, timeout=timeout)
stderr = maybe_read_stderr(proc, timeout=0.1)

return check_expectations(test["expect"], stdout, stderr)
except TimeoutError as e:
print(f"[TIMEOUT] Test '{test['code']}': {e}")
return False
except Exception as e:
print(f"[ERROR] Test '{test['code']}': {e}")
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bare 'except Exception' silently swallows all errors. The exception handler on lines 48-50 catches all exceptions including critical ones like KeyboardInterrupt. This makes debugging difficult as all test failures appear identical. Consider catching specific exceptions or at least logging the exception type and message.

Suggested change
print(f"[ERROR] Test '{test['code']}': {e}")
print(f"[ERROR] Test '{test['code']}': {type(e).__name__}: {e}")

Copilot uses AI. Check for mistakes.
return False


def _close_process(proc, timeout):
"""Gracefully closes REPL process."""
try:
proc.terminate()
proc.wait(timeout=timeout)
except Exception:
proc.kill()
Comment on lines +58 to +59
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another bare 'except Exception' that silently handles all errors. The exception handler on lines 58-59 during process termination catches all exceptions without logging or distinguishing between different error types. This could hide important cleanup failures.

Suggested change
except Exception:
proc.kill()
except subprocess.TimeoutExpired:
# If the process does not exit in time, force kill it.
proc.kill()
except Exception as e:
# Log unexpected termination errors, then ensure the process is killed.
print(f"[WARN] Failed to terminate process cleanly: {e}")
proc.kill()

Copilot uses AI. Check for mistakes.

def run_repl_test(binary, flags, file, timeout=TIMEOUT):
"""Runs REPL tests from file, returns True if all pass."""
args = [binary] + shlex.split(flags)

proc = subprocess.Popen(
args,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1
)

try:
tests = parse_tests(file)
read_until_prompt(proc, timeout=timeout) # consume initial prompt

for test in tests:
if not _run_single_test(proc, test, timeout):
return False

return True
Comment on lines +74 to +82
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty test files will pass silently. If 'parse_tests' returns an empty list (because the file has no code lines), the for loop on line 78 will not execute, and the function will return True, making the test pass. This could hide misconfigured or incorrectly formatted test files. Consider adding a check to ensure at least one test was found, or documenting that empty test files are valid.

Copilot uses AI. Check for mistakes.
finally:
_close_process(proc, timeout)
113 changes: 113 additions & 0 deletions runner/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import subprocess
import sys
import glob as _glob

from functional import run_program_test
from repl import run_repl_test


class TestRunner:
"""Manages test execution and result tracking."""

def __init__(self, binary):
self.binary = binary
self.flags = ""
self.total = 0
self.passed = 0

@staticmethod
def glob(pattern):
"""Returns sorted list of files matching pattern."""
return sorted(_glob.glob(pattern))

def _record_test(self, success):
"""Records test result."""
self.total += 1
if success:
self.passed += 1

def with_flags(self, flags_str, test_fn):
"""Temporarily changes flags for test execution."""
old_flags = self.flags
self.flags = flags_str
try:
test_fn()
finally:
self.flags = old_flags

def simple(self, file):
"""Runs test expecting exit code 0."""
result = run_program_test(self.binary, self.flags, file, expect_exit=0)
self._record_test(result)

def exit_code(self, code, file):
"""Runs test expecting specific exit code."""
result = run_program_test(self.binary, self.flags, file, expect_exit=code)
self._record_test(result)

def repl(self, file):
"""Runs REPL test."""
result = run_repl_test(self.binary, self.flags, file)
self._record_test(result)

def simple_run_tests(self, test_files_name, flags=""):
old_flags = self.flags
self.flags = flags
try:
for file in TestRunner.glob(test_files_name):
self.simple(file)
finally:
self.flags = old_flags

def repl_tests(self, test_files_name, flags=""):
old_flags = self.flags
self.flags = flags
try:
for file in TestRunner.glob(test_files_name):
print(f"\n--- REPL TEST: {file} ---\n")
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Debug print statement left in production code. The print statement on line 67 that outputs "--- REPL TEST: {file} ---" should be removed or controlled by a verbosity flag to maintain clean test output.

Copilot uses AI. Check for mistakes.
self.repl(file)
finally:
self.flags = old_flags
Comment on lines +53 to +70
Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code duplication between 'simple_run_tests' and 'repl_tests' methods is significant. Both methods follow the same pattern: save old flags, set new flags, iterate over files, call a test method, and restore flags. This should be refactored into a generic helper method to improve maintainability.

Suggested change
def simple_run_tests(self, test_files_name, flags=""):
old_flags = self.flags
self.flags = flags
try:
for file in TestRunner.glob(test_files_name):
self.simple(file)
finally:
self.flags = old_flags
def repl_tests(self, test_files_name, flags=""):
old_flags = self.flags
self.flags = flags
try:
for file in TestRunner.glob(test_files_name):
print(f"\n--- REPL TEST: {file} ---\n")
self.repl(file)
finally:
self.flags = old_flags
def _run_tests(self, test_files_name, flags, per_file_fn):
"""Generic helper to run tests over globbed files with temporary flags."""
def run_all():
for file in TestRunner.glob(test_files_name):
per_file_fn(file)
self.with_flags(flags, run_all)
def simple_run_tests(self, test_files_name, flags=""):
"""Runs simple tests (expecting exit code 0) over all matching files."""
def per_file(file):
self.simple(file)
self._run_tests(test_files_name, flags, per_file)
def repl_tests(self, test_files_name, flags=""):
"""Runs REPL tests over all matching files, printing a header for each."""
def per_file(file):
print(f"\n--- REPL TEST: {file} ---\n")
self.repl(file)
self._run_tests(test_files_name, flags, per_file)

Copilot uses AI. Check for mistakes.



def _build_target(program_name):
"""Builds target using dune, returns path or None on failure."""
target = f"src/{program_name}.exe"
if subprocess.run(["dune", "build", target]).returncode != 0:
return None
return f"_build/default/{target}"


def main():
"""Main entry point for test runner."""
if len(sys.argv) != 3:
print("USAGE: runner.py PROGRAM TEST_SUITE")
return 1

binary = _build_target(sys.argv[1])
if not binary:
return 1

runner = TestRunner(binary)

# Create namespace with runner methods for test suite execution
namespace = {
"simple": runner.simple,
"exit_code": runner.exit_code,
"with_flags": runner.with_flags,
"glob": TestRunner.glob,
"repl": runner.repl,
"simple_run_tests": runner.simple_run_tests,
"repl_tests": runner.repl_tests,
}

Copy link

Copilot AI Jan 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using 'exec()' with file content poses a security risk. The code directly executes Python code from the test suite file without any validation or sandboxing. While this may be acceptable for internal test files, consider adding a warning comment about the security implications, or implement basic validation to ensure the file only contains expected test commands.

Suggested change
# WARNING: The test suite file is executed as Python code using exec().
# This is intended only for internal, trusted test files. Do not pass
# untrusted or user-supplied files here, as that would allow arbitrary
# code execution.

Copilot uses AI. Check for mistakes.
with open(sys.argv[2]) as f:
exec(f.read(), namespace)

print(f"\nPassed: {runner.passed}/{runner.total}")
return 0 if runner.passed == runner.total else 1


if __name__ == "__main__":
sys.exit(main())
7 changes: 7 additions & 0 deletions test/repl/non_fatal_error0001.fram
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
let f (x : Unit -> Unit) = x () ;;

f () ;;
# @stderr: error

42;;
# @stdout: 42
Loading