fram-lang · Xaro8 · Jan 28, 2026 · Jan 28, 2026 · Copilot · Jan 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@ _build
 
 dbl.opam
 .ocamlformat
+
+runner/__pycache__/
diff --git a/runner/common.py b/runner/common.py
@@ -0,0 +1,138 @@
+"""Common utilities for test runners."""
+
+# Default timeout for tests (in seconds)
+TIMEOUT = 5.0
+
+# REPL prompt
+PROMPT = ">"
-# Default timeout for tests (in seconds)
-TIMEOUT = 5.0
-
-# REPL prompt
-PROMPT = ">"
+import os
+
+# Default timeout for tests (in seconds)
+TIMEOUT = 5.0
+
+# REPL prompt (configurable via REPL_PROMPT environment variable)
+PROMPT = os.environ.get("REPL_PROMPT", ">")
-# Default timeout for tests (in seconds)
-TIMEOUT = 5.0
-
-# REPL prompt
-PROMPT = ">"
+import os
+
+# Default timeout for tests (in seconds)
+TIMEOUT = 5.0
+
+# REPL prompt (configurable via REPL_PROMPT environment variable)
+PROMPT = os.environ.get("REPL_PROMPT", ">")
+
+
+def _parse_expectation_line(line):
+    """Parses a single expectation line (# @...).
+
+    Supports:
+      # @stdout: text         - expect text in stdout
+      # @stderr: text         - expect text in stderr
+      # @stdout-empty         - expect empty stdout
+      # @stderr-empty         - expect empty stderr
+
+    Args:
+        line: Line to parse
+
+    Returns:
+        Tuple (kind, text) or (None, None) if not an expectation line
+    """
+    if not line.startswith("# @"):
+        return None, None
+
+    rest = line[3:].strip()
+
+    # Check for empty expectations
+    if rest in ("stdout-empty", "stderr-empty"):
+        return rest, ""
+
+    if ":" not in rest:
+        return None, None
+
+    kind, text = rest.split(":", 1)
+    return kind.strip(), text.strip()
+
+def parse_tests(path):
+    """Parses REPL test file with code lines and # @ expectations.
+
+    Format:
+      code_line_1
+      # @stdout: expected_output
+      # @stderr: expected_error
+      # @stdout-empty          (expect empty stdout)
+      # @stderr-empty          (expect empty stderr)
+      code_line_2
+      ...
+
+    Args:
+        path: Path to test file
+
+    Returns:
+        List of tests, each with 'code' and 'expect' keys
+    """
+    tests = []
+    current_test = None
+
+    with open(path) as f:
+        for line in f:
+            line = line.rstrip()
+            if not line:
+                continue
+
+            # New code line = new test
+            if not line.startswith("#"):
+                if current_test:
+                    tests.append(current_test)
+                current_test = {
+                    "code": line,
+                    "expect": {"stdout": [], "stderr": [], "stdout-empty": False, "stderr-empty": False}
+                }
+            # Expected output annotation
+            elif current_test:
+                kind, text = _parse_expectation_line(line)
+                if kind:
+                    if kind in ("stdout-empty", "stderr-empty"):
+                        current_test["expect"][kind] = True
+                    elif kind in current_test["expect"]:
+                        current_test["expect"][kind].append(text)
+
+    if current_test:
+        tests.append(current_test)
+    return tests
+
+def parse_expectations(file):
+    tests = parse_tests(file)
+    expectations = {"stdout": [], "stderr": [], "stdout-empty": False, "stderr-empty": False}
+    for test in tests:
+        expectations["stdout"].extend(test["expect"]["stdout"])
+        expectations["stderr"].extend(test["expect"]["stderr"])
+        expectations["stdout-empty"] = expectations["stdout-empty"] or test["expect"]["stdout-empty"]
+        expectations["stderr-empty"] = expectations["stderr-empty"] or test["expect"]["stderr-empty"]
+
-    for test in tests:
-        expectations["stdout"].extend(test["expect"]["stdout"])
-        expectations["stderr"].extend(test["expect"]["stderr"])
-        expectations["stdout-empty"] = expectations["stdout-empty"] or test["expect"]["stdout-empty"]
-        expectations["stderr-empty"] = expectations["stderr-empty"] or test["expect"]["stderr-empty"]
+
+    # Track aggregate emptiness: start assuming streams can be empty,
+    # then clear the flags if any test expects non-empty output or
+    # does not mark the stream as empty.
+    stdout_empty_all = True
+    stderr_empty_all = True
+
+    for test in tests:
+        # Collect all expected substrings
+        expectations["stdout"].extend(test["expect"]["stdout"])
+        expectations["stderr"].extend(test["expect"]["stderr"])
+
+        # Combine empty flags with AND semantics
+        stdout_empty_all = stdout_empty_all and test["expect"]["stdout-empty"]
+        stderr_empty_all = stderr_empty_all and test["expect"]["stderr-empty"]
+
+        # If any explicit non-empty expectation exists for a stream,
+        # that stream cannot be globally empty.
+        if test["expect"]["stdout"]:
+            stdout_empty_all = False
+        if test["expect"]["stderr"]:
+            stderr_empty_all = False
+
+    # Finalize empty expectations only if all tests agree and there are
+    # no conflicting non-empty expectations.
+    if tests:
+        expectations["stdout-empty"] = stdout_empty_all
+        expectations["stderr-empty"] = stderr_empty_all
-    for test in tests:
-        expectations["stdout"].extend(test["expect"]["stdout"])
-        expectations["stderr"].extend(test["expect"]["stderr"])
-        expectations["stdout-empty"] = expectations["stdout-empty"] or test["expect"]["stdout-empty"]
-        expectations["stderr-empty"] = expectations["stderr-empty"] or test["expect"]["stderr-empty"]
+
+    # Track aggregate emptiness: start assuming streams can be empty,
+    # then clear the flags if any test expects non-empty output or
+    # does not mark the stream as empty.
+    stdout_empty_all = True
+    stderr_empty_all = True
+
+    for test in tests:
+        # Collect all expected substrings
+        expectations["stdout"].extend(test["expect"]["stdout"])
+        expectations["stderr"].extend(test["expect"]["stderr"])
+
+        # Combine empty flags with AND semantics
+        stdout_empty_all = stdout_empty_all and test["expect"]["stdout-empty"]
+        stderr_empty_all = stderr_empty_all and test["expect"]["stderr-empty"]
+
+        # If any explicit non-empty expectation exists for a stream,
+        # that stream cannot be globally empty.
+        if test["expect"]["stdout"]:
+            stdout_empty_all = False
+        if test["expect"]["stderr"]:
+            stderr_empty_all = False
+
+    # Finalize empty expectations only if all tests agree and there are
+    # no conflicting non-empty expectations.
+    if tests:
+        expectations["stdout-empty"] = stdout_empty_all
+        expectations["stderr-empty"] = stderr_empty_all
+    return expectations
+
+def check_expectations(expectations, stdout, stderr):
+    """Checks if output matches expectations for both stdout and stderr.
+
+    Supports:
+      - Checking for presence of text in stdout/stderr
+      - Checking for empty stdout/stderr
+
+    Args:
+        expectations: Dictionary with 'stdout', 'stderr', 'stdout-empty', 'stderr-empty' keys
+        stdout: Actual stdout output
+        stderr: Actual stderr output
+
+    Returns:
+        True if all expectations are met, False otherwise
+    """
+    # Check stdout-empty
+    if expectations["stdout-empty"]:
+        if stdout.strip():
+            print(f"[FAILED] Expected empty stdout, but got:\n{stdout}")
+            return False
+
+    # Check stderr-empty
+    if expectations["stderr-empty"]:
+        if stderr.strip():
+            print(f"[FAILED] Expected empty stderr, but got:\n{stderr}")
+            return False
+
+    # Check stdout contains expected text
+    for expected in expectations["stdout"]:
+        if expected not in stdout:
+            print(f"[FAILED stdout] '{expected}' not in:\n{stdout}")
+            return False
+
+    for expected in expectations["stderr"]:
+        if expected not in stderr:
+            print(f"[FAILED stderr] '{expected}' not in:\n{stderr}")
+            return False
+
+    return True
+
diff --git a/runner/functional.py b/runner/functional.py
@@ -0,0 +1,41 @@
+import subprocess
+import time
+
+from common import TIMEOUT, parse_expectations, check_expectations
+
+def run_program_test(binary, flags, file, expect_exit, timeout=TIMEOUT):
+    """Runs program test and checks exit code and output expectations.
+
+    Args:
+        binary: Path to executable
+        flags: Command line flags
+        file: Test file path
+        expect_exit: Expected exit code
+        timeout: Timeout in seconds (default: TIMEOUT constant)
+    """
+    cmd = f"{binary} {flags} {file}"
+    expectations = parse_expectations(file)
+
+    try:
+        start_time = time.time()
+        proc = subprocess.run(
+            cmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            timeout=timeout,
+        )
+    except subprocess.TimeoutExpired:
+        elapsed = time.time() - start_time
+        print(f"TIMEOUT: {file} (exceeded {timeout}s timeout, ran for {elapsed:.2f}s)")
+        return False
+
+    # Check exit code
+    if proc.returncode != expect_exit:
+        print(f"BAD EXIT: {file} got {proc.returncode} expected {expect_exit}")
+        print(proc.stderr)
+        return False
+
+    # Check expectations
+    return check_expectations(expectations, proc.stdout, proc.stderr)
diff --git a/runner/repl.py b/runner/repl.py
@@ -0,0 +1,84 @@
+import subprocess
+import time
+import shlex
+
+from common import TIMEOUT, PROMPT, parse_tests, check_expectations
+
+def read_until_prompt(proc, timeout=TIMEOUT):
+    """Reads output until prompt is found or timeout occurs."""
+    output = ""
+    start_time = time.time()
+
+    while time.time() - start_time < timeout:
+        try:
+            c = proc.stdout.read(1)
+            if not c:  # EOF
+                break
+            output += c
+            if output.endswith(PROMPT):
+                return output
-                return output
+                # Return the output without the trailing prompt characters
+                return output[:-len(PROMPT)]
-                return output
+                # Return the output without the trailing prompt characters
+                return output[:-len(PROMPT)]
+        except Exception:
-        except Exception:
+        except OSError:
-        except Exception:
+        except OSError:
+            break
-            break
+            raise
-                break
-            output += c
-            if output.endswith(PROMPT):
-                return output
-        except Exception:
-            break
+                # EOF occurred before the prompt was seen; report this explicitly.
+                raise EOFError(f"EOF reached while waiting for prompt (got {len(output)} chars)")
+            output += c
+            if output.endswith(PROMPT):
+                return output
+        except EOFError:
+            # Propagate EOF to the caller so it is not misreported as a timeout.
+            raise
+        except Exception:
+            # Propagate unexpected read errors instead of mislabeling them as timeouts.
+            raise
-            break
+            raise
-                break
-            output += c
-            if output.endswith(PROMPT):
-                return output
-        except Exception:
-            break
+                # EOF occurred before the prompt was seen; report this explicitly.
+                raise EOFError(f"EOF reached while waiting for prompt (got {len(output)} chars)")
+            output += c
+            if output.endswith(PROMPT):
+                return output
+        except EOFError:
+            # Propagate EOF to the caller so it is not misreported as a timeout.
+            raise
+        except Exception:
+            # Propagate unexpected read errors instead of mislabeling them as timeouts.
+            raise
+
+    elapsed = time.time() - start_time
+    raise TimeoutError(f"Timeout waiting for prompt after {timeout}s (elapsed: {elapsed:.2f}s, got {len(output)} chars)")
+
+def maybe_read_stderr(proc, timeout=0.1):
+    """Attempts to read stderr with timeout."""
+    try:
+        import select
+        ready, _, _ = select.select([proc.stderr], [], [], timeout)
+        return proc.stderr.readline() if ready else ""
+    except Exception:
+        return ""
+
+def _run_single_test(proc, test, timeout):
+    """Runs a single REPL test, returns True if passes."""
+    try:
+        proc.stdin.write(test["code"] + "\n")
+        proc.stdin.flush()
+
+        stdout = read_until_prompt(proc, timeout=timeout)
+        stderr = maybe_read_stderr(proc, timeout=0.1)
+
+        return check_expectations(test["expect"], stdout, stderr)
+    except TimeoutError as e:
+        print(f"[TIMEOUT] Test '{test['code']}': {e}")
+        return False
+    except Exception as e:
+        print(f"[ERROR] Test '{test['code']}': {e}")
-        print(f"[ERROR] Test '{test['code']}': {e}")
+        print(f"[ERROR] Test '{test['code']}': {type(e).__name__}: {e}")
-        print(f"[ERROR] Test '{test['code']}': {e}")
+        print(f"[ERROR] Test '{test['code']}': {type(e).__name__}: {e}")
+        return False
+
+
+def _close_process(proc, timeout):
+    """Gracefully closes REPL process."""
+    try:
+        proc.terminate()
+        proc.wait(timeout=timeout)
+    except Exception:
+        proc.kill()
-    except Exception:
-        proc.kill()
+    except subprocess.TimeoutExpired:
+        # If the process does not exit in time, force kill it.
+        proc.kill()
+    except Exception as e:
+        # Log unexpected termination errors, then ensure the process is killed.
+        print(f"[WARN] Failed to terminate process cleanly: {e}")
+        proc.kill()
-    except Exception:
-        proc.kill()
+    except subprocess.TimeoutExpired:
+        # If the process does not exit in time, force kill it.
+        proc.kill()
+    except Exception as e:
+        # Log unexpected termination errors, then ensure the process is killed.
+        print(f"[WARN] Failed to terminate process cleanly: {e}")
+        proc.kill()
+
+def run_repl_test(binary, flags, file, timeout=TIMEOUT):
+    """Runs REPL tests from file, returns True if all pass."""
+    args = [binary] + shlex.split(flags)
+
+    proc = subprocess.Popen(
+        args,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        bufsize=1
+    )
+
+    try:
+        tests = parse_tests(file)
+        read_until_prompt(proc, timeout=timeout)  # consume initial prompt
+
+        for test in tests:
+            if not _run_single_test(proc, test, timeout):
+                return False
+
+        return True
+    finally:
+        _close_process(proc, timeout)
diff --git a/runner/runner.py b/runner/runner.py
@@ -0,0 +1,113 @@
+import subprocess
+import sys
+import glob as _glob
+
+from functional import run_program_test
+from repl import run_repl_test
+
+
+class TestRunner:
+    """Manages test execution and result tracking."""
+
+    def __init__(self, binary):
+        self.binary = binary
+        self.flags = ""
+        self.total = 0
+        self.passed = 0
+
+    @staticmethod
+    def glob(pattern):
+        """Returns sorted list of files matching pattern."""
+        return sorted(_glob.glob(pattern))
+
+    def _record_test(self, success):
+        """Records test result."""
+        self.total += 1
+        if success:
+            self.passed += 1
+
+    def with_flags(self, flags_str, test_fn):
+        """Temporarily changes flags for test execution."""
+        old_flags = self.flags
+        self.flags = flags_str
+        try:
+            test_fn()
+        finally:
+            self.flags = old_flags
+
+    def simple(self, file):
+        """Runs test expecting exit code 0."""
+        result = run_program_test(self.binary, self.flags, file, expect_exit=0)
+        self._record_test(result)
+
+    def exit_code(self, code, file):
+        """Runs test expecting specific exit code."""
+        result = run_program_test(self.binary, self.flags, file, expect_exit=code)
+        self._record_test(result)
+
+    def repl(self, file):
+        """Runs REPL test."""
+        result = run_repl_test(self.binary, self.flags, file)
+        self._record_test(result)
+
+    def simple_run_tests(self, test_files_name, flags=""):
+        old_flags = self.flags
+        self.flags = flags
+        try:
+            for file in TestRunner.glob(test_files_name):
+                self.simple(file)
+        finally:
+            self.flags = old_flags
+
+    def repl_tests(self, test_files_name, flags=""):
+        old_flags = self.flags
+        self.flags = flags
+        try:
+            for file in TestRunner.glob(test_files_name):
+                print(f"\n--- REPL TEST: {file} ---\n")
+                self.repl(file)
+        finally:
+            self.flags = old_flags
-    def simple_run_tests(self, test_files_name, flags=""):
-        old_flags = self.flags
-        self.flags = flags
-        try:
-            for file in TestRunner.glob(test_files_name):
-                self.simple(file)
-        finally:
-            self.flags = old_flags
-    
-    def repl_tests(self, test_files_name, flags=""):
-        old_flags = self.flags
-        self.flags = flags
-        try:
-            for file in TestRunner.glob(test_files_name):
-                print(f"\n--- REPL TEST: {file} ---\n")
-                self.repl(file)
-        finally:
-            self.flags = old_flags
+    def _run_tests(self, test_files_name, flags, per_file_fn):
+        """Generic helper to run tests over globbed files with temporary flags."""
+        def run_all():
+            for file in TestRunner.glob(test_files_name):
+                per_file_fn(file)
+        self.with_flags(flags, run_all)
+
+    def simple_run_tests(self, test_files_name, flags=""):
+        """Runs simple tests (expecting exit code 0) over all matching files."""
+        def per_file(file):
+            self.simple(file)
+        self._run_tests(test_files_name, flags, per_file)
+    
+    def repl_tests(self, test_files_name, flags=""):
+        """Runs REPL tests over all matching files, printing a header for each."""
+        def per_file(file):
+            print(f"\n--- REPL TEST: {file} ---\n")
+            self.repl(file)
+        self._run_tests(test_files_name, flags, per_file)
-    def simple_run_tests(self, test_files_name, flags=""):
-        old_flags = self.flags
-        self.flags = flags
-        try:
-            for file in TestRunner.glob(test_files_name):
-                self.simple(file)
-        finally:
-            self.flags = old_flags
-    
-    def repl_tests(self, test_files_name, flags=""):
-        old_flags = self.flags
-        self.flags = flags
-        try:
-            for file in TestRunner.glob(test_files_name):
-                print(f"\n--- REPL TEST: {file} ---\n")
-                self.repl(file)
-        finally:
-            self.flags = old_flags
+    def _run_tests(self, test_files_name, flags, per_file_fn):
+        """Generic helper to run tests over globbed files with temporary flags."""
+        def run_all():
+            for file in TestRunner.glob(test_files_name):
+                per_file_fn(file)
+        self.with_flags(flags, run_all)
+
+    def simple_run_tests(self, test_files_name, flags=""):
+        """Runs simple tests (expecting exit code 0) over all matching files."""
+        def per_file(file):
+            self.simple(file)
+        self._run_tests(test_files_name, flags, per_file)
+    
+    def repl_tests(self, test_files_name, flags=""):
+        """Runs REPL tests over all matching files, printing a header for each."""
+        def per_file(file):
+            print(f"\n--- REPL TEST: {file} ---\n")
+            self.repl(file)
+        self._run_tests(test_files_name, flags, per_file)
+
+
+
+def _build_target(program_name):
+    """Builds target using dune, returns path or None on failure."""
+    target = f"src/{program_name}.exe"
+    if subprocess.run(["dune", "build", target]).returncode != 0:
+        return None
+    return f"_build/default/{target}"
+
+
+def main():
+    """Main entry point for test runner."""
+    if len(sys.argv) != 3:
+        print("USAGE: runner.py PROGRAM TEST_SUITE")
+        return 1
+
+    binary = _build_target(sys.argv[1])
+    if not binary:
+        return 1
+
+    runner = TestRunner(binary)
+
+    # Create namespace with runner methods for test suite execution
+    namespace = {
+        "simple": runner.simple,
+        "exit_code": runner.exit_code,
+        "with_flags": runner.with_flags,
+        "glob": TestRunner.glob,
+        "repl": runner.repl,
+        "simple_run_tests": runner.simple_run_tests,
+        "repl_tests": runner.repl_tests,
+    }
+
-
+
+    # WARNING: The test suite file is executed as Python code using exec().
+    # This is intended only for internal, trusted test files. Do not pass
+    # untrusted or user-supplied files here, as that would allow arbitrary
+    # code execution.
-
+
+    # WARNING: The test suite file is executed as Python code using exec().
+    # This is intended only for internal, trusted test files. Do not pass
+    # untrusted or user-supplied files here, as that would allow arbitrary
+    # code execution.
+    with open(sys.argv[2]) as f:
+        exec(f.read(), namespace)
+
+    print(f"\nPassed: {runner.passed}/{runner.total}")
+    return 0 if runner.passed == runner.total else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/test/repl/non_fatal_error0001.fram b/test/repl/non_fatal_error0001.fram
@@ -0,0 +1,7 @@
+let f (x : Unit -> Unit) = x () ;;
+
+f () ;;
+# @stderr: error
+
+42;;
+# @stdout: 42
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,5 @@ _build

		dbl.opam
		.ocamlformat

		runner/__pycache__/