diff --git a/.gitignore b/.gitignore index 2c90d2b..035645d 100644 --- a/.gitignore +++ b/.gitignore @@ -24,5 +24,6 @@ Thumbs.db # Validation outputs reports/ +/logs/ *.log validation_results*.txt \ No newline at end of file diff --git a/README.md b/README.md index aba0005..65b2bc4 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,80 @@ cd validation uv sync # or: docker build -t mobilitydcat-validator . ``` +## CLI Usage + +Two separate workflows are available: +- Universal validation: `scripts/validate.py` +- Suite testing (positives/negatives expected outcomes): `scripts/validate_suite.py` + +Run validator help: +```bash +uv run scripts/validate.py --help +``` + +Run suite tester help: +```bash +uv run scripts/validate_suite.py --help +``` + +Minimal default run: +```bash +uv run scripts/validate.py +``` + +Defaults used in minimal run: +- `--data data/` +- `--shacl shacl/` +- `--vocab sample_data/vocabularies/` +- `--report-file logs/validation-report.txt` + +Example (directory validation): +```bash +uv run scripts/validate.py \ + --data sample_data/ \ + --shacl shacl/ +``` + +Optional tuning example: +```bash +uv run scripts/validate.py \ + --data sample_data/ \ + --shacl shacl/ \ + --timeout 30 \ + --max-files-report 50 +``` + +Report behavior: +- Terminal output is compact by default for large runs. +- Full violation details are always written to a report file. +- Default report path (if `--report-file` is omitted): `logs/validation-report.txt`. + +Supported RDF serializations: +- The validator accepts multiple RDF serializations in one run, including `.ttl`, `.rdf`, `.xml`, `.nt`, `.n3`, `.jsonld`, `.json`, `.trig`, and `.nq`. +- You can point `--data` to a directory containing mixed formats; all supported files are discovered and validated. + +Common options: +- `--data`: Input RDF file or directory +- `--shacl`: SHACL file or directory +- `--vocab`: Vocabulary stubs directory +- `--verbose` / `--no-verbose`: Show or hide detailed violations in terminal +- `--progress` / `--no-progress`: Per-file progress while validating directories +- `--timeout`: Per-file validation timeout in seconds (`0` disables timeout) +- `--max-files-report`: Safety option to cap VALID/INVALID terminal output on large runs (`50` default, `0` means unlimited) +- `--report-file`: Path for full detailed validation report + +Suite testing workflow: +- Use `scripts/validate_suite.py` for `positives`/`negatives` test folders. +- Expected outcomes are inferred from directory names (`positives` => should conform, `negatives` => should violate). +- Files outside those folder patterns are reported as unclassified and fail the suite run. + +Why `--vocab` is important: +- Some SHACL checks rely on external controlled vocabularies being available as RDF resources at validation time. +- Typical examples are EU File Type, EU Frequency, and mobility theme terms that are referenced by URI in datasets. +- The validator loads all `.ttl` files from the `--vocab` directory and merges them into each data graph before running SHACL. +- This prevents false violations caused by missing vocabulary resources during class/range checks. +- In most cases you can use the default path; override `--vocab` only when validating against a different vocabulary source. + ## Structure ``` validation/ diff --git a/docs/README.md b/docs/README.md index eed4bb6..6d4857f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,6 +2,12 @@ Validation suite for mobilityDCAT-AP 1.1.0 compliance using SHACL shapes. +## Workflows + +This repository has two separate CLI workflows: +- Universal validation: `scripts/validate.py` (pure SHACL conformance checks) +- Suite testing: `scripts/validate_suite.py` (expects `positives`/`negatives` folder semantics) + ## Quick Start ```bash # Install dependencies @@ -14,28 +20,96 @@ uv run scripts/validate.py --data sample_data/baseline-dcat-ap/negatives/B-N-01- uv run scripts/validate.py --data sample_data/mobility/negatives/M-N-01-missing-mandatory-properties-dataset.ttl --shacl shacl/ -v ``` +## CLI Options + +Show all options: +```bash +uv run scripts/validate.py --help +``` + +Show suite tester options: +```bash +uv run scripts/validate_suite.py --help +``` + +Minimal default run: +```bash +uv run scripts/validate.py +``` + +Defaults used in minimal run: +- `--data data/` +- `--shacl shacl/` +- `--vocab sample_data/vocabularies/` +- `--report-file logs/validation-report.txt` + +Current key options: +- `--data`: Path to RDF file or directory +- `--shacl`: Path to SHACL file or directory +- `--vocab`: Path to vocabulary stubs directory (default: `sample_data/vocabularies`) +- `--verbose` / `--no-verbose`: Toggle detailed violation output in terminal +- `--progress` / `--no-progress`: Toggle per-file progress output for directory validation +- `--timeout`: Per-file timeout in seconds (`0` disables timeout) +- `--max-files-report`: Safety option to cap VALID/INVALID terminal output and keep VS Code responsive on large runs (`0` means unlimited) +- `--report-file`: Write full detailed report (default: `logs/validation-report.txt`) + +Why `--vocab` is important: +- Several shapes expect terms from external controlled vocabularies to be present as RDF resources. +- Common examples include EU file types, EU frequency values, and mobility theme concepts. +- The validator reads all `.ttl` files from the `--vocab` folder and merges them into each input graph before validation. +- This helps avoid false negatives/positives caused by unresolved vocabulary resources in class/range constraints. +- Keep the default in normal runs; set a custom `--vocab` path when you need to validate against another vocabulary snapshot. + +Notes: +- Terminal output is intentionally compact by default for stability on large runs. +- Full violation details are written to the report file. + +Supported RDF serializations: +- Validation supports multiple RDF serializations: `.ttl`, `.rdf`, `.xml`, `.nt`, `.n3`, `.jsonld`, `.json`, `.trig`, and `.nq`. +- A single directory run can include mixed serializations; all supported files are discovered automatically. + +Example with explicit report file: +```bash +uv run scripts/validate.py \ + --data sample_data/ \ + --shacl shacl/ \ + --report-file logs/validation-report.txt +``` + +Optional tuning example: +```bash +uv run scripts/validate.py \ + --data sample_data/ \ + --shacl shacl/ \ + --timeout 30 \ + --max-files-report 50 \ + --report-file logs/validation-report-latest.txt +``` + ## Run All Test Suites + +Use the dedicated suite runner so expectations are evaluated from `positives`/`negatives` paths: ```bash # All baseline DCAT-AP tests -uv run scripts/validate.py --data sample_data/baseline-dcat-ap/ --shacl shacl/ +uv run scripts/validate_suite.py --data sample_data/baseline-dcat-ap/ --shacl shacl/ # All mobility-specific tests -uv run scripts/validate.py --data sample_data/mobility/ --shacl shacl/ +uv run scripts/validate_suite.py --data sample_data/mobility/ --shacl shacl/ # All multilingual tests -uv run scripts/validate.py --data sample_data/multilingual/ --shacl shacl/ +uv run scripts/validate_suite.py --data sample_data/multilingual/ --shacl shacl/ # All partial graph tests -uv run scripts/validate.py --data sample_data/partial_graphs/ --shacl shacl/ +uv run scripts/validate_suite.py --data sample_data/partial_graphs/ --shacl shacl/ # All range constraint tests -uv run scripts/validate.py --data sample_data/ranges/ --shacl shacl/ +uv run scripts/validate_suite.py --data sample_data/ranges/ --shacl shacl/ # All vocabulary tests -uv run scripts/validate.py --data sample_data/vocabularies/ --shacl shacl/ +uv run scripts/validate_suite.py --data sample_data/vocabularies/ --shacl shacl/ # Run everything -uv run scripts/validate.py --data sample_data/ --shacl shacl/ +uv run scripts/validate_suite.py --data sample_data/ --shacl shacl/ ``` ## Docker Usage diff --git a/scripts/validate.py b/scripts/validate.py index 784a7ce..7ad46e1 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """Main validation script - clean, detailed output""" import argparse +from datetime import datetime from pathlib import Path from rdflib import Graph from graph_loader import discover_rdf_files @@ -20,32 +21,110 @@ def load_vocab_graph(vocab_dir: Path) -> Graph: return vocab_graph -def validate_single_file(data_file, shacl_graph, verbose=False, vocab_graph=None): - """Validate a single file""" - result, error = validate_file(data_file, shacl_graph, extra_graph=vocab_graph) +def write_detailed_report(report_file, data_root, results, errors): + """Write full validation errors and violations to a report file.""" + report_file.parent.mkdir(parents=True, exist_ok=True) + + valid = [r for r in results if r.conforms] + invalid = [r for r in results if not r.conforms] + + lines = [] + lines.append("MOBILITYDCAT-AP VALIDATION REPORT") + lines.append(f"Generated: {datetime.now().isoformat(timespec='seconds')}") + lines.append(f"Data root: {data_root}") + lines.append(f"Total: {len(results)} Valid: {len(valid)} Invalid: {len(invalid)} Errors: {len(errors)}") + lines.append("") + + lines.append("LOAD ERRORS") + lines.append("-" * 80) + if errors: + for error in errors: + lines.append(str(error)) + else: + lines.append("None") + lines.append("") + + lines.append("INVALID FILE DETAILS") + lines.append("-" * 80) + if invalid: + for result in invalid: + rel_path = result.file_path.relative_to(data_root) + violations = result.get_violations() + lines.append(f"FILE: {rel_path}") + lines.append(f"VIOLATIONS: {len(violations)}") + + for i, violation in enumerate(violations, 1): + lines.append(f" [{i}] Property: {violation.get('property', 'unknown')}") + lines.append(f" Constraint: {violation.get('constraint', 'Unknown')}") + if 'focus' in violation: + lines.append(f" Focus: {violation['focus']}") + if 'message' in violation: + lines.append(f" Message: {violation['message']}") + + lines.append("") + else: + lines.append("None") + lines.append("") + + report_file.write_text("\n".join(lines), encoding="utf-8") + + +def validate_single_file( + data_file, + shacl_graph, + verbose=False, + vocab_graph=None, + timeout=0, + return_details=False, +): + """Validate a single file and report results""" + result, error = validate_file( + data_file, + shacl_graph, + extra_graph=vocab_graph, + timeout_seconds=timeout, + ) if error: - print(f"❌ ERROR {error}") + print(f"❌ {error}") + if return_details: + return False, None, error return False - print(f"{result.status():8} {data_file.name}") + status = "✓ Valid" if result.conforms else "✗ Invalid" + print(f"{status:12} {data_file.name}") - if verbose and not result.conforms: + # Show violations if invalid + if not result.conforms: violations = result.get_violations() if violations: - print(f"\n Violations found: {len(violations)}") - for i, v in enumerate(violations, 1): - print(f" [{i}] Property: {v.get('property', 'unknown')}") - print(f" Constraint: {v.get('constraint', 'unknown')}") - if 'message' in v: - print(f" Message: {v['message'][:100]}") + print(f" Violations: {len(violations)}") + if verbose: + for i, v in enumerate(violations, 1): + print(f" [{i}] Property: {v.get('property', 'unknown')}") + print(f" Constraint: {v.get('constraint', 'unknown')}") + if 'message' in v: + msg = v['message'][:80] + print(f" Message: {msg}") print() - return result.passed() + if return_details: + return result.conforms, result, None + return result.conforms -def validate_directory(data_dir, shacl_graph, verbose=False, vocab_graph=None): - """Validate all files in a directory""" + +def validate_directory( + data_dir, + shacl_graph, + verbose=False, + vocab_graph=None, + timeout=0, + progress=False, + max_files_report=50, + report_file=Path('logs/validation-report.txt'), +): + """Validate all files in a directory and report results""" rdf_files = discover_rdf_files(data_dir) if not rdf_files: @@ -54,7 +133,18 @@ def validate_directory(data_dir, shacl_graph, verbose=False, vocab_graph=None): print(f"Found {len(rdf_files)} file(s)\n") - results, errors = validate_multiple_files(rdf_files, shacl_graph, extra_graph=vocab_graph) + def progress_printer(file_path, index, total): + if progress: + rel_path = file_path.relative_to(data_dir) + print(f"[{index:>3}/{total}] Validating {rel_path}") + + results, errors = validate_multiple_files( + rdf_files, + shacl_graph, + extra_graph=vocab_graph, + timeout_seconds=timeout, + progress_callback=progress_printer if progress else None, + ) # Print errors first if errors: @@ -65,68 +155,70 @@ def validate_directory(data_dir, shacl_graph, verbose=False, vocab_graph=None): print(f"❌ {error}") print() - # Group results - passed = [r for r in results if r.passed()] - failed = [r for r in results if not r.passed()] + # Group results by conformance + valid = [r for r in results if r.conforms] + invalid = [r for r in results if not r.conforms] - # Show passed tests - if passed: + # Show valid files + if valid: print("=" * 80) - print("PASSED") + print("VALID") print("=" * 80) - for result in passed: + valid_to_print = valid[:max_files_report] if max_files_report and max_files_report > 0 else valid + for result in valid_to_print: rel_path = result.file_path.relative_to(data_dir) - violations = result.get_violations() - if violations: - print(f"✓ {str(rel_path):<45} Detected {len(violations)} violation(s)") - else: - print(f"✓ {str(rel_path):<45} Valid") + print(f"✓ {str(rel_path)}") + if len(valid) > len(valid_to_print): + omitted = len(valid) - len(valid_to_print) + print(f"... omitted {omitted} additional valid file(s). Use --max-files-report to adjust.") print() - # Show failed tests with details - if failed: + # Show invalid files with violation details + if invalid: print("=" * 80) - print("FAILED") + print("INVALID") print("=" * 80) - for result in failed: + invalid_to_print = invalid[:max_files_report] if max_files_report and max_files_report > 0 else invalid + for result in invalid_to_print: rel_path = result.file_path.relative_to(data_dir) - print(f"\n✗ {rel_path}") - - if result.is_positive_test(): - violations = result.get_violations() - if violations: - print(f" Expected: Valid") - print(f" Got: {len(violations)} violation(s) found\n") - for i, v in enumerate(violations, 1): - prop = v.get('property', 'unknown') - constraint = v.get('constraint', 'Unknown') - print(f" [{i}] Property: {prop}") - print(f" Constraint: {constraint}") - if 'message' in v: - msg = v['message'][:80] - print(f" Message: {msg}") - print() - else: - print(f" Expected: Invalid (should have violations)") - print(f" Got: Valid (no violations detected)\n") + violations = result.get_violations() + violation_count = len(violations) if violations else 0 + print(f"✗ {rel_path} (violations: {violation_count})") + if verbose and violations: + for i, v in enumerate(violations, 1): + prop = v.get('property', 'unknown') + constraint = v.get('constraint', 'Unknown') + print(f" [{i}] Property: {prop}") + print(f" Constraint: {constraint}") + if 'message' in v: + msg = v['message'][:80] + print(f" Message: {msg}") + print() + if len(invalid) > len(invalid_to_print): + omitted = len(invalid) - len(invalid_to_print) + print(f"... omitted {omitted} additional invalid file(s). Use --max-files-report to adjust.") + print() # Summary print("=" * 80) print("SUMMARY") print("=" * 80) total = len(results) - passed_count = len(passed) - failed_count = len(failed) + valid_count = len(valid) + invalid_count = len(invalid) error_count = len(errors) - print(f"Total: {total} file(s)") - print(f"✓ Passed: {passed_count}") - print(f"✗ Failed: {failed_count}") + print(f"Total: {total} file(s)") + print(f"✓ Valid: {valid_count}") + print(f"✗ Invalid: {invalid_count}") if error_count: - print(f"❌ Errors: {error_count}") + print(f"❌ Errors: {error_count}") print("=" * 80) - return failed_count == 0 and error_count == 0 + write_detailed_report(report_file, data_dir, results, errors) + print(f"Detailed report: {report_file}\n") + + return invalid_count == 0 and error_count == 0 def main(): @@ -137,7 +229,7 @@ def main(): Examples: uv run scripts/validate.py --data my-data.ttl --shacl shacl/ uv run scripts/validate.py --data data/ --shacl shacl/ - uv run scripts/validate.py --data data/ --shacl shacl/ --verbose + uv run scripts/validate.py --data data/ --shacl shacl/ --verbose --progress uv run scripts/validate.py --data data/ --shacl shacl/ --vocab sample_data/vocabularies/ Supported formats: .ttl, .rdf, .xml, .nt, .n3, .jsonld, .json, .trig, .nq """ @@ -167,7 +259,51 @@ def main(): parser.add_argument( '--verbose', '-v', action='store_true', - help='Show detailed violation information' + default=False, + help='Show detailed violation information (off by default for large runs)' + ) + + parser.add_argument( + '--no-verbose', + dest='verbose', + action='store_false', + help='Disable detailed violation information' + ) + + parser.add_argument( + '--progress', + dest='progress', + action='store_true', + default=False, + help='Enable per-file progress output while validating directories' + ) + + parser.add_argument( + '--timeout', + type=float, + default=0, + help='Per-file validation timeout in seconds (0 disables timeout)' + ) + + parser.add_argument( + '--no-progress', + dest='progress', + action='store_false', + help='Disable per-file progress output while validating directories' + ) + + parser.add_argument( + '--max-files-report', + type=int, + default=50, + help='Maximum number of valid/invalid files to print per section (0 means unlimited)' + ) + + parser.add_argument( + '--report-file', + type=Path, + default=Path('logs/validation-report.txt'), + help='Path to write full error and violation details' ) args = parser.parse_args() @@ -188,9 +324,33 @@ def main(): success = False if args.data.is_file(): - success = validate_single_file(args.data, shacl_graph, args.verbose, vocab_graph) + success, single_result, single_error = validate_single_file( + args.data, + shacl_graph, + args.verbose, + vocab_graph, + args.timeout, + return_details=True, + ) + # Single-file runs write a report with full details as well. + write_detailed_report( + args.report_file, + args.data.parent, + [single_result] if single_result else [], + [single_error] if single_error else [], + ) + print(f"Detailed report: {args.report_file}\n") elif args.data.is_dir(): - success = validate_directory(args.data, shacl_graph, args.verbose, vocab_graph) + success = validate_directory( + args.data, + shacl_graph, + args.verbose, + vocab_graph, + args.timeout, + args.progress, + args.max_files_report, + args.report_file, + ) else: print(f"❌ Data path not found: {args.data}") exit(1) diff --git a/scripts/validate_suite.py b/scripts/validate_suite.py new file mode 100644 index 0000000..5989d2b --- /dev/null +++ b/scripts/validate_suite.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +"""Validation suite runner for positives/negatives expected outcomes.""" +import argparse +from datetime import datetime +from pathlib import Path + +from graph_loader import discover_rdf_files +from shacl_loader import load_shacl +from validate import load_vocab_graph +from validator import validate_multiple_files + + +def expected_conforms_for_path(file_path: Path): + """Infer expected conformance from path names.""" + lowered_parts = {part.lower() for part in file_path.parts} + if 'negatives' in lowered_parts or 'negative' in lowered_parts: + return False + if 'positives' in lowered_parts or 'positive' in lowered_parts: + return True + return None + + +def write_suite_report(report_file, data_root, entries, errors): + """Write a full suite report including expectations and mismatches.""" + report_file.parent.mkdir(parents=True, exist_ok=True) + + passes = [e for e in entries if e['passed']] + fails = [e for e in entries if not e['passed']] + unclassified = [e for e in entries if e['expected'] is None] + + lines = [] + lines.append("MOBILITYDCAT-AP VALIDATION SUITE REPORT") + lines.append(f"Generated: {datetime.now().isoformat(timespec='seconds')}") + lines.append(f"Data root: {data_root}") + lines.append( + f"Total: {len(entries)} Pass: {len(passes)} Fail: {len(fails)} Unclassified: {len(unclassified)} Errors: {len(errors)}" + ) + lines.append("") + + lines.append("LOAD ERRORS") + lines.append("-" * 80) + if errors: + for error in errors: + lines.append(str(error)) + else: + lines.append("None") + lines.append("") + + lines.append("FAIL DETAILS") + lines.append("-" * 80) + if fails: + for entry in fails: + rel_path = entry['result'].file_path.relative_to(data_root) + expected_label = 'conforms' if entry['expected'] else 'violates' + actual_label = 'conforms' if entry['result'].conforms else 'violates' + lines.append(f"FILE: {rel_path}") + lines.append(f"EXPECTED: {expected_label}") + lines.append(f"ACTUAL: {actual_label}") + violations = entry['result'].get_violations() + lines.append(f"VIOLATIONS: {len(violations)}") + for i, violation in enumerate(violations, 1): + lines.append(f" [{i}] Property: {violation.get('property', 'unknown')}") + lines.append(f" Constraint: {violation.get('constraint', 'Unknown')}") + if 'focus' in violation: + lines.append(f" Focus: {violation['focus']}") + if 'message' in violation: + lines.append(f" Message: {violation['message']}") + lines.append("") + else: + lines.append("None") + lines.append("") + + lines.append("UNCLASSIFIED FILES") + lines.append("-" * 80) + if unclassified: + for entry in unclassified: + rel_path = entry['result'].file_path.relative_to(data_root) + lines.append(str(rel_path)) + else: + lines.append("None") + + report_file.write_text("\n".join(lines), encoding="utf-8") + + +def main(): + parser = argparse.ArgumentParser( + description="Run validation suites using positives/negatives expected outcomes", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + uv run scripts/validate_suite.py --data sample_data/mobility --shacl shacl/ + uv run scripts/validate_suite.py --data sample_data --shacl shacl/ --max-files-report 100 + """, + ) + + parser.add_argument('--data', type=Path, required=True, help='Suite data file or directory') + parser.add_argument('--shacl', type=Path, default=Path('shacl'), help='Path to SHACL file or directory') + parser.add_argument( + '--vocab', + type=Path, + default=Path('sample_data/vocabularies'), + help='Path to vocabulary stub directory (default: sample_data/vocabularies/)' + ) + parser.add_argument('--timeout', type=float, default=0, help='Per-file validation timeout in seconds (0 disables timeout)') + parser.add_argument('--progress', action='store_true', default=False, help='Enable per-file progress output') + parser.add_argument( + '--max-files-report', + type=int, + default=50, + help='Maximum number of pass/fail files to print per section (0 means unlimited)' + ) + parser.add_argument( + '--report-file', + type=Path, + default=Path('logs/validation-suite-report.txt'), + help='Path to write full suite report' + ) + + args = parser.parse_args() + + print("Loading SHACL shapes...") + try: + shacl_graph, shacl_files = load_shacl(args.shacl) + print(f"✓ Loaded {len(shacl_files)} SHACL file(s)\n") + except Exception as exc: + print(f"❌ Error loading SHACL: {exc}") + raise SystemExit(1) + + vocab_graph = load_vocab_graph(args.vocab) + + if args.data.is_file(): + rdf_files = [args.data] + data_root = args.data.parent + elif args.data.is_dir(): + rdf_files = discover_rdf_files(args.data) + data_root = args.data + else: + print(f"❌ Data path not found: {args.data}") + raise SystemExit(1) + + if not rdf_files: + print(f"❌ No RDF files found in {args.data}") + raise SystemExit(1) + + print(f"Found {len(rdf_files)} file(s)\n") + + def progress_printer(file_path, index, total): + if args.progress: + rel_path = file_path.relative_to(data_root) + print(f"[{index:>3}/{total}] Validating {rel_path}") + + results, errors = validate_multiple_files( + rdf_files, + shacl_graph, + extra_graph=vocab_graph, + timeout_seconds=args.timeout, + progress_callback=progress_printer if args.progress else None, + ) + + entries = [] + for result in results: + expected = expected_conforms_for_path(result.file_path) + passed = expected is not None and result.conforms == expected + entries.append({'result': result, 'expected': expected, 'passed': passed}) + + passes = [e for e in entries if e['passed']] + fails = [e for e in entries if not e['passed'] and e['expected'] is not None] + unclassified = [e for e in entries if e['expected'] is None] + + if passes: + print("=" * 80) + print("PASS") + print("=" * 80) + to_print = passes[:args.max_files_report] if args.max_files_report and args.max_files_report > 0 else passes + for entry in to_print: + rel_path = entry['result'].file_path.relative_to(data_root) + print(f"✓ {rel_path}") + if len(passes) > len(to_print): + print(f"... omitted {len(passes) - len(to_print)} additional pass file(s). Use --max-files-report to adjust.") + print() + + if fails: + print("=" * 80) + print("FAIL") + print("=" * 80) + to_print = fails[:args.max_files_report] if args.max_files_report and args.max_files_report > 0 else fails + for entry in to_print: + rel_path = entry['result'].file_path.relative_to(data_root) + expected_label = 'conforms' if entry['expected'] else 'violates' + actual_label = 'conforms' if entry['result'].conforms else 'violates' + print(f"✗ {rel_path} (expected: {expected_label}, actual: {actual_label})") + if len(fails) > len(to_print): + print(f"... omitted {len(fails) - len(to_print)} additional fail file(s). Use --max-files-report to adjust.") + print() + + if unclassified: + print("=" * 80) + print("UNCLASSIFIED") + print("=" * 80) + to_print = unclassified[:args.max_files_report] if args.max_files_report and args.max_files_report > 0 else unclassified + for entry in to_print: + rel_path = entry['result'].file_path.relative_to(data_root) + print(f"! {rel_path} (expected outcome not inferable; use positives/negatives folder names)") + if len(unclassified) > len(to_print): + print(f"... omitted {len(unclassified) - len(to_print)} additional unclassified file(s). Use --max-files-report to adjust.") + print() + + print("=" * 80) + print("SUMMARY") + print("=" * 80) + print(f"Total: {len(entries)} file(s)") + print(f"✓ Pass: {len(passes)}") + print(f"✗ Fail: {len(fails)}") + print(f"! Unclassified: {len(unclassified)}") + if errors: + print(f"❌ Errors: {len(errors)}") + print("=" * 80) + + write_suite_report(args.report_file, data_root, entries, errors) + print(f"Detailed report: {args.report_file}\n") + + success = len(fails) == 0 and len(unclassified) == 0 and len(errors) == 0 + raise SystemExit(0 if success else 1) + + +if __name__ == '__main__': + main() diff --git a/scripts/validator.py b/scripts/validator.py index 7eedecd..d61c1d0 100644 --- a/scripts/validator.py +++ b/scripts/validator.py @@ -1,10 +1,15 @@ """SHACL validation logic with detailed violation reporting""" +import signal from pyshacl import validate from rdflib import Namespace SH = Namespace("http://www.w3.org/ns/shacl#") + +class ValidationTimeoutError(Exception): + """Raised when SHACL validation exceeds the configured timeout.""" + class ValidationResult: """Holds validation results with violation details""" @@ -15,30 +20,9 @@ def __init__(self, conforms, results_graph, report_text, file_path=None): self.file_path = file_path self._violations = None - def is_positive_test(self): - """Check if this is a positive test case""" - if self.file_path: - return "positive" in str(self.file_path) - return False - - def is_negative_test(self): - """Check if this is a negative test case""" - if self.file_path: - return "negative" in str(self.file_path) - return False - - def passed(self): - """Check if test passed based on expected outcome""" - if self.is_positive_test(): - return self.conforms - elif self.is_negative_test(): - return not self.conforms - else: - return self.conforms - def status(self): - """Get status string""" - return "✓ PASS" if self.passed() else "✗ FAIL" + """Get status string based on conformance""" + return "✓ Valid" if self.conforms else "✗ Invalid" def get_violations(self): """Extract violation details from results graph""" @@ -91,9 +75,45 @@ def validate_graph(data_graph, shacl_graph, inference='rdfs'): return ValidationResult(conforms, results_graph, report_text) -def validate_file(file_path, shacl_graph, inference='rdfs', extra_graph=None): - from graph_loader import load_graph_from_file - from rdflib import Graph + +def _validate_with_timeout(data_graph, shacl_graph, inference='rdfs', timeout_seconds=0): + """Run pySHACL with optional Unix signal timeout.""" + if not timeout_seconds or timeout_seconds <= 0: + return validate( + data_graph, + shacl_graph=shacl_graph, + inference=inference, + abort_on_first=False + ) + + if not hasattr(signal, 'SIGALRM'): + return validate( + data_graph, + shacl_graph=shacl_graph, + inference=inference, + abort_on_first=False + ) + + def _handle_timeout(signum, frame): + raise ValidationTimeoutError(f"Validation timed out after {timeout_seconds}s") + + previous_handler = signal.getsignal(signal.SIGALRM) + previous_timer = signal.getitimer(signal.ITIMER_REAL) + try: + signal.signal(signal.SIGALRM, _handle_timeout) + signal.setitimer(signal.ITIMER_REAL, timeout_seconds) + return validate( + data_graph, + shacl_graph=shacl_graph, + inference=inference, + abort_on_first=False + ) + finally: + signal.setitimer(signal.ITIMER_REAL, previous_timer[0], previous_timer[1]) + signal.signal(signal.SIGALRM, previous_handler) + +def validate_file(file_path, shacl_graph, inference='rdfs', extra_graph=None, timeout_seconds=0): + from graph_loader import load_graph_from_file, LoadError data_graph, load_error = load_graph_from_file(file_path) if load_error: @@ -103,20 +123,40 @@ def validate_file(file_path, shacl_graph, inference='rdfs', extra_graph=None): if extra_graph: data_graph += extra_graph - conforms, results_graph, report_text = validate( - data_graph, - shacl_graph=shacl_graph, - inference=inference, - abort_on_first=False - ) + try: + conforms, results_graph, report_text = _validate_with_timeout( + data_graph, + shacl_graph=shacl_graph, + inference=inference, + timeout_seconds=timeout_seconds, + ) + except ValidationTimeoutError as e: + return None, LoadError(file_path, str(e)) + return ValidationResult(conforms, results_graph, report_text, file_path), None -def validate_multiple_files(file_paths, shacl_graph, inference='rdfs', extra_graph=None): +def validate_multiple_files( + file_paths, + shacl_graph, + inference='rdfs', + extra_graph=None, + timeout_seconds=0, + progress_callback=None, +): results = [] errors = [] - for file_path in file_paths: - result, error = validate_file(file_path, shacl_graph, inference, extra_graph) + total = len(file_paths) + for index, file_path in enumerate(file_paths, 1): + if progress_callback: + progress_callback(file_path, index, total) + result, error = validate_file( + file_path, + shacl_graph, + inference, + extra_graph, + timeout_seconds, + ) if result: results.append(result) else: