py-pdf
diff --git a/‎.github/scripts/check_gh_pages_updates.py‎
Lines changed: 100 additions & 0 deletions b/‎.github/scripts/check_gh_pages_updates.py‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎.github/scripts/check_urls.py‎
Lines changed: 87 additions & 0 deletions b/‎.github/scripts/check_urls.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎.github/workflows/benchmark.yaml‎
Lines changed: 9 additions & 4 deletions b/‎.github/workflows/benchmark.yaml‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎.github/workflows/create-github-release.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/create-github-release.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/gh-pages-check.yaml‎
Lines changed: 29 additions & 0 deletions b/‎.github/workflows/gh-pages-check.yaml‎
Lines changed: 29 additions & 0 deletions
@@ -0,0 +1,100 @@
+"""Check that all GitHub pages JavaScript dependencies are up-to-date."""  # noqa: INP001
+
+import base64
+import hashlib
+import json
+import re
+import sys
+import urllib.request
+from pathlib import Path
+
+JSDELIVR_RE = re.compile(
+    r"(https://cdn\.jsdelivr\.net/npm/"
+    r"(?P<name>[^@/]+)@(?P<version>[^/]+)"
+    r"/(?P<path>[^\"']+))"
+)
+
+
+def fetch_json(url: str) -> dict:
+    """Retrieve JSON data from the given URL."""
+    with urllib.request.urlopen(url, timeout=15) as resp:  # noqa: S310  # Controlled input.
+        return json.load(resp)
+
+
+def fetch_bytes(url: str) -> bytes:
+    """Retrieve bytes data from the given URL."""
+    with urllib.request.urlopen(url, timeout=30) as resp:  # noqa: S310  # Controlled input.
+        return resp.read()
+
+
+def get_latest_version(pkg: str) -> str:
+    """Get the latest version for this package."""
+    data = fetch_json(f"https://registry.npmjs.org/{pkg}")
+    return data["dist-tags"]["latest"]
+
+
+def sri_hash(content: bytes) -> str:
+    """Calculate the SRI hash for the given content."""
+    digest = hashlib.sha384(content).digest()
+    return "sha384-" + base64.b64encode(digest).decode("ascii")
+
+
+def scan_html(path: Path) -> list[re.Match[str]]:
+    """Scan the given HTML file for external JavaScript includes."""
+    text = path.read_text(encoding="utf-8", errors="ignore")
+    return list(JSDELIVR_RE.finditer(text))
+
+
+def main() -> None:
+    """Perform the checks."""
+    outdated_found = False
+
+    for html_path in sorted(Path("gh-pages").rglob("*.html"), key=str):
+        matches = scan_html(html_path)
+        if not matches:
+            continue
+
+        sys.stdout.write(f"\n📄 {html_path} ...\n\n")
+
+        for m in matches:
+            pkg = m.group("name")
+            current_version = m.group("version")
+            full_url = m.group(1)
+
+            try:
+                latest_version = get_latest_version(pkg)
+            except Exception as e:
+                sys.stdout.write(f"  ⚠️  {pkg}: npm lookup failed ({e})\n")
+                continue
+
+            if current_version == latest_version:
+                sys.stdout.write(f"  ✅ {pkg} {current_version}\n")
+                continue
+
+            outdated_found = True
+            latest_url = full_url.replace(
+                f"@{current_version}/", f"@{latest_version}/"
+            )
+
+            try:
+                latest_bytes = fetch_bytes(latest_url)
+                latest_sri = sri_hash(latest_bytes)
+            except Exception as e:
+                sys.stdout.write(f"  ⚠️  {pkg}: failed to fetch latest file ({e})\n")
+                continue
+
+            sys.stdout.write(f"  ❌ {pkg}\n")
+            sys.stdout.write(f"     Current: {current_version}\n")
+            sys.stdout.write(f"     Latest:  {latest_version}\n")
+            sys.stdout.write(f"     Latest SRI: {latest_sri}\n")
+            sys.stdout.write("\n")
+
+    if outdated_found:
+        sys.stdout.write("\n❗ Outdated dependencies detected\n")
+        sys.exit(1)
+
+    sys.stdout.write("\n🎉 All CDN dependencies are up to date\n")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,87 @@
+"""Check that all test data URLs are still accessible."""  # noqa: INP001
+import ast
+import sys
+from collections.abc import Iterator
+from operator import itemgetter
+from pathlib import Path
+
+from tests import _get_data_from_url, read_yaml_to_list_of_dicts
+
+URL_PREFIXES_TO_IGNORE = (
+    "http://ns.adobe.com/tiff/1.0/",
+    "http://www.example.com",
+    "https://example.com",
+    "https://martin-thoma.com",
+    "https://pypdf.readthedocs.io/",
+    "https://www.example.com",
+)
+
+PDF_URLS_WHICH_DO_NOT_LOOK_LIKE_PDFS = {
+    "https://github.com/user-attachments/files/18381726/tika-957721.pdf",
+}
+
+
+def get_urls_from_test_files() -> Iterator[str]:
+    """Retrieve all URLs defined the test files."""
+    tests_directory = Path(__file__).parent.parent.parent / "tests"
+    for test_file in sorted(tests_directory.rglob("test_*.py")):
+        tree = ast.parse(source=test_file.read_text(encoding="utf-8"), filename=str(test_file))
+        for node in ast.walk(tree):
+            if not isinstance(node, ast.Constant):
+                continue
+            if not isinstance(node.value, str):
+                continue
+            if not node.value.startswith(("http://", "https://")):
+                continue
+            yield node.value
+
+
+def get_urls_from_example_files() -> Iterator[str]:
+    """Retrieve all URLs defined in the `example_files.yaml`."""
+    pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent.parent.parent / "tests" / "example_files.yaml")
+    yield from map(itemgetter("url"), pdfs)
+
+
+def check_url(url: str) -> bool:
+    """Check if the given URL appears to still be valid."""
+    if url.startswith(URL_PREFIXES_TO_IGNORE):
+        return True
+
+    try:
+        data = _get_data_from_url(url)
+    except Exception as exception:
+        sys.stderr.write(f"Error getting data from {url}: {exception}\n")
+        return False
+
+    if len(data) < 75:
+        sys.stderr.write(f"Not enough data from {url}: {data}\n")
+        return False
+
+    if (
+            url.lower().endswith(".pdf") and
+            url not in PDF_URLS_WHICH_DO_NOT_LOOK_LIKE_PDFS and
+            not data.startswith(b"%PDF-")
+    ):
+        sys.stderr.write(f"The file at {url} does not look like a PDF: {data[:50]}\n")
+        return False
+
+    sys.stdout.write(f"URL {url} looks good.\n")
+    return True
+
+
+def main() -> bool:
+    """Check if there are invalid URLs."""
+    urls: set[str] = set()
+    for url in get_urls_from_test_files():
+        urls.add(url)
+    for url in get_urls_from_example_files():
+        urls.add(url)
+
+    is_valid = True
+    for url in sorted(urls):
+        is_valid &= check_url(url)
+    return not is_valid
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -10,14 +10,19 @@ permissions:
 
 jobs:
   benchmark:
-    name: Run pytest-benchmark
+    name: "Benchmark ${{ matrix.name }}"
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.x"]
+        python-version: ['3.x']
+        include:
+          - python-version: '3.x'
+            name: 'CPython'
+          - python-version: 'pypy3.11'
+            name: 'PyPy 3.11'
     steps:
     - name: Checkout Code
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
       with:
         submodules: 'recursive'
     - name: Setup Python
@@ -36,7 +41,7 @@ jobs:
     - name: Store benchmark result
       uses: benchmark-action/github-action-benchmark@v1
       with:
-        name: Python Benchmark with pytest-benchmark
+        name: "${{ matrix.name }} Benchmark"
         tool: 'pytest'
         output-file-path: output.json
         # Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096
 
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout Repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
       - name: Prepare variables
         id: prepare_variables
         run: |
 
@@ -0,0 +1,29 @@
+name: 'GitHub Pages Check'
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: 0 6 * * 1
+
+jobs:
+  url-check:
+    name: GitHub Pages check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout GitHub Pages
+        uses: actions/checkout@v6
+        with:
+          ref: 'gh-pages'
+          path: 'gh-pages'
+      - name: Checkout main (tools)
+        uses: actions/checkout@v6
+        with:
+          ref: main
+          path: main
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.x'
+      - name: Check GitHub Pages
+        run: |
+          export PYTHONPATH="$GITHUB_WORKSPACE"
+          python main/.github/scripts/check_gh_pages_updates.py