Skip to content

Commit 655f229

Browse files
authored
Merge branch 'main' into insert
2 parents cf81daa + 743151a commit 655f229

File tree

95 files changed

+3840
-2394
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+3840
-2394
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""Check that all GitHub pages JavaScript dependencies are up-to-date.""" # noqa: INP001
2+
3+
import base64
4+
import hashlib
5+
import json
6+
import re
7+
import sys
8+
import urllib.request
9+
from pathlib import Path
10+
11+
JSDELIVR_RE = re.compile(
12+
r"(https://cdn\.jsdelivr\.net/npm/"
13+
r"(?P<name>[^@/]+)@(?P<version>[^/]+)"
14+
r"/(?P<path>[^\"']+))"
15+
)
16+
17+
18+
def fetch_json(url: str) -> dict:
19+
"""Retrieve JSON data from the given URL."""
20+
with urllib.request.urlopen(url, timeout=15) as resp: # noqa: S310 # Controlled input.
21+
return json.load(resp)
22+
23+
24+
def fetch_bytes(url: str) -> bytes:
25+
"""Retrieve bytes data from the given URL."""
26+
with urllib.request.urlopen(url, timeout=30) as resp: # noqa: S310 # Controlled input.
27+
return resp.read()
28+
29+
30+
def get_latest_version(pkg: str) -> str:
31+
"""Get the latest version for this package."""
32+
data = fetch_json(f"https://registry.npmjs.org/{pkg}")
33+
return data["dist-tags"]["latest"]
34+
35+
36+
def sri_hash(content: bytes) -> str:
37+
"""Calculate the SRI hash for the given content."""
38+
digest = hashlib.sha384(content).digest()
39+
return "sha384-" + base64.b64encode(digest).decode("ascii")
40+
41+
42+
def scan_html(path: Path) -> list[re.Match[str]]:
43+
"""Scan the given HTML file for external JavaScript includes."""
44+
text = path.read_text(encoding="utf-8", errors="ignore")
45+
return list(JSDELIVR_RE.finditer(text))
46+
47+
48+
def main() -> None:
49+
"""Perform the checks."""
50+
outdated_found = False
51+
52+
for html_path in sorted(Path("gh-pages").rglob("*.html"), key=str):
53+
matches = scan_html(html_path)
54+
if not matches:
55+
continue
56+
57+
sys.stdout.write(f"\n📄 {html_path} ...\n\n")
58+
59+
for m in matches:
60+
pkg = m.group("name")
61+
current_version = m.group("version")
62+
full_url = m.group(1)
63+
64+
try:
65+
latest_version = get_latest_version(pkg)
66+
except Exception as e:
67+
sys.stdout.write(f" ⚠️ {pkg}: npm lookup failed ({e})\n")
68+
continue
69+
70+
if current_version == latest_version:
71+
sys.stdout.write(f" ✅ {pkg} {current_version}\n")
72+
continue
73+
74+
outdated_found = True
75+
latest_url = full_url.replace(
76+
f"@{current_version}/", f"@{latest_version}/"
77+
)
78+
79+
try:
80+
latest_bytes = fetch_bytes(latest_url)
81+
latest_sri = sri_hash(latest_bytes)
82+
except Exception as e:
83+
sys.stdout.write(f" ⚠️ {pkg}: failed to fetch latest file ({e})\n")
84+
continue
85+
86+
sys.stdout.write(f" ❌ {pkg}\n")
87+
sys.stdout.write(f" Current: {current_version}\n")
88+
sys.stdout.write(f" Latest: {latest_version}\n")
89+
sys.stdout.write(f" Latest SRI: {latest_sri}\n")
90+
sys.stdout.write("\n")
91+
92+
if outdated_found:
93+
sys.stdout.write("\n❗ Outdated dependencies detected\n")
94+
sys.exit(1)
95+
96+
sys.stdout.write("\n🎉 All CDN dependencies are up to date\n")
97+
98+
99+
if __name__ == "__main__":
100+
main()

.github/scripts/check_urls.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""Check that all test data URLs are still accessible.""" # noqa: INP001
2+
import ast
3+
import sys
4+
from collections.abc import Iterator
5+
from operator import itemgetter
6+
from pathlib import Path
7+
8+
from tests import _get_data_from_url, read_yaml_to_list_of_dicts
9+
10+
URL_PREFIXES_TO_IGNORE = (
11+
"http://ns.adobe.com/tiff/1.0/",
12+
"http://www.example.com",
13+
"https://example.com",
14+
"https://martin-thoma.com",
15+
"https://pypdf.readthedocs.io/",
16+
"https://www.example.com",
17+
)
18+
19+
PDF_URLS_WHICH_DO_NOT_LOOK_LIKE_PDFS = {
20+
"https://github.com/user-attachments/files/18381726/tika-957721.pdf",
21+
}
22+
23+
24+
def get_urls_from_test_files() -> Iterator[str]:
25+
"""Retrieve all URLs defined the test files."""
26+
tests_directory = Path(__file__).parent.parent.parent / "tests"
27+
for test_file in sorted(tests_directory.rglob("test_*.py")):
28+
tree = ast.parse(source=test_file.read_text(encoding="utf-8"), filename=str(test_file))
29+
for node in ast.walk(tree):
30+
if not isinstance(node, ast.Constant):
31+
continue
32+
if not isinstance(node.value, str):
33+
continue
34+
if not node.value.startswith(("http://", "https://")):
35+
continue
36+
yield node.value
37+
38+
39+
def get_urls_from_example_files() -> Iterator[str]:
40+
"""Retrieve all URLs defined in the `example_files.yaml`."""
41+
pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent.parent.parent / "tests" / "example_files.yaml")
42+
yield from map(itemgetter("url"), pdfs)
43+
44+
45+
def check_url(url: str) -> bool:
46+
"""Check if the given URL appears to still be valid."""
47+
if url.startswith(URL_PREFIXES_TO_IGNORE):
48+
return True
49+
50+
try:
51+
data = _get_data_from_url(url)
52+
except Exception as exception:
53+
sys.stderr.write(f"Error getting data from {url}: {exception}\n")
54+
return False
55+
56+
if len(data) < 75:
57+
sys.stderr.write(f"Not enough data from {url}: {data}\n")
58+
return False
59+
60+
if (
61+
url.lower().endswith(".pdf") and
62+
url not in PDF_URLS_WHICH_DO_NOT_LOOK_LIKE_PDFS and
63+
not data.startswith(b"%PDF-")
64+
):
65+
sys.stderr.write(f"The file at {url} does not look like a PDF: {data[:50]}\n")
66+
return False
67+
68+
sys.stdout.write(f"URL {url} looks good.\n")
69+
return True
70+
71+
72+
def main() -> bool:
73+
"""Check if there are invalid URLs."""
74+
urls: set[str] = set()
75+
for url in get_urls_from_test_files():
76+
urls.add(url)
77+
for url in get_urls_from_example_files():
78+
urls.add(url)
79+
80+
is_valid = True
81+
for url in sorted(urls):
82+
is_valid &= check_url(url)
83+
return not is_valid
84+
85+
86+
if __name__ == "__main__":
87+
sys.exit(main())

.github/workflows/benchmark.yaml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,19 @@ permissions:
1010

1111
jobs:
1212
benchmark:
13-
name: Run pytest-benchmark
13+
name: "Benchmark ${{ matrix.name }}"
1414
runs-on: ubuntu-latest
1515
strategy:
1616
matrix:
17-
python-version: ["3.x"]
17+
python-version: ['3.x']
18+
include:
19+
- python-version: '3.x'
20+
name: 'CPython'
21+
- python-version: 'pypy3.11'
22+
name: 'PyPy 3.11'
1823
steps:
1924
- name: Checkout Code
20-
uses: actions/checkout@v5
25+
uses: actions/checkout@v6
2126
with:
2227
submodules: 'recursive'
2328
- name: Setup Python
@@ -36,7 +41,7 @@ jobs:
3641
- name: Store benchmark result
3742
uses: benchmark-action/github-action-benchmark@v1
3843
with:
39-
name: Python Benchmark with pytest-benchmark
44+
name: "${{ matrix.name }} Benchmark"
4045
tool: 'pytest'
4146
output-file-path: output.json
4247
# Use personal access token instead of GITHUB_TOKEN due to https://github.community/t/github-action-not-triggering-gh-pages-upon-push/16096

.github/workflows/create-github-release.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
runs-on: ubuntu-latest
1616
steps:
1717
- name: Checkout Repository
18-
uses: actions/checkout@v5
18+
uses: actions/checkout@v6
1919
- name: Prepare variables
2020
id: prepare_variables
2121
run: |
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
name: 'GitHub Pages Check'
2+
on:
3+
workflow_dispatch:
4+
schedule:
5+
- cron: 0 6 * * 1
6+
7+
jobs:
8+
url-check:
9+
name: GitHub Pages check
10+
runs-on: ubuntu-latest
11+
steps:
12+
- name: Checkout GitHub Pages
13+
uses: actions/checkout@v6
14+
with:
15+
ref: 'gh-pages'
16+
path: 'gh-pages'
17+
- name: Checkout main (tools)
18+
uses: actions/checkout@v6
19+
with:
20+
ref: main
21+
path: main
22+
- name: Setup Python
23+
uses: actions/setup-python@v6
24+
with:
25+
python-version: '3.x'
26+
- name: Check GitHub Pages
27+
run: |
28+
export PYTHONPATH="$GITHUB_WORKSPACE"
29+
python main/.github/scripts/check_gh_pages_updates.py

0 commit comments

Comments
 (0)