Skip to content

Commit 107c5bb

Browse files
Merge pull request #13 from hotosm/ci/stac-validation
Feature : CI to validate STAC items for basemodels !
2 parents 1de8a8d + 0b4d07b commit 107c5bb

File tree

9 files changed

+379
-9
lines changed

9 files changed

+379
-9
lines changed

.github/workflows/build-model-images.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,13 @@ jobs:
2727
- name: Fetch base ref
2828
if: github.event_name == 'pull_request'
2929
run: git fetch origin ${{ github.event.pull_request.base.ref }}
30-
- uses: actions/setup-python@v5
30+
- uses: astral-sh/setup-uv@v5
3131
with:
3232
python-version: "3.13"
33-
- run: pip install pystac
33+
enable-cache: true
34+
- run: uv add pystac
3435
- id: models
35-
run: echo "matrix=$(python .github/scripts/detect_models.py)" >> $GITHUB_OUTPUT
36+
run: echo "matrix=$(uv run python .github/scripts/detect_models.py)" >> $GITHUB_OUTPUT
3637
env:
3738
EVENT_NAME: ${{ github.event_name }}
3839
BASE_SHA: ${{ github.event.pull_request.base.sha }}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
name: validate STAC items
2+
on:
3+
push:
4+
branches:
5+
- master
6+
paths:
7+
- "models/**/stac-item.json"
8+
- "fair/schemas/**"
9+
- "fair/stac/validators.py"
10+
pull_request:
11+
branches:
12+
- master
13+
paths:
14+
- "models/**/stac-item.json"
15+
- "fair/schemas/**"
16+
- "fair/stac/validators.py"
17+
defaults:
18+
run:
19+
shell: bash
20+
concurrency:
21+
group: ${{ github.workflow }}-${{ github.ref }}
22+
cancel-in-progress: true
23+
jobs:
24+
validate:
25+
name: validate stac items
26+
runs-on: ubuntu-latest
27+
steps:
28+
- name: Clone repo
29+
uses: actions/checkout@v4
30+
- name: Set up uv
31+
uses: astral-sh/setup-uv@v5
32+
with:
33+
python-version: "3.13"
34+
enable-cache: true
35+
- name: Install dependencies
36+
run: uv sync --group dev
37+
- name: Validate STAC items
38+
run: make validate-stac

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: init setup clean example lint format typecheck test build bump pre-commit
1+
.PHONY: init setup clean example lint format typecheck test build bump pre-commit validate-stac
22

33
init:
44
uv sync --group local
@@ -46,3 +46,6 @@ bump:
4646
pre-commit:
4747
uv run pre-commit install --hook-type commit-msg --hook-type pre-commit
4848

49+
validate-stac:
50+
uv run python scripts/validate_stac_items.py
51+
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"required_extensions": [
3+
"https://stac-extensions.github.io/mlm/v1.5.1/schema.json",
4+
"https://stac-extensions.github.io/version/v1.2.0/schema.json",
5+
"https://stac-extensions.github.io/classification/v2.0.0/schema.json",
6+
"https://stac-extensions.github.io/file/v2.1.0/schema.json",
7+
"https://stac-extensions.github.io/raster/v1.1.0/schema.json"
8+
],
9+
"required_properties": [
10+
"mlm:name",
11+
"mlm:architecture",
12+
"mlm:tasks",
13+
"mlm:framework",
14+
"mlm:framework_version",
15+
"mlm:pretrained",
16+
"mlm:input",
17+
"mlm:output",
18+
"mlm:hyperparameters",
19+
"keywords",
20+
"version",
21+
"license"
22+
],
23+
"non_empty_list_properties": [
24+
"mlm:tasks",
25+
"mlm:input",
26+
"mlm:output",
27+
"keywords"
28+
],
29+
"required_assets": {
30+
"model": [
31+
"mlm:artifact_type"
32+
],
33+
"source-code": [
34+
"mlm:entrypoint"
35+
],
36+
"mlm:training": [],
37+
"mlm:inference": []
38+
},
39+
"input_required_fields": [
40+
"pre_processing_function"
41+
],
42+
"output_required_fields": [
43+
"post_processing_function",
44+
"classification:classes"
45+
],
46+
"processing_function_fields": [
47+
"format",
48+
"expression"
49+
],
50+
"allowed_values": {
51+
"license": [
52+
"GPL-3.0-only",
53+
"MIT",
54+
"Apache-2.0",
55+
"BSD-3-Clause"
56+
],
57+
"mlm:framework": [
58+
"PyTorch",
59+
"TensorFlow"
60+
],
61+
"mlm:tasks": [
62+
"semantic-segmentation",
63+
"instance-segmentation",
64+
"object-detection",
65+
"classification"
66+
]
67+
}
68+
}

fair/stac/validators.py

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66
import pystac
77

8-
## TODO : add fAIr specific validation rules here , mainly for existence of those keys which are required to integrate
9-
# basemodels
8+
# TODO : extend the validation with complete set of requirements based on the prod stac , currently only handful checks
9+
# are in place
1010

1111

1212
def validate_mlm_schema(item: pystac.Item) -> list[str]:
@@ -23,6 +23,85 @@ def _load_keywords_schema() -> dict:
2323
return json.loads(ref.read_text(encoding="utf-8"))
2424

2525

26+
def _load_base_model_requirements() -> dict:
27+
ref = importlib.resources.files("fair.schemas").joinpath("base_model_requirements.json")
28+
return json.loads(ref.read_text(encoding="utf-8"))
29+
30+
31+
def _check_processing_fn(fn: object, path: str, required_fields: list[str], errors: list[str]) -> None:
32+
if not isinstance(fn, dict):
33+
errors.append(f"{path} must be an object")
34+
return
35+
for field in required_fields:
36+
if field not in fn:
37+
errors.append(f"{path} missing field: {field}")
38+
39+
40+
def validate_base_model_item(item: pystac.Item) -> list[str]:
41+
"""Validate a base-model STAC item against fAIr requirements from base_model_requirements.json."""
42+
reqs = _load_base_model_requirements()
43+
kw_schema = _load_keywords_schema()
44+
errors: list[str] = []
45+
46+
declared = set(item.stac_extensions)
47+
for ext in reqs["required_extensions"]:
48+
if ext not in declared:
49+
errors.append(f"Missing extension: {ext}")
50+
51+
props = item.properties
52+
for prop in reqs["required_properties"]:
53+
if prop not in props or props[prop] is None:
54+
errors.append(f"Missing property: {prop}")
55+
56+
for prop in reqs["non_empty_list_properties"]:
57+
val = props.get(prop)
58+
if isinstance(val, list) and len(val) == 0:
59+
errors.append(f"Property must be non-empty list: {prop}")
60+
61+
allowed_kw = (
62+
set(kw_schema["allowed_keywords"])
63+
| set(kw_schema["allowed_tasks"])
64+
| set(kw_schema.get("allowed_geometry_types", []))
65+
)
66+
unknown_kw = set(props.get("keywords", [])) - allowed_kw
67+
if unknown_kw:
68+
errors.append(f"Unknown keywords: {unknown_kw}")
69+
70+
for prop, allowed in reqs.get("allowed_values", {}).items():
71+
val = props.get(prop)
72+
if val is None:
73+
continue
74+
items = val if isinstance(val, list) else [val]
75+
invalid = set(items) - set(allowed)
76+
if invalid:
77+
errors.append(f"Invalid {prop} values: {invalid}. Allowed: {allowed}")
78+
79+
proc_fields = reqs["processing_function_fields"]
80+
for i, inp in enumerate(props.get("mlm:input") or []):
81+
for field in reqs["input_required_fields"]:
82+
if field not in inp:
83+
errors.append(f"mlm:input[{i}] missing: {field}")
84+
elif field == "pre_processing_function":
85+
_check_processing_fn(inp[field], f"mlm:input[{i}].{field}", proc_fields, errors)
86+
87+
for i, out in enumerate(props.get("mlm:output") or []):
88+
for field in reqs["output_required_fields"]:
89+
if field not in out:
90+
errors.append(f"mlm:output[{i}] missing: {field}")
91+
elif field == "post_processing_function":
92+
_check_processing_fn(out[field], f"mlm:output[{i}].{field}", proc_fields, errors)
93+
94+
for asset_key, required_fields in reqs["required_assets"].items():
95+
if asset_key not in item.assets:
96+
errors.append(f"Missing asset: {asset_key}")
97+
continue
98+
for field in required_fields:
99+
if field not in item.assets[asset_key].extra_fields:
100+
errors.append(f"Asset '{asset_key}' missing field: {field}")
101+
102+
return errors
103+
104+
26105
def validate_compatibility(
27106
base_model_item: pystac.Item,
28107
dataset_item: pystac.Item,
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# BuildKit uses <Dockerfile>.dockerignore automatically (no repo-level side effects)
2+
*
3+
!fair/
4+
!models/example_unet/

models/example_unet/stac-item.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
"mlm:tasks": [
5050
"semantic-segmentation"
5151
],
52-
"mlm:framework": "pytorch",
52+
"mlm:framework": "PyTorch",
5353
"mlm:framework_version": "2.1.0",
5454
"mlm:pretrained": true,
5555
"mlm:pretrained_source": "OAM-TCD (NeurIPS 2024, arxiv.org/abs/2407.11743)",
@@ -58,6 +58,7 @@
5858
"semantic-segmentation",
5959
"polygon"
6060
],
61+
"license": "GPL-3.0-only",
6162
"version": "1",
6263
"mlm:input": [
6364
{

scripts/validate_stac_items.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/usr/bin/env python
2+
"""Validate all models/*/stac-item.json against fAIr platform requirements."""
3+
4+
import glob
5+
import sys
6+
7+
import pystac
8+
9+
from fair.stac.validators import validate_base_model_item
10+
11+
12+
def main() -> int:
13+
paths = sorted(glob.glob("models/*/stac-item.json"))
14+
if not paths:
15+
print("No stac-item.json found under models/")
16+
return 1
17+
18+
failed = False
19+
for path in paths:
20+
item = pystac.Item.from_file(path)
21+
errors = validate_base_model_item(item)
22+
if errors:
23+
failed = True
24+
print(f"FAIL {path}")
25+
for err in errors:
26+
print(f" {err}")
27+
else:
28+
print(f"OK {path}")
29+
30+
return 1 if failed else 0
31+
32+
33+
if __name__ == "__main__":
34+
sys.exit(main())

0 commit comments

Comments
 (0)