Merge pull request #13 from hotosm/ci/stac-validation

kshitijrajsharma · web-flow · commit 107c5bb3abd8 · 2026-02-25T15:50:43.000+01:00
Feature : CI to validate STAC items for basemodels !
diff --git a/.github/workflows/build-model-images.yml b/.github/workflows/build-model-images.yml
@@ -27,12 +27,13 @@ jobs:
       - name: Fetch base ref
         if: github.event_name == 'pull_request'
         run: git fetch origin ${{ github.event.pull_request.base.ref }}
-      - uses: actions/setup-python@v5
+      - uses: astral-sh/setup-uv@v5
         with:
           python-version: "3.13"
-      - run: pip install pystac
+          enable-cache: true
+      - run: uv add pystac
       - id: models
-        run: echo "matrix=$(python .github/scripts/detect_models.py)" >> $GITHUB_OUTPUT
+        run: echo "matrix=$(uv run python .github/scripts/detect_models.py)" >> $GITHUB_OUTPUT
         env:
           EVENT_NAME: ${{ github.event_name }}
           BASE_SHA: ${{ github.event.pull_request.base.sha }}
diff --git a/.github/workflows/validate-stac.yml b/.github/workflows/validate-stac.yml
@@ -0,0 +1,38 @@
+name: validate STAC items
+on:
+  push:
+    branches:
+      - master
+    paths:
+      - "models/**/stac-item.json"
+      - "fair/schemas/**"
+      - "fair/stac/validators.py"
+  pull_request:
+    branches:
+      - master
+    paths:
+      - "models/**/stac-item.json"
+      - "fair/schemas/**"
+      - "fair/stac/validators.py"
+defaults:
+  run:
+    shell: bash
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  validate:
+    name: validate stac items
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone repo
+        uses: actions/checkout@v4
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.13"
+          enable-cache: true
+      - name: Install dependencies
+        run: uv sync --group dev
+      - name: Validate STAC items
+        run: make validate-stac
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: init setup clean example lint format typecheck test build bump pre-commit
+.PHONY: init setup clean example lint format typecheck test build bump pre-commit validate-stac
 
 init:
 	uv sync --group local
@@ -46,3 +46,6 @@ bump:
 pre-commit:
 	uv run pre-commit install --hook-type commit-msg --hook-type pre-commit
 
+validate-stac:
+	uv run python scripts/validate_stac_items.py
+
diff --git a/fair/schemas/base_model_requirements.json b/fair/schemas/base_model_requirements.json
@@ -0,0 +1,68 @@
+{
+    "required_extensions": [
+        "https://stac-extensions.github.io/mlm/v1.5.1/schema.json",
+        "https://stac-extensions.github.io/version/v1.2.0/schema.json",
+        "https://stac-extensions.github.io/classification/v2.0.0/schema.json",
+        "https://stac-extensions.github.io/file/v2.1.0/schema.json",
+        "https://stac-extensions.github.io/raster/v1.1.0/schema.json"
+    ],
+    "required_properties": [
+        "mlm:name",
+        "mlm:architecture",
+        "mlm:tasks",
+        "mlm:framework",
+        "mlm:framework_version",
+        "mlm:pretrained",
+        "mlm:input",
+        "mlm:output",
+        "mlm:hyperparameters",
+        "keywords",
+        "version",
+        "license"
+    ],
+    "non_empty_list_properties": [
+        "mlm:tasks",
+        "mlm:input",
+        "mlm:output",
+        "keywords"
+    ],
+    "required_assets": {
+        "model": [
+            "mlm:artifact_type"
+        ],
+        "source-code": [
+            "mlm:entrypoint"
+        ],
+        "mlm:training": [],
+        "mlm:inference": []
+    },
+    "input_required_fields": [
+        "pre_processing_function"
+    ],
+    "output_required_fields": [
+        "post_processing_function",
+        "classification:classes"
+    ],
+    "processing_function_fields": [
+        "format",
+        "expression"
+    ],
+    "allowed_values": {
+        "license": [
+            "GPL-3.0-only",
+            "MIT",
+            "Apache-2.0",
+            "BSD-3-Clause"
+        ],
+        "mlm:framework": [
+            "PyTorch",
+            "TensorFlow"
+        ],
+        "mlm:tasks": [
+            "semantic-segmentation",
+            "instance-segmentation",
+            "object-detection",
+            "classification"
+        ]
+    }
+}
diff --git a/fair/stac/validators.py b/fair/stac/validators.py
@@ -5,8 +5,8 @@
 
 import pystac
 
-## TODO : add fAIr specific validation rules here , mainly for existence of those keys which are required to integrate
-# basemodels
+# TODO : extend the validation with complete set of requirements based on the prod stac , currently only handful checks
+#  are in place
 
 
 def validate_mlm_schema(item: pystac.Item) -> list[str]:
@@ -23,6 +23,85 @@ def _load_keywords_schema() -> dict:
     return json.loads(ref.read_text(encoding="utf-8"))
 
 
+def _load_base_model_requirements() -> dict:
+    ref = importlib.resources.files("fair.schemas").joinpath("base_model_requirements.json")
+    return json.loads(ref.read_text(encoding="utf-8"))
+
+
+def _check_processing_fn(fn: object, path: str, required_fields: list[str], errors: list[str]) -> None:
+    if not isinstance(fn, dict):
+        errors.append(f"{path} must be an object")
+        return
+    for field in required_fields:
+        if field not in fn:
+            errors.append(f"{path} missing field: {field}")
+
+
+def validate_base_model_item(item: pystac.Item) -> list[str]:
+    """Validate a base-model STAC item against fAIr requirements from base_model_requirements.json."""
+    reqs = _load_base_model_requirements()
+    kw_schema = _load_keywords_schema()
+    errors: list[str] = []
+
+    declared = set(item.stac_extensions)
+    for ext in reqs["required_extensions"]:
+        if ext not in declared:
+            errors.append(f"Missing extension: {ext}")
+
+    props = item.properties
+    for prop in reqs["required_properties"]:
+        if prop not in props or props[prop] is None:
+            errors.append(f"Missing property: {prop}")
+
+    for prop in reqs["non_empty_list_properties"]:
+        val = props.get(prop)
+        if isinstance(val, list) and len(val) == 0:
+            errors.append(f"Property must be non-empty list: {prop}")
+
+    allowed_kw = (
+        set(kw_schema["allowed_keywords"])
+        | set(kw_schema["allowed_tasks"])
+        | set(kw_schema.get("allowed_geometry_types", []))
+    )
+    unknown_kw = set(props.get("keywords", [])) - allowed_kw
+    if unknown_kw:
+        errors.append(f"Unknown keywords: {unknown_kw}")
+
+    for prop, allowed in reqs.get("allowed_values", {}).items():
+        val = props.get(prop)
+        if val is None:
+            continue
+        items = val if isinstance(val, list) else [val]
+        invalid = set(items) - set(allowed)
+        if invalid:
+            errors.append(f"Invalid {prop} values: {invalid}. Allowed: {allowed}")
+
+    proc_fields = reqs["processing_function_fields"]
+    for i, inp in enumerate(props.get("mlm:input") or []):
+        for field in reqs["input_required_fields"]:
+            if field not in inp:
+                errors.append(f"mlm:input[{i}] missing: {field}")
+            elif field == "pre_processing_function":
+                _check_processing_fn(inp[field], f"mlm:input[{i}].{field}", proc_fields, errors)
+
+    for i, out in enumerate(props.get("mlm:output") or []):
+        for field in reqs["output_required_fields"]:
+            if field not in out:
+                errors.append(f"mlm:output[{i}] missing: {field}")
+            elif field == "post_processing_function":
+                _check_processing_fn(out[field], f"mlm:output[{i}].{field}", proc_fields, errors)
+
+    for asset_key, required_fields in reqs["required_assets"].items():
+        if asset_key not in item.assets:
+            errors.append(f"Missing asset: {asset_key}")
+            continue
+        for field in required_fields:
+            if field not in item.assets[asset_key].extra_fields:
+                errors.append(f"Asset '{asset_key}' missing field: {field}")
+
+    return errors
+
+
 def validate_compatibility(
     base_model_item: pystac.Item,
     dataset_item: pystac.Item,
diff --git a/models/example_unet/Dockerfile.dockerignore b/models/example_unet/Dockerfile.dockerignore
@@ -0,0 +1,4 @@
+# BuildKit uses <Dockerfile>.dockerignore automatically (no repo-level side effects) 
+*
+!fair/
+!models/example_unet/
diff --git a/models/example_unet/stac-item.json b/models/example_unet/stac-item.json
@@ -49,7 +49,7 @@
         "mlm:tasks": [
             "semantic-segmentation"
         ],
-        "mlm:framework": "pytorch",
+        "mlm:framework": "PyTorch",
         "mlm:framework_version": "2.1.0",
         "mlm:pretrained": true,
         "mlm:pretrained_source": "OAM-TCD (NeurIPS 2024, arxiv.org/abs/2407.11743)",
@@ -58,6 +58,7 @@
             "semantic-segmentation",
             "polygon"
         ],
+        "license": "GPL-3.0-only",
         "version": "1",
         "mlm:input": [
             {
diff --git a/scripts/validate_stac_items.py b/scripts/validate_stac_items.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+"""Validate all models/*/stac-item.json against fAIr platform requirements."""
+
+import glob
+import sys
+
+import pystac
+
+from fair.stac.validators import validate_base_model_item
+
+
+def main() -> int:
+    paths = sorted(glob.glob("models/*/stac-item.json"))
+    if not paths:
+        print("No stac-item.json found under models/")
+        return 1
+
+    failed = False
+    for path in paths:
+        item = pystac.Item.from_file(path)
+        errors = validate_base_model_item(item)
+        if errors:
+            failed = True
+            print(f"FAIL {path}")
+            for err in errors:
+                print(f"  {err}")
+        else:
+            print(f"OK   {path}")
+
+    return 1 if failed else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_validators.py b/tests/test_validators.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +# BuildKit uses <Dockerfile>.dockerignore automatically (no repo-level side effects)
 +*
 +!fair/
 +!models/example_unet/