Skip to content

Commit 03e5f2b

Browse files
committed
test: add no-network and install profile gates
1 parent 8bdd56e commit 03e5f2b

8 files changed

Lines changed: 412 additions & 20 deletions

File tree

.github/workflows/ci.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,47 @@ jobs:
154154
flags: ${{ matrix.install-profile }}-py${{ matrix.python-version }}
155155
token: ${{ secrets.CODECOV_TOKEN }}
156156

157+
profile-smoke:
158+
runs-on: ubuntu-latest
159+
strategy:
160+
fail-fast: false
161+
matrix:
162+
install-profile:
163+
- core
164+
- cli
165+
- nlp
166+
- nlp-advanced
167+
- ocr
168+
- distributed
169+
- web
170+
steps:
171+
- uses: actions/checkout@v4
172+
- name: Set up Python
173+
uses: actions/setup-python@v5
174+
with:
175+
python-version: "3.11"
176+
cache: "pip"
177+
178+
- name: Upgrade pip
179+
run: |
180+
python -m pip install --upgrade pip
181+
182+
- name: Install dependencies (core)
183+
if: matrix.install-profile == 'core'
184+
run: |
185+
pip install -e ".[test]"
186+
187+
- name: Install dependencies (profile)
188+
if: matrix.install-profile != 'core'
189+
run: |
190+
pip install -e ".[test,${{ matrix.install-profile }}]"
191+
192+
- name: Run install profile smoke test
193+
env:
194+
DATAFOG_INSTALL_PROFILE: ${{ matrix.install-profile }}
195+
run: |
196+
pytest tests/test_install_profiles.py -q
197+
157198
wheel-size:
158199
runs-on: ubuntu-latest
159200
steps:

datafog/engine.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -171,17 +171,13 @@ def _gliner_entities(text: str) -> list[Entity]:
171171
def _get_spacy_annotator():
172172
try:
173173
from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
174-
except ImportError:
175-
return _UnavailableAnnotator(
176-
"SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
177-
)
174+
except ImportError as exc:
175+
return _UnavailableAnnotator(str(exc))
178176

179177
try:
180178
return SpacyPIIAnnotator.create()
181-
except ImportError:
182-
return _UnavailableAnnotator(
183-
"SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
184-
)
179+
except ImportError as exc:
180+
return _UnavailableAnnotator(str(exc))
185181
except Exception as exc:
186182
return _UnavailableAnnotator(
187183
f"SpaCy engine initialization failed: {type(exc).__name__}: {exc}"
@@ -192,19 +188,13 @@ def _get_spacy_annotator():
192188
def _get_gliner_annotator():
193189
try:
194190
from .processing.text_processing.gliner_annotator import GLiNERAnnotator
195-
except ImportError:
196-
return _UnavailableAnnotator(
197-
"GLiNER engine requires the nlp-advanced extra. "
198-
"Install with: pip install datafog[nlp-advanced]"
199-
)
191+
except ImportError as exc:
192+
return _UnavailableAnnotator(str(exc))
200193

201194
try:
202195
annotator = GLiNERAnnotator.create()
203-
except ImportError:
204-
return _UnavailableAnnotator(
205-
"GLiNER engine requires the nlp-advanced extra. "
206-
"Install with: pip install datafog[nlp-advanced]"
207-
)
196+
except ImportError as exc:
197+
return _UnavailableAnnotator(str(exc))
208198
except Exception as exc:
209199
return _UnavailableAnnotator(
210200
f"GLiNER engine initialization failed: {type(exc).__name__}: {exc}"

datafog/models/spacy_nlp.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from uuid import uuid4
1010

1111
import spacy
12-
from rich.progress import track
1312

1413
from .annotator import AnnotationResult, AnnotatorRequest
1514

@@ -53,7 +52,7 @@ def annotate_text(self, text: str, language: str = "en") -> List[AnnotationResul
5352
)
5453
doc = self.nlp(annotator_request.text)
5554
results = []
56-
for ent in track(doc.ents, description="Processing entities"):
55+
for ent in doc.ents:
5756
result = AnnotationResult(
5857
start=ent.start_char,
5958
end=ent.end_char,

docs/v5-model-requirements.md

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
# v5 Model Selection Requirements
2+
3+
This sheet defines requirements for revisiting DataFog's optional model stack before
4+
locking the v5 core API around specific NLP/OCR backends. It is intentionally a
5+
requirements document, not a model recommendation list.
6+
7+
## Decision Goals
8+
9+
- Pick models that improve adoption by making the first successful result easy,
10+
trustworthy, and local by default.
11+
- Keep the core SDK fast and lightweight; model-backed engines remain optional.
12+
- Make model behavior explicit enough that users can defend it in privacy,
13+
security, and compliance reviews.
14+
- Preserve a clean path for future backend swaps without breaking the top-level
15+
v5 API.
16+
17+
## Must-Haves
18+
19+
### Runtime And Packaging
20+
21+
- No model downloads during import, install, or ordinary SDK calls.
22+
- All model downloads must be explicit CLI/API actions or user-provided local
23+
paths.
24+
- The core install must not require ML, OCR, Torch, TensorFlow, Java, Spark, or
25+
system OCR binaries.
26+
- Optional extras must map cleanly to real imports:
27+
- `nlp` for lightweight NLP engines.
28+
- `nlp-advanced` for heavier ML NER engines.
29+
- `ocr` for local image/OCR processing.
30+
- `distributed` for Spark-style processing.
31+
- Missing dependency and missing model errors must explain the exact install or
32+
download command.
33+
- Python 3.10, 3.11, and 3.12 must be supported for advertised optional model
34+
profiles. Python 3.13 support should be advertised only after explicit profile
35+
validation.
36+
- Models must work in offline mode after explicit download/cache preparation.
37+
38+
### Privacy And Trust
39+
40+
- No network access during inference.
41+
- No telemetry, remote callbacks, model hub lookups, or license checks during
42+
inference.
43+
- No raw PII should be written to logs, cache names, telemetry, exceptions, or
44+
debug traces by default.
45+
- Model metadata exposed by DataFog should identify model name/version/source
46+
without storing detected raw PII.
47+
- Reversible workflows must be opt-in and clearly separated from ordinary
48+
redaction.
49+
50+
### Detection Contract
51+
52+
- Model outputs must include enough structure for the public result contract:
53+
entity type, text/span, start/end offsets, confidence when available, and
54+
engine/source.
55+
- Spans must be deterministic for the same model, text, and settings.
56+
- Entity labels must be mappable into DataFog's canonical entity taxonomy without
57+
surprising users.
58+
- Model-backed engines must compose with regex detection without duplicating or
59+
overwriting high-confidence structured entities.
60+
- Failure modes must be predictable: unsupported language, missing model, missing
61+
optional dependency, and low-confidence results should all be distinguishable.
62+
63+
### Quality Gates
64+
65+
- Candidate models must be benchmarked on DataFog's target corpora before
66+
adoption.
67+
- Benchmarks must include precision/recall by entity type, not only aggregate F1.
68+
- Structured PII such as email, phone, IP address, SSN, credit cards, dates, and
69+
ZIP/postal codes should remain regex/validator-first unless a model clearly
70+
improves quality.
71+
- NER-style entities such as person, organization, location, address, and
72+
domain-specific identifiers need regression tests with realistic app/log data.
73+
- OCR models must be evaluated separately for text extraction quality and PII
74+
extraction quality after OCR.
75+
76+
### Operational Fit
77+
78+
- CPU inference must be acceptable for the default advertised workflow.
79+
- GPU-only models are not acceptable as default engines.
80+
- Model size, cold-start time, memory use, and cache footprint must be measured.
81+
- The model must have a usable open license for commercial SDK users.
82+
- The model or provider must have credible maintenance signals and versioned
83+
artifacts.
84+
85+
## Nice-To-Haves
86+
87+
- Strong multilingual support with per-language quality reporting.
88+
- Quantized or small variants that keep local inference practical.
89+
- ONNX or other portable runtime support for future non-Torch deployments.
90+
- Streaming/chunked inference support or predictable behavior across chunk
91+
boundaries.
92+
- Custom entity hints or user-provided label sets.
93+
- Confidence calibration good enough to expose threshold controls.
94+
- Batch inference APIs for logs, CSV, and JSONL workflows.
95+
- Clear model cards with training data notes, limitations, and intended use.
96+
- Support for local cache directories that can be controlled by environment
97+
variable or explicit config.
98+
- Graceful operation on Apple Silicon and common Linux CI runners.
99+
100+
## Disqualifiers
101+
102+
- Requires network access for inference.
103+
- Downloads weights implicitly from ordinary SDK calls.
104+
- License is unclear, non-commercial, or incompatible with SDK distribution.
105+
- Requires a hosted API for core value.
106+
- Requires GPU for reasonable first-use behavior.
107+
- Cannot return stable spans or forces only label-level output.
108+
- Emits raw text or entities through logging, telemetry, or callbacks.
109+
- Adds heavyweight dependencies to the core install.
110+
- Breaks Python version support we already advertise.
111+
112+
## Evaluation Matrix
113+
114+
Each candidate backend should be scored before adoption:
115+
116+
| Area | Required Evidence |
117+
| --- | --- |
118+
| Install footprint | Extra name, package deps, wheel size impact, system deps |
119+
| Runtime footprint | Cold start, warm latency, memory, CPU/GPU requirements |
120+
| Offline behavior | Explicit download path, local cache path, no-network test |
121+
| Quality | Precision/recall by entity type on DataFog corpora |
122+
| Span quality | Offset correctness and deduplication behavior |
123+
| Privacy | No raw PII logs/cache/telemetry, safe error messages |
124+
| Licensing | Model license, dependency licenses, commercial use notes |
125+
| Maintenance | Release cadence, Python compatibility, issue activity |
126+
| API fit | Entity taxonomy mapping, confidence support, batch/chunk support |
127+
| Docs fit | Model card, limitations, user-facing setup instructions |
128+
129+
## Candidate Backend Categories To Evaluate
130+
131+
- Regex plus validators for structured PII and secrets.
132+
- Lightweight NLP NER for person, organization, location, and address entities.
133+
- Advanced local NER models for broader entity coverage and multilingual support.
134+
- OCR text extraction engines for local images/PDF-derived images.
135+
- Document understanding models only if they beat OCR plus text PII extraction
136+
enough to justify their footprint.
137+
- User-provided backend hooks for teams that already have a preferred model.
138+
139+
## Recommended Selection Policy
140+
141+
- Default v5 behavior should remain regex/validator-first.
142+
- Model-backed engines should be opt-in by engine, policy, or extra.
143+
- DataFog should prefer smaller, reliable local models over maximum leaderboard
144+
scores if they improve install success and first-use latency.
145+
- Model choices should be version-pinned in docs and CI once advertised.
146+
- A model can be experimental in docs/examples before it becomes part of the
147+
supported contract.
148+
149+
## Open Questions
150+
151+
- Do we want one recommended advanced NER model, or a pluggable registry with a
152+
default?
153+
- Should OCR stay Tesseract-first, or should v5 introduce a newer local OCR
154+
default after benchmarking?
155+
- How much multilingual quality is required for v5.0.0 versus a later release?
156+
- Should Python 3.13 optional-profile support be a v4.5 compatibility release,
157+
a v5 launch requirement, or both?
158+
- What maximum model download size is acceptable for the default recommended
159+
advanced profile?

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
]
3737

3838
ocr_deps = [
39+
"numpy>=1.24.0",
3940
"pytesseract>=0.3.0",
4041
"Pillow>=12.2.0",
4142
"sentencepiece>=0.2.0",
@@ -51,6 +52,7 @@
5152
web_deps = [
5253
"fastapi>=0.100.0",
5354
"aiohttp>=3.13.4",
55+
"certifi>=2025.4.26",
5456
"requests>=2.33.0",
5557
]
5658

tests/test_install_profiles.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import os
2+
3+
import pytest
4+
5+
pytestmark = pytest.mark.skipif(
6+
not os.environ.get("DATAFOG_INSTALL_PROFILE"),
7+
reason="install profile smoke tests run only in profile-specific CI jobs",
8+
)
9+
10+
11+
def test_install_profile_import_surface() -> None:
12+
profile = os.environ["DATAFOG_INSTALL_PROFILE"]
13+
14+
if profile == "core":
15+
import datafog
16+
17+
assert datafog.scan("Email jane@example.com").entities
18+
assert datafog.redact("Email jane@example.com").redacted_text
19+
elif profile == "cli":
20+
from datafog.client import app
21+
22+
assert app is not None
23+
elif profile == "nlp":
24+
import spacy # noqa: F401
25+
26+
from datafog.models.spacy_nlp import SpacyAnnotator
27+
from datafog.processing.text_processing.spacy_pii_annotator import (
28+
SpacyPIIAnnotator,
29+
)
30+
31+
assert SpacyAnnotator is not None
32+
assert SpacyPIIAnnotator is not None
33+
elif profile == "nlp-advanced":
34+
import gliner # noqa: F401
35+
import torch # noqa: F401
36+
import transformers # noqa: F401
37+
38+
from datafog.processing.text_processing.gliner_annotator import GLiNERAnnotator
39+
40+
assert GLiNERAnnotator is not None
41+
elif profile == "ocr":
42+
import numpy # noqa: F401
43+
import pytesseract # noqa: F401
44+
from PIL import Image # noqa: F401
45+
46+
from datafog.processing.image_processing.donut_processor import DonutProcessor
47+
from datafog.processing.image_processing.pytesseract_processor import (
48+
PytesseractProcessor,
49+
)
50+
from datafog.services.image_service import ImageService
51+
52+
assert DonutProcessor is not None
53+
assert ImageService is not None
54+
assert PytesseractProcessor is not None
55+
elif profile == "distributed":
56+
from datafog.processing.spark_processing import pyspark_udfs
57+
from datafog.services.spark_service import SparkService
58+
59+
pyspark_udfs.ensure_installed("pyspark")
60+
assert SparkService is not None
61+
elif profile == "web":
62+
import aiohttp # noqa: F401
63+
import certifi # noqa: F401
64+
import fastapi # noqa: F401
65+
import requests # noqa: F401
66+
else:
67+
raise AssertionError(f"unknown DATAFOG_INSTALL_PROFILE: {profile}")

0 commit comments

Comments
 (0)