Skip to content

Commit b071193

Browse files
committed
fix: preserve explicit spaCy model behavior
1 parent 90fdd08 commit b071193

9 files changed

Lines changed: 13 additions & 15 deletions

File tree

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,13 @@ jobs:
6060
if: matrix.install-profile == 'nlp'
6161
run: |
6262
pip install -e ".[test,cli,nlp]" -r requirements-test.txt
63-
python -m spacy download en_core_web_sm
63+
python -m spacy download en_core_web_lg
6464
6565
- name: Install dependencies (nlp-advanced)
6666
if: matrix.install-profile == 'nlp-advanced'
6767
run: |
6868
pip install -e ".[test,cli,nlp,nlp-advanced]" -r requirements-test.txt
69-
python -m spacy download en_core_web_sm
69+
python -m spacy download en_core_web_lg
7070
datafog download-model urchade/gliner_multi_pii-v1 --engine gliner
7171
7272
- name: Run tests (core)

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ jobs:
128128
python -m pip install --upgrade pip
129129
pip install -e ".[all,test]"
130130
pip install -r requirements-test.txt
131-
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
131+
python -m spacy download en_core_web_lg
132132
datafog download-model urchade/gliner_multi_pii-v1 --engine gliner
133133
134134
- name: Run tests with segfault protection

datafog/client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def download_model(
181181
Download a model for specified engine.
182182
183183
Examples:
184-
spaCy: datafog download-model en_core_web_sm --engine spacy
184+
spaCy: datafog download-model en_core_web_lg --engine spacy
185185
GLiNER: datafog download-model urchade/gliner_multi_pii-v1 --engine gliner
186186
"""
187187
if engine == "spacy":

datafog/models/spacy_nlp.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from .annotator import AnnotationResult, AnnotatorRequest
1515

16-
DEFAULT_SPACY_MODEL = "en_core_web_sm"
16+
DEFAULT_SPACY_MODEL = "en_core_web_lg"
1717

1818

1919
class SpacyAnnotator:
@@ -33,7 +33,7 @@ def load_model(self):
3333
self.nlp = spacy.load(self.model_name)
3434
except OSError as exc:
3535
raise ImportError(
36-
f"spaCy model '{self.model_name}' is not installed. "
36+
f"spaCy model {self.model_name!r} is not installed. "
3737
f"Download it explicitly with: datafog download-model {self.model_name} --engine spacy"
3838
) from exc
3939

@@ -83,7 +83,7 @@ def list_entities(model_name: str = DEFAULT_SPACY_MODEL) -> List[str]:
8383
nlp = spacy.load(model_name)
8484
except OSError as exc:
8585
raise ImportError(
86-
f"spaCy model '{model_name}' is not installed. "
86+
f"spaCy model {model_name!r} is not installed. "
8787
f"Download it explicitly with: datafog download-model {model_name} --engine spacy"
8888
) from exc
8989
return [ent for ent in nlp.pipe_labels["ner"]]

datafog/processing/image_processing/donut_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ async def extract_text_from_image(self, image: "Image.Image") -> str:
110110
)
111111
except OSError as exc:
112112
raise RuntimeError(
113-
f"Donut model '{self.model_path}' is not available locally. "
113+
f"Donut model {self.model_path!r} is not available locally. "
114114
"Download it explicitly before using Donut OCR, or pass a local "
115115
"model path."
116116
) from exc

datafog/processing/spark_processing/pyspark_udfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
1313
MAXIMAL_STRING_SIZE = 1000000
14-
DEFAULT_SPACY_MODEL = "en_core_web_sm"
14+
DEFAULT_SPACY_MODEL = "en_core_web_lg"
1515

1616

1717
def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:

datafog/processing/text_processing/gliner_annotator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def create(
8787
except Exception as e:
8888
logging.error(f"Failed to load GLiNER model {model_name}: {str(e)}")
8989
raise RuntimeError(
90-
f"GLiNER model '{model_name}' is not available locally. "
90+
f"GLiNER model {model_name!r} is not available locally. "
9191
"Download it explicitly with: "
9292
f"datafog download-model {model_name} --engine gliner"
9393
) from e

datafog/processing/text_processing/spacy_pii_annotator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"WORK_OF_ART",
2525
]
2626
MAXIMAL_STRING_SIZE = 1000000
27-
DEFAULT_SPACY_MODEL = "en_core_web_sm"
27+
DEFAULT_SPACY_MODEL = "en_core_web_lg"
2828

2929

3030
class SpacyPIIAnnotator(BaseModel):
@@ -47,7 +47,7 @@ def create(cls, model_name: str = DEFAULT_SPACY_MODEL) -> "SpacyPIIAnnotator":
4747
nlp = spacy.load(model_name)
4848
except OSError as exc:
4949
raise ImportError(
50-
f"spaCy model '{model_name}' is not installed. "
50+
f"spaCy model {model_name!r} is not installed. "
5151
f"Download it explicitly with: datafog download-model {model_name} --engine spacy"
5252
) from exc
5353

tests/test_runtime_dependency_safety.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,7 @@ def load(_model_name):
3333

3434
monkeypatch.setitem(sys.modules, "spacy", FakeSpacy())
3535

36-
from datafog.processing.text_processing.spacy_pii_annotator import (
37-
SpacyPIIAnnotator,
38-
)
36+
from datafog.processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
3937

4038
with pytest.raises(ImportError, match="Download it explicitly"):
4139
SpacyPIIAnnotator.create()

0 commit comments

Comments
 (0)