Skip to content

Commit 6363c61

Browse files
authored
Merge pull request #1316 from PyThaiNLP/copilot/fix-duplicate-word-warning
fix: suppress gensim duplicate-word warnings when loading word2vec binary files
2 parents 1b4a014 + d0dbd6d commit 6363c61

File tree

3 files changed

+44
-12
lines changed

3 files changed

+44
-12
lines changed

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ The minimum requirement is now Python 3.9.
5757
### Deprecated
5858

5959
- `PYTHAINLP_DATA_DIR` env var; use `PYTHAINLP_DATA` instead
60-
(follows `NLTK_DATA` convention from NLTK) (#1306)
60+
(follows `NLTK_DATA` convention from NLTK)
61+
`PYTHAINLP_DATA_DIR` will be removed in future version (#1306)
6162

6263
### Removed
6364

@@ -72,6 +73,8 @@ The minimum requirement is now Python 3.9.
7273
- Consonant cluster boundary issue in `royin` romanization (#1172)
7374
- Final consonant classification in `check_marttra()` (#1173)
7475
- Kho Khon alphabet issue in `tltk` transliteration (#1187)
76+
- Suppress Gensim duplicate-word warnings when loading word2vec
77+
binary files (#1316)
7578

7679
## [5.2.0] - 2025-12-20
7780

pythainlp/augment/word2vec/core.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,20 @@
44
from __future__ import annotations
55

66
import itertools
7+
import logging
78
from typing import TYPE_CHECKING, Callable, Union
89

910
if TYPE_CHECKING:
1011
from gensim.models.keyedvectors import KeyedVectors
1112

1213

14+
class _DuplicateWordFilter(logging.Filter):
15+
"""Suppress gensim's 'duplicate word' warnings for word2vec files."""
16+
17+
def filter(self, record: logging.LogRecord) -> bool:
18+
return "duplicate word" not in record.getMessage()
19+
20+
1321
class Word2VecAug:
1422
tokenizer: Callable[[str], list[str]]
1523
model: "KeyedVectors"
@@ -28,12 +36,19 @@ def __init__(
2836
import gensim.models.keyedvectors as word2vec
2937

3038
self.tokenizer: Callable[[str], list[str]] = tokenize
31-
if type == "file":
32-
self.model = word2vec.KeyedVectors.load_word2vec_format(model)
33-
elif type == "binary":
34-
self.model = word2vec.KeyedVectors.load_word2vec_format(
35-
model, binary=True, unicode_errors="ignore"
36-
)
39+
if type in ("file", "binary"):
40+
_filter = _DuplicateWordFilter()
41+
_gensim_kv_logger = logging.getLogger("gensim.models.keyedvectors")
42+
_gensim_kv_logger.addFilter(_filter)
43+
try:
44+
if type == "file":
45+
self.model = word2vec.KeyedVectors.load_word2vec_format(model)
46+
else:
47+
self.model = word2vec.KeyedVectors.load_word2vec_format(
48+
model, binary=True, unicode_errors="ignore"
49+
)
50+
finally:
51+
_gensim_kv_logger.removeFilter(_filter)
3752
else:
3853
self.model = model
3954
self.dict_wv: list[str] = list(self.model.key_to_index.keys())

pythainlp/word_vector/core.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# SPDX-License-Identifier: Apache-2.0
44
from __future__ import annotations
55

6+
import logging
67
from collections.abc import Callable
78
from typing import TYPE_CHECKING
89

@@ -21,6 +22,13 @@
2122
_TK_EOL: str = "xxeol"
2223

2324

25+
class _DuplicateWordFilter(logging.Filter):
26+
"""Suppress gensim's 'duplicate word' warnings for word2vec files."""
27+
28+
def filter(self, record: logging.LogRecord) -> bool:
29+
return "duplicate word" not in record.getMessage()
30+
31+
2432
class WordVector:
2533
"""Word Vector class
2634
@@ -66,11 +74,17 @@ def load_wordvector(self, model_name: str) -> None:
6674
f" Python: pythainlp.corpus.download('{model_name}')\n"
6775
f" CLI: thainlp data get {model_name}"
6876
)
69-
self.model = KeyedVectors.load_word2vec_format(
70-
corpus_file,
71-
binary=True,
72-
unicode_errors="ignore",
73-
)
77+
_filter = _DuplicateWordFilter()
78+
_gensim_kv_logger = logging.getLogger("gensim.models.keyedvectors")
79+
_gensim_kv_logger.addFilter(_filter)
80+
try:
81+
self.model = KeyedVectors.load_word2vec_format(
82+
corpus_file,
83+
binary=True,
84+
unicode_errors="ignore",
85+
)
86+
finally:
87+
_gensim_kv_logger.removeFilter(_filter)
7488
self.WV_DIM = self.model.vector_size
7589

7690
if self.model_name == "thai2fit_wv":

0 commit comments

Comments
 (0)