diff --git a/CHANGELOG.md b/CHANGELOG.md index d9b591835..6f9b09915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Full release notes: -- Commit history: +- Commit history: + +## [5.3.1] - 2026-03-15 + +This release focuses on security issues related to corpus file loading. + +### Security + +- thai2fit: Use JSON model instead of pickle (#1325) +- Defensive corpus loading: validate fields before processing (#1327) +- w2p: Use npz model instead of pickle (#1328) ## [5.3.0] - 2026-03-10 @@ -202,6 +212,7 @@ The minimum requirement is now Python 3.9. - See +[5.3.1]: https://github.com/PyThaiNLP/pythainlp/compare/v5.3.0...v5.3.1 [5.3.0]: https://github.com/PyThaiNLP/pythainlp/compare/v5.2.0...v5.3.0 [5.2.0]: https://github.com/PyThaiNLP/pythainlp/compare/v5.1.2...v5.2.0 [5.1.2]: https://github.com/PyThaiNLP/pythainlp/compare/v5.1.1...v5.1.2 diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index cecec9d0a..f82ac0006 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -57,6 +57,7 @@ "correct", "is_offline_mode", "is_read_only_mode", + "is_unsafe_pickle_allowed", "pos_tag", "romanize", "spell", @@ -78,6 +79,10 @@ subword_tokenize, word_tokenize, ) -from pythainlp.tools.path import is_offline_mode, is_read_only_mode +from pythainlp.tools.path import ( + is_offline_mode, + is_read_only_mode, + is_unsafe_pickle_allowed, +) from pythainlp.transliterate import romanize, transliterate from pythainlp.util import collate, thai_strftime diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 075b1e62a..6bdfec631 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -319,7 +319,7 @@ def thai_dict() -> dict[str, list[str]]: for row in reader: word = row.get("word") meaning = row.get("meaning") - if not word or not word.strip() or not meaning or not meaning.strip(): + if not (word and word.strip() and meaning and meaning.strip()): warnings.warn( f"Skipping thai_dict entry with missing or empty field(s): {dict(row)!r}", UserWarning, @@ -406,7 +406,11 @@ def thai_synonyms() -> dict[str, Union[list[str], list[list[str]]]]: word = row.get("word") pos = row.get("pos") synonym = row.get("synonym") - if not word or not word.strip() or not pos or not pos.strip() or not synonym or not synonym.strip(): + if not ( + word and word.strip() + and pos and pos.strip() + and synonym and synonym.strip() + ): warnings.warn( f"Skipping thai_synonyms entry with missing or empty field(s): {dict(row)!r}", UserWarning, diff --git a/pythainlp/spell/words_spelling_correction.py b/pythainlp/spell/words_spelling_correction.py index 847d63fe1..c5f20c346 100644 --- a/pythainlp/spell/words_spelling_correction.py +++ b/pythainlp/spell/words_spelling_correction.py @@ -84,7 +84,9 @@ def _load_embeddings(self) -> tuple[list[str], NDArray[np.float32]]: """Loads embeddings matrix and vocabulary list.""" import numpy as np - input_matrix = np.load(os.path.join(self.model_dir, "embeddings.npy")) + input_matrix = np.load( + os.path.join(self.model_dir, "embeddings.npy"), allow_pickle=False + ) words = [] vocab_path = os.path.join(self.model_dir, "vocabulary.txt") with open(vocab_path, encoding="utf-8") as f: diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py index 8712ffbda..ac96ecc78 100644 --- a/pythainlp/tag/_tag_perceptron.py +++ b/pythainlp/tag/_tag_perceptron.py @@ -166,8 +166,8 @@ def train( ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. - :param save_loc: If not ``None``, saves a pickled model in this \ - location. + :param save_loc: If not ``None``, saves the model as a JSON file in \ + this location. :param nr_iter: Number of training iterations. """ import random @@ -200,7 +200,7 @@ def train( random.shuffle(sentences_list) self.model.average_weights() - # save the model + # save the model as JSON if save_loc is not None: data: dict[str, Union[dict, list]] = {} data["weights"] = self.model.weights @@ -210,7 +210,7 @@ def train( json.dump(data, f, ensure_ascii=False) def load(self, loc: str) -> None: - """Load a pickled model. + """Load a saved model from a JSON file. :param str loc: model path """ try: diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index 8cbf06e7e..fcb8d662d 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -8,6 +8,7 @@ "get_pythainlp_path", "is_offline_mode", "is_read_only_mode", + "is_unsafe_pickle_allowed", "safe_print", "warn_deprecation", ] @@ -20,4 +21,5 @@ get_pythainlp_path, is_offline_mode, is_read_only_mode, + is_unsafe_pickle_allowed, ) diff --git a/pythainlp/tools/path.py b/pythainlp/tools/path.py index ee8b473b2..2bd954c7d 100644 --- a/pythainlp/tools/path.py +++ b/pythainlp/tools/path.py @@ -102,6 +102,22 @@ def is_read_only_mode() -> bool: return False +def is_unsafe_pickle_allowed() -> bool: + """Return whether loading legacy pickle-based corpus files is allowed. + + Pickle deserialisation can execute arbitrary code if the file has been + tampered with, so it is **disabled by default**. + Set the ``PYTHAINLP_ALLOW_UNSAFE_PICKLE`` environment variable to + a truthy value (e.g. ``"1"``) only when you trust the corpus file and + understand the risk. + + :return: ``True`` if legacy pickle loading is allowed, ``False`` otherwise. + :rtype: bool + """ + val = os.getenv("PYTHAINLP_ALLOW_UNSAFE_PICKLE", "") + return val.strip().lower() in ("1", "true", "yes", "on") + + def is_offline_mode() -> bool: """Return whether PyThaiNLP is operating in offline mode. diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index c863f5b35..d2bb31f09 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -22,7 +22,7 @@ "-พจใงต้ืฮแาฐฒฤูศฅถฺฎหคสุขเึดฟำฝยลอ็ม" + " ณิฑชฉซทรํฬฏ–ัฃวก่ปผ์ฆบี๊ธฌญะไษ๋นโภ?" ) -_MODEL_NAME: str = "thai_w2p" +_MODEL_NAME: str = "thai_w2p_npz" class _Hparams: @@ -40,9 +40,9 @@ class _Hparams: hp: _Hparams = _Hparams() -def _load_vocab() -> tuple[ - dict[str, int], dict[int, str], dict[str, int], dict[int, str] -]: +def _load_vocab() -> ( + tuple[dict[str, int], dict[int, str], dict[str, int], dict[int, str]] +): g2idx = {g: idx for idx, g in enumerate(hp.graphemes)} idx2g = dict(enumerate(hp.graphemes)) @@ -60,7 +60,6 @@ class Thai_W2P: p2idx: dict[str, int] idx2p: dict[int, str] checkpoint: Optional[str] - variables: "NDArray" enc_emb: "NDArray" enc_w_ih: "NDArray" enc_w_hh: "NDArray" @@ -84,9 +83,7 @@ def __init__(self) -> None: self.p2idx: dict[str, int] self.idx2p: dict[int, str] self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab() - self.checkpoint: Optional[str] = get_corpus_path( - _MODEL_NAME, version="0.2" - ) + self.checkpoint: Optional[str] = get_corpus_path(_MODEL_NAME, version="0.2") if not self.checkpoint: raise FileNotFoundError( f"corpus-not-found name={_MODEL_NAME!r}\n" @@ -98,55 +95,54 @@ def __init__(self) -> None: def _load_variables(self) -> None: import numpy as np - if self.checkpoint is None: raise RuntimeError("checkpoint path is not set") - self.variables: "NDArray" = np.load(self.checkpoint, allow_pickle=True) - # (29, 64). (len(graphemes), emb) - self.enc_emb: "NDArray" = self.variables.item().get( - "encoder.emb.weight" - ) - # (3*128, 64) - self.enc_w_ih: "NDArray" = self.variables.item().get( - "encoder.rnn.weight_ih_l0" - ) - # (3*128, 128) - self.enc_w_hh: "NDArray" = self.variables.item().get( - "encoder.rnn.weight_hh_l0" - ) - # (3*128,) - self.enc_b_ih: "NDArray" = self.variables.item().get( - "encoder.rnn.bias_ih_l0" - ) - # (3*128,) - self.enc_b_hh: "NDArray" = self.variables.item().get( - "encoder.rnn.bias_hh_l0" - ) - - # (74, 64). (len(phonemes), emb) - self.dec_emb: "NDArray" = self.variables.item().get( - "decoder.emb.weight" - ) - # (3*128, 64) - self.dec_w_ih: "NDArray" = self.variables.item().get( - "decoder.rnn.weight_ih_l0" - ) - # (3*128, 128) - self.dec_w_hh: "NDArray" = self.variables.item().get( - "decoder.rnn.weight_hh_l0" - ) - # (3*128,) - self.dec_b_ih: "NDArray" = self.variables.item().get( - "decoder.rnn.bias_ih_l0" - ) - # (3*128,) - self.dec_b_hh: "NDArray" = self.variables.item().get( - "decoder.rnn.bias_hh_l0" - ) - # (74, 128) - self.fc_w: "NDArray" = self.variables.item().get("decoder.fc.weight") - # (74,) - self.fc_b: "NDArray" = self.variables.item().get("decoder.fc.bias") + with np.load(self.checkpoint, allow_pickle=False) as variables: + # (29, 64). (len(graphemes), emb) + self.enc_emb: "NDArray" = variables[ + "encoder_emb_weight" + ] + # (3*128, 64) + self.enc_w_ih: "NDArray" = variables[ + "encoder_rnn_weight_ih_l0" + ] + # (3*128, 128) + self.enc_w_hh: "NDArray" = variables[ + "encoder_rnn_weight_hh_l0" + ] + # (3*128,) + self.enc_b_ih: "NDArray" = variables[ + "encoder_rnn_bias_ih_l0" + ] + # (3*128,) + self.enc_b_hh: "NDArray" = variables[ + "encoder_rnn_bias_hh_l0" + ] + + # (74, 64). (len(phonemes), emb) + self.dec_emb: "NDArray" = variables[ + "decoder_emb_weight" + ] + # (3*128, 64) + self.dec_w_ih: "NDArray" = variables[ + "decoder_rnn_weight_ih_l0" + ] + # (3*128, 128) + self.dec_w_hh: "NDArray" = variables[ + "decoder_rnn_weight_hh_l0" + ] + # (3*128,) + self.dec_b_ih: "NDArray" = variables[ + "decoder_rnn_bias_ih_l0" + ] + # (3*128,) + self.dec_b_hh: "NDArray" = variables[ + "decoder_rnn_bias_hh_l0" + ] + # (74, 128) + self.fc_w: "NDArray" = variables["decoder_fc_weight"] + # (74,) + self.fc_b: "NDArray" = variables["decoder_fc_bias"] def _sigmoid(self, x: "np.ndarray") -> "np.ndarray": import numpy as np diff --git a/tests/core/test_tools.py b/tests/core/test_tools.py index 78fe28eec..a87751d7d 100644 --- a/tests/core/test_tools.py +++ b/tests/core/test_tools.py @@ -8,7 +8,11 @@ import warnings from unittest.mock import patch -from pythainlp import is_offline_mode, is_read_only_mode +from pythainlp import ( + is_offline_mode, + is_read_only_mode, + is_unsafe_pickle_allowed, +) from pythainlp.tools import ( get_full_data_path, get_pythainlp_data_path, @@ -26,9 +30,7 @@ class ToolsTestCase(unittest.TestCase): def test_path(self): data_filename = "ttc_freq.txt" - self.assertTrue( - get_full_data_path(data_filename).endswith(data_filename) - ) + self.assertTrue(get_full_data_path(data_filename).endswith(data_filename)) self.assertIsInstance(get_pythainlp_data_path(), str) self.assertIsInstance(get_pythainlp_path(), str) @@ -272,3 +274,23 @@ def test_get_pythainlp_data_path_no_makedirs_in_read_only(self): "Data directory should not be created in read-only mode", ) + def test_is_unsafe_pickle_allowed(self): + """Test is_unsafe_pickle_allowed() reflects PYTHAINLP_ALLOW_UNSAFE_PICKLE env var.""" + # Truthy values + for truthy in ("1", "true", "True", "TRUE", "yes", "YES", "on", "ON"): + with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": truthy}): + self.assertTrue( + is_unsafe_pickle_allowed(), + f"Expected True for PYTHAINLP_ALLOW_UNSAFE_PICKLE={truthy!r}", + ) + # Falsy values + for falsy in ("", "0", "false", "False", "FALSE", "no", "NO", "off", "OFF"): + with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": falsy}): + self.assertFalse( + is_unsafe_pickle_allowed(), + f"Expected False for PYTHAINLP_ALLOW_UNSAFE_PICKLE={falsy!r}", + ) + # Unset: should default to False + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("PYTHAINLP_ALLOW_UNSAFE_PICKLE", None) + self.assertFalse(is_unsafe_pickle_allowed())