From 2c199662bece3b50c96a47e208341da331dcf5a2 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 14 Mar 2026 05:00:56 +0700 Subject: [PATCH 01/27] Update np.load to disallow allow_pickle Change np.load to disallow pickling for security. --- pythainlp/transliterate/w2p.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index c863f5b35..2e5165330 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -101,7 +101,7 @@ def _load_variables(self) -> None: if self.checkpoint is None: raise RuntimeError("checkpoint path is not set") - self.variables: "NDArray" = np.load(self.checkpoint, allow_pickle=True) + self.variables: "NDArray" = np.load(self.checkpoint, allow_pickle=False) # (29, 64). (len(graphemes), emb) self.enc_emb: "NDArray" = self.variables.item().get( "encoder.emb.weight" From c5e7b87181abc051918a0311380a16e01b9b9135 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 05:09:40 +0700 Subject: [PATCH 02/27] Fix np.load allow_pickle=False to work correctly with .npz format (#1329) * Initial plan * Fix np.load allow_pickle=False to work with .npz NpzFile format - Replace .item().get(key) with [key] dict-style access on NpzFile - Remove variables instance attribute; use local variable instead - Add type annotation for variables local var as np.lib.npyio.NpzFile - Add allow_pickle=False to embeddings.npy load in words_spelling_correction.py Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --- pythainlp/spell/words_spelling_correction.py | 4 +- pythainlp/transliterate/w2p.py | 50 ++++++-------------- 2 files changed, 18 insertions(+), 36 deletions(-) diff --git a/pythainlp/spell/words_spelling_correction.py b/pythainlp/spell/words_spelling_correction.py index 847d63fe1..c5f20c346 100644 --- a/pythainlp/spell/words_spelling_correction.py +++ b/pythainlp/spell/words_spelling_correction.py @@ -84,7 +84,9 @@ def _load_embeddings(self) -> tuple[list[str], NDArray[np.float32]]: """Loads embeddings matrix and vocabulary list.""" import numpy as np - input_matrix = np.load(os.path.join(self.model_dir, "embeddings.npy")) + input_matrix = np.load( + os.path.join(self.model_dir, "embeddings.npy"), allow_pickle=False + ) words = [] vocab_path = os.path.join(self.model_dir, "vocabulary.txt") with open(vocab_path, encoding="utf-8") as f: diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 2e5165330..7f52d331e 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -60,7 +60,6 @@ class Thai_W2P: p2idx: dict[str, int] idx2p: dict[int, str] checkpoint: Optional[str] - variables: "NDArray" enc_emb: "NDArray" enc_w_ih: "NDArray" enc_w_hh: "NDArray" @@ -101,52 +100,33 @@ def _load_variables(self) -> None: if self.checkpoint is None: raise RuntimeError("checkpoint path is not set") - self.variables: "NDArray" = np.load(self.checkpoint, allow_pickle=False) - # (29, 64). (len(graphemes), emb) - self.enc_emb: "NDArray" = self.variables.item().get( - "encoder.emb.weight" + variables: "np.lib.npyio.NpzFile" = np.load( + self.checkpoint, allow_pickle=False ) + # (29, 64). (len(graphemes), emb) + self.enc_emb: "NDArray" = variables["encoder.emb.weight"] # (3*128, 64) - self.enc_w_ih: "NDArray" = self.variables.item().get( - "encoder.rnn.weight_ih_l0" - ) + self.enc_w_ih: "NDArray" = variables["encoder.rnn.weight_ih_l0"] # (3*128, 128) - self.enc_w_hh: "NDArray" = self.variables.item().get( - "encoder.rnn.weight_hh_l0" - ) + self.enc_w_hh: "NDArray" = variables["encoder.rnn.weight_hh_l0"] # (3*128,) - self.enc_b_ih: "NDArray" = self.variables.item().get( - "encoder.rnn.bias_ih_l0" - ) + self.enc_b_ih: "NDArray" = variables["encoder.rnn.bias_ih_l0"] # (3*128,) - self.enc_b_hh: "NDArray" = self.variables.item().get( - "encoder.rnn.bias_hh_l0" - ) - + self.enc_b_hh: "NDArray" = variables["encoder.rnn.bias_hh_l0"] # (74, 64). (len(phonemes), emb) - self.dec_emb: "NDArray" = self.variables.item().get( - "decoder.emb.weight" - ) + self.dec_emb: "NDArray" = variables["decoder.emb.weight"] # (3*128, 64) - self.dec_w_ih: "NDArray" = self.variables.item().get( - "decoder.rnn.weight_ih_l0" - ) + self.dec_w_ih: "NDArray" = variables["decoder.rnn.weight_ih_l0"] # (3*128, 128) - self.dec_w_hh: "NDArray" = self.variables.item().get( - "decoder.rnn.weight_hh_l0" - ) + self.dec_w_hh: "NDArray" = variables["decoder.rnn.weight_hh_l0"] # (3*128,) - self.dec_b_ih: "NDArray" = self.variables.item().get( - "decoder.rnn.bias_ih_l0" - ) + self.dec_b_ih: "NDArray" = variables["decoder.rnn.bias_ih_l0"] # (3*128,) - self.dec_b_hh: "NDArray" = self.variables.item().get( - "decoder.rnn.bias_hh_l0" - ) + self.dec_b_hh: "NDArray" = variables["decoder.rnn.bias_hh_l0"] # (74, 128) - self.fc_w: "NDArray" = self.variables.item().get("decoder.fc.weight") + self.fc_w: "NDArray" = variables["decoder.fc.weight"] # (74,) - self.fc_b: "NDArray" = self.variables.item().get("decoder.fc.bias") + self.fc_b: "NDArray" = variables["decoder.fc.bias"] def _sigmoid(self, x: "np.ndarray") -> "np.ndarray": import numpy as np From 48b1f0fca08a94bea13735a6b342b2c8085c9284 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Mar 2026 22:12:33 +0000 Subject: [PATCH 03/27] Initial plan From 8ce6415d75d3959d9950ecbf34041e33d6076699 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Mar 2026 22:21:02 +0000 Subject: [PATCH 04/27] Fix np.load allow_pickle for legacy .npy corpus (ValueError fix) The thai_w2p corpus v0.2 is stored as a .npy pickled dict. Loading it with allow_pickle=False raises ValueError. Detect file format by extension: - .npz: use allow_pickle=False (secure, for future corpus versions) - .npy (legacy): use allow_pickle=True + dict validation Also add `import os` for os.path.splitext(). Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --- pythainlp/transliterate/w2p.py | 51 ++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 7f52d331e..05e1b31c5 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -7,6 +7,7 @@ from __future__ import annotations +import os from typing import TYPE_CHECKING, Optional from pythainlp.corpus import get_corpus_path @@ -100,33 +101,53 @@ def _load_variables(self) -> None: if self.checkpoint is None: raise RuntimeError("checkpoint path is not set") - variables: "np.lib.npyio.NpzFile" = np.load( - self.checkpoint, allow_pickle=False - ) + + # .npz files store each array directly (no pickle needed). + # Legacy .npy files store a pickled dict as an object array. + # NOTE: allow_pickle=True is retained only for backward compatibility + # with the existing .npy corpus. When the corpus is republished in + # .npz format, the legacy branch can be removed. + _, ext = os.path.splitext(self.checkpoint) + if ext.lower() == ".npz": + raw: "np.lib.npyio.NpzFile" = np.load( + self.checkpoint, allow_pickle=False + ) + weights: dict[str, "NDArray"] = dict(raw) + else: + legacy: "NDArray" = np.load( + self.checkpoint, allow_pickle=True + ) + weights = legacy.item() + if not isinstance(weights, dict): + raise ValueError( + f"Expected a dict in legacy corpus file, got " + f"{type(weights).__name__!r}" + ) + # (29, 64). (len(graphemes), emb) - self.enc_emb: "NDArray" = variables["encoder.emb.weight"] + self.enc_emb: "NDArray" = weights["encoder.emb.weight"] # (3*128, 64) - self.enc_w_ih: "NDArray" = variables["encoder.rnn.weight_ih_l0"] + self.enc_w_ih: "NDArray" = weights["encoder.rnn.weight_ih_l0"] # (3*128, 128) - self.enc_w_hh: "NDArray" = variables["encoder.rnn.weight_hh_l0"] + self.enc_w_hh: "NDArray" = weights["encoder.rnn.weight_hh_l0"] # (3*128,) - self.enc_b_ih: "NDArray" = variables["encoder.rnn.bias_ih_l0"] + self.enc_b_ih: "NDArray" = weights["encoder.rnn.bias_ih_l0"] # (3*128,) - self.enc_b_hh: "NDArray" = variables["encoder.rnn.bias_hh_l0"] + self.enc_b_hh: "NDArray" = weights["encoder.rnn.bias_hh_l0"] # (74, 64). (len(phonemes), emb) - self.dec_emb: "NDArray" = variables["decoder.emb.weight"] + self.dec_emb: "NDArray" = weights["decoder.emb.weight"] # (3*128, 64) - self.dec_w_ih: "NDArray" = variables["decoder.rnn.weight_ih_l0"] + self.dec_w_ih: "NDArray" = weights["decoder.rnn.weight_ih_l0"] # (3*128, 128) - self.dec_w_hh: "NDArray" = variables["decoder.rnn.weight_hh_l0"] + self.dec_w_hh: "NDArray" = weights["decoder.rnn.weight_hh_l0"] # (3*128,) - self.dec_b_ih: "NDArray" = variables["decoder.rnn.bias_ih_l0"] + self.dec_b_ih: "NDArray" = weights["decoder.rnn.bias_ih_l0"] # (3*128,) - self.dec_b_hh: "NDArray" = variables["decoder.rnn.bias_hh_l0"] + self.dec_b_hh: "NDArray" = weights["decoder.rnn.bias_hh_l0"] # (74, 128) - self.fc_w: "NDArray" = variables["decoder.fc.weight"] + self.fc_w: "NDArray" = weights["decoder.fc.weight"] # (74,) - self.fc_b: "NDArray" = variables["decoder.fc.bias"] + self.fc_b: "NDArray" = weights["decoder.fc.bias"] def _sigmoid(self, x: "np.ndarray") -> "np.ndarray": import numpy as np From d02d313b0fa7096f4a02098ad505ed9933493dee Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 03:27:40 +0000 Subject: [PATCH 05/27] Initial plan From a145055a3e0c8a70f10aa60c623ab322bcf18a61 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 03:34:40 +0000 Subject: [PATCH 06/27] Sync dev, add pickle warning, fix docstring and code style Co-authored-by: bact <128572+bact@users.noreply.github.com> --- CHANGELOG.md | 4 ++++ pythainlp/corpus/common.py | 8 ++++++-- pythainlp/tag/_tag_perceptron.py | 2 +- pythainlp/transliterate/w2p.py | 8 ++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9b591835..e2ca45632 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,6 +100,10 @@ The minimum requirement is now Python 3.9. - Prevent path traversal and symlink attacks in archive extraction (#1225) +- Disallow pickle when loading numpy arrays to reduce security risk (#1328, #1329) +- Emit a warning when loading legacy corpus files using pickle (#1328) +- Defensive file loading in corpus utilities: validate fields before processing + (#1325, #1327) ## [5.2.0] - 2025-12-20 diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 075b1e62a..6bdfec631 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -319,7 +319,7 @@ def thai_dict() -> dict[str, list[str]]: for row in reader: word = row.get("word") meaning = row.get("meaning") - if not word or not word.strip() or not meaning or not meaning.strip(): + if not (word and word.strip() and meaning and meaning.strip()): warnings.warn( f"Skipping thai_dict entry with missing or empty field(s): {dict(row)!r}", UserWarning, @@ -406,7 +406,11 @@ def thai_synonyms() -> dict[str, Union[list[str], list[list[str]]]]: word = row.get("word") pos = row.get("pos") synonym = row.get("synonym") - if not word or not word.strip() or not pos or not pos.strip() or not synonym or not synonym.strip(): + if not ( + word and word.strip() + and pos and pos.strip() + and synonym and synonym.strip() + ): warnings.warn( f"Skipping thai_synonyms entry with missing or empty field(s): {dict(row)!r}", UserWarning, diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py index 8712ffbda..895ad9ca2 100644 --- a/pythainlp/tag/_tag_perceptron.py +++ b/pythainlp/tag/_tag_perceptron.py @@ -210,7 +210,7 @@ def train( json.dump(data, f, ensure_ascii=False) def load(self, loc: str) -> None: - """Load a pickled model. + """Load a saved model from a JSON file. :param str loc: model path """ try: diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 05e1b31c5..75af049e6 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -8,6 +8,7 @@ from __future__ import annotations import os +import warnings from typing import TYPE_CHECKING, Optional from pythainlp.corpus import get_corpus_path @@ -114,6 +115,13 @@ def _load_variables(self) -> None: ) weights: dict[str, "NDArray"] = dict(raw) else: + warnings.warn( + f"Loading legacy corpus file {self.checkpoint!r} using pickle " + "(allow_pickle=True). This is a security risk. " + "The corpus should be republished in .npz format.", + UserWarning, + stacklevel=3, + ) legacy: "NDArray" = np.load( self.checkpoint, allow_pickle=True ) From 8c5f125a2af72383883a4363ce490af4ead332ff Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 10:41:10 +0700 Subject: [PATCH 07/27] Update CHANGELOG for release 5.3.1 This release focuses on security issues related to corpus file loading, including improved pickle handling and defensive file loading. --- CHANGELOG.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e2ca45632..ee300c83f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Full release notes: -- Commit history: +- Commit history: + +## [5.3.1] - 2026-03-15 + +This release focuses on security issues related to corpus file loading. + +### Security + +- Remove pickle from pythainlp/generate/thai2fit.py (#1325) +- Defensive file loading in corpus utilities: validate fields before processing + (#1327) +- Improved pickle handling; Emit a warning when loading legacy + corpus files using pickle (#1328) ## [5.3.0] - 2026-03-10 @@ -100,10 +112,6 @@ The minimum requirement is now Python 3.9. - Prevent path traversal and symlink attacks in archive extraction (#1225) -- Disallow pickle when loading numpy arrays to reduce security risk (#1328, #1329) -- Emit a warning when loading legacy corpus files using pickle (#1328) -- Defensive file loading in corpus utilities: validate fields before processing - (#1325, #1327) ## [5.2.0] - 2025-12-20 @@ -206,6 +214,7 @@ The minimum requirement is now Python 3.9. - See +[5.3.1]: https://github.com/PyThaiNLP/pythainlp/compare/v5.3.0...v5.3.1 [5.3.0]: https://github.com/PyThaiNLP/pythainlp/compare/v5.2.0...v5.3.0 [5.2.0]: https://github.com/PyThaiNLP/pythainlp/compare/v5.1.2...v5.2.0 [5.1.2]: https://github.com/PyThaiNLP/pythainlp/compare/v5.1.1...v5.1.2 From 44b420295b792a62585efcb08cad532039b82630 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 10:47:56 +0700 Subject: [PATCH 08/27] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- pythainlp/transliterate/w2p.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 75af049e6..6c2f2302b 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -110,10 +110,8 @@ def _load_variables(self) -> None: # .npz format, the legacy branch can be removed. _, ext = os.path.splitext(self.checkpoint) if ext.lower() == ".npz": - raw: "np.lib.npyio.NpzFile" = np.load( - self.checkpoint, allow_pickle=False - ) - weights: dict[str, "NDArray"] = dict(raw) + with np.load(self.checkpoint, allow_pickle=False) as raw: + weights: dict[str, "NDArray"] = dict(raw) else: warnings.warn( f"Loading legacy corpus file {self.checkpoint!r} using pickle " From dcb07e4a790dea4fe09d2dfc974410d718dc53dc Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 10:49:00 +0700 Subject: [PATCH 09/27] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee300c83f..1ddb87b72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Full release notes: -- Commit history: +- Commit history: ## [5.3.1] - 2026-03-15 From 8665e46df4a3e8180cb52b21a98183bb8fdd4717 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 10:49:40 +0700 Subject: [PATCH 10/27] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- pythainlp/tag/_tag_perceptron.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py index 895ad9ca2..ac96ecc78 100644 --- a/pythainlp/tag/_tag_perceptron.py +++ b/pythainlp/tag/_tag_perceptron.py @@ -166,8 +166,8 @@ def train( ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. - :param save_loc: If not ``None``, saves a pickled model in this \ - location. + :param save_loc: If not ``None``, saves the model as a JSON file in \ + this location. :param nr_iter: Number of training iterations. """ import random @@ -200,7 +200,7 @@ def train( random.shuffle(sentences_list) self.model.average_weights() - # save the model + # save the model as JSON if save_loc is not None: data: dict[str, Union[dict, list]] = {} data["weights"] = self.model.weights From 913547196b3a93a246b9ee28eae1d39ea13673d8 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 10:54:54 +0700 Subject: [PATCH 11/27] Only load pickle file for PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE is set Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- pythainlp/transliterate/w2p.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 6c2f2302b..0a30c4089 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -113,10 +113,30 @@ def _load_variables(self) -> None: with np.load(self.checkpoint, allow_pickle=False) as raw: weights: dict[str, "NDArray"] = dict(raw) else: + # For security reasons, loading legacy .npy corpora via pickle is + # disabled by default. Users who understand and accept the risk + # may explicitly opt in via the PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE + # environment variable (set to "1", "true", or "yes"). + legacy_opt_in = os.getenv( + "PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE", "" + ).lower() in {"1", "true", "yes"} + if not legacy_opt_in: + raise RuntimeError( + "Refusing to load legacy .npy W2P corpus via pickle " + "(allow_pickle=True) by default, because this can lead " + "to arbitrary code execution if the file is tampered " + "with. Please migrate to a .npz corpus file. To " + "temporarily re-enable the legacy loader, set the " + "environment variable PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE " + 'to "1", "true", or "yes".' + ) + warnings.warn( f"Loading legacy corpus file {self.checkpoint!r} using pickle " - "(allow_pickle=True). This is a security risk. " - "The corpus should be republished in .npz format.", + "(allow_pickle=True). This is a security risk and is " + "deprecated. Set PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE only if " + "you understand and accept the risk, and migrate to .npz " + "format as soon as possible.", UserWarning, stacklevel=3, ) From ba5e601ca99a105e9fdb5f3a5611ee4421e39517 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 11:12:06 +0700 Subject: [PATCH 12/27] Add PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE: 1 to untitest env --- .github/workflows/unittest.yml | 2 ++ pythainlp/transliterate/w2p.py | 38 +++++++++++++++------------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 16ecc2d34..c5e8e35be 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -137,6 +137,7 @@ jobs: if: ${{ ((matrix.python-version == env.PYTHON_VERSION_LATEST_2) || (matrix.python-version == env.PYTHON_VERSION_EARLIEST)) && !(matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST_2) }} env: PYTHONIOENCODING: utf-8 + PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE: 1 run: | pip install ".[compact]" python -m unittest tests.core tests.compact @@ -145,6 +146,7 @@ jobs: if: matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST_2 env: PYTHONIOENCODING: utf-8 + PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE: 1 run: | pip install ".[compact,extra]" coverage run -m unittest tests.core tests.compact tests.extra diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 0a30c4089..1290cd5c3 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -42,9 +42,9 @@ class _Hparams: hp: _Hparams = _Hparams() -def _load_vocab() -> tuple[ - dict[str, int], dict[int, str], dict[str, int], dict[int, str] -]: +def _load_vocab() -> ( + tuple[dict[str, int], dict[int, str], dict[str, int], dict[int, str]] +): g2idx = {g: idx for idx, g in enumerate(hp.graphemes)} idx2g = dict(enumerate(hp.graphemes)) @@ -85,9 +85,7 @@ def __init__(self) -> None: self.p2idx: dict[str, int] self.idx2p: dict[int, str] self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab() - self.checkpoint: Optional[str] = get_corpus_path( - _MODEL_NAME, version="0.2" - ) + self.checkpoint: Optional[str] = get_corpus_path(_MODEL_NAME, version="0.2") if not self.checkpoint: raise FileNotFoundError( f"corpus-not-found name={_MODEL_NAME!r}\n" @@ -113,36 +111,34 @@ def _load_variables(self) -> None: with np.load(self.checkpoint, allow_pickle=False) as raw: weights: dict[str, "NDArray"] = dict(raw) else: - # For security reasons, loading legacy .npy corpora via pickle is - # disabled by default. Users who understand and accept the risk - # may explicitly opt in via the PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE - # environment variable (set to "1", "true", or "yes"). + # NOTE: Loading legacy .npy file via pickle is disabled by default. + # Users may explicitly opt in via the + # PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE environment variable + # (set to "1"). legacy_opt_in = os.getenv( "PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE", "" ).lower() in {"1", "true", "yes"} if not legacy_opt_in: raise RuntimeError( "Refusing to load legacy .npy W2P corpus via pickle " - "(allow_pickle=True) by default, because this can lead " - "to arbitrary code execution if the file is tampered " - "with. Please migrate to a .npz corpus file. To " - "temporarily re-enable the legacy loader, set the " + "by default, because this can lead to arbitrary code " + "execution if the file is tampered with. " + "Please migrate to a .npz file. " + "To temporarily re-enable the legacy loader, set the " "environment variable PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE " - 'to "1", "true", or "yes".' + 'to "1".' ) warnings.warn( - f"Loading legacy corpus file {self.checkpoint!r} using pickle " - "(allow_pickle=True). This is a security risk and is " - "deprecated. Set PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE only if " + f"Loading legacy corpus file {self.checkpoint!r} " + "using pickle. This is a security risk and is deprecated. " + "Set PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE only if " "you understand and accept the risk, and migrate to .npz " "format as soon as possible.", UserWarning, stacklevel=3, ) - legacy: "NDArray" = np.load( - self.checkpoint, allow_pickle=True - ) + legacy: "NDArray" = np.load(self.checkpoint, allow_pickle=True) weights = legacy.item() if not isinstance(weights, dict): raise ValueError( From c8f9d56c54863a7f9e8915b165f689832c98a845 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 11:43:10 +0700 Subject: [PATCH 13/27] Change PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE to PYTHAINLP_ALLOW_UNSAFE_PICKLE Add is_unsafe_pickle_allowed() function --- .github/workflows/unittest.yml | 4 ++-- CHANGELOG.md | 5 +++-- README.md | 1 + README_TH.md | 1 + pythainlp/tools/__init__.py | 2 ++ pythainlp/tools/path.py | 23 +++++++++++++++++++++++ pythainlp/transliterate/w2p.py | 14 ++++---------- 7 files changed, 36 insertions(+), 14 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index c5e8e35be..5d8f50ab7 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -137,7 +137,7 @@ jobs: if: ${{ ((matrix.python-version == env.PYTHON_VERSION_LATEST_2) || (matrix.python-version == env.PYTHON_VERSION_EARLIEST)) && !(matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST_2) }} env: PYTHONIOENCODING: utf-8 - PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE: 1 + PYTHAINLP_ALLOW_UNSAFE_PICKLE: 1 run: | pip install ".[compact]" python -m unittest tests.core tests.compact @@ -146,7 +146,7 @@ jobs: if: matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST_2 env: PYTHONIOENCODING: utf-8 - PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE: 1 + PYTHAINLP_ALLOW_UNSAFE_PICKLE: 1 run: | pip install ".[compact,extra]" coverage run -m unittest tests.core tests.compact tests.extra diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ddb87b72..ad4d7ef7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,8 +26,9 @@ This release focuses on security issues related to corpus file loading. - Remove pickle from pythainlp/generate/thai2fit.py (#1325) - Defensive file loading in corpus utilities: validate fields before processing (#1327) -- Improved pickle handling; Emit a warning when loading legacy - corpus files using pickle (#1328) +- Improved pickle handling; + Only allow pickle loading when `PYTHAINLP_ALLOW_UNSAFE_PICKLE` is set; + Emit a warning when loading pickle (#1328) ## [5.3.0] - 2026-03-10 diff --git a/README.md b/README.md index 496bd40ca..7e9c19d88 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ please inspect the `[project.optional-dependencies]` section of | `PYTHAINLP_OFFLINE` | Set to `1` to disable automatic corpus downloads. Explicit `download()` calls still work. | Current | | `PYTHAINLP_READ_ONLY` | Set to `1` to enable read-only mode, which prevents implicit background writes to PyThaiNLP's internal data directory (corpus downloads, catalog updates, directory creation). Explicit user-initiated saves to user-specified paths are unaffected. | Current | | `PYTHAINLP_READ_MODE` | Legacy alias for `PYTHAINLP_READ_ONLY`. Emits a `DeprecationWarning`. Setting both raises `ValueError`. | Deprecated; use `PYTHAINLP_READ_ONLY` | +| `PYTHAINLP_ALLOW_UNSAFE_PICKLE` | Set to `1` to enable loading of legacy pickle files. This is disabled by default as pickle files can execute arbitrary code. Only enable this if you trust the source of your data. | Current | ### Data directory diff --git a/README_TH.md b/README_TH.md index e2a36f54d..75d197fb9 100644 --- a/README_TH.md +++ b/README_TH.md @@ -113,6 +113,7 @@ pip install "pythainlp[extra1,extra2,...]" | `PYTHAINLP_OFFLINE` | ตั้งเป็น `1` เพื่อปิดการดาวน์โหลดคลังภาษาอัตโนมัติ การเรียก `download()` โดยตรงยังคงใช้งานได้ | ปัจจุบัน | | `PYTHAINLP_READ_ONLY` | ตั้งเป็น `1` เพื่อเปิดโหมดอ่านอย่างเดียว ป้องกันการเขียนในฉากหลังที่ผู้ใช้อาจไม่ทราบ (ดาวน์โหลดคลังภาษา, ปรับปรุงแค็ตตาล็อก, สร้างไดเรกทอรี) การบันทึกแฟ้มที่ผู้ใช้ระบุเองไม่ได้รับผลกระทบ | ปัจจุบัน | | `PYTHAINLP_READ_MODE` | ชื่อเดิมของ `PYTHAINLP_READ_ONLY` แสดง `DeprecationWarning` และหากตั้งค่าทั้งสองพร้อมกันจะเกิด `ValueError` | เลิกใช้แล้ว; ใช้ `PYTHAINLP_READ_ONLY` แทน | +| `PYTHAINLP_ALLOW_UNSAFE_PICKLE` | ตั้งเป็น `1` เพื่อให้อ่านแฟ้ม pickle ได้ การอ่านแฟ้มชนิดนี้ถูกตั้งปิดเป็นค่าปริยาย เนื่องจากการอ่านแฟ้ม pickle อาจไปเรียกคำสั่งไม่พึงประสงค์ได้ เปิดใช้เมื่อคุณเชื่อถือแหล่งข้อมูลของคุณ | ปัจจุบัน | ### ไดเรกทอรีข้อมูล diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index 8cbf06e7e..1f9f60d83 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -6,6 +6,7 @@ "get_full_data_path", "get_pythainlp_data_path", "get_pythainlp_path", + "is_unsafe_pickle_allowed", "is_offline_mode", "is_read_only_mode", "safe_print", @@ -18,6 +19,7 @@ get_full_data_path, get_pythainlp_data_path, get_pythainlp_path, + is_unsafe_pickle_allowed, is_offline_mode, is_read_only_mode, ) diff --git a/pythainlp/tools/path.py b/pythainlp/tools/path.py index ee8b473b2..0d1339b5f 100644 --- a/pythainlp/tools/path.py +++ b/pythainlp/tools/path.py @@ -102,6 +102,29 @@ def is_read_only_mode() -> bool: return False +def is_unsafe_pickle_allowed() -> bool: + """Return whether loading legacy pickle-based corpus files is allowed. + + Pickle deserialisation can execute arbitrary code if the file has been + tampered with, so it is **disabled by default**. Set the + ``PYTHAINLP_ALLOW_UNSAFE_PICKLE`` environment variable to a truthy value + (``"1"``, ``"true"``, ``"yes"``) only when you trust the corpus file and + understand the risk. + + .. note:: + Do **not** cache the return value of this function (e.g. with + ``functools.lru_cache``). The env var must be re-read on every call + so that changes made after import are respected and no early call can + lock in a permissive value. + + :return: ``True`` if legacy pickle loading is allowed, ``False`` otherwise. + :rtype: bool + """ + return os.getenv("PYTHAINLP_ALLOW_UNSAFE_PICKLE", "").strip().lower() in { + "1", "true", "yes" + } + + def is_offline_mode() -> bool: """Return whether PyThaiNLP is operating in offline mode. diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 1290cd5c3..fd1239e46 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Optional from pythainlp.corpus import get_corpus_path +from pythainlp.tools import is_unsafe_pickle_allowed if TYPE_CHECKING: import numpy as np @@ -111,28 +112,21 @@ def _load_variables(self) -> None: with np.load(self.checkpoint, allow_pickle=False) as raw: weights: dict[str, "NDArray"] = dict(raw) else: - # NOTE: Loading legacy .npy file via pickle is disabled by default. - # Users may explicitly opt in via the - # PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE environment variable - # (set to "1"). - legacy_opt_in = os.getenv( - "PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE", "" - ).lower() in {"1", "true", "yes"} - if not legacy_opt_in: + if not is_unsafe_pickle_allowed(): raise RuntimeError( "Refusing to load legacy .npy W2P corpus via pickle " "by default, because this can lead to arbitrary code " "execution if the file is tampered with. " "Please migrate to a .npz file. " "To temporarily re-enable the legacy loader, set the " - "environment variable PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE " + "environment variable PYTHAINLP_ALLOW_UNSAFE_PICKLE " 'to "1".' ) warnings.warn( f"Loading legacy corpus file {self.checkpoint!r} " "using pickle. This is a security risk and is deprecated. " - "Set PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE only if " + "Set PYTHAINLP_ALLOW_UNSAFE_PICKLE only if " "you understand and accept the risk, and migrate to .npz " "format as soon as possible.", UserWarning, From 5405fd1d06587ba7c5acf9f2c799a4fec56a5e31 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 11:45:31 +0700 Subject: [PATCH 14/27] Sort import --- pythainlp/tools/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index 1f9f60d83..fcb8d662d 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -6,9 +6,9 @@ "get_full_data_path", "get_pythainlp_data_path", "get_pythainlp_path", - "is_unsafe_pickle_allowed", "is_offline_mode", "is_read_only_mode", + "is_unsafe_pickle_allowed", "safe_print", "warn_deprecation", ] @@ -19,7 +19,7 @@ get_full_data_path, get_pythainlp_data_path, get_pythainlp_path, - is_unsafe_pickle_allowed, is_offline_mode, is_read_only_mode, + is_unsafe_pickle_allowed, ) From ee055c0da01eb97b56ef1fcc18ff937c4ae43c60 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 12:09:43 +0700 Subject: [PATCH 15/27] Add test for PYTHAINLP_ALLOW_UNSAFE_PICKLE --- .github/workflows/unittest.yml | 2 - tests/core/test_tools.py | 30 ++++++++++- tests/core/test_transliterate.py | 1 + tests/extra/testx_transliterate.py | 80 ++++++++++++++++++++++++++++-- 4 files changed, 105 insertions(+), 8 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 5d8f50ab7..16ecc2d34 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -137,7 +137,6 @@ jobs: if: ${{ ((matrix.python-version == env.PYTHON_VERSION_LATEST_2) || (matrix.python-version == env.PYTHON_VERSION_EARLIEST)) && !(matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST_2) }} env: PYTHONIOENCODING: utf-8 - PYTHAINLP_ALLOW_UNSAFE_PICKLE: 1 run: | pip install ".[compact]" python -m unittest tests.core tests.compact @@ -146,7 +145,6 @@ jobs: if: matrix.os == 'ubuntu-latest' && matrix.python-version == env.PYTHON_VERSION_LATEST_2 env: PYTHONIOENCODING: utf-8 - PYTHAINLP_ALLOW_UNSAFE_PICKLE: 1 run: | pip install ".[compact,extra]" coverage run -m unittest tests.core tests.compact tests.extra diff --git a/tests/core/test_tools.py b/tests/core/test_tools.py index 78fe28eec..50cfe13d3 100644 --- a/tests/core/test_tools.py +++ b/tests/core/test_tools.py @@ -8,7 +8,11 @@ import warnings from unittest.mock import patch -from pythainlp import is_offline_mode, is_read_only_mode +from pythainlp import ( + is_offline_mode, + is_read_only_mode, + is_unsafe_pickle_allowed, +) from pythainlp.tools import ( get_full_data_path, get_pythainlp_data_path, @@ -272,3 +276,27 @@ def test_get_pythainlp_data_path_no_makedirs_in_read_only(self): "Data directory should not be created in read-only mode", ) + def test_is_unsafe_pickle_allowed(self): + """Test is_unsafe_pickle_allowed() reflects PYTHAINLP_ALLOW_UNSAFE_PICKLE env var.""" + # Truthy values + for truthy in ("1", "true", "True", "TRUE", "yes", "YES"): + with patch.dict( + os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": truthy} + ): + self.assertTrue( + is_unsafe_pickle_allowed(), + f"Expected True for PYTHAINLP_ALLOW_UNSAFE_PICKLE={truthy!r}", + ) + # Falsy values + for falsy in ("", "0", "false", "False", "FALSE", "no", "NO", "on"): + with patch.dict( + os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": falsy} + ): + self.assertFalse( + is_unsafe_pickle_allowed(), + f"Expected False for PYTHAINLP_ALLOW_UNSAFE_PICKLE={falsy!r}", + ) + # Unset: should default to False + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("PYTHAINLP_ALLOW_UNSAFE_PICKLE", None) + self.assertFalse(is_unsafe_pickle_allowed()) diff --git a/tests/core/test_transliterate.py b/tests/core/test_transliterate.py index 0585ba757..60b32d71d 100644 --- a/tests/core/test_transliterate.py +++ b/tests/core/test_transliterate.py @@ -102,3 +102,4 @@ def test_transliterate_iso11940(self): self.assertEqual( transliterate("ภาษาไทย", engine="iso_11940"), "p̣hās̛̄āịthy" ) + diff --git a/tests/extra/testx_transliterate.py b/tests/extra/testx_transliterate.py index 9d8da8bc5..964cc96d2 100644 --- a/tests/extra/testx_transliterate.py +++ b/tests/extra/testx_transliterate.py @@ -2,7 +2,10 @@ # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 +import os import unittest +import warnings +from unittest.mock import patch import torch @@ -184,11 +187,78 @@ def test_transliterate_wunsen(self): def test_pronunciate(self): self.assertEqual(pronunciate(""), "") remove("thai_w2p") - self.assertIsNotNone(pronunciate("คน", engine="w2p")) - self.assertIsNotNone(pronunciate("แมว", engine="w2p")) - self.assertIsNotNone(pronunciate("มข.", engine="w2p")) - self.assertIsNotNone(pronunciate("มช.", engine="w2p")) - self.assertIsNotNone(pronunciate("jks", engine="w2p")) + with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": "1"}): + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + self.assertIsNotNone(pronunciate("คน", engine="w2p")) + self.assertIsNotNone(pronunciate("แมว", engine="w2p")) + self.assertIsNotNone(pronunciate("มข.", engine="w2p")) + self.assertIsNotNone(pronunciate("มช.", engine="w2p")) + self.assertIsNotNone(pronunciate("jks", engine="w2p")) + + def test_pronunciate_w2p_pickle_blocked_by_default(self): + """Thai_W2P._load_variables must raise RuntimeError for a legacy .npy + corpus when PYTHAINLP_ALLOW_UNSAFE_PICKLE is not set. + """ + try: + from pythainlp.transliterate.w2p import Thai_W2P + except (FileNotFoundError, RuntimeError): + self.skipTest("w2p module not ready (corpus missing or pickle blocked)") + + instance = object.__new__(Thai_W2P) + instance.checkpoint = "/fake/model.npy" + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("PYTHAINLP_ALLOW_UNSAFE_PICKLE", None) + with self.assertRaises(RuntimeError) as ctx: + instance._load_variables() + self.assertIn("PYTHAINLP_ALLOW_UNSAFE_PICKLE", str(ctx.exception)) + + def test_pronunciate_w2p_pickle_warning_when_allowed(self): + """Thai_W2P._load_variables must emit a UserWarning when loading a + legacy .npy corpus with PYTHAINLP_ALLOW_UNSAFE_PICKLE set. + """ + import numpy as np + + try: + from pythainlp.transliterate.w2p import Thai_W2P + except (FileNotFoundError, RuntimeError): + self.skipTest("w2p module not ready (corpus missing or pickle blocked)") + + keys = [ + "encoder.emb.weight", + "encoder.rnn.weight_ih_l0", + "encoder.rnn.weight_hh_l0", + "encoder.rnn.bias_ih_l0", + "encoder.rnn.bias_hh_l0", + "decoder.emb.weight", + "decoder.rnn.weight_ih_l0", + "decoder.rnn.weight_hh_l0", + "decoder.rnn.bias_ih_l0", + "decoder.rnn.bias_hh_l0", + "decoder.fc.weight", + "decoder.fc.bias", + ] + fake_weights = {k: np.zeros(1) for k in keys} + fake_array = np.empty((), dtype=object) + fake_array[()] = fake_weights + + instance = object.__new__(Thai_W2P) + instance.checkpoint = "/fake/model.npy" + with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": "1"}): + with patch("numpy.load", return_value=fake_array): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + instance._load_variables() + user_warnings = [ + x for x in w if issubclass(x.category, UserWarning) + ] + self.assertTrue( + any( + "PYTHAINLP_ALLOW_UNSAFE_PICKLE" in str(x.message) + for x in user_warnings + ), + "Expected a UserWarning mentioning PYTHAINLP_ALLOW_UNSAFE_PICKLE", + ) def test_puan(self): self.assertEqual(puan("แมว"), "แมว") From db08a00109209bf7223a84baf19062a54b19e4fb Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 12:14:21 +0700 Subject: [PATCH 16/27] Fix import --- pythainlp/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index cecec9d0a..f82ac0006 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -57,6 +57,7 @@ "correct", "is_offline_mode", "is_read_only_mode", + "is_unsafe_pickle_allowed", "pos_tag", "romanize", "spell", @@ -78,6 +79,10 @@ subword_tokenize, word_tokenize, ) -from pythainlp.tools.path import is_offline_mode, is_read_only_mode +from pythainlp.tools.path import ( + is_offline_mode, + is_read_only_mode, + is_unsafe_pickle_allowed, +) from pythainlp.transliterate import romanize, transliterate from pythainlp.util import collate, thai_strftime From a8bfe5165312c44a104c1e61636b1b26500909be Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 12:27:16 +0700 Subject: [PATCH 17/27] Set PYTHAINLP_ALLOW_UNSAFE_PICKLE for tone detector test --- tests/compact/testc_util.py | 55 +++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/tests/compact/testc_util.py b/tests/compact/testc_util.py index 31aacfa40..1c65ae8bf 100644 --- a/tests/compact/testc_util.py +++ b/tests/compact/testc_util.py @@ -4,7 +4,10 @@ """Unit tests for pythainlp.util module.""" +import os import unittest +import warnings +from unittest.mock import patch from pythainlp.util import ( check_khuap_klam, @@ -33,33 +36,39 @@ def test_rhyme(self): self.assertGreater(len(rhyme("แมว")), 2) def test_thai_word_tone_detector(self): - self.assertIsNotNone(thai_word_tone_detector("คนดี")) - self.assertEqual( - thai_word_tone_detector("ราคา"), [("รา", "m"), ("คา", "m")] - ) - # Edge cases: None and empty string - self.assertEqual(thai_word_tone_detector(None), []) - self.assertEqual(thai_word_tone_detector(""), []) + with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": "1"}): + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + self.assertIsNotNone(thai_word_tone_detector("คนดี")) + self.assertEqual( + thai_word_tone_detector("ราคา"), [("รา", "m"), ("คา", "m")] + ) + # Edge cases: None and empty string + self.assertEqual(thai_word_tone_detector(None), []) + self.assertEqual(thai_word_tone_detector(""), []) class KhuapKlamTestCaseC(unittest.TestCase): def test_check_khuap_klam(self): - # True consonant clusters (คำควบกล้ำแท้) - self.assertTrue(check_khuap_klam("กราบ")) - self.assertTrue(check_khuap_klam("ปลา")) - self.assertTrue(check_khuap_klam("เพราะ")) - self.assertTrue(check_khuap_klam("ตรง")) + with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": "1"}): + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + # True consonant clusters (คำควบกล้ำแท้) + self.assertTrue(check_khuap_klam("กราบ")) + self.assertTrue(check_khuap_klam("ปลา")) + self.assertTrue(check_khuap_klam("เพราะ")) + self.assertTrue(check_khuap_klam("ตรง")) - # False consonant clusters (คำควบกล้ำไม่แท้) - self.assertFalse(check_khuap_klam("จริง")) - self.assertFalse(check_khuap_klam("ทราย")) - self.assertFalse(check_khuap_klam("เศร้า")) + # False consonant clusters (คำควบกล้ำไม่แท้) + self.assertFalse(check_khuap_klam("จริง")) + self.assertFalse(check_khuap_klam("ทราย")) + self.assertFalse(check_khuap_klam("เศร้า")) - # Not a consonant cluster - self.assertIsNone(check_khuap_klam("แม่")) - self.assertIsNone(check_khuap_klam("ตา")) - self.assertIsNone(check_khuap_klam("มา")) - self.assertIsNone(check_khuap_klam("นา")) + # Not a consonant cluster + self.assertIsNone(check_khuap_klam("แม่")) + self.assertIsNone(check_khuap_klam("ตา")) + self.assertIsNone(check_khuap_klam("มา")) + self.assertIsNone(check_khuap_klam("นา")) - # Edge cases: empty string returns None - self.assertIsNone(check_khuap_klam("")) + # Edge cases: empty string returns None + self.assertIsNone(check_khuap_klam("")) From bec6fe1f09bccd8ca3f2fdd7feed7bb60b5fa375 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 12:45:37 +0700 Subject: [PATCH 18/27] Refactor is_unsafe_pickle_allowed() --- pythainlp/tools/path.py | 17 +++++------------ tests/core/test_tools.py | 16 +++++----------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/pythainlp/tools/path.py b/pythainlp/tools/path.py index 0d1339b5f..2bd954c7d 100644 --- a/pythainlp/tools/path.py +++ b/pythainlp/tools/path.py @@ -106,23 +106,16 @@ def is_unsafe_pickle_allowed() -> bool: """Return whether loading legacy pickle-based corpus files is allowed. Pickle deserialisation can execute arbitrary code if the file has been - tampered with, so it is **disabled by default**. Set the - ``PYTHAINLP_ALLOW_UNSAFE_PICKLE`` environment variable to a truthy value - (``"1"``, ``"true"``, ``"yes"``) only when you trust the corpus file and + tampered with, so it is **disabled by default**. + Set the ``PYTHAINLP_ALLOW_UNSAFE_PICKLE`` environment variable to + a truthy value (e.g. ``"1"``) only when you trust the corpus file and understand the risk. - .. note:: - Do **not** cache the return value of this function (e.g. with - ``functools.lru_cache``). The env var must be re-read on every call - so that changes made after import are respected and no early call can - lock in a permissive value. - :return: ``True`` if legacy pickle loading is allowed, ``False`` otherwise. :rtype: bool """ - return os.getenv("PYTHAINLP_ALLOW_UNSAFE_PICKLE", "").strip().lower() in { - "1", "true", "yes" - } + val = os.getenv("PYTHAINLP_ALLOW_UNSAFE_PICKLE", "") + return val.strip().lower() in ("1", "true", "yes", "on") def is_offline_mode() -> bool: diff --git a/tests/core/test_tools.py b/tests/core/test_tools.py index 50cfe13d3..a87751d7d 100644 --- a/tests/core/test_tools.py +++ b/tests/core/test_tools.py @@ -30,9 +30,7 @@ class ToolsTestCase(unittest.TestCase): def test_path(self): data_filename = "ttc_freq.txt" - self.assertTrue( - get_full_data_path(data_filename).endswith(data_filename) - ) + self.assertTrue(get_full_data_path(data_filename).endswith(data_filename)) self.assertIsInstance(get_pythainlp_data_path(), str) self.assertIsInstance(get_pythainlp_path(), str) @@ -279,19 +277,15 @@ def test_get_pythainlp_data_path_no_makedirs_in_read_only(self): def test_is_unsafe_pickle_allowed(self): """Test is_unsafe_pickle_allowed() reflects PYTHAINLP_ALLOW_UNSAFE_PICKLE env var.""" # Truthy values - for truthy in ("1", "true", "True", "TRUE", "yes", "YES"): - with patch.dict( - os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": truthy} - ): + for truthy in ("1", "true", "True", "TRUE", "yes", "YES", "on", "ON"): + with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": truthy}): self.assertTrue( is_unsafe_pickle_allowed(), f"Expected True for PYTHAINLP_ALLOW_UNSAFE_PICKLE={truthy!r}", ) # Falsy values - for falsy in ("", "0", "false", "False", "FALSE", "no", "NO", "on"): - with patch.dict( - os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": falsy} - ): + for falsy in ("", "0", "false", "False", "FALSE", "no", "NO", "off", "OFF"): + with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": falsy}): self.assertFalse( is_unsafe_pickle_allowed(), f"Expected False for PYTHAINLP_ALLOW_UNSAFE_PICKLE={falsy!r}", From fb27b5dea3c5f00b70b6ca30ace14ef78dbebde1 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 14 Mar 2026 12:48:22 +0700 Subject: [PATCH 19/27] Refactor model loading to use .npz format only Updated model name and refactored variable loading to use .npz format exclusively, removing legacy .npy handling. --- pythainlp/transliterate/w2p.py | 89 +++++++++++++--------------------- 1 file changed, 35 insertions(+), 54 deletions(-) diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index fd1239e46..9e201cd9d 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -12,7 +12,6 @@ from typing import TYPE_CHECKING, Optional from pythainlp.corpus import get_corpus_path -from pythainlp.tools import is_unsafe_pickle_allowed if TYPE_CHECKING: import numpy as np @@ -25,7 +24,7 @@ "-พจใงต้ืฮแาฐฒฤูศฅถฺฎหคสุขเึดฟำฝยลอ็ม" + " ณิฑชฉซทรํฬฏ–ัฃวก่ปผ์ฆบี๊ธฌญะไษ๋นโภ?" ) -_MODEL_NAME: str = "thai_w2p" +_MODEL_NAME: str = "thai_w2p_npz" class _Hparams: @@ -98,72 +97,54 @@ def __init__(self) -> None: def _load_variables(self) -> None: import numpy as np - if self.checkpoint is None: raise RuntimeError("checkpoint path is not set") - - # .npz files store each array directly (no pickle needed). - # Legacy .npy files store a pickled dict as an object array. - # NOTE: allow_pickle=True is retained only for backward compatibility - # with the existing .npy corpus. When the corpus is republished in - # .npz format, the legacy branch can be removed. - _, ext = os.path.splitext(self.checkpoint) - if ext.lower() == ".npz": - with np.load(self.checkpoint, allow_pickle=False) as raw: - weights: dict[str, "NDArray"] = dict(raw) - else: - if not is_unsafe_pickle_allowed(): - raise RuntimeError( - "Refusing to load legacy .npy W2P corpus via pickle " - "by default, because this can lead to arbitrary code " - "execution if the file is tampered with. " - "Please migrate to a .npz file. " - "To temporarily re-enable the legacy loader, set the " - "environment variable PYTHAINLP_ALLOW_UNSAFE_PICKLE " - 'to "1".' - ) - - warnings.warn( - f"Loading legacy corpus file {self.checkpoint!r} " - "using pickle. This is a security risk and is deprecated. " - "Set PYTHAINLP_ALLOW_UNSAFE_PICKLE only if " - "you understand and accept the risk, and migrate to .npz " - "format as soon as possible.", - UserWarning, - stacklevel=3, - ) - legacy: "NDArray" = np.load(self.checkpoint, allow_pickle=True) - weights = legacy.item() - if not isinstance(weights, dict): - raise ValueError( - f"Expected a dict in legacy corpus file, got " - f"{type(weights).__name__!r}" - ) - + self.variables: "NDArray" = np.load(self.checkpoint, allow_pickle=False) # (29, 64). (len(graphemes), emb) - self.enc_emb: "NDArray" = weights["encoder.emb.weight"] + self.enc_emb: "NDArray" = self.variables[ + "encoder_emb_weight" + ] # (3*128, 64) - self.enc_w_ih: "NDArray" = weights["encoder.rnn.weight_ih_l0"] + self.enc_w_ih: "NDArray" = self.variables[ + "encoder_rnn_weight_ih_l0" + ] # (3*128, 128) - self.enc_w_hh: "NDArray" = weights["encoder.rnn.weight_hh_l0"] + self.enc_w_hh: "NDArray" = self.variables[ + "encoder_rnn_weight_hh_l0" + ] # (3*128,) - self.enc_b_ih: "NDArray" = weights["encoder.rnn.bias_ih_l0"] + self.enc_b_ih: "NDArray" = self.variables[ + "encoder_rnn_bias_ih_l0" + ] # (3*128,) - self.enc_b_hh: "NDArray" = weights["encoder.rnn.bias_hh_l0"] + self.enc_b_hh: "NDArray" = self.variables[ + "encoder_rnn_bias_hh_l0" + ] + # (74, 64). (len(phonemes), emb) - self.dec_emb: "NDArray" = weights["decoder.emb.weight"] + self.dec_emb: "NDArray" = self.variables[ + "decoder_emb_weight" + ] # (3*128, 64) - self.dec_w_ih: "NDArray" = weights["decoder.rnn.weight_ih_l0"] + self.dec_w_ih: "NDArray" = self.variables[ + "decoder_rnn_weight_ih_l0" + ] # (3*128, 128) - self.dec_w_hh: "NDArray" = weights["decoder.rnn.weight_hh_l0"] + self.dec_w_hh: "NDArray" = self.variables[ + "decoder_rnn_weight_hh_l0" + ] # (3*128,) - self.dec_b_ih: "NDArray" = weights["decoder.rnn.bias_ih_l0"] + self.dec_b_ih: "NDArray" = self.variables[ + "decoder_rnn_bias_ih_l0" + ] # (3*128,) - self.dec_b_hh: "NDArray" = weights["decoder.rnn.bias_hh_l0"] + self.dec_b_hh: "NDArray" = self.variables[ + "decoder_rnn_bias_hh_l0" + ] # (74, 128) - self.fc_w: "NDArray" = weights["decoder.fc.weight"] + self.fc_w: "NDArray" = self.variables["decoder_fc_weight"] # (74,) - self.fc_b: "NDArray" = weights["decoder.fc.bias"] + self.fc_b: "NDArray" = self.variables["decoder_fc_bias"] def _sigmoid(self, x: "np.ndarray") -> "np.ndarray": import numpy as np From ddb78b60947611e10cb787b29882b6b202fe8dd6 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 14 Mar 2026 12:51:44 +0700 Subject: [PATCH 20/27] Clean up w2p.py by removing os and warnings imports Removed unused imports from w2p.py --- pythainlp/transliterate/w2p.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 9e201cd9d..4011fffba 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -7,8 +7,6 @@ from __future__ import annotations -import os -import warnings from typing import TYPE_CHECKING, Optional from pythainlp.corpus import get_corpus_path From 17f66baf9310be7ecc5791d68ed8f44c67f35ba2 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 13:02:29 +0700 Subject: [PATCH 21/27] Remove PYTHAINLP_ALLOW_UNSAFE_PICKLE tests --- tests/compact/testc_util.py | 52 +++++++++---------- tests/extra/testx_transliterate.py | 80 ++---------------------------- 2 files changed, 28 insertions(+), 104 deletions(-) diff --git a/tests/compact/testc_util.py b/tests/compact/testc_util.py index 1c65ae8bf..a49f632b1 100644 --- a/tests/compact/testc_util.py +++ b/tests/compact/testc_util.py @@ -36,39 +36,33 @@ def test_rhyme(self): self.assertGreater(len(rhyme("แมว")), 2) def test_thai_word_tone_detector(self): - with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": "1"}): - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - self.assertIsNotNone(thai_word_tone_detector("คนดี")) - self.assertEqual( - thai_word_tone_detector("ราคา"), [("รา", "m"), ("คา", "m")] - ) - # Edge cases: None and empty string - self.assertEqual(thai_word_tone_detector(None), []) - self.assertEqual(thai_word_tone_detector(""), []) + self.assertIsNotNone(thai_word_tone_detector("คนดี")) + self.assertEqual( + thai_word_tone_detector("ราคา"), [("รา", "m"), ("คา", "m")] + ) + # Edge cases: None and empty string + self.assertEqual(thai_word_tone_detector(None), []) + self.assertEqual(thai_word_tone_detector(""), []) class KhuapKlamTestCaseC(unittest.TestCase): def test_check_khuap_klam(self): - with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": "1"}): - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - # True consonant clusters (คำควบกล้ำแท้) - self.assertTrue(check_khuap_klam("กราบ")) - self.assertTrue(check_khuap_klam("ปลา")) - self.assertTrue(check_khuap_klam("เพราะ")) - self.assertTrue(check_khuap_klam("ตรง")) + # True consonant clusters (คำควบกล้ำแท้) + self.assertTrue(check_khuap_klam("กราบ")) + self.assertTrue(check_khuap_klam("ปลา")) + self.assertTrue(check_khuap_klam("เพราะ")) + self.assertTrue(check_khuap_klam("ตรง")) - # False consonant clusters (คำควบกล้ำไม่แท้) - self.assertFalse(check_khuap_klam("จริง")) - self.assertFalse(check_khuap_klam("ทราย")) - self.assertFalse(check_khuap_klam("เศร้า")) + # False consonant clusters (คำควบกล้ำไม่แท้) + self.assertFalse(check_khuap_klam("จริง")) + self.assertFalse(check_khuap_klam("ทราย")) + self.assertFalse(check_khuap_klam("เศร้า")) - # Not a consonant cluster - self.assertIsNone(check_khuap_klam("แม่")) - self.assertIsNone(check_khuap_klam("ตา")) - self.assertIsNone(check_khuap_klam("มา")) - self.assertIsNone(check_khuap_klam("นา")) + # Not a consonant cluster + self.assertIsNone(check_khuap_klam("แม่")) + self.assertIsNone(check_khuap_klam("ตา")) + self.assertIsNone(check_khuap_klam("มา")) + self.assertIsNone(check_khuap_klam("นา")) - # Edge cases: empty string returns None - self.assertIsNone(check_khuap_klam("")) + # Edge cases: empty string returns None + self.assertIsNone(check_khuap_klam("")) diff --git a/tests/extra/testx_transliterate.py b/tests/extra/testx_transliterate.py index 964cc96d2..9d8da8bc5 100644 --- a/tests/extra/testx_transliterate.py +++ b/tests/extra/testx_transliterate.py @@ -2,10 +2,7 @@ # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 -import os import unittest -import warnings -from unittest.mock import patch import torch @@ -187,78 +184,11 @@ def test_transliterate_wunsen(self): def test_pronunciate(self): self.assertEqual(pronunciate(""), "") remove("thai_w2p") - with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": "1"}): - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - self.assertIsNotNone(pronunciate("คน", engine="w2p")) - self.assertIsNotNone(pronunciate("แมว", engine="w2p")) - self.assertIsNotNone(pronunciate("มข.", engine="w2p")) - self.assertIsNotNone(pronunciate("มช.", engine="w2p")) - self.assertIsNotNone(pronunciate("jks", engine="w2p")) - - def test_pronunciate_w2p_pickle_blocked_by_default(self): - """Thai_W2P._load_variables must raise RuntimeError for a legacy .npy - corpus when PYTHAINLP_ALLOW_UNSAFE_PICKLE is not set. - """ - try: - from pythainlp.transliterate.w2p import Thai_W2P - except (FileNotFoundError, RuntimeError): - self.skipTest("w2p module not ready (corpus missing or pickle blocked)") - - instance = object.__new__(Thai_W2P) - instance.checkpoint = "/fake/model.npy" - with patch.dict(os.environ, {}, clear=False): - os.environ.pop("PYTHAINLP_ALLOW_UNSAFE_PICKLE", None) - with self.assertRaises(RuntimeError) as ctx: - instance._load_variables() - self.assertIn("PYTHAINLP_ALLOW_UNSAFE_PICKLE", str(ctx.exception)) - - def test_pronunciate_w2p_pickle_warning_when_allowed(self): - """Thai_W2P._load_variables must emit a UserWarning when loading a - legacy .npy corpus with PYTHAINLP_ALLOW_UNSAFE_PICKLE set. - """ - import numpy as np - - try: - from pythainlp.transliterate.w2p import Thai_W2P - except (FileNotFoundError, RuntimeError): - self.skipTest("w2p module not ready (corpus missing or pickle blocked)") - - keys = [ - "encoder.emb.weight", - "encoder.rnn.weight_ih_l0", - "encoder.rnn.weight_hh_l0", - "encoder.rnn.bias_ih_l0", - "encoder.rnn.bias_hh_l0", - "decoder.emb.weight", - "decoder.rnn.weight_ih_l0", - "decoder.rnn.weight_hh_l0", - "decoder.rnn.bias_ih_l0", - "decoder.rnn.bias_hh_l0", - "decoder.fc.weight", - "decoder.fc.bias", - ] - fake_weights = {k: np.zeros(1) for k in keys} - fake_array = np.empty((), dtype=object) - fake_array[()] = fake_weights - - instance = object.__new__(Thai_W2P) - instance.checkpoint = "/fake/model.npy" - with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": "1"}): - with patch("numpy.load", return_value=fake_array): - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - instance._load_variables() - user_warnings = [ - x for x in w if issubclass(x.category, UserWarning) - ] - self.assertTrue( - any( - "PYTHAINLP_ALLOW_UNSAFE_PICKLE" in str(x.message) - for x in user_warnings - ), - "Expected a UserWarning mentioning PYTHAINLP_ALLOW_UNSAFE_PICKLE", - ) + self.assertIsNotNone(pronunciate("คน", engine="w2p")) + self.assertIsNotNone(pronunciate("แมว", engine="w2p")) + self.assertIsNotNone(pronunciate("มข.", engine="w2p")) + self.assertIsNotNone(pronunciate("มช.", engine="w2p")) + self.assertIsNotNone(pronunciate("jks", engine="w2p")) def test_puan(self): self.assertEqual(puan("แมว"), "แมว") From 1caf8f638b743eaaaf29e8c02fab205d362e3781 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 13:03:19 +0700 Subject: [PATCH 22/27] Update CHANGELOG.md --- CHANGELOG.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad4d7ef7e..4335e2d2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,9 +26,7 @@ This release focuses on security issues related to corpus file loading. - Remove pickle from pythainlp/generate/thai2fit.py (#1325) - Defensive file loading in corpus utilities: validate fields before processing (#1327) -- Improved pickle handling; - Only allow pickle loading when `PYTHAINLP_ALLOW_UNSAFE_PICKLE` is set; - Emit a warning when loading pickle (#1328) +- Improved pickle handling (#1328) ## [5.3.0] - 2026-03-10 From 7a731a68c887c40c5dd6327b9db5c15257bdd665 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 13:04:18 +0700 Subject: [PATCH 23/27] Fix imports --- tests/compact/testc_util.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/compact/testc_util.py b/tests/compact/testc_util.py index a49f632b1..31aacfa40 100644 --- a/tests/compact/testc_util.py +++ b/tests/compact/testc_util.py @@ -4,10 +4,7 @@ """Unit tests for pythainlp.util module.""" -import os import unittest -import warnings -from unittest.mock import patch from pythainlp.util import ( check_khuap_klam, From a52bdf1cdb61d40d60da940933628b40567a720f Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 13:13:23 +0700 Subject: [PATCH 24/27] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tests/core/test_transliterate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/core/test_transliterate.py b/tests/core/test_transliterate.py index 60b32d71d..0585ba757 100644 --- a/tests/core/test_transliterate.py +++ b/tests/core/test_transliterate.py @@ -102,4 +102,3 @@ def test_transliterate_iso11940(self): self.assertEqual( transliterate("ภาษาไทย", engine="iso_11940"), "p̣hās̛̄āịthy" ) - From 9cdd39e0d6855ac6fddb73b889ebcc8808e873bb Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 13:15:00 +0700 Subject: [PATCH 25/27] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- pythainlp/transliterate/w2p.py | 92 +++++++++++++++++----------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 4011fffba..d2bb31f09 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -97,52 +97,52 @@ def _load_variables(self) -> None: import numpy as np if self.checkpoint is None: raise RuntimeError("checkpoint path is not set") - self.variables: "NDArray" = np.load(self.checkpoint, allow_pickle=False) - # (29, 64). (len(graphemes), emb) - self.enc_emb: "NDArray" = self.variables[ - "encoder_emb_weight" - ] - # (3*128, 64) - self.enc_w_ih: "NDArray" = self.variables[ - "encoder_rnn_weight_ih_l0" - ] - # (3*128, 128) - self.enc_w_hh: "NDArray" = self.variables[ - "encoder_rnn_weight_hh_l0" - ] - # (3*128,) - self.enc_b_ih: "NDArray" = self.variables[ - "encoder_rnn_bias_ih_l0" - ] - # (3*128,) - self.enc_b_hh: "NDArray" = self.variables[ - "encoder_rnn_bias_hh_l0" - ] - - # (74, 64). (len(phonemes), emb) - self.dec_emb: "NDArray" = self.variables[ - "decoder_emb_weight" - ] - # (3*128, 64) - self.dec_w_ih: "NDArray" = self.variables[ - "decoder_rnn_weight_ih_l0" - ] - # (3*128, 128) - self.dec_w_hh: "NDArray" = self.variables[ - "decoder_rnn_weight_hh_l0" - ] - # (3*128,) - self.dec_b_ih: "NDArray" = self.variables[ - "decoder_rnn_bias_ih_l0" - ] - # (3*128,) - self.dec_b_hh: "NDArray" = self.variables[ - "decoder_rnn_bias_hh_l0" - ] - # (74, 128) - self.fc_w: "NDArray" = self.variables["decoder_fc_weight"] - # (74,) - self.fc_b: "NDArray" = self.variables["decoder_fc_bias"] + with np.load(self.checkpoint, allow_pickle=False) as variables: + # (29, 64). (len(graphemes), emb) + self.enc_emb: "NDArray" = variables[ + "encoder_emb_weight" + ] + # (3*128, 64) + self.enc_w_ih: "NDArray" = variables[ + "encoder_rnn_weight_ih_l0" + ] + # (3*128, 128) + self.enc_w_hh: "NDArray" = variables[ + "encoder_rnn_weight_hh_l0" + ] + # (3*128,) + self.enc_b_ih: "NDArray" = variables[ + "encoder_rnn_bias_ih_l0" + ] + # (3*128,) + self.enc_b_hh: "NDArray" = variables[ + "encoder_rnn_bias_hh_l0" + ] + + # (74, 64). (len(phonemes), emb) + self.dec_emb: "NDArray" = variables[ + "decoder_emb_weight" + ] + # (3*128, 64) + self.dec_w_ih: "NDArray" = variables[ + "decoder_rnn_weight_ih_l0" + ] + # (3*128, 128) + self.dec_w_hh: "NDArray" = variables[ + "decoder_rnn_weight_hh_l0" + ] + # (3*128,) + self.dec_b_ih: "NDArray" = variables[ + "decoder_rnn_bias_ih_l0" + ] + # (3*128,) + self.dec_b_hh: "NDArray" = variables[ + "decoder_rnn_bias_hh_l0" + ] + # (74, 128) + self.fc_w: "NDArray" = variables["decoder_fc_weight"] + # (74,) + self.fc_b: "NDArray" = variables["decoder_fc_bias"] def _sigmoid(self, x: "np.ndarray") -> "np.ndarray": import numpy as np From b8f39fb00e7642f9ef76f9f9703a3de0246e6e77 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 13:23:39 +0700 Subject: [PATCH 26/27] Update CHANGELOG.md --- CHANGELOG.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4335e2d2a..6f9b09915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,10 +23,9 @@ This release focuses on security issues related to corpus file loading. ### Security -- Remove pickle from pythainlp/generate/thai2fit.py (#1325) -- Defensive file loading in corpus utilities: validate fields before processing - (#1327) -- Improved pickle handling (#1328) +- thai2fit: Use JSON model instead of pickle (#1325) +- Defensive corpus loading: validate fields before processing (#1327) +- w2p: Use npz model instead of pickle (#1328) ## [5.3.0] - 2026-03-10 From 500c6e55e0865b5a948027ffd4104c9e6b3de358 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 14 Mar 2026 13:27:23 +0700 Subject: [PATCH 27/27] Remove PYTHAINLP_ALLOW_UNSAFE_PICKLE from doc We no longer use pickle. Do not advertise this env var. Keep it internally for future use. (may remove in 6.0.0) --- README.md | 1 - README_TH.md | 1 - 2 files changed, 2 deletions(-) diff --git a/README.md b/README.md index 7e9c19d88..496bd40ca 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,6 @@ please inspect the `[project.optional-dependencies]` section of | `PYTHAINLP_OFFLINE` | Set to `1` to disable automatic corpus downloads. Explicit `download()` calls still work. | Current | | `PYTHAINLP_READ_ONLY` | Set to `1` to enable read-only mode, which prevents implicit background writes to PyThaiNLP's internal data directory (corpus downloads, catalog updates, directory creation). Explicit user-initiated saves to user-specified paths are unaffected. | Current | | `PYTHAINLP_READ_MODE` | Legacy alias for `PYTHAINLP_READ_ONLY`. Emits a `DeprecationWarning`. Setting both raises `ValueError`. | Deprecated; use `PYTHAINLP_READ_ONLY` | -| `PYTHAINLP_ALLOW_UNSAFE_PICKLE` | Set to `1` to enable loading of legacy pickle files. This is disabled by default as pickle files can execute arbitrary code. Only enable this if you trust the source of your data. | Current | ### Data directory diff --git a/README_TH.md b/README_TH.md index 75d197fb9..e2a36f54d 100644 --- a/README_TH.md +++ b/README_TH.md @@ -113,7 +113,6 @@ pip install "pythainlp[extra1,extra2,...]" | `PYTHAINLP_OFFLINE` | ตั้งเป็น `1` เพื่อปิดการดาวน์โหลดคลังภาษาอัตโนมัติ การเรียก `download()` โดยตรงยังคงใช้งานได้ | ปัจจุบัน | | `PYTHAINLP_READ_ONLY` | ตั้งเป็น `1` เพื่อเปิดโหมดอ่านอย่างเดียว ป้องกันการเขียนในฉากหลังที่ผู้ใช้อาจไม่ทราบ (ดาวน์โหลดคลังภาษา, ปรับปรุงแค็ตตาล็อก, สร้างไดเรกทอรี) การบันทึกแฟ้มที่ผู้ใช้ระบุเองไม่ได้รับผลกระทบ | ปัจจุบัน | | `PYTHAINLP_READ_MODE` | ชื่อเดิมของ `PYTHAINLP_READ_ONLY` แสดง `DeprecationWarning` และหากตั้งค่าทั้งสองพร้อมกันจะเกิด `ValueError` | เลิกใช้แล้ว; ใช้ `PYTHAINLP_READ_ONLY` แทน | -| `PYTHAINLP_ALLOW_UNSAFE_PICKLE` | ตั้งเป็น `1` เพื่อให้อ่านแฟ้ม pickle ได้ การอ่านแฟ้มชนิดนี้ถูกตั้งปิดเป็นค่าปริยาย เนื่องจากการอ่านแฟ้ม pickle อาจไปเรียกคำสั่งไม่พึงประสงค์ได้ เปิดใช้เมื่อคุณเชื่อถือแหล่งข้อมูลของคุณ | ปัจจุบัน | ### ไดเรกทอรีข้อมูล