Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2c19966
Update np.load to disallow allow_pickle
wannaphong Mar 13, 2026
c5e7b87
Fix np.load allow_pickle=False to work correctly with .npz format (#1…
Copilot Mar 13, 2026
48b1f0f
Initial plan
Copilot Mar 13, 2026
8ce6415
Fix np.load allow_pickle for legacy .npy corpus (ValueError fix)
Copilot Mar 13, 2026
90eaaca
Merge pull request #1330 from PyThaiNLP/copilot/sub-pr-1328-again
bact Mar 14, 2026
d02d313
Initial plan
Copilot Mar 14, 2026
ec12c6e
Merge branch 'dev' into copilot/sub-pr-1328
Copilot Mar 14, 2026
a145055
Sync dev, add pickle warning, fix docstring and code style
Copilot Mar 14, 2026
8c5f125
Update CHANGELOG for release 5.3.1
bact Mar 14, 2026
053afe5
Merge pull request #1331 from PyThaiNLP/copilot/sub-pr-1328
bact Mar 14, 2026
44b4202
Potential fix for pull request finding
bact Mar 14, 2026
dcb07e4
Potential fix for pull request finding
bact Mar 14, 2026
8665e46
Potential fix for pull request finding
bact Mar 14, 2026
9135471
Only load pickle file for PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE is set
bact Mar 14, 2026
ba5e601
Add PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE: 1 to untitest env
bact Mar 14, 2026
c8f9d56
Change PYTHAINLP_W2P_ALLOW_LEGACY_PICKLE to PYTHAINLP_ALLOW_UNSAFE_PI…
bact Mar 14, 2026
5405fd1
Sort import
bact Mar 14, 2026
ee055c0
Add test for PYTHAINLP_ALLOW_UNSAFE_PICKLE
bact Mar 14, 2026
db08a00
Fix import
bact Mar 14, 2026
a8bfe51
Set PYTHAINLP_ALLOW_UNSAFE_PICKLE for tone detector test
bact Mar 14, 2026
bec6fe1
Refactor is_unsafe_pickle_allowed()
bact Mar 14, 2026
fb27b5d
Refactor model loading to use .npz format only
wannaphong Mar 14, 2026
ddb78b6
Clean up w2p.py by removing os and warnings imports
wannaphong Mar 14, 2026
17f66ba
Remove PYTHAINLP_ALLOW_UNSAFE_PICKLE tests
bact Mar 14, 2026
1caf8f6
Update CHANGELOG.md
bact Mar 14, 2026
7a731a6
Fix imports
bact Mar 14, 2026
a52bdf1
Potential fix for pull request finding
bact Mar 14, 2026
9cdd39e
Potential fix for pull request finding
bact Mar 14, 2026
b8f39fb
Update CHANGELOG.md
bact Mar 14, 2026
500c6e5
Remove PYTHAINLP_ALLOW_UNSAFE_PICKLE from doc
bact Mar 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,17 @@ and this project adheres to
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).

- Full release notes: <https://github.com/PyThaiNLP/pythainlp/releases>
- Commit history: <https://github.com/PyThaiNLP/pythainlp/compare/v5.2.0...5.3.0>
- Commit history: <https://github.com/PyThaiNLP/pythainlp/compare/v5.3.0...v5.3.1>

## [5.3.1] - 2026-03-15

This release focuses on security issues related to corpus file loading.

### Security

- thai2fit: Use JSON model instead of pickle (#1325)
- Defensive corpus loading: validate fields before processing (#1327)
- w2p: Use npz model instead of pickle (#1328)

## [5.3.0] - 2026-03-10

Expand Down Expand Up @@ -202,6 +212,7 @@ The minimum requirement is now Python 3.9.

- See <https://github.com/PyThaiNLP/pythainlp/releases/tag/v5.0.0>

[5.3.1]: https://github.com/PyThaiNLP/pythainlp/compare/v5.3.0...v5.3.1
[5.3.0]: https://github.com/PyThaiNLP/pythainlp/compare/v5.2.0...v5.3.0
[5.2.0]: https://github.com/PyThaiNLP/pythainlp/compare/v5.1.2...v5.2.0
[5.1.2]: https://github.com/PyThaiNLP/pythainlp/compare/v5.1.1...v5.1.2
Expand Down
7 changes: 6 additions & 1 deletion pythainlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"correct",
"is_offline_mode",
"is_read_only_mode",
"is_unsafe_pickle_allowed",
"pos_tag",
"romanize",
"spell",
Expand All @@ -78,6 +79,10 @@
subword_tokenize,
word_tokenize,
)
from pythainlp.tools.path import is_offline_mode, is_read_only_mode
from pythainlp.tools.path import (
is_offline_mode,
is_read_only_mode,
is_unsafe_pickle_allowed,
)
from pythainlp.transliterate import romanize, transliterate
from pythainlp.util import collate, thai_strftime
8 changes: 6 additions & 2 deletions pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def thai_dict() -> dict[str, list[str]]:
for row in reader:
word = row.get("word")
meaning = row.get("meaning")
if not word or not word.strip() or not meaning or not meaning.strip():
if not (word and word.strip() and meaning and meaning.strip()):
warnings.warn(
f"Skipping thai_dict entry with missing or empty field(s): {dict(row)!r}",
UserWarning,
Expand Down Expand Up @@ -406,7 +406,11 @@ def thai_synonyms() -> dict[str, Union[list[str], list[list[str]]]]:
word = row.get("word")
pos = row.get("pos")
synonym = row.get("synonym")
if not word or not word.strip() or not pos or not pos.strip() or not synonym or not synonym.strip():
if not (
word and word.strip()
and pos and pos.strip()
and synonym and synonym.strip()
):
warnings.warn(
f"Skipping thai_synonyms entry with missing or empty field(s): {dict(row)!r}",
UserWarning,
Expand Down
4 changes: 3 additions & 1 deletion pythainlp/spell/words_spelling_correction.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ def _load_embeddings(self) -> tuple[list[str], NDArray[np.float32]]:
"""Loads embeddings matrix and vocabulary list."""
import numpy as np

input_matrix = np.load(os.path.join(self.model_dir, "embeddings.npy"))
input_matrix = np.load(
os.path.join(self.model_dir, "embeddings.npy"), allow_pickle=False
)
words = []
vocab_path = os.path.join(self.model_dir, "vocabulary.txt")
with open(vocab_path, encoding="utf-8") as f:
Expand Down
8 changes: 4 additions & 4 deletions pythainlp/tag/_tag_perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ def train(
``nr_iter`` controls the number of Perceptron training iterations.

:param sentences: A list of (words, tags) tuples.
:param save_loc: If not ``None``, saves a pickled model in this \
location.
:param save_loc: If not ``None``, saves the model as a JSON file in \
this location.
:param nr_iter: Number of training iterations.
"""
import random
Expand Down Expand Up @@ -200,7 +200,7 @@ def train(
random.shuffle(sentences_list)
self.model.average_weights()

# save the model
# save the model as JSON
if save_loc is not None:
data: dict[str, Union[dict, list]] = {}
data["weights"] = self.model.weights
Expand All @@ -210,7 +210,7 @@ def train(
json.dump(data, f, ensure_ascii=False)

def load(self, loc: str) -> None:
"""Load a pickled model.
"""Load a saved model from a JSON file.
:param str loc: model path
"""
try:
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"get_pythainlp_path",
"is_offline_mode",
"is_read_only_mode",
"is_unsafe_pickle_allowed",
"safe_print",
"warn_deprecation",
]
Expand All @@ -20,4 +21,5 @@
get_pythainlp_path,
is_offline_mode,
is_read_only_mode,
is_unsafe_pickle_allowed,
)
16 changes: 16 additions & 0 deletions pythainlp/tools/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,22 @@ def is_read_only_mode() -> bool:
return False


def is_unsafe_pickle_allowed() -> bool:
"""Return whether loading legacy pickle-based corpus files is allowed.

Pickle deserialisation can execute arbitrary code if the file has been
tampered with, so it is **disabled by default**.
Set the ``PYTHAINLP_ALLOW_UNSAFE_PICKLE`` environment variable to
a truthy value (e.g. ``"1"``) only when you trust the corpus file and
understand the risk.

:return: ``True`` if legacy pickle loading is allowed, ``False`` otherwise.
:rtype: bool
"""
val = os.getenv("PYTHAINLP_ALLOW_UNSAFE_PICKLE", "")
return val.strip().lower() in ("1", "true", "yes", "on")


def is_offline_mode() -> bool:
"""Return whether PyThaiNLP is operating in offline mode.

Expand Down
106 changes: 51 additions & 55 deletions pythainlp/transliterate/w2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"-พจใงต้ืฮแาฐฒฤูศฅถฺฎหคสุขเึดฟำฝยลอ็ม" + " ณิฑชฉซทรํฬฏ–ัฃวก่ปผ์ฆบี๊ธฌญะไษ๋นโภ?"
)

_MODEL_NAME: str = "thai_w2p"
_MODEL_NAME: str = "thai_w2p_npz"


class _Hparams:
Expand All @@ -40,9 +40,9 @@ class _Hparams:
hp: _Hparams = _Hparams()


def _load_vocab() -> tuple[
dict[str, int], dict[int, str], dict[str, int], dict[int, str]
]:
def _load_vocab() -> (
tuple[dict[str, int], dict[int, str], dict[str, int], dict[int, str]]
):
g2idx = {g: idx for idx, g in enumerate(hp.graphemes)}
idx2g = dict(enumerate(hp.graphemes))

Expand All @@ -60,7 +60,6 @@ class Thai_W2P:
p2idx: dict[str, int]
idx2p: dict[int, str]
checkpoint: Optional[str]
variables: "NDArray"
enc_emb: "NDArray"
enc_w_ih: "NDArray"
enc_w_hh: "NDArray"
Expand All @@ -84,9 +83,7 @@ def __init__(self) -> None:
self.p2idx: dict[str, int]
self.idx2p: dict[int, str]
self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab()
self.checkpoint: Optional[str] = get_corpus_path(
_MODEL_NAME, version="0.2"
)
self.checkpoint: Optional[str] = get_corpus_path(_MODEL_NAME, version="0.2")
if not self.checkpoint:
raise FileNotFoundError(
f"corpus-not-found name={_MODEL_NAME!r}\n"
Expand All @@ -98,55 +95,54 @@ def __init__(self) -> None:

def _load_variables(self) -> None:
import numpy as np

if self.checkpoint is None:
raise RuntimeError("checkpoint path is not set")
self.variables: "NDArray" = np.load(self.checkpoint, allow_pickle=True)
# (29, 64). (len(graphemes), emb)
self.enc_emb: "NDArray" = self.variables.item().get(
"encoder.emb.weight"
)
# (3*128, 64)
self.enc_w_ih: "NDArray" = self.variables.item().get(
"encoder.rnn.weight_ih_l0"
)
# (3*128, 128)
self.enc_w_hh: "NDArray" = self.variables.item().get(
"encoder.rnn.weight_hh_l0"
)
# (3*128,)
self.enc_b_ih: "NDArray" = self.variables.item().get(
"encoder.rnn.bias_ih_l0"
)
# (3*128,)
self.enc_b_hh: "NDArray" = self.variables.item().get(
"encoder.rnn.bias_hh_l0"
)

# (74, 64). (len(phonemes), emb)
self.dec_emb: "NDArray" = self.variables.item().get(
"decoder.emb.weight"
)
# (3*128, 64)
self.dec_w_ih: "NDArray" = self.variables.item().get(
"decoder.rnn.weight_ih_l0"
)
# (3*128, 128)
self.dec_w_hh: "NDArray" = self.variables.item().get(
"decoder.rnn.weight_hh_l0"
)
# (3*128,)
self.dec_b_ih: "NDArray" = self.variables.item().get(
"decoder.rnn.bias_ih_l0"
)
# (3*128,)
self.dec_b_hh: "NDArray" = self.variables.item().get(
"decoder.rnn.bias_hh_l0"
)
# (74, 128)
self.fc_w: "NDArray" = self.variables.item().get("decoder.fc.weight")
# (74,)
self.fc_b: "NDArray" = self.variables.item().get("decoder.fc.bias")
with np.load(self.checkpoint, allow_pickle=False) as variables:
# (29, 64). (len(graphemes), emb)
self.enc_emb: "NDArray" = variables[
"encoder_emb_weight"
]
# (3*128, 64)
self.enc_w_ih: "NDArray" = variables[
"encoder_rnn_weight_ih_l0"
]
# (3*128, 128)
self.enc_w_hh: "NDArray" = variables[
"encoder_rnn_weight_hh_l0"
]
# (3*128,)
self.enc_b_ih: "NDArray" = variables[
"encoder_rnn_bias_ih_l0"
]
# (3*128,)
self.enc_b_hh: "NDArray" = variables[
"encoder_rnn_bias_hh_l0"
]

# (74, 64). (len(phonemes), emb)
self.dec_emb: "NDArray" = variables[
"decoder_emb_weight"
]
# (3*128, 64)
self.dec_w_ih: "NDArray" = variables[
"decoder_rnn_weight_ih_l0"
]
# (3*128, 128)
self.dec_w_hh: "NDArray" = variables[
"decoder_rnn_weight_hh_l0"
]
# (3*128,)
self.dec_b_ih: "NDArray" = variables[
"decoder_rnn_bias_ih_l0"
]
# (3*128,)
self.dec_b_hh: "NDArray" = variables[
"decoder_rnn_bias_hh_l0"
]
# (74, 128)
self.fc_w: "NDArray" = variables["decoder_fc_weight"]
# (74,)
self.fc_b: "NDArray" = variables["decoder_fc_bias"]

def _sigmoid(self, x: "np.ndarray") -> "np.ndarray":
import numpy as np
Expand Down
30 changes: 26 additions & 4 deletions tests/core/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
import warnings
from unittest.mock import patch

from pythainlp import is_offline_mode, is_read_only_mode
from pythainlp import (
is_offline_mode,
is_read_only_mode,
is_unsafe_pickle_allowed,
)
from pythainlp.tools import (
get_full_data_path,
get_pythainlp_data_path,
Expand All @@ -26,9 +30,7 @@
class ToolsTestCase(unittest.TestCase):
def test_path(self):
data_filename = "ttc_freq.txt"
self.assertTrue(
get_full_data_path(data_filename).endswith(data_filename)
)
self.assertTrue(get_full_data_path(data_filename).endswith(data_filename))
self.assertIsInstance(get_pythainlp_data_path(), str)
self.assertIsInstance(get_pythainlp_path(), str)

Expand Down Expand Up @@ -272,3 +274,23 @@ def test_get_pythainlp_data_path_no_makedirs_in_read_only(self):
"Data directory should not be created in read-only mode",
)

def test_is_unsafe_pickle_allowed(self):
"""Test is_unsafe_pickle_allowed() reflects PYTHAINLP_ALLOW_UNSAFE_PICKLE env var."""
# Truthy values
for truthy in ("1", "true", "True", "TRUE", "yes", "YES", "on", "ON"):
with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": truthy}):
self.assertTrue(
is_unsafe_pickle_allowed(),
f"Expected True for PYTHAINLP_ALLOW_UNSAFE_PICKLE={truthy!r}",
)
# Falsy values
for falsy in ("", "0", "false", "False", "FALSE", "no", "NO", "off", "OFF"):
with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": falsy}):
self.assertFalse(
is_unsafe_pickle_allowed(),
f"Expected False for PYTHAINLP_ALLOW_UNSAFE_PICKLE={falsy!r}",
)
# Unset: should default to False
with patch.dict(os.environ, {}, clear=False):
os.environ.pop("PYTHAINLP_ALLOW_UNSAFE_PICKLE", None)
self.assertFalse(is_unsafe_pickle_allowed())
Loading