PyThaiNLP · wannaphong · Mar 14, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,7 +15,17 @@ and this project adheres to
 [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 - Full release notes: <https://github.com/PyThaiNLP/pythainlp/releases>
-- Commit history: <https://github.com/PyThaiNLP/pythainlp/compare/v5.2.0...5.3.0>
+- Commit history: <https://github.com/PyThaiNLP/pythainlp/compare/v5.3.0...v5.3.1>
+
+## [5.3.1] - 2026-03-15
+
+This release focuses on security issues related to corpus file loading.
+
+### Security
+
+- thai2fit: Use JSON model instead of pickle (#1325)
+- Defensive corpus loading: validate fields before processing (#1327)
+- w2p: Use npz model instead of pickle (#1328)
 
 ## [5.3.0] - 2026-03-10
 
@@ -202,6 +212,7 @@ The minimum requirement is now Python 3.9.
 
 - See <https://github.com/PyThaiNLP/pythainlp/releases/tag/v5.0.0>
 
+[5.3.1]: https://github.com/PyThaiNLP/pythainlp/compare/v5.3.0...v5.3.1
 [5.3.0]: https://github.com/PyThaiNLP/pythainlp/compare/v5.2.0...v5.3.0
 [5.2.0]: https://github.com/PyThaiNLP/pythainlp/compare/v5.1.2...v5.2.0
 [5.1.2]: https://github.com/PyThaiNLP/pythainlp/compare/v5.1.1...v5.1.2

diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
@@ -57,6 +57,7 @@
     "correct",
     "is_offline_mode",
     "is_read_only_mode",
+    "is_unsafe_pickle_allowed",
     "pos_tag",
     "romanize",
     "spell",
@@ -78,6 +79,10 @@
     subword_tokenize,
     word_tokenize,
 )
-from pythainlp.tools.path import is_offline_mode, is_read_only_mode
+from pythainlp.tools.path import (
+    is_offline_mode,
+    is_read_only_mode,
+    is_unsafe_pickle_allowed,
+)
 from pythainlp.transliterate import romanize, transliterate
 from pythainlp.util import collate, thai_strftime
diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py
@@ -319,7 +319,7 @@ def thai_dict() -> dict[str, list[str]]:
         for row in reader:
             word = row.get("word")
             meaning = row.get("meaning")
-            if not word or not word.strip() or not meaning or not meaning.strip():
+            if not (word and word.strip() and meaning and meaning.strip()):
                 warnings.warn(
                     f"Skipping thai_dict entry with missing or empty field(s): {dict(row)!r}",
                     UserWarning,
@@ -406,7 +406,11 @@ def thai_synonyms() -> dict[str, Union[list[str], list[list[str]]]]:
             word = row.get("word")
             pos = row.get("pos")
             synonym = row.get("synonym")
-            if not word or not word.strip() or not pos or not pos.strip() or not synonym or not synonym.strip():
+            if not (
+                word and word.strip()
+                and pos and pos.strip()
+                and synonym and synonym.strip()
+            ):
                 warnings.warn(
                     f"Skipping thai_synonyms entry with missing or empty field(s): {dict(row)!r}",
                     UserWarning,

diff --git a/pythainlp/spell/words_spelling_correction.py b/pythainlp/spell/words_spelling_correction.py
@@ -84,7 +84,9 @@ def _load_embeddings(self) -> tuple[list[str], NDArray[np.float32]]:
         """Loads embeddings matrix and vocabulary list."""
         import numpy as np
 
-        input_matrix = np.load(os.path.join(self.model_dir, "embeddings.npy"))
+        input_matrix = np.load(
+            os.path.join(self.model_dir, "embeddings.npy"), allow_pickle=False
+        )
         words = []
         vocab_path = os.path.join(self.model_dir, "vocabulary.txt")
         with open(vocab_path, encoding="utf-8") as f:

diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py
@@ -166,8 +166,8 @@ def train(
         ``nr_iter`` controls the number of Perceptron training iterations.
 
         :param sentences: A list of (words, tags) tuples.
-        :param save_loc: If not ``None``, saves a pickled model in this \
-            location.
+        :param save_loc: If not ``None``, saves the model as a JSON file in \
+            this location.
         :param nr_iter: Number of training iterations.
         """
         import random
@@ -200,7 +200,7 @@ def train(
             random.shuffle(sentences_list)
         self.model.average_weights()
 
-        # save the model
+        # save the model as JSON
         if save_loc is not None:
             data: dict[str, Union[dict, list]] = {}
             data["weights"] = self.model.weights
@@ -210,7 +210,7 @@ def train(
                 json.dump(data, f, ensure_ascii=False)
 
     def load(self, loc: str) -> None:
-        """Load a pickled model.
+        """Load a saved model from a JSON file.
         :param str loc: model path
         """
         try:

diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
@@ -8,6 +8,7 @@
     "get_pythainlp_path",
     "is_offline_mode",
     "is_read_only_mode",
+    "is_unsafe_pickle_allowed",
     "safe_print",
     "warn_deprecation",
 ]
@@ -20,4 +21,5 @@
     get_pythainlp_path,
     is_offline_mode,
     is_read_only_mode,
+    is_unsafe_pickle_allowed,
 )
diff --git a/pythainlp/tools/path.py b/pythainlp/tools/path.py
@@ -102,6 +102,22 @@ def is_read_only_mode() -> bool:
     return False
 
 
+def is_unsafe_pickle_allowed() -> bool:
+    """Return whether loading legacy pickle-based corpus files is allowed.
+
+    Pickle deserialisation can execute arbitrary code if the file has been
+    tampered with, so it is **disabled by default**.
+    Set the ``PYTHAINLP_ALLOW_UNSAFE_PICKLE`` environment variable to
+    a truthy value (e.g. ``"1"``) only when you trust the corpus file and
+    understand the risk.
+
+    :return: ``True`` if legacy pickle loading is allowed, ``False`` otherwise.
+    :rtype: bool
+    """
+    val = os.getenv("PYTHAINLP_ALLOW_UNSAFE_PICKLE", "")
+    return val.strip().lower() in ("1", "true", "yes", "on")
+
+
 def is_offline_mode() -> bool:
     """Return whether PyThaiNLP is operating in offline mode.
 

diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py
@@ -22,7 +22,7 @@
     "-พจใงต้ืฮแาฐฒฤูศฅถฺฎหคสุขเึดฟำฝยลอ็ม" + " ณิฑชฉซทรํฬฏ–ัฃวก่ปผ์ฆบี๊ธฌญะไษ๋นโภ?"
 )
 
-_MODEL_NAME: str = "thai_w2p"
+_MODEL_NAME: str = "thai_w2p_npz"
 
 
 class _Hparams:
@@ -40,9 +40,9 @@ class _Hparams:
 hp: _Hparams = _Hparams()
 
 
-def _load_vocab() -> tuple[
-    dict[str, int], dict[int, str], dict[str, int], dict[int, str]
-]:
+def _load_vocab() -> (
+    tuple[dict[str, int], dict[int, str], dict[str, int], dict[int, str]]
+):
     g2idx = {g: idx for idx, g in enumerate(hp.graphemes)}
     idx2g = dict(enumerate(hp.graphemes))
 
@@ -60,7 +60,6 @@ class Thai_W2P:
     p2idx: dict[str, int]
     idx2p: dict[int, str]
     checkpoint: Optional[str]
-    variables: "NDArray"
     enc_emb: "NDArray"
     enc_w_ih: "NDArray"
     enc_w_hh: "NDArray"
@@ -84,9 +83,7 @@ def __init__(self) -> None:
         self.p2idx: dict[str, int]
         self.idx2p: dict[int, str]
         self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab()
-        self.checkpoint: Optional[str] = get_corpus_path(
-            _MODEL_NAME, version="0.2"
-        )
+        self.checkpoint: Optional[str] = get_corpus_path(_MODEL_NAME, version="0.2")
         if not self.checkpoint:
             raise FileNotFoundError(
                 f"corpus-not-found name={_MODEL_NAME!r}\n"
@@ -98,55 +95,54 @@ def __init__(self) -> None:
 
     def _load_variables(self) -> None:
         import numpy as np
-
         if self.checkpoint is None:
             raise RuntimeError("checkpoint path is not set")
-        self.variables: "NDArray" = np.load(self.checkpoint, allow_pickle=True)
-        # (29, 64). (len(graphemes), emb)
-        self.enc_emb: "NDArray" = self.variables.item().get(
-            "encoder.emb.weight"
-        )
-        # (3*128, 64)
-        self.enc_w_ih: "NDArray" = self.variables.item().get(
-            "encoder.rnn.weight_ih_l0"
-        )
-        # (3*128, 128)
-        self.enc_w_hh: "NDArray" = self.variables.item().get(
-            "encoder.rnn.weight_hh_l0"
-        )
-        # (3*128,)
-        self.enc_b_ih: "NDArray" = self.variables.item().get(
-            "encoder.rnn.bias_ih_l0"
-        )
-        # (3*128,)
-        self.enc_b_hh: "NDArray" = self.variables.item().get(
-            "encoder.rnn.bias_hh_l0"
-        )
-
-        # (74, 64). (len(phonemes), emb)
-        self.dec_emb: "NDArray" = self.variables.item().get(
-            "decoder.emb.weight"
-        )
-        # (3*128, 64)
-        self.dec_w_ih: "NDArray" = self.variables.item().get(
-            "decoder.rnn.weight_ih_l0"
-        )
-        # (3*128, 128)
-        self.dec_w_hh: "NDArray" = self.variables.item().get(
-            "decoder.rnn.weight_hh_l0"
-        )
-        # (3*128,)
-        self.dec_b_ih: "NDArray" = self.variables.item().get(
-            "decoder.rnn.bias_ih_l0"
-        )
-        # (3*128,)
-        self.dec_b_hh: "NDArray" = self.variables.item().get(
-            "decoder.rnn.bias_hh_l0"
-        )
-        # (74, 128)
-        self.fc_w: "NDArray" = self.variables.item().get("decoder.fc.weight")
-        # (74,)
-        self.fc_b: "NDArray" = self.variables.item().get("decoder.fc.bias")
+        with np.load(self.checkpoint, allow_pickle=False) as variables:
+            # (29, 64). (len(graphemes), emb)
+            self.enc_emb: "NDArray" = variables[
+                "encoder_emb_weight"
+            ]
+            # (3*128, 64)
+            self.enc_w_ih: "NDArray" = variables[
+                "encoder_rnn_weight_ih_l0"
+            ]
+            # (3*128, 128)
+            self.enc_w_hh: "NDArray" = variables[
+                "encoder_rnn_weight_hh_l0"
+            ]
+            # (3*128,)
+            self.enc_b_ih: "NDArray" = variables[
+                "encoder_rnn_bias_ih_l0"
+            ]
+            # (3*128,)
+            self.enc_b_hh: "NDArray" = variables[
+                "encoder_rnn_bias_hh_l0"
+            ]
+
+            # (74, 64). (len(phonemes), emb)
+            self.dec_emb: "NDArray" = variables[
+                "decoder_emb_weight"
+            ]
+            # (3*128, 64)
+            self.dec_w_ih: "NDArray" = variables[
+                "decoder_rnn_weight_ih_l0"
+            ]
+            # (3*128, 128)
+            self.dec_w_hh: "NDArray" = variables[
+                "decoder_rnn_weight_hh_l0"
+            ]
+            # (3*128,)
+            self.dec_b_ih: "NDArray" = variables[
+                "decoder_rnn_bias_ih_l0"
+            ]
+            # (3*128,)
+            self.dec_b_hh: "NDArray" = variables[
+                "decoder_rnn_bias_hh_l0"
+            ]
+            # (74, 128)
+            self.fc_w: "NDArray" = variables["decoder_fc_weight"]
+            # (74,)
+            self.fc_b: "NDArray" = variables["decoder_fc_bias"]
 
     def _sigmoid(self, x: "np.ndarray") -> "np.ndarray":
         import numpy as np

diff --git a/tests/core/test_tools.py b/tests/core/test_tools.py
@@ -8,7 +8,11 @@
 import warnings
 from unittest.mock import patch
 
-from pythainlp import is_offline_mode, is_read_only_mode
+from pythainlp import (
+    is_offline_mode,
+    is_read_only_mode,
+    is_unsafe_pickle_allowed,
+)
 from pythainlp.tools import (
     get_full_data_path,
     get_pythainlp_data_path,
@@ -26,9 +30,7 @@
 class ToolsTestCase(unittest.TestCase):
     def test_path(self):
         data_filename = "ttc_freq.txt"
-        self.assertTrue(
-            get_full_data_path(data_filename).endswith(data_filename)
-        )
+        self.assertTrue(get_full_data_path(data_filename).endswith(data_filename))
         self.assertIsInstance(get_pythainlp_data_path(), str)
         self.assertIsInstance(get_pythainlp_path(), str)
 
@@ -272,3 +274,23 @@ def test_get_pythainlp_data_path_no_makedirs_in_read_only(self):
                     "Data directory should not be created in read-only mode",
                 )
 
+    def test_is_unsafe_pickle_allowed(self):
+        """Test is_unsafe_pickle_allowed() reflects PYTHAINLP_ALLOW_UNSAFE_PICKLE env var."""
+        # Truthy values
+        for truthy in ("1", "true", "True", "TRUE", "yes", "YES", "on", "ON"):
+            with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": truthy}):
+                self.assertTrue(
+                    is_unsafe_pickle_allowed(),
+                    f"Expected True for PYTHAINLP_ALLOW_UNSAFE_PICKLE={truthy!r}",
+                )
+        # Falsy values
+        for falsy in ("", "0", "false", "False", "FALSE", "no", "NO", "off", "OFF"):
+            with patch.dict(os.environ, {"PYTHAINLP_ALLOW_UNSAFE_PICKLE": falsy}):
+                self.assertFalse(
+                    is_unsafe_pickle_allowed(),
+                    f"Expected False for PYTHAINLP_ALLOW_UNSAFE_PICKLE={falsy!r}",
+                )
+        # Unset: should default to False
+        with patch.dict(os.environ, {}, clear=False):
+            os.environ.pop("PYTHAINLP_ALLOW_UNSAFE_PICKLE", None)
+            self.assertFalse(is_unsafe_pickle_allowed())