Merge pull request #1320 from PyThaiNLP/copilot/identify-caching-opportunities

bact · web-flow · commit 59edaea51456 · 2026-03-10T01:04:42.000+07:00
Add LRU caching to corpus file loaders and pronounce utilities
diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -11,6 +11,7 @@
 import sys
 import tarfile
 import zipfile
+from functools import lru_cache
 from importlib.resources import files
 from typing import TYPE_CHECKING
 
@@ -113,6 +114,7 @@ def path_pythainlp_corpus(filename: str) -> str:
     return os.path.join(corpus_path(), filename)
 
 
+@lru_cache(maxsize=None)
 def get_corpus(filename: str, comments: bool = True) -> frozenset[str]:
     """Read corpus data from file and return a frozenset.
 
@@ -183,6 +185,7 @@ def get_corpus(filename: str, comments: bool = True) -> frozenset[str]:
     return frozenset(filter(None, lines))
 
 
+@lru_cache(maxsize=None)
 def get_corpus_as_is(filename: str) -> list[str]:
     """Read corpus data from file, as it is, and return a list.
 
@@ -218,6 +221,15 @@ def get_corpus_as_is(filename: str) -> list[str]:
     return lines
 
 
+@lru_cache(maxsize=None)
+def _load_default_db() -> dict[str, Any]:
+    """Load and cache the bundled default_db.json corpus catalog."""
+    corpus_files = files("pythainlp.corpus")
+    default_db_file = corpus_files.joinpath("default_db.json")
+    text = default_db_file.read_text(encoding="utf-8-sig")
+    return json.loads(text)  # type: ignore[no-any-return]
+
+
 def get_corpus_default_db(name: str, version: str = "") -> Optional[str]:
     """Get model path from default_db.json
 
@@ -229,10 +241,7 @@ def get_corpus_default_db(name: str, version: str = "") -> Optional[str]:
     If you want to edit default_db.json, \
         you can edit pythainlp/corpus/default_db.json
     """
-    corpus_files = files("pythainlp.corpus")
-    default_db_file = corpus_files.joinpath("default_db.json")
-    text = default_db_file.read_text(encoding="utf-8-sig")
-    corpus_db = json.loads(text)
+    corpus_db = _load_default_db()
 
     if name in corpus_db:
         if version in corpus_db[name]["versions"]:
diff --git a/pythainlp/util/pronounce.py b/pythainlp/util/pronounce.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 import re
-from typing import Optional
+from functools import lru_cache
 
 from pythainlp import thai_consonants, thai_tonemarks
 from pythainlp.corpus import thai_words
@@ -13,9 +13,15 @@
 from pythainlp.util import remove_tonemark
 
 kv: KhaveeVerifier = KhaveeVerifier()
-all_thai_words_dict: Optional[list[str]] = None
 
 
+@lru_cache(maxsize=None)
+def _single_syllable_thai_words() -> list[str]:
+    """Return cached list of single-syllable Thai words."""
+    return [i for i in thai_words() if len(syllable_tokenize(i)) == 1]
+
+
+@lru_cache(maxsize=1024)
 def rhyme(word: str) -> list[str]:
     """Find Thai rhyme
 
@@ -31,16 +37,9 @@ def rhyme(word: str) -> list[str]:
         print(rhyme("จีบ"))
         # output: ['กลีบ', 'กีบ', 'ครีบ', ...]
     """
-    global all_thai_words_dict
-    list_sumpus = []
-    if all_thai_words_dict is None:
-        all_thai_words_dict = [
-            i for i in list(thai_words()) if len(syllable_tokenize(i)) == 1
-        ]
-    for i in all_thai_words_dict:
-        if kv.is_sumpus(word, i) and i != word:
-            list_sumpus.append(i)
-    return sorted(list_sumpus)
+    return sorted(
+        i for i in _single_syllable_thai_words() if kv.is_sumpus(word, i) and i != word
+    )
 
 
 _vowel_str: str = "".join(
@@ -130,31 +129,18 @@ def tone_to_spelling(t: str) -> str:
     return t
 
 
-def spelling(word: str) -> list[str]:
-    """Thai word to spelling
-
-    This funnction support Thai root word only.
-
-    :param str word: A Thai word
-    :return: spelling
-    :rtype: List[str]
-
-    :Example:
-    ::
-
-        from pythainlp.util import spelling
-
-        print(spelling("เรียน"))
-        # output: ['รอ', 'เอีย', 'นอ', 'เรียน']
-
-        print(spelling("เฝ้า)
-        # output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
-    """
-    if not word or not isinstance(word, str):
-        return []
-    thai_vowel_tokenizer = Tokenizer(
+@lru_cache(maxsize=None)
+def _spelling_tokenizer() -> Tokenizer:
+    """Lazy-load and cache the vowel/consonant tokenizer used by spelling()."""
+    return Tokenizer(
         custom_dict=thai_vowel + list(thai_consonants), engine="longest"
     )
+
+
+@lru_cache(maxsize=1024)
+def _spelling_impl(word: str) -> list[str]:
+    """Cached implementation of spelling() for valid string inputs."""
+    thai_vowel_tokenizer = _spelling_tokenizer()
     word_pre = remove_tonemark(word).replace("็", "")
     tone = [tone_to_spelling(i) for i in word if i in thai_tonemarks]
     word_output = word_pre
@@ -179,3 +165,28 @@ def spelling(word: str) -> list[str]:
         return output + [word]
     else:
         return output + [word_pre, word]
+
+
+def spelling(word: str) -> list[str]:
+    """Thai word to spelling
+
+    This function supports Thai root words only.
+
+    :param str word: A Thai word
+    :return: spelling
+    :rtype: List[str]
+
+    :Example:
+    ::
+
+        from pythainlp.util import spelling
+
+        print(spelling("เรียน"))
+        # output: ['รอ', 'เอีย', 'นอ', 'เรียน']
+
+        print(spelling("เฝ้า"))
+        # output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
+    """
+    if not word or not isinstance(word, str):
+        return []
+    return _spelling_impl(word)
diff --git a/pythainlp/util/syllable.py b/pythainlp/util/syllable.py
@@ -194,7 +194,7 @@ def syllable_open_close_detector(syllable: str) -> str:
         print(syllable_open_close_detector("คะ"))
         # output: open
     """
-    consonants = [i for i in syllable if i in list(thai_consonants)]
+    consonants = [i for i in syllable if i in thai_consonants]
 
     if len(consonants) < 2:
         return "open"
@@ -225,7 +225,7 @@ def syllable_length(syllable: str) -> str:
         print(syllable_length("คะ"))
         # output: short
     """
-    consonants = [i for i in syllable if i in list(thai_consonants)]
+    consonants = [i for i in syllable if i in thai_consonants]
     if len(consonants) <= 3 and any((c in set(short)) for c in syllable):
         return "short"
 
@@ -236,7 +236,7 @@ def syllable_length(syllable: str) -> str:
 
 
 def _tone_mark_detector(syllable: str) -> str:
-    tone_mark = [i for i in syllable if i in list(thai_tonemarks)]
+    tone_mark = [i for i in syllable if i in thai_tonemarks]
     if tone_mark == []:
         return ""
 
@@ -245,7 +245,7 @@ def _tone_mark_detector(syllable: str) -> str:
 
 def _check_sonorant_syllable(syllable: str) -> bool:
     _sonorant = [i for i in syllable if i in thai_low_sonorants]
-    consonants = [i for i in syllable if i in list(thai_consonants)]
+    consonants = [i for i in syllable if i in thai_consonants]
 
     # Return False if no sonorants or not enough consonants
     if not _sonorant or len(consonants) < 2:
@@ -289,7 +289,7 @@ def tone_detector(syllable: str) -> str:
     """
     s = sound_syllable(syllable)
     # get consonants
-    consonants = [i for i in syllable if i in list(thai_consonants)]
+    consonants = [i for i in syllable if i in thai_consonants]
 
     # Handle syllables with no consonants (e.g., ฤ, ฦ)
     if len(consonants) == 0: