Skip to content

Commit 59edaea

Browse files
authored
Merge pull request #1320 from PyThaiNLP/copilot/identify-caching-opportunities
Add LRU caching to corpus file loaders and pronounce utilities
2 parents 7446da5 + 0aa4a10 commit 59edaea

File tree

3 files changed

+64
-44
lines changed

3 files changed

+64
-44
lines changed

pythainlp/corpus/core.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import sys
1212
import tarfile
1313
import zipfile
14+
from functools import lru_cache
1415
from importlib.resources import files
1516
from typing import TYPE_CHECKING
1617

@@ -113,6 +114,7 @@ def path_pythainlp_corpus(filename: str) -> str:
113114
return os.path.join(corpus_path(), filename)
114115

115116

117+
@lru_cache(maxsize=None)
116118
def get_corpus(filename: str, comments: bool = True) -> frozenset[str]:
117119
"""Read corpus data from file and return a frozenset.
118120
@@ -183,6 +185,7 @@ def get_corpus(filename: str, comments: bool = True) -> frozenset[str]:
183185
return frozenset(filter(None, lines))
184186

185187

188+
@lru_cache(maxsize=None)
186189
def get_corpus_as_is(filename: str) -> list[str]:
187190
"""Read corpus data from file, as it is, and return a list.
188191
@@ -218,6 +221,15 @@ def get_corpus_as_is(filename: str) -> list[str]:
218221
return lines
219222

220223

224+
@lru_cache(maxsize=None)
225+
def _load_default_db() -> dict[str, Any]:
226+
"""Load and cache the bundled default_db.json corpus catalog."""
227+
corpus_files = files("pythainlp.corpus")
228+
default_db_file = corpus_files.joinpath("default_db.json")
229+
text = default_db_file.read_text(encoding="utf-8-sig")
230+
return json.loads(text) # type: ignore[no-any-return]
231+
232+
221233
def get_corpus_default_db(name: str, version: str = "") -> Optional[str]:
222234
"""Get model path from default_db.json
223235
@@ -229,10 +241,7 @@ def get_corpus_default_db(name: str, version: str = "") -> Optional[str]:
229241
If you want to edit default_db.json, \
230242
you can edit pythainlp/corpus/default_db.json
231243
"""
232-
corpus_files = files("pythainlp.corpus")
233-
default_db_file = corpus_files.joinpath("default_db.json")
234-
text = default_db_file.read_text(encoding="utf-8-sig")
235-
corpus_db = json.loads(text)
244+
corpus_db = _load_default_db()
236245

237246
if name in corpus_db:
238247
if version in corpus_db[name]["versions"]:

pythainlp/util/pronounce.py

Lines changed: 46 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from __future__ import annotations
55

66
import re
7-
from typing import Optional
7+
from functools import lru_cache
88

99
from pythainlp import thai_consonants, thai_tonemarks
1010
from pythainlp.corpus import thai_words
@@ -13,9 +13,15 @@
1313
from pythainlp.util import remove_tonemark
1414

1515
kv: KhaveeVerifier = KhaveeVerifier()
16-
all_thai_words_dict: Optional[list[str]] = None
1716

1817

18+
@lru_cache(maxsize=None)
19+
def _single_syllable_thai_words() -> list[str]:
20+
"""Return cached list of single-syllable Thai words."""
21+
return [i for i in thai_words() if len(syllable_tokenize(i)) == 1]
22+
23+
24+
@lru_cache(maxsize=1024)
1925
def rhyme(word: str) -> list[str]:
2026
"""Find Thai rhyme
2127
@@ -31,16 +37,9 @@ def rhyme(word: str) -> list[str]:
3137
print(rhyme("จีบ"))
3238
# output: ['กลีบ', 'กีบ', 'ครีบ', ...]
3339
"""
34-
global all_thai_words_dict
35-
list_sumpus = []
36-
if all_thai_words_dict is None:
37-
all_thai_words_dict = [
38-
i for i in list(thai_words()) if len(syllable_tokenize(i)) == 1
39-
]
40-
for i in all_thai_words_dict:
41-
if kv.is_sumpus(word, i) and i != word:
42-
list_sumpus.append(i)
43-
return sorted(list_sumpus)
40+
return sorted(
41+
i for i in _single_syllable_thai_words() if kv.is_sumpus(word, i) and i != word
42+
)
4443

4544

4645
_vowel_str: str = "".join(
@@ -130,31 +129,18 @@ def tone_to_spelling(t: str) -> str:
130129
return t
131130

132131

133-
def spelling(word: str) -> list[str]:
134-
"""Thai word to spelling
135-
136-
This funnction support Thai root word only.
137-
138-
:param str word: A Thai word
139-
:return: spelling
140-
:rtype: List[str]
141-
142-
:Example:
143-
::
144-
145-
from pythainlp.util import spelling
146-
147-
print(spelling("เรียน"))
148-
# output: ['รอ', 'เอีย', 'นอ', 'เรียน']
149-
150-
print(spelling("เฝ้า)
151-
# output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
152-
"""
153-
if not word or not isinstance(word, str):
154-
return []
155-
thai_vowel_tokenizer = Tokenizer(
132+
@lru_cache(maxsize=None)
133+
def _spelling_tokenizer() -> Tokenizer:
134+
"""Lazy-load and cache the vowel/consonant tokenizer used by spelling()."""
135+
return Tokenizer(
156136
custom_dict=thai_vowel + list(thai_consonants), engine="longest"
157137
)
138+
139+
140+
@lru_cache(maxsize=1024)
141+
def _spelling_impl(word: str) -> list[str]:
142+
"""Cached implementation of spelling() for valid string inputs."""
143+
thai_vowel_tokenizer = _spelling_tokenizer()
158144
word_pre = remove_tonemark(word).replace("็", "")
159145
tone = [tone_to_spelling(i) for i in word if i in thai_tonemarks]
160146
word_output = word_pre
@@ -179,3 +165,28 @@ def spelling(word: str) -> list[str]:
179165
return output + [word]
180166
else:
181167
return output + [word_pre, word]
168+
169+
170+
def spelling(word: str) -> list[str]:
171+
"""Thai word to spelling
172+
173+
This function supports Thai root words only.
174+
175+
:param str word: A Thai word
176+
:return: spelling
177+
:rtype: List[str]
178+
179+
:Example:
180+
::
181+
182+
from pythainlp.util import spelling
183+
184+
print(spelling("เรียน"))
185+
# output: ['รอ', 'เอีย', 'นอ', 'เรียน']
186+
187+
print(spelling("เฝ้า"))
188+
# output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
189+
"""
190+
if not word or not isinstance(word, str):
191+
return []
192+
return _spelling_impl(word)

pythainlp/util/syllable.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def syllable_open_close_detector(syllable: str) -> str:
194194
print(syllable_open_close_detector("คะ"))
195195
# output: open
196196
"""
197-
consonants = [i for i in syllable if i in list(thai_consonants)]
197+
consonants = [i for i in syllable if i in thai_consonants]
198198

199199
if len(consonants) < 2:
200200
return "open"
@@ -225,7 +225,7 @@ def syllable_length(syllable: str) -> str:
225225
print(syllable_length("คะ"))
226226
# output: short
227227
"""
228-
consonants = [i for i in syllable if i in list(thai_consonants)]
228+
consonants = [i for i in syllable if i in thai_consonants]
229229
if len(consonants) <= 3 and any((c in set(short)) for c in syllable):
230230
return "short"
231231

@@ -236,7 +236,7 @@ def syllable_length(syllable: str) -> str:
236236

237237

238238
def _tone_mark_detector(syllable: str) -> str:
239-
tone_mark = [i for i in syllable if i in list(thai_tonemarks)]
239+
tone_mark = [i for i in syllable if i in thai_tonemarks]
240240
if tone_mark == []:
241241
return ""
242242

@@ -245,7 +245,7 @@ def _tone_mark_detector(syllable: str) -> str:
245245

246246
def _check_sonorant_syllable(syllable: str) -> bool:
247247
_sonorant = [i for i in syllable if i in thai_low_sonorants]
248-
consonants = [i for i in syllable if i in list(thai_consonants)]
248+
consonants = [i for i in syllable if i in thai_consonants]
249249

250250
# Return False if no sonorants or not enough consonants
251251
if not _sonorant or len(consonants) < 2:
@@ -289,7 +289,7 @@ def tone_detector(syllable: str) -> str:
289289
"""
290290
s = sound_syllable(syllable)
291291
# get consonants
292-
consonants = [i for i in syllable if i in list(thai_consonants)]
292+
consonants = [i for i in syllable if i in thai_consonants]
293293

294294
# Handle syllables with no consonants (e.g., ฤ, ฦ)
295295
if len(consonants) == 0:

0 commit comments

Comments
 (0)