Skip to content

Commit e57a2a4

Browse files
committed
Add legacy character normalization pre-process for Tibetan.
1 parent 2995839 commit e57a2a4

File tree

3 files changed

+47
-0
lines changed

3 files changed

+47
-0
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from logging import getLogger
2+
from os import path
3+
4+
from yaml import load as yload
5+
try:
6+
from yaml import CLoader as Loader
7+
except ImportError:
8+
from yaml import Loader
9+
10+
11+
logger = getLogger(__name__)
12+
13+
MOD_BASEDIR = path.dirname(__file__)
14+
15+
with open(path.join(MOD_BASEDIR, "tibetan_roman_preprocess.yml")) as fh:
16+
pre_map = yload(fh, Loader=Loader)
17+
18+
19+
def post_normalize(ctx):
20+
"""
21+
Preprocess Roman input to convert legacy mappings.
22+
23+
Occurrences of ṅ, ñ, ś, and ź are converted to ng, ny, sh, and zh,
24+
respectively.
25+
"""
26+
for k, v in pre_map.items():
27+
ctx.src = ctx.src.replace(k, v)
28+
29+
if ctx.orig != ctx.src:
30+
logger.debug(f"Corrected Roman source: {ctx.orig} -> {ctx.src}")
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
# Tibetan 2015 changes (R2R)
3+
# Corrects transliteration of ng, ny, sh, zh and alif
4+
"\u02BE": "\u02BC"
5+
"N\u0303": "Ny"
6+
"n\u0303": "ny"
7+
"N\u0307": "Ng"
8+
"n\u0307": "ng"
9+
"S\u0301": "Sh"
10+
"s\u0301": "sh"
11+
"Z\u0301": "Zh"
12+
"z\u0301": "zh"

scriptshifter/tables/data/tibetan.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ general:
77
case_sensitive: false
88

99
roman_to_script:
10+
hooks:
11+
post_normalize:
12+
-
13+
- tibetan.post_normalize
14+
1015
map:
1116

1217
# DELIMITER TSHEG (0F0C) SPACE (0020)

0 commit comments

Comments
 (0)