Add legacy character normalization pre-process for Tibetan.

scossu · scossu · commit e57a2a484188 · 2025-11-23T18:30:42.000-05:00
diff --git a/scriptshifter/hooks/tibetan/__init__.py b/scriptshifter/hooks/tibetan/__init__.py
@@ -0,0 +1,30 @@
+from logging import getLogger
+from os import path
+
+from yaml import load as yload
+try:
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+
+
+logger = getLogger(__name__)
+
+MOD_BASEDIR = path.dirname(__file__)
+
+with open(path.join(MOD_BASEDIR, "tibetan_roman_preprocess.yml")) as fh:
+    pre_map = yload(fh, Loader=Loader)
+
+
+def post_normalize(ctx):
+    """
+    Preprocess Roman input to convert legacy mappings.
+
+    Occurrences of ṅ, ñ, ś, and ź are converted to ng, ny, sh, and zh,
+    respectively.
+    """
+    for k, v in pre_map.items():
+        ctx.src = ctx.src.replace(k, v)
+
+    if ctx.orig != ctx.src:
+        logger.debug(f"Corrected Roman source: {ctx.orig} -> {ctx.src}")
diff --git a/scriptshifter/hooks/tibetan/tibetan_roman_preprocess.yml b/scriptshifter/hooks/tibetan/tibetan_roman_preprocess.yml
@@ -0,0 +1,12 @@
+---
+# Tibetan 2015 changes (R2R)
+# Corrects transliteration of ng, ny, sh, zh and alif
+"\u02BE": "\u02BC"
+"N\u0303": "Ny"
+"n\u0303": "ny"
+"N\u0307": "Ng"
+"n\u0307": "ng"
+"S\u0301": "Sh"
+"s\u0301": "sh"
+"Z\u0301": "Zh"
+"z\u0301": "zh"
diff --git a/scriptshifter/tables/data/tibetan.yml b/scriptshifter/tables/data/tibetan.yml
@@ -7,6 +7,11 @@ general:
   case_sensitive: false
 
 roman_to_script:
+  hooks:
+    post_normalize:
+      -
+        - tibetan.post_normalize
+
   map:
 
     # DELIMITER TSHEG (0F0C) SPACE (0020)