Skip to content

Commit 403cc77

Browse files
authored
Allow various compat decompositions in confusables data (#1308)
- This fixes part of unicode-org/properties#460 - Also fix a data error for U+1F16D which included a TAB character.
1 parent cfcc775 commit 403cc77

File tree

8 files changed

+1622
-329
lines changed

8 files changed

+1622
-329
lines changed

unicodetools/data/security/dev/confusables.txt

Lines changed: 525 additions & 55 deletions
Large diffs are not rendered by default.

unicodetools/data/security/dev/confusablesSummary.txt

Lines changed: 1076 additions & 246 deletions
Large diffs are not rendered by default.

unicodetools/data/security/dev/data/confusablesSummaryIdentifier.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# confusablesSummaryIdentifier.txt
2-
# Date: 2025-12-07, 22:02:22 GMT
3-
# © 2025 Unicode®, Inc.
2+
# Date: 2026-02-20, 09:37:08 GMT
3+
# © 2026 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
66
#
@@ -18,7 +18,7 @@
1818

1919
# ' ` ʻ ʼ י ׳ ’
2020
(‎ ' ‎) 0027 APOSTROPHE
21-
← (‎ ` ‎) 0060 GRAVE ACCENT # →ˋ→→`→→‘→
21+
← (‎ ` ‎) 0060 GRAVE ACCENT # →`→→‘→
2222
← (‎ ʻ ‎) 02BB MODIFIER LETTER TURNED COMMA # →‘→
2323
← (‎ ʼ ‎) 02BC MODIFIER LETTER APOSTROPHE # →′→
2424
← (‎ י ‎) 05D9 HEBREW LETTER YOD

unicodetools/data/security/dev/data/source/confusables-source.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3577,10 +3577,8 @@ FF39 ; 0059 # ( Y → Y) FULLWIDTH LATIN CAPITAL LETTER Y → LATIN CAPITA
35773577
FF3A ; 005A # ( Z → Z) FULLWIDTH LATIN CAPITAL LETTER Z → LATIN CAPITAL LETTER Z
35783578
005A ; 0396
35793579
FF3B ; 005B # ( [ → [) FULLWIDTH LEFT SQUARE BRACKET → LEFT SQUARE BRACKET
3580-
FF3B ; 3014 # ( [ → 〔) FULLWIDTH LEFT SQUARE BRACKET → LEFT TORTOISE SHELL BRACKET
35813580
FF3C ; 005C # ( \ → \) FULLWIDTH REVERSE SOLIDUS → REVERSE SOLIDUS
35823581
FF3D ; 005D # ( ] → ]) FULLWIDTH RIGHT SQUARE BRACKET → RIGHT SQUARE BRACKET
3583-
FF3D ; 3015 # ( ] → 〕) FULLWIDTH RIGHT SQUARE BRACKET → RIGHT TORTOISE SHELL BRACKET
35843582
FF3E ; 005E # ( ^ → ^) FULLWIDTH CIRCUMFLEX ACCENT → CIRCUMFLEX ACCENT
35853583
FF3E ; FE3F # ( ^ → ︿) FULLWIDTH CIRCUMFLEX ACCENT → PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
35863584
FF3F ; 005F # ( _ → _) FULLWIDTH LOW LINE → LOW LINE
@@ -5424,7 +5422,7 @@ ABBB; 0473; V8_0; ꮻ => ѳ; CHEROKEE SMALL LETTER WI => CYRILLIC SMALL LETTER F
54245422
1F10D ; ⓪ ; V13_0 ; CIRCLED ZERO WITH SLASH
54255423
1F10E ; 21BA ; V13_0 ; CIRCLED ANTICLOCKWISE ARROW
54265424
1F10F; 0024 20E0 ; V11_0 ; CIRCLED DOLLAR SIGN WITH OVERLAID BACKSLASH
5427-
1F16D ; ㏄ 20DD ; V11_0 ; CIRCLED CC
5425+
1F16D ; ㏄ 20DD ; V11_0 ; CIRCLED CC
54285426
1F16E ; C 20E0 ; V11_0 ; CIRCLED C WITH OVERLAID BACKSLASH
54295427
# 1F16F ; 🚹 ; V11_0 ; CIRCLED HUMAN FIGURE
54305428

unicodetools/data/security/dev/data/source/confusables-winFonts.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,10 +1583,6 @@
15831583

15841584
3003 ; 2033 #* ( 〃 ~ ″ ) DITTO MARK ~ DOUBLE PRIME
15851585

1586-
3014 ; FF3B #* ( 〔 ~ [ ) LEFT TORTOISE SHELL BRACKET ~ FULLWIDTH LEFT SQUARE BRACKET
1587-
1588-
3015 ; FF3D #* ( 〕 ~ ] ) RIGHT TORTOISE SHELL BRACKET ~ FULLWIDTH RIGHT SQUARE BRACKET
1589-
15901586
301C ; FF5E #* ( 〜 ~ ~ ) WAVE DASH ~ FULLWIDTH TILDE
15911587

15921588
#3030 ; FE4B #* ( 〰 ~ ﹋ ) WAVY DASH ~ WAVY OVERLINE

unicodetools/data/security/dev/data/source/formatted-source.txt

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# formatted-source.txt
2-
# Date: 2026-02-06, 21:41:43 GMT
2+
# Date: 2026-02-20, 09:37:07 GMT
33
# © 2026 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -4734,10 +4734,6 @@
47344734

47354735
3005 ; 206A4 # ( 々 ~ 𠚤 ) IDEOGRAPHIC ITERATION MARK ~ CJK UNIFIED IDEOGRAPH-206A4
47364736

4737-
3014 ; FF3B #* ( 〔 ~ [ ) LEFT TORTOISE SHELL BRACKET ~ FULLWIDTH LEFT SQUARE BRACKET
4738-
4739-
3015 ; FF3D #* ( 〕 ~ ] ) RIGHT TORTOISE SHELL BRACKET ~ FULLWIDTH RIGHT SQUARE BRACKET
4740-
47414737
301C ; FF5E #* ( 〜 ~ ~ ) WAVE DASH ~ FULLWIDTH TILDE
47424738

47434739
302C ; 309A # ( 〬 ~ ゚ ) IDEOGRAPHIC DEPARTING TONE MARK ~ COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
@@ -4785,7 +4781,7 @@
47854781

47864782
312D ; 5E00 # ( ㄭ ~ 帀 ) BOPOMOFO LETTER IH ~ CJK UNIFIED IDEOGRAPH-5E00
47874783

4788-
33C4 0009 20DD ; 1F16D #* ( ~ 🅭 ) SQUARE CC, <CHARACTER TABULATION>, COMBINING ENCLOSING CIRCLE ~ CIRCLED CC
4784+
33C4 20DD ; 1F16D #* ( ㏄⃝ ~ 🅭 ) SQUARE CC, COMBINING ENCLOSING CIRCLE ~ CIRCLED CC
47894785

47904786
3588 ; 439B # ( 㖈 ~ 䎛 ) CJK UNIFIED IDEOGRAPH-3588 ~ CJK UNIFIED IDEOGRAPH-439B
47914787

unicodetools/data/security/dev/data/source/formatted-winFonts.txt

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# formatted-winFonts.txt
2-
# Date: 2025-11-13, 04:42:20 GMT
3-
# © 2025 Unicode®, Inc.
2+
# Date: 2026-02-20, 09:37:07 GMT
3+
# © 2026 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
66
#
@@ -973,10 +973,6 @@
973973

974974
3003 ; 2033 #* ( 〃 ~ ″ ) DITTO MARK ~ DOUBLE PRIME
975975

976-
3014 ; FF3B #* ( 〔 ~ [ ) LEFT TORTOISE SHELL BRACKET ~ FULLWIDTH LEFT SQUARE BRACKET
977-
978-
3015 ; FF3D #* ( 〕 ~ ] ) RIGHT TORTOISE SHELL BRACKET ~ FULLWIDTH RIGHT SQUARE BRACKET
979-
980976
301C ; FF5E #* ( 〜 ~ ~ ) WAVE DASH ~ FULLWIDTH TILDE
981977

982978
3078 ; 30D8 # ( へ ~ ヘ ) HIRAGANA LETTER HE ~ KATAKANA LETTER HE

unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusables.java

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -712,26 +712,33 @@ private static UnicodeSet getSkipNFKD() {
712712
}
713713
final int decompType = DEFAULT_UCD.getDecompositionType(cp);
714714
final String nfc = Default.nfc().normalize(cp);
715+
final String mapped = NFKD.normalize(cp);
715716
if (decompType == UCD_Types.CANONICAL) {
716717
nfcMap.put(cp, nfc);
717718
}
718719
if (decompType == UCD_Types.COMPAT_CIRCLE
719720
|| decompType == UCD_Types.COMPAT_SUPER
720721
|| decompType == UCD_Types.COMPAT_SUB
721722
|| decompType == UCD_Types.COMPAT_VERTICAL
722-
|| decompType == UCD_Types.COMPAT_SMALL
723-
|| decompType == UCD_Types.COMPAT_SQUARE
724723
|| decompType == UCD_Types.COMPAT_FRACTION
725-
|| decompType == UCD_Types.COMPAT_NARROW
726-
|| decompType == UCD_Types.COMPAT_WIDE
727-
|| decompType == UCD_Types.COMPAT_WIDE
724+
|| (0x3300 <= cp && cp <= 0x3357) // Skip squared Katakana words
725+
|| cp == 0x337F // Skip SQUARE CORPORATION
726+
|| cp == 0xFE58 // Skip SMALL EM DASH since it would merge two classes
727+
// Skip FULLWIDTH HYPHEN-MINUS since it would merge two classes
728+
|| cp == 0xFF0D
729+
|| (0x1F130 <= cp && cp <= 0x1F14F) // Skip squared Latin letters
730+
|| (0x1F200 <= cp && cp <= 0x1F23B) // Skip Enclosed Ideographic Supplement
731+
// Skip square compatibility characters that have a power of 2 or 3.
732+
// This needs to be fixed in the future so we can have confusability
733+
// with the decomposition using superscript 2 and 3.
734+
|| (decompType == UCD_Types.COMPAT_SQUARE
735+
&& (mapped.contains("2") || mapped.contains("3")))
728736
|| cp == '﬩'
729737
|| cp == '︒') {
730738
_skipNFKD.add(cp);
731739
continue;
732740
}
733741
final String source = UTF16.valueOf(cp);
734-
final String mapped = NFKD.normalize(cp);
735742
String kmapped = ModifiedNFKD.normalize(source);
736743
if (!kmapped.equals(source) && !kmapped.equals(nfc)) {
737744
if (kmapped.startsWith(" ") || kmapped.startsWith("\u0640")) {

0 commit comments

Comments
 (0)