Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .playwright-mcp/console-2026-06-16T22-12-05-217Z.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[ 2240ms] [WARNING] [antalytics] track() called before init(); event dropped @ https://platform.claude.com/_next/static/chunks/79931-0391105e084828ad.js:11
[ 2241ms] [WARNING] [antalytics] track() called before init(); event dropped @ https://platform.claude.com/_next/static/chunks/79931-0391105e084828ad.js:11
[ 3386ms] [WARNING] [Intercom] The App ID in your code snippet has not been set. Set it to your App ID found in settings to complete installation: https://app.intercom.com/a/apps/_/settings/web @ https://js.intercomcdn.com/frame-modern.01682e51.js:0
560 changes: 560 additions & 0 deletions .playwright-mcp/page-2026-06-16T22-12-07-474Z.yml

Large diffs are not rendered by default.

6 changes: 0 additions & 6 deletions claude-plugin/skills/academy-terms/data/terms.id.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@
],
"Computer Use": [
"Penggunaan Komputer"
],
"subagent": [
"subagen"
],
"subagents": [
"subagen"
]
},
"terms": {
Expand Down
11 changes: 11 additions & 0 deletions scripts/check-glossary.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,17 @@ for (const [lang, data] of Object.entries(languages)) {
console.log(` WARN [${lang}] _protected."${correct}" lists itself as a wrong form (no-op)`);
warnings++;
check1Issues++;
continue;
}
// Substring corruption: a wrong-form contained in its own correct term
// rewrites the correct term itself at restore time (unanchored replaceAll),
// e.g. "subagen" inside "subagent" → "subagentt". Hard error.
if (typeof wrong === 'string' && wrong.length > 0 && correct.includes(wrong)) {
console.log(
` ERROR [${lang}] _protected."${correct}" wrong-form "${wrong}" is a substring of the correct term — corrupts it on restore`,
);
errors++;
check1Issues++;
}
}
}
Expand Down
6 changes: 0 additions & 6 deletions src/data/id.json
Original file line number Diff line number Diff line change
Expand Up @@ -1174,12 +1174,6 @@
],
"Computer Use": [
"Penggunaan Komputer"
],
"subagent": [
"subagen"
],
"subagents": [
"subagen"
]
},
"exam_ui": {
Expand Down
52 changes: 42 additions & 10 deletions src/lib/protected-terms.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,40 @@
// Self-mapping (correct → correct) is a no-op; skip to avoid
// wasted iterations on long pages.
if (wrong === correct) continue;
// A wrong-form that is a SUBSTRING of its own correct term would corrupt
// the correct term on restore (e.g. wrong "subagen" inside correct
// "subagent" → "subagentt"). Longest-first sort can't save a true prefix,
// so drop these entirely. (check-glossary also rejects them at build.)
if (correct.includes(wrong)) continue;
map[wrong] = correct;
}
}
_protectedTermsSorted = Object.entries(map).sort((a, b) => b[0].length - a[0].length);
// Sort longest-first AND precompile a Unicode letter-boundary matcher per
// wrong-form: `(?<!\p{L})form(?!\p{L})` stops a form from matching INSIDE a
// longer word in ANY script — Latin "Claudio" inside "Claudios", CJK "기술"
// (skill) inside "기술자" (technician) — the substring-corruption class the
// old blanket replaceAll could not avoid. Falls back to plain replaceAll if
// a form can't compile into a valid regex.
_protectedTermsSorted = Object.entries(map)
.sort((a, b) => b[0].length - a[0].length)
.map(([wrong, correct]) => {
let re = null;
// Letter-boundary anchoring is only safe for space-separated scripts
// (Latin/Cyrillic/Greek/…). CJK + Kana + Hangul have NO word separators,
// so a protected term is routinely adjacent to particles/ideographs even
// when it SHOULD be restored — anchoring there would BREAK legitimate
// restoration. For CJK forms keep the literal replaceAll; their compound
// corruption stays a per-dictionary data concern (check-glossary flags it).
const isCJK = /[぀-ヿ㐀-鿿가-힯豈-﫿ヲ-ᅵ]/.test(wrong);
if (!isCJK) {
try {
re = new RegExp('(?<!\\p{L})' + wrong.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + '(?!\\p{L})', 'gu');
} catch (_e) {
re = null; // invalid form → fall back to literal replaceAll below
}
}
return { wrong, correct, re };
});
const terms = Object.keys(protectedEntries);
_protectedKeepEnglish = terms.length > 0 ? terms.join(', ') : DEFAULT_PROTECTED_TERMS;

Expand All @@ -64,13 +94,13 @@
/**
* Fix mistranslated protected terms in the given text.
*
* Known limitation — CJK substring corruption: a Hangul/Hanzi/Kana wrong-form
* that happens to be a prefix of a legitimate longer word will still be
* replaced (e.g. wrong-form "기술" ("skill") inside "기술자" ("technician")
* yields "skill자"). The fix lives in the per-language dictionary itself:
* add the longer compound as its own entry (mapping to its correct form)
* and the longer-first sort below will match it before the shorter prefix.
* See `src/data/<lang>.json` `_protected` section.
* Matching is Unicode letter-boundary-anchored (see buildProtectedTermsMap),
* so a wrong-form no longer corrupts a longer word that merely CONTAINS it
* (Latin "subagen" in "subagent", CJK "기술"/skill in "기술자"/technician).
* What anchoring CANNOT resolve is a wrong-form that is a legitimate STANDALONE
* word in the target language (e.g. "Claudio" is both GT's mistranslation of
* "Claude" AND a real Italian name) — those must be removed from the
* per-language `_protected` block; check-glossary flags the worst offenders.
*
* @param {string|null|undefined} text
* @returns {string}
Expand All @@ -83,8 +113,10 @@
if (typeof text !== 'string') return text;
if (_protectedTermsSorted.length === 0) return text;
let result = text;
for (const [wrong, correct] of _protectedTermsSorted) {
if (result.includes(wrong)) result = result.replaceAll(wrong, correct);
for (const { wrong, correct, re } of _protectedTermsSorted) {
// Cheap pre-filter: the literal must be present for either matcher to fire.
if (!result.includes(wrong)) continue;
result = re ? result.replace(re, correct) : result.replaceAll(wrong, correct);
}
// Collapse "Claude(Claude)"-style GT gloss duplicates the restore above can
// produce. Cheap paren guard keeps the common (no-paren) node off the regex.
Expand Down
26 changes: 26 additions & 0 deletions tests/protected-terms.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,29 @@ describe('no corruption of correct CJK prose (real shipped dictionaries)', () =>
});
}
});

describe('restore engine — substring/boundary safety', () => {
beforeEach(() => resetProtectedTerms());

test('drops a wrong-form that is a substring of its own correct term (no self-corruption)', () => {
// "subagen" ⊂ "subagent": unanchored replaceAll would yield "subagentt".
buildProtectedTermsMap('id', fakeTranslator({ subagent: ['subagen'], subagents: ['subagen'] }));
expect(restoreProtectedTerms('Buat subagent baru')).toBe('Buat subagent baru');
expect(restoreProtectedTerms('Daftar subagents di sini')).toBe('Daftar subagents di sini');
});

test('Latin wrong-form is letter-boundary-anchored — never corrupts a longer word containing it', () => {
buildProtectedTermsMap('xx', fakeTranslator({ Plugin: ['plug'] }));
expect(restoreProtectedTerms('a plughole and a plug')).toBe('a plughole and a Plugin');
});

test('Latin standalone restoration still fires', () => {
buildProtectedTermsMap('fr', fakeTranslator({ 'slash command': ['commande slash'] }));
expect(restoreProtectedTerms('Tapez la commande slash.')).toBe('Tapez la slash command.');
});

test('CJK restoration is preserved when adjacent to a particle (CJK keeps substring matching)', () => {
buildProtectedTermsMap('ko', fakeTranslator({ Claude: ['클로드'] }));
expect(restoreProtectedTerms('클로드는 유용합니다')).toBe('Claude는 유용합니다');
});
});
Loading