From 92bac4e73082fa5b524c5ad49e7b541529f8422d Mon Sep 17 00:00:00 2001 From: Kuuuube <61125188+Kuuuube@users.noreply.github.com> Date: Sun, 3 Nov 2024 16:02:29 -0500 Subject: [PATCH] Add radical normalization preprocessor to Japanese, Chinese, and Cantonese (#1559) * Add radical normalization to japanese * Add radical normalization to chinese * Add CJK Strokes Range * Add to yue * Move normalizeRadicalCharacters to CJK-util * Fix tests --- .eslintrc.json | 1 + ext/js/language/CJK-util.js | 39 +++++++++++++++++++++++++ ext/js/language/language-descriptors.js | 8 +++++ types/ext/language-descriptors.d.ts | 13 +++++++-- 4 files changed, 59 insertions(+), 2 deletions(-) diff --git a/.eslintrc.json b/.eslintrc.json index 64e8525893..1eb1278c19 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -560,6 +560,7 @@ "ext/js/dom/css-style-applier.js", "ext/js/language/CJK-util.js", "ext/js/language/ja/japanese.js", + "ext/js/language/text-processors.js", "ext/js/language/text-utilities.js", "ext/js/templates/anki-template-renderer-content-manager.js", "ext/js/templates/anki-template-renderer.js", diff --git a/ext/js/language/CJK-util.js b/ext/js/language/CJK-util.js index 5c59afb58b..5c56468261 100644 --- a/ext/js/language/CJK-util.js +++ b/ext/js/language/CJK-util.js @@ -15,6 +15,8 @@ * along with this program. If not, see . */ +import {basicTextProcessorOptions} from './text-processors.js'; + /** @type {import('CJK-util').CodepointRange} */ const CJK_UNIFIED_IDEOGRAPHS_RANGE = [0x4e00, 0x9fff]; /** @type {import('CJK-util').CodepointRange} */ @@ -94,3 +96,40 @@ export function isCodePointInRanges(codePoint, ranges) { } return false; } + +/** @type {import('CJK-util').CodepointRange} */ +export const KANGXI_RADICALS_RANGE = [0x2f00, 0x2fdf]; + +/** @type {import('CJK-util').CodepointRange} */ +export const CJK_RADICALS_SUPPLEMENT_RANGE = [0x2e80, 0x2eff]; + +/** @type {import('CJK-util').CodepointRange} */ +export const CJK_STROKES_RANGE = [0x31c0, 0x31ef]; + +/** @type {import('CJK-util').CodepointRange[]} */ +export const CJK_RADICALS_RANGES = [ + KANGXI_RADICALS_RANGE, + CJK_RADICALS_SUPPLEMENT_RANGE, + CJK_STROKES_RANGE, +]; + +/** + * @param {string} text + * @returns {string} + */ +export function normalizeRadicals(text) { + let result = ''; + for (let i = 0; i < text.length; i++) { + const codePoint = text[i].codePointAt(0); + result += codePoint && (isCodePointInRanges(codePoint, CJK_RADICALS_RANGES)) ? text[i].normalize('NFKD') : text[i]; + } + return result; +} + +/** @type {import('language').TextProcessor} */ +export const normalizeRadicalCharacters = { + name: 'Normalize radical characters', + description: '⼀ → 一 (U+2F00 → U+4E00)', + options: basicTextProcessorOptions, + process: (str, setting) => (setting ? normalizeRadicals(str) : str), +}; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index a07437b711..02e2f35e1a 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -16,6 +16,7 @@ */ import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js'; +import {normalizeRadicalCharacters} from './CJK-util.js'; import {eszettPreprocessor} from './de/german-text-preprocessors.js'; import {germanTransforms} from './de/german-transforms.js'; import {englishTransforms} from './en/english-transforms.js'; @@ -212,6 +213,7 @@ const languageDescriptors = [ convertHalfWidthCharacters, alphabeticToHiragana, normalizeCombiningCharacters, + normalizeRadicalCharacters, alphanumericWidthVariants, convertHiraganaToKatakana, collapseEmphaticSequences, @@ -374,6 +376,9 @@ const languageDescriptors = [ iso639_3: 'yue', name: 'Cantonese', exampleText: '讀', + textPreprocessors: { + normalizeRadicalCharacters, + }, }, { iso: 'zh', @@ -382,6 +387,9 @@ const languageDescriptors = [ exampleText: '读', isTextLookupWorthy: isStringPartiallyChinese, readingNormalizer: normalizePinyin, + textPreprocessors: { + normalizeRadicalCharacters, + }, }, ]; diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 62643e0c75..199ec4b587 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -135,6 +135,7 @@ type AllTextProcessors = { convertHalfWidthCharacters: TextProcessor; alphabeticToHiragana: TextProcessor; normalizeCombiningCharacters: TextProcessor; + normalizeRadicalCharacters: TextProcessor; alphanumericWidthVariants: BidirectionalConversionPreprocessor; convertHiraganaToKatakana: BidirectionalConversionPreprocessor; collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>; @@ -200,6 +201,14 @@ type AllTextProcessors = { normalizeDiacritics: TextProcessor<'old' | 'new' | 'off'>; }; }; - yue: Record; - zh: Record; + yue: { + pre: { + normalizeRadicalCharacters: TextProcessor; + }; + }; + zh: { + pre: { + normalizeRadicalCharacters: TextProcessor; + }; + }; };