From 92bac4e73082fa5b524c5ad49e7b541529f8422d Mon Sep 17 00:00:00 2001
From: Kuuuube <61125188+Kuuuube@users.noreply.github.com>
Date: Sun, 3 Nov 2024 16:02:29 -0500
Subject: [PATCH] Add radical normalization preprocessor to Japanese, Chinese,
and Cantonese (#1559)
* Add radical normalization to japanese
* Add radical normalization to chinese
* Add CJK Strokes Range
* Add to yue
* Move normalizeRadicalCharacters to CJK-util
* Fix tests
---
.eslintrc.json | 1 +
ext/js/language/CJK-util.js | 39 +++++++++++++++++++++++++
ext/js/language/language-descriptors.js | 8 +++++
types/ext/language-descriptors.d.ts | 13 +++++++--
4 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/.eslintrc.json b/.eslintrc.json
index 64e8525893..1eb1278c19 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -560,6 +560,7 @@
"ext/js/dom/css-style-applier.js",
"ext/js/language/CJK-util.js",
"ext/js/language/ja/japanese.js",
+ "ext/js/language/text-processors.js",
"ext/js/language/text-utilities.js",
"ext/js/templates/anki-template-renderer-content-manager.js",
"ext/js/templates/anki-template-renderer.js",
diff --git a/ext/js/language/CJK-util.js b/ext/js/language/CJK-util.js
index 5c59afb58b..5c56468261 100644
--- a/ext/js/language/CJK-util.js
+++ b/ext/js/language/CJK-util.js
@@ -15,6 +15,8 @@
* along with this program. If not, see .
*/
+import {basicTextProcessorOptions} from './text-processors.js';
+
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_RANGE = [0x4e00, 0x9fff];
/** @type {import('CJK-util').CodepointRange} */
@@ -94,3 +96,40 @@ export function isCodePointInRanges(codePoint, ranges) {
}
return false;
}
+
+/** @type {import('CJK-util').CodepointRange} */
+export const KANGXI_RADICALS_RANGE = [0x2f00, 0x2fdf];
+
+/** @type {import('CJK-util').CodepointRange} */
+export const CJK_RADICALS_SUPPLEMENT_RANGE = [0x2e80, 0x2eff];
+
+/** @type {import('CJK-util').CodepointRange} */
+export const CJK_STROKES_RANGE = [0x31c0, 0x31ef];
+
+/** @type {import('CJK-util').CodepointRange[]} */
+export const CJK_RADICALS_RANGES = [
+ KANGXI_RADICALS_RANGE,
+ CJK_RADICALS_SUPPLEMENT_RANGE,
+ CJK_STROKES_RANGE,
+];
+
+/**
+ * @param {string} text
+ * @returns {string}
+ */
+export function normalizeRadicals(text) {
+ let result = '';
+ for (let i = 0; i < text.length; i++) {
+ const codePoint = text[i].codePointAt(0);
+ result += codePoint && (isCodePointInRanges(codePoint, CJK_RADICALS_RANGES)) ? text[i].normalize('NFKD') : text[i];
+ }
+ return result;
+}
+
+/** @type {import('language').TextProcessor} */
+export const normalizeRadicalCharacters = {
+ name: 'Normalize radical characters',
+ description: '⼀ → 一 (U+2F00 → U+4E00)',
+ options: basicTextProcessorOptions,
+ process: (str, setting) => (setting ? normalizeRadicals(str) : str),
+};
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index a07437b711..02e2f35e1a 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -16,6 +16,7 @@
*/
import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js';
+import {normalizeRadicalCharacters} from './CJK-util.js';
import {eszettPreprocessor} from './de/german-text-preprocessors.js';
import {germanTransforms} from './de/german-transforms.js';
import {englishTransforms} from './en/english-transforms.js';
@@ -212,6 +213,7 @@ const languageDescriptors = [
convertHalfWidthCharacters,
alphabeticToHiragana,
normalizeCombiningCharacters,
+ normalizeRadicalCharacters,
alphanumericWidthVariants,
convertHiraganaToKatakana,
collapseEmphaticSequences,
@@ -374,6 +376,9 @@ const languageDescriptors = [
iso639_3: 'yue',
name: 'Cantonese',
exampleText: '讀',
+ textPreprocessors: {
+ normalizeRadicalCharacters,
+ },
},
{
iso: 'zh',
@@ -382,6 +387,9 @@ const languageDescriptors = [
exampleText: '读',
isTextLookupWorthy: isStringPartiallyChinese,
readingNormalizer: normalizePinyin,
+ textPreprocessors: {
+ normalizeRadicalCharacters,
+ },
},
];
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index 62643e0c75..199ec4b587 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -135,6 +135,7 @@ type AllTextProcessors = {
convertHalfWidthCharacters: TextProcessor;
alphabeticToHiragana: TextProcessor;
normalizeCombiningCharacters: TextProcessor;
+ normalizeRadicalCharacters: TextProcessor;
alphanumericWidthVariants: BidirectionalConversionPreprocessor;
convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;
@@ -200,6 +201,14 @@ type AllTextProcessors = {
normalizeDiacritics: TextProcessor<'old' | 'new' | 'off'>;
};
};
- yue: Record;
- zh: Record;
+ yue: {
+ pre: {
+ normalizeRadicalCharacters: TextProcessor;
+ };
+ };
+ zh: {
+ pre: {
+ normalizeRadicalCharacters: TextProcessor;
+ };
+ };
};