Skip to content

Commit

Permalink
Add radical normalization preprocessor to Japanese, Chinese, and Cant…
Browse files Browse the repository at this point in the history
…onese (#1559)

* Add radical normalization to japanese

* Add radical normalization to chinese

* Add CJK Strokes Range

* Add to yue

* Move normalizeRadicalCharacters to CJK-util

* Fix tests
  • Loading branch information
Kuuuube authored Nov 3, 2024
1 parent 7c198d4 commit 92bac4e
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 2 deletions.
1 change: 1 addition & 0 deletions .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,7 @@
"ext/js/dom/css-style-applier.js",
"ext/js/language/CJK-util.js",
"ext/js/language/ja/japanese.js",
"ext/js/language/text-processors.js",
"ext/js/language/text-utilities.js",
"ext/js/templates/anki-template-renderer-content-manager.js",
"ext/js/templates/anki-template-renderer.js",
Expand Down
39 changes: 39 additions & 0 deletions ext/js/language/CJK-util.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import {basicTextProcessorOptions} from './text-processors.js';

/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_RANGE = [0x4e00, 0x9fff];
/** @type {import('CJK-util').CodepointRange} */
Expand Down Expand Up @@ -94,3 +96,40 @@ export function isCodePointInRanges(codePoint, ranges) {
}
return false;
}

/** @type {import('CJK-util').CodepointRange} */
export const KANGXI_RADICALS_RANGE = [0x2f00, 0x2fdf];

/** @type {import('CJK-util').CodepointRange} */
export const CJK_RADICALS_SUPPLEMENT_RANGE = [0x2e80, 0x2eff];

/** @type {import('CJK-util').CodepointRange} */
export const CJK_STROKES_RANGE = [0x31c0, 0x31ef];

/** @type {import('CJK-util').CodepointRange[]} */
export const CJK_RADICALS_RANGES = [
KANGXI_RADICALS_RANGE,
CJK_RADICALS_SUPPLEMENT_RANGE,
CJK_STROKES_RANGE,
];

/**
* @param {string} text
* @returns {string}
*/
export function normalizeRadicals(text) {
let result = '';
for (let i = 0; i < text.length; i++) {
const codePoint = text[i].codePointAt(0);
result += codePoint && (isCodePointInRanges(codePoint, CJK_RADICALS_RANGES)) ? text[i].normalize('NFKD') : text[i];
}
return result;
}

/** @type {import('language').TextProcessor<boolean>} */
export const normalizeRadicalCharacters = {
name: 'Normalize radical characters',
description: '⼀ → 一 (U+2F00 → U+4E00)',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? normalizeRadicals(str) : str),
};
8 changes: 8 additions & 0 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/

import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js';
import {normalizeRadicalCharacters} from './CJK-util.js';
import {eszettPreprocessor} from './de/german-text-preprocessors.js';
import {germanTransforms} from './de/german-transforms.js';
import {englishTransforms} from './en/english-transforms.js';
Expand Down Expand Up @@ -212,6 +213,7 @@ const languageDescriptors = [
convertHalfWidthCharacters,
alphabeticToHiragana,
normalizeCombiningCharacters,
normalizeRadicalCharacters,
alphanumericWidthVariants,
convertHiraganaToKatakana,
collapseEmphaticSequences,
Expand Down Expand Up @@ -374,6 +376,9 @@ const languageDescriptors = [
iso639_3: 'yue',
name: 'Cantonese',
exampleText: '讀',
textPreprocessors: {
normalizeRadicalCharacters,
},
},
{
iso: 'zh',
Expand All @@ -382,6 +387,9 @@ const languageDescriptors = [
exampleText: '读',
isTextLookupWorthy: isStringPartiallyChinese,
readingNormalizer: normalizePinyin,
textPreprocessors: {
normalizeRadicalCharacters,
},
},
];

Expand Down
13 changes: 11 additions & 2 deletions types/ext/language-descriptors.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ type AllTextProcessors = {
convertHalfWidthCharacters: TextProcessor<boolean>;
alphabeticToHiragana: TextProcessor<boolean>;
normalizeCombiningCharacters: TextProcessor<boolean>;
normalizeRadicalCharacters: TextProcessor<boolean>;
alphanumericWidthVariants: BidirectionalConversionPreprocessor;
convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;
Expand Down Expand Up @@ -200,6 +201,14 @@ type AllTextProcessors = {
normalizeDiacritics: TextProcessor<'old' | 'new' | 'off'>;
};
};
yue: Record<string, never>;
zh: Record<string, never>;
yue: {
pre: {
normalizeRadicalCharacters: TextProcessor<boolean>;
};
};
zh: {
pre: {
normalizeRadicalCharacters: TextProcessor<boolean>;
};
};
};

0 comments on commit 92bac4e

Please sign in to comment.