Skip to content

Commit

Permalink
Add isTextLookupWorthy function for Chinese
Browse files Browse the repository at this point in the history
* add is Chinese check move

* fix lint

* fix lint

* fixes

---------

Co-authored-by: Darius Jahandarie <[email protected]>
Co-authored-by: Stefan Vukovic <[email protected]>

<rikaitan.link>YjYzNDFmMzEyZDgzMzJjY2ZmMGQ5MjhkOTM2ZTkyOTBkYTBlOTU4NAo=</rikaitan.link>
  • Loading branch information
Casheeew committed Jun 2, 2024
1 parent 81861fd commit 2060ba2
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 66 deletions.
3 changes: 3 additions & 0 deletions .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@
"ext/js/display/pronunciation-generator.js",
"ext/js/display/structured-content-generator.js",
"ext/js/dom/css-style-applier.js",
"ext/js/language/CJK-util.js",
"ext/js/language/ja/japanese.js",
"ext/js/language/text-utilities.js",
"ext/js/templates/anki-template-renderer-content-manager.js",
Expand Down Expand Up @@ -640,6 +641,7 @@
"ext/js/general/object-property-accessor.js",
"ext/js/general/regex-util.js",
"ext/js/language/ar/arabic-text-preprocessors.js",
"ext/js/language/CJK-util.js",
"ext/js/language/de/german-text-preprocessors.js",
"ext/js/language/de/german-transforms.js",
"ext/js/language/en/english-transforms.js",
Expand All @@ -661,6 +663,7 @@
"ext/js/language/sq/albanian-transforms.js",
"ext/js/language/text-processors.js",
"ext/js/language/translator.js",
"ext/js/language/zh/chinese.js",
"ext/js/media/audio-downloader.js",
"ext/js/media/media-util.js",
"ext/js/templates/template-patcher.js"
Expand Down
1 change: 1 addition & 0 deletions dev/jsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"japanese-util": ["../types/ext/japanese-util"],
"language": ["../types/ext/language"],
"language-descriptors": ["../types/ext/language-descriptors"],
"CJK-util": ["../types/ext/CJK-util"],
"ext/json-schema": ["../types/ext/json-schema"],
"language-transformer": ["../types/ext/language-transformer"],
"language-transformer-internal": ["../types/ext/language-transformer-internal"],
Expand Down
96 changes: 96 additions & 0 deletions ext/js/language/CJK-util.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Copyright (C) 2024 Ajatt-Tools and contributors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_RANGE = [0x4e00, 0x9fff];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE = [0x3400, 0x4dbf];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE = [0x20000, 0x2a6df];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_RANGE = [0x2a700, 0x2b73f];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G_RANGE = [0x30000, 0x3134f];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H_RANGE = [0x31350, 0x323af];
/** @type {import('CJK-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I_RANGE = [0x2ebf0, 0x2ee5f];
/** @type {import('CJK-util').CodepointRange} */
const CJK_COMPATIBILITY_IDEOGRAPHS_RANGE = [0xf900, 0xfaff];
/** @type {import('CJK-util').CodepointRange} */
const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f];

/** @type {import('CJK-util').CodepointRange[]} */
export const CJK_IDEOGRAPH_RANGES = [
CJK_UNIFIED_IDEOGRAPHS_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I_RANGE,
CJK_COMPATIBILITY_IDEOGRAPHS_RANGE,
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE,
];

/** @type {import('CJK-util').CodepointRange[]} */
export const FULLWIDTH_CHARACTER_RANGES = [
[0xff10, 0xff19], // Fullwidth numbers
[0xff21, 0xff3a], // Fullwidth upper case Latin letters
[0xff41, 0xff5a], // Fullwidth lower case Latin letters

[0xff01, 0xff0f], // Fullwidth punctuation 1
[0xff1a, 0xff1f], // Fullwidth punctuation 2
[0xff3b, 0xff3f], // Fullwidth punctuation 3
[0xff5b, 0xff60], // Fullwidth punctuation 4
[0xffe0, 0xffee], // Currency markers
];

/** @type {import('CJK-util').CodepointRange} */
export const CJK_PUNCTUATION_RANGE = [0x3000, 0x303f];

/**
* @param {number} codePoint
* @param {import('CJK-util').CodepointRange} range
* @returns {boolean}
*/
export function isCodePointInRange(codePoint, [min, max]) {
return (codePoint >= min && codePoint <= max);
}

/**
* @param {number} codePoint
* @param {import('CJK-util').CodepointRange[]} ranges
* @returns {boolean}
*/
export function isCodePointInRanges(codePoint, ranges) {
for (const [min, max] of ranges) {
if (codePoint >= min && codePoint <= max) {
return true;
}
}
return false;
}
69 changes: 8 additions & 61 deletions ext/js/language/ja/japanese.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,60 +15,31 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import {CJK_IDEOGRAPH_RANGES, isCodePointInRange, isCodePointInRanges} from '../CJK-util.js';


const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5;
const KATAKANA_SMALL_KE_CODE_POINT = 0x30f6;
const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc;

/** @type {import('japanese-util').CodepointRange} */
/** @type {import('CJK-util').CodepointRange} */
const HIRAGANA_RANGE = [0x3040, 0x309f];
/** @type {import('japanese-util').CodepointRange} */
/** @type {import('CJK-util').CodepointRange} */
const KATAKANA_RANGE = [0x30a0, 0x30ff];

/** @type {import('japanese-util').CodepointRange} */
/** @type {import('CJK-util').CodepointRange} */
const HIRAGANA_CONVERSION_RANGE = [0x3041, 0x3096];
/** @type {import('japanese-util').CodepointRange} */
/** @type {import('CJK-util').CodepointRange} */
const KATAKANA_CONVERSION_RANGE = [0x30a1, 0x30f6];

/** @type {import('japanese-util').CodepointRange[]} */
/** @type {import('CJK-util').CodepointRange[]} */
const KANA_RANGES = [HIRAGANA_RANGE, KATAKANA_RANGE];

/** @type {import('japanese-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_RANGE = [0x4e00, 0x9fff];
/** @type {import('japanese-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE = [0x3400, 0x4dbf];
/** @type {import('japanese-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE = [0x20000, 0x2a6df];
/** @type {import('japanese-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_RANGE = [0x2a700, 0x2b73f];
/** @type {import('japanese-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f];
/** @type {import('japanese-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf];
/** @type {import('japanese-util').CodepointRange} */
const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef];
/** @type {import('japanese-util').CodepointRange} */
const CJK_COMPATIBILITY_IDEOGRAPHS_RANGE = [0xf900, 0xfaff];
/** @type {import('japanese-util').CodepointRange} */
const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f];
/** @type {import('japanese-util').CodepointRange[]} */
const CJK_IDEOGRAPH_RANGES = [
CJK_UNIFIED_IDEOGRAPHS_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE,
CJK_COMPATIBILITY_IDEOGRAPHS_RANGE,
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE,
];

/**
* Japanese character ranges, roughly ordered in order of expected frequency.
* @type {import('japanese-util').CodepointRange[]}
* @type {import('CJK-util').CodepointRange[]}
*/
const JAPANESE_RANGES = [
HIRAGANA_RANGE,
Expand Down Expand Up @@ -184,30 +155,6 @@ for (let i = 0, ii = kana.length; i < ii; i += 3) {
}
}


/**
* @param {number} codePoint
* @param {import('japanese-util').CodepointRange} range
* @returns {boolean}
*/
function isCodePointInRange(codePoint, [min, max]) {
return (codePoint >= min && codePoint <= max);
}

/**
* @param {number} codePoint
* @param {import('japanese-util').CodepointRange[]} ranges
* @returns {boolean}
*/
function isCodePointInRanges(codePoint, ranges) {
for (const [min, max] of ranges) {
if (codePoint >= min && codePoint <= max) {
return true;
}
}
return false;
}

/**
* @param {string} previousCharacter
* @returns {?string}
Expand Down
2 changes: 2 additions & 0 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js
import {oldIrishTransforms} from './sga/old-irish-transforms.js';
import {albanianTransforms} from './sq/albanian-transforms.js';
import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
import {isStringPartiallyChinese} from './zh/chinese.js';

const capitalizationPreprocessors = {
decapitalize,
Expand Down Expand Up @@ -264,6 +265,7 @@ const languageDescriptors = [
iso: 'zh',
name: 'Chinese',
exampleText: '读',
isTextLookupWorthy: isStringPartiallyChinese,
},
];

Expand Down
62 changes: 62 additions & 0 deletions ext/js/language/zh/chinese.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (C) 2024 Ajatt-Tools and contributors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import {CJK_IDEOGRAPH_RANGES, CJK_PUNCTUATION_RANGE, FULLWIDTH_CHARACTER_RANGES, isCodePointInRanges} from '../CJK-util.js';

/** @type {import('CJK-util').CodepointRange} */
const BOPOMOFO_RANGE = [0x3100, 0x312f];
/** @type {import('CJK-util').CodepointRange} */
const BOPOMOFO_EXTENDED_RANGE = [0x31a0, 0x31bf];
/** @type {import('CJK-util').CodepointRange} */
const IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION_RANGE = [0x16fe0, 0x16fff];
/** @type {import('CJK-util').CodepointRange} */
const SMALL_FORM_RANGE = [0xfe50, 0xfe6f];
/** @type {import('CJK-util').CodepointRange} */
const VERTICAL_FORM_RANGE = [0xfe10, 0xfe1f];


/**
* Chinese character ranges, roughly ordered in order of expected frequency.
* @type {import('CJK-util').CodepointRange[]}
*/
const CHINESE_RANGES = [
...CJK_IDEOGRAPH_RANGES,
CJK_PUNCTUATION_RANGE,

...FULLWIDTH_CHARACTER_RANGES,

BOPOMOFO_RANGE,
BOPOMOFO_EXTENDED_RANGE,
IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION_RANGE,
SMALL_FORM_RANGE,
VERTICAL_FORM_RANGE,
];


/**
* @param {string} str
* @returns {boolean}
*/
export function isStringPartiallyChinese(str) {
if (str.length === 0) { return false; }
for (const c of str) {
if (isCodePointInRanges(/** @type {number} */ (c.codePointAt(0)), CHINESE_RANGES)) {
return true;
}
}
return false;
}
21 changes: 21 additions & 0 deletions types/ext/CJK-util.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Copyright (C) 2023-2024 Ajatt-Tools and contributors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

export type CodepointRange = [
minInclusive: number,
maxInclusive: number,
];
5 changes: 0 additions & 5 deletions types/ext/japanese-util.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

export type CodepointRange = [
minInclusive: number,
maxInclusive: number,
];

export type FuriganaGroup = {
isKana: boolean;
text: string;
Expand Down

0 comments on commit 2060ba2

Please sign in to comment.