Skip to content

Commit

Permalink
Simplify diacratic removal; modify Latin & Greek preprocessors (#724)
Browse files Browse the repository at this point in the history
* Simplified diacratic removal and added preprocessors to LA and GRC

* linted

* Clarified the name of removeAlphabeticDiacritics

* Add comment to removeAlphabeticDiacritics

Signed-off-by: Darius Jahandarie <[email protected]>

* Change to NFD

Signed-off-by: Matttttt <[email protected]>

* Remove trailing spaces in comment

Signed-off-by: Darius Jahandarie <[email protected]>

* Remove latin preprocessors .eslintrc.json

Signed-off-by: Matttttt <[email protected]>

* fix tests

---------

Signed-off-by: Darius Jahandarie <[email protected]>
Signed-off-by: Matttttt <[email protected]>
Co-authored-by: martholomew <[email protected]>
Co-authored-by: Darius Jahandarie <[email protected]>
Co-authored-by: Stefan Vukovic <[email protected]>
  • Loading branch information
4 people authored Apr 8, 2024
1 parent 2c5af21 commit 0663774
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 63 deletions.
1 change: 0 additions & 1 deletion .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,6 @@
"ext/js/language/ja/japanese-transforms.js",
"ext/js/language/ja/japanese-wanakana.js",
"ext/js/language/ja/japanese.js",
"ext/js/language/la/latin-text-preprocessors.js",
"ext/js/language/language-descriptors.js",
"ext/js/language/language-transformer.js",
"ext/js/language/language-transforms.js",
Expand Down
56 changes: 0 additions & 56 deletions ext/js/language/la/latin-text-preprocessors.js

This file was deleted.

10 changes: 6 additions & 4 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@ import {englishTransforms} from './en/english-transforms.js';
import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
import {japaneseTransforms} from './ja/japanese-transforms.js';
import {isStringPartiallyJapanese} from './ja/japanese.js';
import {removeLatinDiacritics} from './la/latin-text-preprocessors.js';
import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';
import {albanianTransforms} from './sq/albanian-transforms.js';
import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js';
import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-preprocessors.js';

const capitalizationPreprocessors = {
decapitalize,
Expand Down Expand Up @@ -87,7 +86,10 @@ const languageDescriptors = [
iso: 'grc',
name: 'Ancient Greek',
exampleText: 'γράφω',
textPreprocessors: capitalizationPreprocessors
textPreprocessors: {
...capitalizationPreprocessors,
removeAlphabeticDiacritics
}
},
{
iso: 'hu',
Expand All @@ -113,7 +115,7 @@ const languageDescriptors = [
exampleText: 'legere',
textPreprocessors: {
...capitalizationPreprocessors,
removeLatinDiacritics
removeAlphabeticDiacritics
}
},
{
Expand Down
14 changes: 14 additions & 0 deletions ext/js/language/text-preprocessors.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,17 @@ export const capitalizeFirstLetter = {
options: basicTextPreprocessorOptions,
process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)
};

/**
* WARNING: This should NOT be used with languages that use Han characters,
* as it can result in undesirable normalization:
* - '\u9038'.normalize('NFD') => '\u9038' (逸)
* - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸)
* @type {import('language').TextPreprocessor<boolean>}
*/
export const removeAlphabeticDiacritics = {
name: 'Remove Alphabetic Diacritics',
description: 'ἄήé -> αηe',
options: basicTextPreprocessorOptions,
process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str)
};
6 changes: 4 additions & 2 deletions types/ext/language-descriptors.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,14 @@ type AllTextPreprocessors = {
removeArabicScriptDiacritics: TextPreprocessor<boolean>;
};
fr: CapitalizationPreprocessors;
grc: CapitalizationPreprocessors;
grc: CapitalizationPreprocessors & {
removeAlphabeticDiacritics: TextPreprocessor<boolean>;
};
hu: CapitalizationPreprocessors;
id: CapitalizationPreprocessors;
it: CapitalizationPreprocessors;
la: CapitalizationPreprocessors & {
removeLatinDiacritics: TextPreprocessor<boolean>;
removeAlphabeticDiacritics: TextPreprocessor<boolean>;
};
ja: {
convertHalfWidthCharacters: TextPreprocessor<boolean>;
Expand Down

0 comments on commit 0663774

Please sign in to comment.