From 26e68dde5951ca7c807726f3db8a8deb4c4432e2 Mon Sep 17 00:00:00 2001 From: D-Sketon <2055272094@qq.com> Date: Sun, 8 Dec 2024 14:46:59 +0800 Subject: [PATCH] feat(compat): implement deburr (#876) --- benchmarks/performance/deburr.bench.ts | 10 + src/_internal/burredLetters.ts | 194 ++++++++++ src/_internal/comboMarks.ts | 119 ++++++ src/_internal/deburredLetters.ts | 195 ++++++++++ src/compat/index.ts | 1 + src/compat/string/deburr.spec.ts | 40 ++ src/compat/string/deburr.ts | 25 ++ src/string/deburr.spec.ts | 514 +------------------------ 8 files changed, 587 insertions(+), 511 deletions(-) create mode 100644 src/_internal/burredLetters.ts create mode 100644 src/_internal/comboMarks.ts create mode 100644 src/_internal/deburredLetters.ts create mode 100644 src/compat/string/deburr.spec.ts create mode 100644 src/compat/string/deburr.ts diff --git a/benchmarks/performance/deburr.bench.ts b/benchmarks/performance/deburr.bench.ts index 29977d6df..f87c8f758 100644 --- a/benchmarks/performance/deburr.bench.ts +++ b/benchmarks/performance/deburr.bench.ts @@ -1,8 +1,10 @@ import { bench, describe } from 'vitest'; import { deburr as deburrToolkit_ } from 'es-toolkit'; +import { deburr as deburrCompatToolkit_ } from 'es-toolkit/compat'; import { deburr as deburrLodash_ } from 'lodash'; const deburrToolkit = deburrToolkit_; +const deburrCompatToolkit = deburrCompatToolkit_; const deburrLodash = deburrLodash_; const longWord = 'déjà vu'.repeat(1000); @@ -11,6 +13,10 @@ describe('deburr', () => { deburrLodash('déjà vu'); }); + bench('es-toolkit/compat/deburr', () => { + deburrCompatToolkit('déjà vu'); + }); + bench('es-toolkit/deburr', () => { deburrToolkit('déjà vu'); }); @@ -19,6 +25,10 @@ describe('deburr', () => { deburrLodash(longWord); }); + bench('es-toolkit/compat/deburr - long words', () => { + deburrCompatToolkit(longWord); + }); + bench('es-toolkit/deburr - long words', () => { deburrToolkit(longWord); }); diff --git a/src/_internal/burredLetters.ts b/src/_internal/burredLetters.ts new file mode 100644 index 000000000..8a86bfbc8 --- /dev/null +++ b/src/_internal/burredLetters.ts @@ -0,0 +1,194 @@ +export const burredLetters = [ + // Latin-1 Supplement letters. + '\xc0', + '\xc1', + '\xc2', + '\xc3', + '\xc4', + '\xc5', + '\xc6', + '\xc7', + '\xc8', + '\xc9', + '\xca', + '\xcb', + '\xcc', + '\xcd', + '\xce', + '\xcf', + '\xd0', + '\xd1', + '\xd2', + '\xd3', + '\xd4', + '\xd5', + '\xd6', + '\xd8', + '\xd9', + '\xda', + '\xdb', + '\xdc', + '\xdd', + '\xde', + '\xdf', + '\xe0', + '\xe1', + '\xe2', + '\xe3', + '\xe4', + '\xe5', + '\xe6', + '\xe7', + '\xe8', + '\xe9', + '\xea', + '\xeb', + '\xec', + '\xed', + '\xee', + '\xef', + '\xf0', + '\xf1', + '\xf2', + '\xf3', + '\xf4', + '\xf5', + '\xf6', + '\xf8', + '\xf9', + '\xfa', + '\xfb', + '\xfc', + '\xfd', + '\xfe', + '\xff', + // Latin Extended-A letters. + '\u0100', + '\u0101', + '\u0102', + '\u0103', + '\u0104', + '\u0105', + '\u0106', + '\u0107', + '\u0108', + '\u0109', + '\u010a', + '\u010b', + '\u010c', + '\u010d', + '\u010e', + '\u010f', + '\u0110', + '\u0111', + '\u0112', + '\u0113', + '\u0114', + '\u0115', + '\u0116', + '\u0117', + '\u0118', + '\u0119', + '\u011a', + '\u011b', + '\u011c', + '\u011d', + '\u011e', + '\u011f', + '\u0120', + '\u0121', + '\u0122', + '\u0123', + '\u0124', + '\u0125', + '\u0126', + '\u0127', + '\u0128', + '\u0129', + '\u012a', + '\u012b', + '\u012c', + '\u012d', + '\u012e', + '\u012f', + '\u0130', + '\u0131', + '\u0132', + '\u0133', + '\u0134', + '\u0135', + '\u0136', + '\u0137', + '\u0138', + '\u0139', + '\u013a', + '\u013b', + '\u013c', + '\u013d', + '\u013e', + '\u013f', + '\u0140', + '\u0141', + '\u0142', + '\u0143', + '\u0144', + '\u0145', + '\u0146', + '\u0147', + '\u0148', + '\u0149', + '\u014a', + '\u014b', + '\u014c', + '\u014d', + '\u014e', + '\u014f', + '\u0150', + '\u0151', + '\u0152', + '\u0153', + '\u0154', + '\u0155', + '\u0156', + '\u0157', + '\u0158', + '\u0159', + '\u015a', + '\u015b', + '\u015c', + '\u015d', + '\u015e', + '\u015f', + '\u0160', + '\u0161', + '\u0162', + '\u0163', + '\u0164', + '\u0165', + '\u0166', + '\u0167', + '\u0168', + '\u0169', + '\u016a', + '\u016b', + '\u016c', + '\u016d', + '\u016e', + '\u016f', + '\u0170', + '\u0171', + '\u0172', + '\u0173', + '\u0174', + '\u0175', + '\u0176', + '\u0177', + '\u0178', + '\u0179', + '\u017a', + '\u017b', + '\u017c', + '\u017d', + '\u017e', + '\u017f', +]; diff --git a/src/_internal/comboMarks.ts b/src/_internal/comboMarks.ts new file mode 100644 index 000000000..fa9c8806a --- /dev/null +++ b/src/_internal/comboMarks.ts @@ -0,0 +1,119 @@ +/** List of combining diacritical marks. */ +export const comboMarks = [ + '\u0300', + '\u0301', + '\u0302', + '\u0303', + '\u0304', + '\u0305', + '\u0306', + '\u0307', + '\u0308', + '\u0309', + '\u030a', + '\u030b', + '\u030c', + '\u030d', + '\u030e', + '\u030f', + '\u0310', + '\u0311', + '\u0312', + '\u0313', + '\u0314', + '\u0315', + '\u0316', + '\u0317', + '\u0318', + '\u0319', + '\u031a', + '\u031b', + '\u031c', + '\u031d', + '\u031e', + '\u031f', + '\u0320', + '\u0321', + '\u0322', + '\u0323', + '\u0324', + '\u0325', + '\u0326', + '\u0327', + '\u0328', + '\u0329', + '\u032a', + '\u032b', + '\u032c', + '\u032d', + '\u032e', + '\u032f', + '\u0330', + '\u0331', + '\u0332', + '\u0333', + '\u0334', + '\u0335', + '\u0336', + '\u0337', + '\u0338', + '\u0339', + '\u033a', + '\u033b', + '\u033c', + '\u033d', + '\u033e', + '\u033f', + '\u0340', + '\u0341', + '\u0342', + '\u0343', + '\u0344', + '\u0345', + '\u0346', + '\u0347', + '\u0348', + '\u0349', + '\u034a', + '\u034b', + '\u034c', + '\u034d', + '\u034e', + '\u034f', + '\u0350', + '\u0351', + '\u0352', + '\u0353', + '\u0354', + '\u0355', + '\u0356', + '\u0357', + '\u0358', + '\u0359', + '\u035a', + '\u035b', + '\u035c', + '\u035d', + '\u035e', + '\u035f', + '\u0360', + '\u0361', + '\u0362', + '\u0363', + '\u0364', + '\u0365', + '\u0366', + '\u0367', + '\u0368', + '\u0369', + '\u036a', + '\u036b', + '\u036c', + '\u036d', + '\u036e', + '\u036f', + '\ufe20', + '\ufe21', + '\ufe22', + '\ufe23', +]; diff --git a/src/_internal/deburredLetters.ts b/src/_internal/deburredLetters.ts new file mode 100644 index 000000000..63562bd9f --- /dev/null +++ b/src/_internal/deburredLetters.ts @@ -0,0 +1,195 @@ +/** List of converted Latin Unicode letters. */ +export const deburredLetters = [ + // Converted Latin-1 Supplement letters. + 'A', + 'A', + 'A', + 'A', + 'A', + 'A', + 'Ae', + 'C', + 'E', + 'E', + 'E', + 'E', + 'I', + 'I', + 'I', + 'I', + 'D', + 'N', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'U', + 'U', + 'U', + 'U', + 'Y', + 'Th', + 'ss', + 'a', + 'a', + 'a', + 'a', + 'a', + 'a', + 'ae', + 'c', + 'e', + 'e', + 'e', + 'e', + 'i', + 'i', + 'i', + 'i', + 'd', + 'n', + 'o', + 'o', + 'o', + 'o', + 'o', + 'o', + 'u', + 'u', + 'u', + 'u', + 'y', + 'th', + 'y', + // Converted Latin Extended-A letters. + 'A', + 'a', + 'A', + 'a', + 'A', + 'a', + 'C', + 'c', + 'C', + 'c', + 'C', + 'c', + 'C', + 'c', + 'D', + 'd', + 'D', + 'd', + 'E', + 'e', + 'E', + 'e', + 'E', + 'e', + 'E', + 'e', + 'E', + 'e', + 'G', + 'g', + 'G', + 'g', + 'G', + 'g', + 'G', + 'g', + 'H', + 'h', + 'H', + 'h', + 'I', + 'i', + 'I', + 'i', + 'I', + 'i', + 'I', + 'i', + 'I', + 'i', + 'IJ', + 'ij', + 'J', + 'j', + 'K', + 'k', + 'k', + 'L', + 'l', + 'L', + 'l', + 'L', + 'l', + 'L', + 'l', + 'L', + 'l', + 'N', + 'n', + 'N', + 'n', + 'N', + 'n', + "'n", + 'N', + 'n', + 'O', + 'o', + 'O', + 'o', + 'O', + 'o', + 'Oe', + 'oe', + 'R', + 'r', + 'R', + 'r', + 'R', + 'r', + 'S', + 's', + 'S', + 's', + 'S', + 's', + 'S', + 's', + 'T', + 't', + 'T', + 't', + 'T', + 't', + 'U', + 'u', + 'U', + 'u', + 'U', + 'u', + 'U', + 'u', + 'U', + 'u', + 'U', + 'u', + 'W', + 'w', + 'Y', + 'y', + 'Y', + 'Z', + 'z', + 'Z', + 'z', + 'Z', + 'z', + 's', +]; diff --git a/src/compat/index.ts b/src/compat/index.ts index 3a629b5fd..1eb2ef915 100644 --- a/src/compat/index.ts +++ b/src/compat/index.ts @@ -163,6 +163,7 @@ export { matches } from './predicate/matches.ts'; export { matchesProperty } from './predicate/matchesProperty.ts'; export { camelCase } from './string/camelCase.ts'; +export { deburr } from './string/deburr.ts'; export { endsWith } from './string/endsWith.ts'; export { escape } from './string/escape.ts'; export { kebabCase } from './string/kebabCase.ts'; diff --git a/src/compat/string/deburr.spec.ts b/src/compat/string/deburr.spec.ts new file mode 100644 index 000000000..32e4f393d --- /dev/null +++ b/src/compat/string/deburr.spec.ts @@ -0,0 +1,40 @@ +import { describe, expect, it } from 'vitest'; +import { deburr } from './deburr'; +import { burredLetters } from '../../_internal/burredLetters'; +import { comboMarks } from '../../_internal/comboMarks'; +import { deburredLetters } from '../../_internal/deburredLetters'; +import { map } from '../array/map'; +import { constant } from '../util/constant'; +import { stubString } from '../util/stubString'; + +describe('deburr', () => { + it('should convert Latin Unicode letters to basic Latin', () => { + const actual = map(burredLetters, deburr); + expect(actual).toEqual(deburredLetters); + }); + + it('should not deburr Latin mathematical operators', () => { + const operators = ['\xd7', '\xf7']; + const actual = map(operators, deburr); + + expect(actual).toEqual(operators); + }); + + it('should deburr combining diacritical marks', () => { + const expected = map(comboMarks, constant('ei')); + + const actual = map(comboMarks, chr => deburr(`e${chr}i`)); + + expect(actual).toEqual(expected); + }); + + it('should return an empty string for empty values', () => { + // eslint-disable-next-line no-sparse-arrays + const values = [, null, undefined, '']; + const expected = map(values, stubString); + + const actual = map(values, (value, index) => (index ? deburr(value as any) : deburr())); + + expect(actual).toEqual(expected); + }); +}); diff --git a/src/compat/string/deburr.ts b/src/compat/string/deburr.ts new file mode 100644 index 000000000..ea781953e --- /dev/null +++ b/src/compat/string/deburr.ts @@ -0,0 +1,25 @@ +import { deburr as deburrToolkit } from '../../string/deburr.ts'; +import { toString } from '../util/toString.ts'; + +/** + * Converts a string by replacing special characters and diacritical marks with their ASCII equivalents. + * For example, "Crème brûlée" becomes "Creme brulee". + * + * @param {string} str - The input string to be deburred. + * @returns {string} - The deburred string with special characters replaced by their ASCII equivalents. + * + * @example + * // Basic usage: + * deburr('Æthelred') // returns 'Aethelred' + * + * @example + * // Handling diacritical marks: + * deburr('München') // returns 'Munchen' + * + * @example + * // Special characters: + * deburr('Crème brûlée') // returns 'Creme brulee' + */ +export function deburr(str?: string): string { + return deburrToolkit(toString(str)); +} diff --git a/src/string/deburr.spec.ts b/src/string/deburr.spec.ts index fd2d86ae2..18104370e 100644 --- a/src/string/deburr.spec.ts +++ b/src/string/deburr.spec.ts @@ -1,516 +1,8 @@ import { describe, expect, it } from 'vitest'; import { deburr } from './deburr'; - -const burredLetters = [ - // Latin-1 Supplement letters. - '\xc0', - '\xc1', - '\xc2', - '\xc3', - '\xc4', - '\xc5', - '\xc6', - '\xc7', - '\xc8', - '\xc9', - '\xca', - '\xcb', - '\xcc', - '\xcd', - '\xce', - '\xcf', - '\xd0', - '\xd1', - '\xd2', - '\xd3', - '\xd4', - '\xd5', - '\xd6', - '\xd8', - '\xd9', - '\xda', - '\xdb', - '\xdc', - '\xdd', - '\xde', - '\xdf', - '\xe0', - '\xe1', - '\xe2', - '\xe3', - '\xe4', - '\xe5', - '\xe6', - '\xe7', - '\xe8', - '\xe9', - '\xea', - '\xeb', - '\xec', - '\xed', - '\xee', - '\xef', - '\xf0', - '\xf1', - '\xf2', - '\xf3', - '\xf4', - '\xf5', - '\xf6', - '\xf8', - '\xf9', - '\xfa', - '\xfb', - '\xfc', - '\xfd', - '\xfe', - '\xff', - // Latin Extended-A letters. - '\u0100', - '\u0101', - '\u0102', - '\u0103', - '\u0104', - '\u0105', - '\u0106', - '\u0107', - '\u0108', - '\u0109', - '\u010a', - '\u010b', - '\u010c', - '\u010d', - '\u010e', - '\u010f', - '\u0110', - '\u0111', - '\u0112', - '\u0113', - '\u0114', - '\u0115', - '\u0116', - '\u0117', - '\u0118', - '\u0119', - '\u011a', - '\u011b', - '\u011c', - '\u011d', - '\u011e', - '\u011f', - '\u0120', - '\u0121', - '\u0122', - '\u0123', - '\u0124', - '\u0125', - '\u0126', - '\u0127', - '\u0128', - '\u0129', - '\u012a', - '\u012b', - '\u012c', - '\u012d', - '\u012e', - '\u012f', - '\u0130', - '\u0131', - '\u0132', - '\u0133', - '\u0134', - '\u0135', - '\u0136', - '\u0137', - '\u0138', - '\u0139', - '\u013a', - '\u013b', - '\u013c', - '\u013d', - '\u013e', - '\u013f', - '\u0140', - '\u0141', - '\u0142', - '\u0143', - '\u0144', - '\u0145', - '\u0146', - '\u0147', - '\u0148', - '\u0149', - '\u014a', - '\u014b', - '\u014c', - '\u014d', - '\u014e', - '\u014f', - '\u0150', - '\u0151', - '\u0152', - '\u0153', - '\u0154', - '\u0155', - '\u0156', - '\u0157', - '\u0158', - '\u0159', - '\u015a', - '\u015b', - '\u015c', - '\u015d', - '\u015e', - '\u015f', - '\u0160', - '\u0161', - '\u0162', - '\u0163', - '\u0164', - '\u0165', - '\u0166', - '\u0167', - '\u0168', - '\u0169', - '\u016a', - '\u016b', - '\u016c', - '\u016d', - '\u016e', - '\u016f', - '\u0170', - '\u0171', - '\u0172', - '\u0173', - '\u0174', - '\u0175', - '\u0176', - '\u0177', - '\u0178', - '\u0179', - '\u017a', - '\u017b', - '\u017c', - '\u017d', - '\u017e', - '\u017f', -]; - -/** List of combining diacritical marks. */ -const comboMarks = [ - '\u0300', - '\u0301', - '\u0302', - '\u0303', - '\u0304', - '\u0305', - '\u0306', - '\u0307', - '\u0308', - '\u0309', - '\u030a', - '\u030b', - '\u030c', - '\u030d', - '\u030e', - '\u030f', - '\u0310', - '\u0311', - '\u0312', - '\u0313', - '\u0314', - '\u0315', - '\u0316', - '\u0317', - '\u0318', - '\u0319', - '\u031a', - '\u031b', - '\u031c', - '\u031d', - '\u031e', - '\u031f', - '\u0320', - '\u0321', - '\u0322', - '\u0323', - '\u0324', - '\u0325', - '\u0326', - '\u0327', - '\u0328', - '\u0329', - '\u032a', - '\u032b', - '\u032c', - '\u032d', - '\u032e', - '\u032f', - '\u0330', - '\u0331', - '\u0332', - '\u0333', - '\u0334', - '\u0335', - '\u0336', - '\u0337', - '\u0338', - '\u0339', - '\u033a', - '\u033b', - '\u033c', - '\u033d', - '\u033e', - '\u033f', - '\u0340', - '\u0341', - '\u0342', - '\u0343', - '\u0344', - '\u0345', - '\u0346', - '\u0347', - '\u0348', - '\u0349', - '\u034a', - '\u034b', - '\u034c', - '\u034d', - '\u034e', - '\u034f', - '\u0350', - '\u0351', - '\u0352', - '\u0353', - '\u0354', - '\u0355', - '\u0356', - '\u0357', - '\u0358', - '\u0359', - '\u035a', - '\u035b', - '\u035c', - '\u035d', - '\u035e', - '\u035f', - '\u0360', - '\u0361', - '\u0362', - '\u0363', - '\u0364', - '\u0365', - '\u0366', - '\u0367', - '\u0368', - '\u0369', - '\u036a', - '\u036b', - '\u036c', - '\u036d', - '\u036e', - '\u036f', - '\ufe20', - '\ufe21', - '\ufe22', - '\ufe23', -]; - -/** List of converted Latin Unicode letters. */ -const deburredLetters = [ - // Converted Latin-1 Supplement letters. - 'A', - 'A', - 'A', - 'A', - 'A', - 'A', - 'Ae', - 'C', - 'E', - 'E', - 'E', - 'E', - 'I', - 'I', - 'I', - 'I', - 'D', - 'N', - 'O', - 'O', - 'O', - 'O', - 'O', - 'O', - 'U', - 'U', - 'U', - 'U', - 'Y', - 'Th', - 'ss', - 'a', - 'a', - 'a', - 'a', - 'a', - 'a', - 'ae', - 'c', - 'e', - 'e', - 'e', - 'e', - 'i', - 'i', - 'i', - 'i', - 'd', - 'n', - 'o', - 'o', - 'o', - 'o', - 'o', - 'o', - 'u', - 'u', - 'u', - 'u', - 'y', - 'th', - 'y', - // Converted Latin Extended-A letters. - 'A', - 'a', - 'A', - 'a', - 'A', - 'a', - 'C', - 'c', - 'C', - 'c', - 'C', - 'c', - 'C', - 'c', - 'D', - 'd', - 'D', - 'd', - 'E', - 'e', - 'E', - 'e', - 'E', - 'e', - 'E', - 'e', - 'E', - 'e', - 'G', - 'g', - 'G', - 'g', - 'G', - 'g', - 'G', - 'g', - 'H', - 'h', - 'H', - 'h', - 'I', - 'i', - 'I', - 'i', - 'I', - 'i', - 'I', - 'i', - 'I', - 'i', - 'IJ', - 'ij', - 'J', - 'j', - 'K', - 'k', - 'k', - 'L', - 'l', - 'L', - 'l', - 'L', - 'l', - 'L', - 'l', - 'L', - 'l', - 'N', - 'n', - 'N', - 'n', - 'N', - 'n', - "'n", - 'N', - 'n', - 'O', - 'o', - 'O', - 'o', - 'O', - 'o', - 'Oe', - 'oe', - 'R', - 'r', - 'R', - 'r', - 'R', - 'r', - 'S', - 's', - 'S', - 's', - 'S', - 's', - 'S', - 's', - 'T', - 't', - 'T', - 't', - 'T', - 't', - 'U', - 'u', - 'U', - 'u', - 'U', - 'u', - 'U', - 'u', - 'U', - 'u', - 'U', - 'u', - 'W', - 'w', - 'Y', - 'y', - 'Y', - 'Z', - 'z', - 'Z', - 'z', - 'Z', - 'z', - 's', -]; +import { burredLetters } from '../_internal/burredLetters'; +import { comboMarks } from '../_internal/comboMarks'; +import { deburredLetters } from '../_internal/deburredLetters'; describe('deburr', () => { it('should convert examples correctly', () => {