Skip to content

Commit

Permalink
Merge pull request #18 from MarvNC:remove-punctuation-readings
Browse files Browse the repository at this point in the history
Strip Punctuation From Ruby Text
  • Loading branch information
MarvNC authored Jan 21, 2024
2 parents 6f589ff + 4635f94 commit 95d44f4
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 15 deletions.
14 changes: 7 additions & 7 deletions src/test/parseCantoneseReadings.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ const testCases = [
{ text: '我', reading: 'ngo5' },
{ text: '講', reading: 'gong2' },
{ text: '咩', reading: 'me1' },
{ text: '?', reading: '?' },
{ text: '?', reading: '' },
],
},
{
Expand All @@ -61,7 +61,7 @@ const testCases = [
{ text: '斷', reading: 'tyun5' },
{ text: '韌', reading: 'jan6' },
{ text: '帶', reading: 'daai2' },
{ text: '。', reading: '.' },
{ text: '。', reading: '' },
],
},
{
Expand Down Expand Up @@ -118,15 +118,15 @@ const testCases = [
{ text: '啲', reading: 'di1' },
{ text: '巴', reading: 'baa1' },
{ text: '士', reading: 'si2' },
{ text: ',', reading: ',' },
{ text: ',', reading: '' },
{ text: '一', reading: 'jat1' },
{ text: '定', reading: 'ding6' },
{ text: '經', reading: 'ging1' },
{ text: '地', reading: 'dei6' },
{ text: '鐵', reading: 'tit3' },
{ text: '站', reading: 'zaam6' },
{ text: '㗎', reading: 'gaa3' },
{ text: '。', reading: '.' },
{ text: '。', reading: '' },
],
},
{
Expand Down Expand Up @@ -161,14 +161,14 @@ const testCases = [
{ text: '經', reading: 'ging1' },
{ text: '之', reading: 'zi1' },
{ text: '路', reading: 'lou6' },
{ text: ',', reading: ',' },
{ text: ',', reading: '' },
{ text: '有', reading: 'jau5' },
{ text: '好', reading: 'hou2' },
{ text: '有', reading: 'jau5' },
{ text: '唔', reading: 'm4' },
{ text: '好', reading: 'hou2' },
{ text: '囉', reading: 'lo1' },
{ text: '。', reading: '.' },
{ text: '。', reading: '' },
],
},
{
Expand All @@ -190,7 +190,7 @@ const testCases = [
{ text: '白', reading: 'baak6' },
{ text: '兔', reading: 'tou3' },
{ text: '會', reading: 'wui6' },
{ text: '》', reading: '.' },
{ text: '》', reading: '' },
{ text: '。', reading: '' },
],
},
Expand Down
24 changes: 16 additions & 8 deletions src/util/textHandling/parseCantoneseReadings.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,39 @@ function parseCantoneseReadings(rawText, readings) {
const isTextHanzi = isHanzi(text);
const isTextAlphanumeric = isJyuutping(text);
const isTextPunctuation = isPunctuation(text);
const isReadingJyuutping = isJyuutping(reading);
const isReadingJyutping = isJyuutping(reading);
const isReadingPunctuation = isPunctuation(reading);
// Ideal case
if (
!!text &&
!!reading &&
((isTextHanzi && isReadingJyuutping) ||
(isTextPunctuation && isReadingPunctuation) ||
((isTextHanzi && isReadingJyutping) ||
// Case where for example text is 'bu' and reading is 'bu4'
(isTextAlphanumeric && isReadingJyuutping) ||
// Where both are special characters
(!isTextAlphanumeric && !isTextHanzi && !isReadingJyuutping))
(isTextAlphanumeric && isReadingJyutping))
) {
resultArray.push({ text, reading });
textIndex++;
readingIndex++;
} else if (
!!text &&
((isTextPunctuation && isReadingJyuutping) ||
((isTextPunctuation && isReadingJyutping) ||
(!!text && reading === undefined) ||
(!isTextAlphanumeric && !isTextHanzi && isReadingJyuutping))
(!isTextAlphanumeric && !isTextHanzi && isReadingJyutping))
) {
// Send empty string to reading
resultArray.push({ text, reading: '' });
textIndex++;
} else if (
!!text &&
!!reading &&
((isTextPunctuation && isReadingPunctuation) ||
// Where both are special characters
(!isTextAlphanumeric && !isTextHanzi && !isReadingJyutping))
) {
// Don't add the punctuation but consume it
resultArray.push({ text, reading: '' });
textIndex++;
readingIndex++;
} else {
throw new Error(
`Unexpected text "${text}" and reading "${reading}" at index ${i} in ${rawText}: ${readings}`
Expand Down

0 comments on commit 95d44f4

Please sign in to comment.