From 3538d9d50ad4df663722780c413e988006eec27b Mon Sep 17 00:00:00 2001 From: Itsupera Date: Fri, 15 Jul 2022 12:41:25 +0200 Subject: [PATCH] Fix case of invalid reading, skipping it --- furigana/furigana.py | 104 ++++++++++++++++++++++------------------- tests/test_furigana.py | 8 ++++ 2 files changed, 63 insertions(+), 49 deletions(-) diff --git a/furigana/furigana.py b/furigana/furigana.py index f69ac63..9b07a99 100644 --- a/furigana/furigana.py +++ b/furigana/furigana.py @@ -36,62 +36,68 @@ def split_okurigana(text, hiragana, reversed=False): i = 0 # cursor on the text j = 0 # cursor on the hiragana - while i < len(text): - start_i = i - start_j = j - - logging.debug(f'Taking care non kanji parts. i={i}, j={j} ("{text[i]}" / "{hiragana[j]}")') - if not is_kanji_or_number(text[i]): - while i < len(text) and j < len(hiragana) and not is_kanji_or_number(text[i]): - # Increment the hiragana cursor, except for punctuation (not kana nor kanji), - # which is absent from the hiragana str ! - if is_kana_character(text[i]): - if not hiragana_matches_text_char(hiragana[j], text[i]): - # Try parsing in reverse order - if not reversed: - return split_okurigana(text[::-1], hiragana[::-1], reversed=True) - - logging.error(f"Kana {hiragana[j]} did not match character {text[i]} ! {text} {hiragana}") - - # Fallback by returning all the remaining text with all the hiragana as furigana - split.append(Text(text[start_i:], hiragana[start_j:])) - return split - j += 1 + # Some entries may contain mistakes, + # such as 爆売れ with ウレ as the reading (with mecab-ipadic-neologd) + if len(hiragana) < len(text): + # Discard the furigana for that word + split = [Text(text, None)] + else: + while i < len(text): + start_i = i + start_j = j - i += 1 + logging.debug(f'Taking care non kanji parts. i={i}, j={j} ("{text[i]}" / "{hiragana[j]}")') + if not is_kanji_or_number(text[i]): + while i < len(text) and j < len(hiragana) and not is_kanji_or_number(text[i]): + # Increment the hiragana cursor, except for punctuation (not kana nor kanji), + # which is absent from the hiragana str ! + if is_kana_character(text[i]): + if not hiragana_matches_text_char(hiragana[j], text[i]): + # Try parsing in reverse order + if not reversed: + return split_okurigana(text[::-1], hiragana[::-1], reversed=True) + + logging.error(f"Kana {hiragana[j]} did not match character {text[i]} ! {text} {hiragana}") + + # Fallback by returning all the remaining text with all the hiragana as furigana + split.append(Text(text[start_i:], hiragana[start_j:])) + return split + j += 1 + + i += 1 - logging.debug(f'Reached end of non kanji part. i={i}, j={j} ("{text[start_i:i]}" / "{hiragana[start_j:j]}")') - split.append(Text(text[start_i:i], None)) + logging.debug(f'Reached end of non kanji part. i={i}, j={j} ("{text[start_i:i]}" / "{hiragana[start_j:j]}")') + split.append(Text(text[start_i:i], None)) + + if i >= len(text): + break + + start_i = i + start_j = j + + # find next kana + logging.debug(f'Find next kana in text "{text[i:]}". i={i}') + while i < len(text) and not is_kana_character(text[i]): + i += 1 if i >= len(text): + logging.debug(f'Only kanji left. i={i}, j={j} ("{text[start_i:i]}" / "{hiragana[start_j:len(hiragana)]}")') + split.append(Text(text[start_i:i], hiragana[start_j:len(hiragana)])) break - start_i = i - start_j = j + logging.debug(f'Get reading for "{text[start_i:i]}". j={j}') + while ( + j < len(hiragana) + and ( + not hiragana_matches_text_char(hiragana[j], text[i]) + or j - start_j < i - start_i # every kanji has at least one sound associated with it + ) + ): + j += 1 + + logging.debug(f'Got reading "{hiragana[start_j:j]}" for "{text[start_i:i]}"') - # find next kana - logging.debug(f'Find next kana in text "{text[i:]}". i={i}') - while i < len(text) and not is_kana_character(text[i]): - i += 1 - - if i >= len(text): - logging.debug(f'Only kanji left. i={i}, j={j} ("{text[start_i:i]}" / "{hiragana[start_j:len(hiragana)]}")') - split.append(Text(text[start_i:i], hiragana[start_j:len(hiragana)])) - break - - logging.debug(f'Get reading for "{text[start_i:i]}". j={j}') - while ( - j < len(hiragana) - and ( - not hiragana_matches_text_char(hiragana[j], text[i]) - or j - start_j < i - start_i # every kanji has at least one sound associated with it - ) - ): - j += 1 - - logging.debug(f'Got reading "{hiragana[start_j:j]}" for "{text[start_i:i]}"') - - split.append(Text(text[start_i:i], hiragana[start_j:j])) + split.append(Text(text[start_i:i], hiragana[start_j:j])) # If we did a reverse parsing, reverse the results if reversed: diff --git a/tests/test_furigana.py b/tests/test_furigana.py index 389ee36..266ca1d 100644 --- a/tests/test_furigana.py +++ b/tests/test_furigana.py @@ -120,6 +120,14 @@ def test_split_furigana(text, expected_split): ("20", "にじゅう", [ Text(text='20', furigana="にじゅう"), ]), + ( + # Some entries may have an incorrect reading + # that does not match in terms of length. + # In this case we just ignore the reading + "爆売れ", "うれ", [ + Text(text='爆売れ', furigana=None), + ] + ), ]) def test_split_okurigana(text, hiragana, expected_split): assert split_okurigana(text, hiragana) == expected_split