diff --git a/README.md b/README.md index 67a61fa..c9d60a5 100644 --- a/README.md +++ b/README.md @@ -51,9 +51,10 @@ split_sentences( - string: single text segmentation - list/tuple of strings: batch texts segmentation - **backend: Morpheme analyzer backend** - - `backend='auto'`: find `mecab` → `konlpy.tag.Mecab` → `pecab` and use first found analyzer (default) + - `backend='auto'`: find `mecab` → `konlpy.tag.Mecab` → `pecab` → `punct` and use first found analyzer (default) - `backend='mecab'`: find `mecab` → `konlpy.tag.Mecab` and use first found analyzer - `backend='pecab'`: use `pecab` analyzer + - `backend='punct'`: split sentences only near punctuation marks - **num_workers: The number of multiprocessing workers** - `num_workers='auto'`: use multiprocessing with the maximum number of workers if possible (default) - `num_workers=1`: don't use multiprocessing @@ -739,9 +740,10 @@ split_morphemes( - string: single text segmentation - list/tuple of strings: batch texts segmentation - **backend: Morpheme analyzer backend.** - - `backend='auto'`: find `mecab` → `konlpy.tag.Mecab` → `pecab` and use first found analyzer (default) + - `backend='auto'`: find `mecab` → `konlpy.tag.Mecab` → `pecab` → `punct` and use first found analyzer (default) - `backend='mecab'`: find `mecab` → `konlpy.tag.Mecab` and use first found analyzer - `backend='pecab'`: use `pecab` analyzer + - `backend='punct'`: split sentences only near punctuation marks - **num_workers: The number of multiprocessing workers** - `num_workers='auto'`: use multiprocessing with the maximum number of workers if possible (default) - `num_workers=1`: don't use multiprocessing @@ -817,9 +819,10 @@ summarize_sentences( - string: single text segmentation - list/tuple of strings: batch texts segmentation - **backend: Morpheme analyzer backend.** - - `backend='auto'`: find `mecab` → `konlpy.tag.Mecab` → `pecab` and use first found analyzer (default) + - `backend='auto'`: find `mecab` → `konlpy.tag.Mecab` → `pecab` → `punct` and use first found analyzer (default) - `backend='mecab'`: find `mecab` → `konlpy.tag.Mecab` and use first found analyzer - `backend='pecab'`: use `pecab` analyzer + - `backend='punct'`: split sentences only near punctuation marks - **num_workers: The number of multiprocessing workers** - `num_workers='auto'`: use multiprocessing with the maximum number of workers if possible (default) - `num_workers=1`: don't use multiprocessing diff --git a/bench/test_kss.py b/bench/test_kss.py index 74cb916..1b456e1 100644 --- a/bench/test_kss.py +++ b/bench/test_kss.py @@ -8,7 +8,7 @@ parser.add_argument("datasets", nargs="+") parser.add_argument("--write_result") parser.add_argument("--write_err") - parser.add_argument("--backend", default="mecab", choices=["mecab", "pecab"]) + parser.add_argument("--backend", default="mecab", choices=["mecab", "pecab", "punct"]) args = parser.parse_args() split_sentences("foo-bar", backend=args.backend) # warm-up diff --git a/kss/__init__.py b/kss/__init__.py index 08c7416..e42ad00 100644 --- a/kss/__init__.py +++ b/kss/__init__.py @@ -6,4 +6,4 @@ from kss._modules.summarization.summarize_sentences import summarize_sentences __ALL__ = [split_sentences, split_morphemes, summarize_sentences] -__version__ = "4.3.2" +__version__ = "4.4.0" diff --git a/kss/_modules/morphemes/analyzers.py b/kss/_modules/morphemes/analyzers.py index aeebac0..f9b0343 100644 --- a/kss/_modules/morphemes/analyzers.py +++ b/kss/_modules/morphemes/analyzers.py @@ -10,6 +10,8 @@ class Analyzer(ABC): + _analyzer, _backend = None, None + def pos(self, text: str, drop_space: bool) -> Any: raise NotImplementedError @@ -66,3 +68,26 @@ def pos(self, text: str, drop_space: bool) -> List[Tuple[str, str]]: output = self._drop_space(output) return output + + +class CharacterAnalyzer(Analyzer): + _analyzer, _backend = None, "character" + + @lru_cache(maxsize=500) + def pos(self, text: str, drop_space: bool) -> List[Tuple[str, str]]: + """ + Get pos information. + + Args: + text (str): input text + drop_space (bool): drop all spaces or not. + + Returns: + List[Tuple[str, str]]: output of analysis. + """ + output = [(char, "-") for char in text] + + if drop_space: + output = self._drop_space(output) + + return output diff --git a/kss/_modules/sentences/sentence_postprocessor.py b/kss/_modules/sentences/sentence_postprocessor.py index 1f54c47..6958a49 100644 --- a/kss/_modules/sentences/sentence_postprocessor.py +++ b/kss/_modules/sentences/sentence_postprocessor.py @@ -24,7 +24,7 @@ def postprocess( Returns: List[str]: postprocessed output setences in string """ - + output_sentences = self._remove_space_before_emoji(output_sentences) output_sentences = self._merge_broken_sub_sentence_in_quotes_or_brackets( output_sentences ) @@ -491,3 +491,27 @@ def _remove_empty_sentence(output_sentences: List[List[Syllable]]): for sentence in output_sentences if len("".join([i.text for i in sentence]).strip()) != 0 ] + + @staticmethod + def _remove_space_before_emoji( + output_sentences: List[List[Syllable]], + ) -> List[List[Syllable]]: + """ + Remove a space character before emoji. + The space character was appended in preprocessing step. + + Args: + output_sentences (List[List[Syllable]]): split list of syllables + + Returns: + List[List[Syllable]]: list of syllables without space before emoji + """ + for sentence_idx, output_sentence in enumerate(output_sentences): + for syllable_idx, output_syllable in enumerate(output_sentence): + if ( + output_syllable.check_pos("EMOJI") + and output_syllable.prev.text == " " + ): + output_syllable.prev = output_syllable.prev.prev + output_sentences[sentence_idx].pop(syllable_idx - 1) + return output_sentences diff --git a/kss/_modules/sentences/sentence_preprocessor.py b/kss/_modules/sentences/sentence_preprocessor.py index 37e9030..4196608 100644 --- a/kss/_modules/sentences/sentence_preprocessor.py +++ b/kss/_modules/sentences/sentence_preprocessor.py @@ -49,6 +49,7 @@ def preprocess(self, input_morphemes: List[Tuple[str, str]]) -> List[Syllable]: """ syllables = self._convert_morphemes_to_syllables(input_morphemes) syllables = self._correct_wrong_tags(syllables) + syllables = self._append_space_before_emoji(syllables) return syllables def _convert_morphemes_to_syllables( @@ -159,3 +160,28 @@ def _change_poses(syllable: Syllable, *poses: str): for pos in poses: _next.pos = pos _next = _next.next + + @staticmethod + def _append_space_before_emoji(syllables: List[Syllable]) -> List[Syllable]: + """ + Append a space character before emoji character. + This could be helpful for tokenizing sentences which contain emoji. + + Args: + syllables (List[Syllable]): input syllables + + Returns: + List[Syllable]: preprocessed syllables + """ + new_syllables = [] + for syllable in syllables: + if syllable.check_pos("EMOJI") and not syllable.prev.check_pos("EMOJI"): + space_syllable = Syllable(" ", "SP") + if syllable.prev is not None: + space_syllable.prev = syllable.prev + space_syllable.next = syllable + syllable.prev = space_syllable + + new_syllables.append(space_syllable) + new_syllables.append(syllable) + return new_syllables diff --git a/kss/_modules/sentences/sentence_splitter.py b/kss/_modules/sentences/sentence_splitter.py index efb6e45..4ef7183 100644 --- a/kss/_modules/sentences/sentence_splitter.py +++ b/kss/_modules/sentences/sentence_splitter.py @@ -6,7 +6,7 @@ from kss._elements.subclasses import Syllable from kss._modules.sentences.sentence_processor import SentenceProcessor -from kss._utils.const import papers +from kss._utils.const import sf_exception class SentenceSplitter(SentenceProcessor): @@ -155,14 +155,7 @@ def _sf(self) -> bool: # 종결부호 분할 규칙 if self._check_pos("SF") and ( self._check_next_pos("SP") - or self._check_next_skip_sp_pos( - ( - "SY", - "SSO", - "EMOJI", - "JAMO", - ) - ) + or self._check_next_skip_sp_pos(("SY", "SSO", "EMOJI", "JAMO")) ): # 예외 1 available = not self._prev_skip(("SP", "SF")).text.isnumeric() @@ -220,8 +213,9 @@ def _sf(self) -> bool: available = available and not self._check_next_is_unavailable_split() # 예외 9 - available = available and ( - not self._check_multiple_prev_texts_from_before(*papers) + available = available and not ( + self._check_text(".") + and self._check_multiple_prev_texts_from_before(*sf_exception) ) return available diff --git a/kss/_modules/sentences/split_sentences.py b/kss/_modules/sentences/split_sentences.py index 3378eee..b99e79f 100644 --- a/kss/_modules/sentences/split_sentences.py +++ b/kss/_modules/sentences/split_sentences.py @@ -33,7 +33,7 @@ def split_sentences( Args: text (Union[str, List[str], Tuple[str]]): single text or list/tuple of texts - backend (str): morpheme analyzer backend. 'mecab', 'pecab' are supported. + backend (str): morpheme analyzer backend. 'mecab', 'pecab', 'punkt' are supported. num_workers (Union[int, str])): the number of multiprocessing workers strip (bool): strip all sentences or not @@ -112,8 +112,11 @@ def _split_sentences( current_sentence_syllables = [syllable] syllable_added = True - elif splitter.check_split_start(): - split_mode = True + else: + if backend._backend == "character": + split_mode = splitter._sf() + else: + split_mode = splitter.check_split_start() else: end_split, end_split_exception = splitter.check_split_end() diff --git a/kss/_utils/const.py b/kss/_utils/const.py index 55de403..6ba60ba 100644 --- a/kss/_utils/const.py +++ b/kss/_utils/const.py @@ -183,7 +183,7 @@ r"[a-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+|[a-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+\.[a-z]" ) -papers = [ +sf_exception = [ " no", " No", " pp", @@ -214,10 +214,43 @@ " Para", " comp", " Comp", + "Capt", + " capt", + " dept", + "Dept", + "Mr", + " mr", + "Miss", + "Mrs", + " mrs", + "Ms", + " ms", + "Dr", + " dr", + "Prof", + " prof", + "Rev", + " rev", + "St", + " st", + " Co", + " co", + " MM", + " mm", + " Messrs", + " messrs", + " Mlle", + " mlle", + " Mme", + " mme", + " def", + " Def", + " viz", + " Viz", ] for i in range(0, 10): - papers += [ + sf_exception += [ f"{i}항", f"{i}조", f"{i}호", @@ -226,6 +259,7 @@ f"{i}권", f"{i}쪽", f"{i}장", + f"{i}", ] @@ -270,8 +304,4 @@ "'n'", "N' ", "n' ", - "Capt.", - "capt.", - "dept.", - "Dept.", } diff --git a/kss/_utils/sanity_checks.py b/kss/_utils/sanity_checks.py index 40b4e25..06baa87 100644 --- a/kss/_utils/sanity_checks.py +++ b/kss/_utils/sanity_checks.py @@ -9,10 +9,16 @@ MecabAnalyzer, PecabAnalyzer, Analyzer, + CharacterAnalyzer, ) from kss._utils.logging import logger -MECAB_INFORM, KONLPY_MECAB_INFORM, PECAB_INFORM = False, False, False +MECAB_INFORM, KONLPY_MECAB_INFORM, PECAB_INFORM, PUNCT_INFORM = ( + False, + False, + False, + False, +) _mecab_info_linux_macos = "https://github.com/hyunwoongko/python-mecab-kor" _konlpy_info_linux_macos = "https://konlpy.org/en/latest/api/konlpy.tag/#mecab-class" @@ -29,7 +35,9 @@ def _message_by_user_os(linux_macos: str, windows: str) -> str: return windows -def _check_value(param: Any, param_name: str, predicate: Callable, suggestion: str) -> Any: +def _check_value( + param: Any, param_name: str, predicate: Callable, suggestion: str +) -> Any: """ Check param value @@ -149,20 +157,24 @@ def _check_analyzer_backend(backend: str) -> Analyzer: Returns: Analyzer: morpheme analyzer backend. """ - global MECAB_INFORM, KONLPY_MECAB_INFORM, PECAB_INFORM + global MECAB_INFORM, KONLPY_MECAB_INFORM, PECAB_INFORM, PUNCT_INFORM if isinstance(backend, str): backend = backend.lower() - if backend not in ["auto", "mecab", "pecab"]: + if backend not in ["auto", "mecab", "pecab", "punct"]: raise ValueError( f"Oops! '{backend}' is not supported value for `backend`.\n" - f"Currently kss only supports ['auto', 'pecab', 'mecab'] for this.\n" + f"Currently kss only supports ['auto', 'pecab', 'mecab', 'punct'] for this.\n" f"Please check `backend` parameter again ;)\n" ) mecab_backend = MecabAnalyzer() pecab_backend = PecabAnalyzer() + punct_backend = CharacterAnalyzer() + + if backend == "punct": + return punct_backend if backend == "mecab": if mecab_backend._backend is not None: @@ -236,22 +248,29 @@ def _check_analyzer_backend(backend: str) -> Analyzer: return pecab_backend else: - raise ImportError( - _message_by_user_os( - linux_macos="You don't have any available morpheme analyzer backend (mecab, pecab).\n" - "You need to install one of mecab, konlpy.tag.Mecab and pecab to use Kss.\n" + if not PUNCT_INFORM: + + installation_help_message = _message_by_user_os( + linux_macos="For your information, Kss also supports mecab backend.\n" + "We recommend you to install mecab or konlpy.tag.Mecab for faster execution of Kss.\n" "Please refer to following web sites for details:\n" f"- mecab: {_mecab_info_linux_macos}\n" - f"- konlpy.tag.Mecab: {_konlpy_info_windows}\n" - f"- pecab: {_pecab_info}\n", - windows="You don't have any available morpheme analyzer backend (mecab, pecab).\n" - "You need to install one of mecab, konlpy.tag.Mecab and pecab to use Kss.\n" + f"- konlpy.tag.Mecab: {_konlpy_info_linux_macos}\n", + windows="For your information, Kss also supports mecab backend.\n" + "We recommend you to install mecab or konlpy.tag.Mecab for faster execution of Kss.\n" "Please refer to following web sites for details:\n" f"- mecab: {_mecab_info_windows}\n" - f"- konlpy.tag.Mecab: {_konlpy_info_windows}\n" - f"- pecab: {_pecab_info}\n", - ), - ) + f"- konlpy.tag.Mecab: {_konlpy_info_windows}\n", + ) + + logger.warning( + "Because there's no supported C++ morpheme analyzer, " + "Kss will take pecab as a backend. :D\n" + f"{installation_help_message}" + ) + PUNCT_INFORM = True + + return punct_backend def _check_num_workers(