Skip to content

Commit

Permalink
4.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
hyunwoongko committed Jan 11, 2023
1 parent eaa0ec1 commit 63b5073
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 43 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ split_sentences(
- string: single text segmentation
- list/tuple of strings: batch texts segmentation
- **backend: Morpheme analyzer backend**
- `backend='auto'`: find `mecab``konlpy.tag.Mecab``pecab` and use first found analyzer (default)
- `backend='auto'`: find `mecab``konlpy.tag.Mecab``pecab` `punct` and use first found analyzer (default)
- `backend='mecab'`: find `mecab``konlpy.tag.Mecab` and use first found analyzer
- `backend='pecab'`: use `pecab` analyzer
- `backend='punct'`: split sentences only near punctuation marks
- **num_workers: The number of multiprocessing workers**
- `num_workers='auto'`: use multiprocessing with the maximum number of workers if possible (default)
- `num_workers=1`: don't use multiprocessing
Expand Down Expand Up @@ -739,9 +740,10 @@ split_morphemes(
- string: single text segmentation
- list/tuple of strings: batch texts segmentation
- **backend: Morpheme analyzer backend.**
- `backend='auto'`: find `mecab``konlpy.tag.Mecab``pecab` and use first found analyzer (default)
- `backend='auto'`: find `mecab``konlpy.tag.Mecab``pecab` `punct` and use first found analyzer (default)
- `backend='mecab'`: find `mecab``konlpy.tag.Mecab` and use first found analyzer
- `backend='pecab'`: use `pecab` analyzer
- `backend='punct'`: split sentences only near punctuation marks
- **num_workers: The number of multiprocessing workers**
- `num_workers='auto'`: use multiprocessing with the maximum number of workers if possible (default)
- `num_workers=1`: don't use multiprocessing
Expand Down Expand Up @@ -817,9 +819,10 @@ summarize_sentences(
- string: single text segmentation
- list/tuple of strings: batch texts segmentation
- **backend: Morpheme analyzer backend.**
- `backend='auto'`: find `mecab``konlpy.tag.Mecab``pecab` and use first found analyzer (default)
- `backend='auto'`: find `mecab``konlpy.tag.Mecab``pecab` `punct` and use first found analyzer (default)
- `backend='mecab'`: find `mecab``konlpy.tag.Mecab` and use first found analyzer
- `backend='pecab'`: use `pecab` analyzer
- `backend='punct'`: split sentences only near punctuation marks
- **num_workers: The number of multiprocessing workers**
- `num_workers='auto'`: use multiprocessing with the maximum number of workers if possible (default)
- `num_workers=1`: don't use multiprocessing
Expand Down
2 changes: 1 addition & 1 deletion bench/test_kss.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
parser.add_argument("datasets", nargs="+")
parser.add_argument("--write_result")
parser.add_argument("--write_err")
parser.add_argument("--backend", default="mecab", choices=["mecab", "pecab"])
parser.add_argument("--backend", default="mecab", choices=["mecab", "pecab", "punct"])
args = parser.parse_args()

split_sentences("foo-bar", backend=args.backend) # warm-up
Expand Down
2 changes: 1 addition & 1 deletion kss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
from kss._modules.summarization.summarize_sentences import summarize_sentences

__ALL__ = [split_sentences, split_morphemes, summarize_sentences]
__version__ = "4.3.2"
__version__ = "4.4.0"
25 changes: 25 additions & 0 deletions kss/_modules/morphemes/analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@


class Analyzer(ABC):
_analyzer, _backend = None, None

def pos(self, text: str, drop_space: bool) -> Any:
raise NotImplementedError

Expand Down Expand Up @@ -66,3 +68,26 @@ def pos(self, text: str, drop_space: bool) -> List[Tuple[str, str]]:
output = self._drop_space(output)

return output


class CharacterAnalyzer(Analyzer):
_analyzer, _backend = None, "character"

@lru_cache(maxsize=500)
def pos(self, text: str, drop_space: bool) -> List[Tuple[str, str]]:
"""
Get pos information.
Args:
text (str): input text
drop_space (bool): drop all spaces or not.
Returns:
List[Tuple[str, str]]: output of analysis.
"""
output = [(char, "-") for char in text]

if drop_space:
output = self._drop_space(output)

return output
26 changes: 25 additions & 1 deletion kss/_modules/sentences/sentence_postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def postprocess(
Returns:
List[str]: postprocessed output setences in string
"""

output_sentences = self._remove_space_before_emoji(output_sentences)
output_sentences = self._merge_broken_sub_sentence_in_quotes_or_brackets(
output_sentences
)
Expand Down Expand Up @@ -491,3 +491,27 @@ def _remove_empty_sentence(output_sentences: List[List[Syllable]]):
for sentence in output_sentences
if len("".join([i.text for i in sentence]).strip()) != 0
]

@staticmethod
def _remove_space_before_emoji(
output_sentences: List[List[Syllable]],
) -> List[List[Syllable]]:
"""
Remove a space character before emoji.
The space character was appended in preprocessing step.
Args:
output_sentences (List[List[Syllable]]): split list of syllables
Returns:
List[List[Syllable]]: list of syllables without space before emoji
"""
for sentence_idx, output_sentence in enumerate(output_sentences):
for syllable_idx, output_syllable in enumerate(output_sentence):
if (
output_syllable.check_pos("EMOJI")
and output_syllable.prev.text == " "
):
output_syllable.prev = output_syllable.prev.prev
output_sentences[sentence_idx].pop(syllable_idx - 1)
return output_sentences
26 changes: 26 additions & 0 deletions kss/_modules/sentences/sentence_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def preprocess(self, input_morphemes: List[Tuple[str, str]]) -> List[Syllable]:
"""
syllables = self._convert_morphemes_to_syllables(input_morphemes)
syllables = self._correct_wrong_tags(syllables)
syllables = self._append_space_before_emoji(syllables)
return syllables

def _convert_morphemes_to_syllables(
Expand Down Expand Up @@ -159,3 +160,28 @@ def _change_poses(syllable: Syllable, *poses: str):
for pos in poses:
_next.pos = pos
_next = _next.next

@staticmethod
def _append_space_before_emoji(syllables: List[Syllable]) -> List[Syllable]:
"""
Append a space character before emoji character.
This could be helpful for tokenizing sentences which contain emoji.
Args:
syllables (List[Syllable]): input syllables
Returns:
List[Syllable]: preprocessed syllables
"""
new_syllables = []
for syllable in syllables:
if syllable.check_pos("EMOJI") and not syllable.prev.check_pos("EMOJI"):
space_syllable = Syllable(" ", "SP")
if syllable.prev is not None:
space_syllable.prev = syllable.prev
space_syllable.next = syllable
syllable.prev = space_syllable

new_syllables.append(space_syllable)
new_syllables.append(syllable)
return new_syllables
16 changes: 5 additions & 11 deletions kss/_modules/sentences/sentence_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from kss._elements.subclasses import Syllable
from kss._modules.sentences.sentence_processor import SentenceProcessor
from kss._utils.const import papers
from kss._utils.const import sf_exception


class SentenceSplitter(SentenceProcessor):
Expand Down Expand Up @@ -155,14 +155,7 @@ def _sf(self) -> bool:
# 종결부호 분할 규칙
if self._check_pos("SF") and (
self._check_next_pos("SP")
or self._check_next_skip_sp_pos(
(
"SY",
"SSO",
"EMOJI",
"JAMO",
)
)
or self._check_next_skip_sp_pos(("SY", "SSO", "EMOJI", "JAMO"))
):
# 예외 1
available = not self._prev_skip(("SP", "SF")).text.isnumeric()
Expand Down Expand Up @@ -220,8 +213,9 @@ def _sf(self) -> bool:
available = available and not self._check_next_is_unavailable_split()

# 예외 9
available = available and (
not self._check_multiple_prev_texts_from_before(*papers)
available = available and not (
self._check_text(".")
and self._check_multiple_prev_texts_from_before(*sf_exception)
)

return available
Expand Down
9 changes: 6 additions & 3 deletions kss/_modules/sentences/split_sentences.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def split_sentences(
Args:
text (Union[str, List[str], Tuple[str]]): single text or list/tuple of texts
backend (str): morpheme analyzer backend. 'mecab', 'pecab' are supported.
backend (str): morpheme analyzer backend. 'mecab', 'pecab', 'punkt' are supported.
num_workers (Union[int, str])): the number of multiprocessing workers
strip (bool): strip all sentences or not
Expand Down Expand Up @@ -112,8 +112,11 @@ def _split_sentences(
current_sentence_syllables = [syllable]
syllable_added = True

elif splitter.check_split_start():
split_mode = True
else:
if backend._backend == "character":
split_mode = splitter._sf()
else:
split_mode = splitter.check_split_start()

else:
end_split, end_split_exception = splitter.check_split_end()
Expand Down
42 changes: 36 additions & 6 deletions kss/_utils/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
r"[a-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+|[a-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+\.[a-z]"
)

papers = [
sf_exception = [
" no",
" No",
" pp",
Expand Down Expand Up @@ -214,10 +214,43 @@
" Para",
" comp",
" Comp",
"Capt",
" capt",
" dept",
"Dept",
"Mr",
" mr",
"Miss",
"Mrs",
" mrs",
"Ms",
" ms",
"Dr",
" dr",
"Prof",
" prof",
"Rev",
" rev",
"St",
" st",
" Co",
" co",
" MM",
" mm",
" Messrs",
" messrs",
" Mlle",
" mlle",
" Mme",
" mme",
" def",
" Def",
" viz",
" Viz",
]

for i in range(0, 10):
papers += [
sf_exception += [
f"{i}항",
f"{i}조",
f"{i}호",
Expand All @@ -226,6 +259,7 @@
f"{i}권",
f"{i}쪽",
f"{i}장",
f"{i}",
]


Expand Down Expand Up @@ -270,8 +304,4 @@
"'n'",
"N' ",
"n' ",
"Capt.",
"capt.",
"dept.",
"Dept.",
}
53 changes: 36 additions & 17 deletions kss/_utils/sanity_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@
MecabAnalyzer,
PecabAnalyzer,
Analyzer,
CharacterAnalyzer,
)
from kss._utils.logging import logger

MECAB_INFORM, KONLPY_MECAB_INFORM, PECAB_INFORM = False, False, False
MECAB_INFORM, KONLPY_MECAB_INFORM, PECAB_INFORM, PUNCT_INFORM = (
False,
False,
False,
False,
)

_mecab_info_linux_macos = "https://github.com/hyunwoongko/python-mecab-kor"
_konlpy_info_linux_macos = "https://konlpy.org/en/latest/api/konlpy.tag/#mecab-class"
Expand All @@ -29,7 +35,9 @@ def _message_by_user_os(linux_macos: str, windows: str) -> str:
return windows


def _check_value(param: Any, param_name: str, predicate: Callable, suggestion: str) -> Any:
def _check_value(
param: Any, param_name: str, predicate: Callable, suggestion: str
) -> Any:
"""
Check param value
Expand Down Expand Up @@ -149,20 +157,24 @@ def _check_analyzer_backend(backend: str) -> Analyzer:
Returns:
Analyzer: morpheme analyzer backend.
"""
global MECAB_INFORM, KONLPY_MECAB_INFORM, PECAB_INFORM
global MECAB_INFORM, KONLPY_MECAB_INFORM, PECAB_INFORM, PUNCT_INFORM

if isinstance(backend, str):
backend = backend.lower()

if backend not in ["auto", "mecab", "pecab"]:
if backend not in ["auto", "mecab", "pecab", "punct"]:
raise ValueError(
f"Oops! '{backend}' is not supported value for `backend`.\n"
f"Currently kss only supports ['auto', 'pecab', 'mecab'] for this.\n"
f"Currently kss only supports ['auto', 'pecab', 'mecab', 'punct'] for this.\n"
f"Please check `backend` parameter again ;)\n"
)

mecab_backend = MecabAnalyzer()
pecab_backend = PecabAnalyzer()
punct_backend = CharacterAnalyzer()

if backend == "punct":
return punct_backend

if backend == "mecab":
if mecab_backend._backend is not None:
Expand Down Expand Up @@ -236,22 +248,29 @@ def _check_analyzer_backend(backend: str) -> Analyzer:

return pecab_backend
else:
raise ImportError(
_message_by_user_os(
linux_macos="You don't have any available morpheme analyzer backend (mecab, pecab).\n"
"You need to install one of mecab, konlpy.tag.Mecab and pecab to use Kss.\n"
if not PUNCT_INFORM:

installation_help_message = _message_by_user_os(
linux_macos="For your information, Kss also supports mecab backend.\n"
"We recommend you to install mecab or konlpy.tag.Mecab for faster execution of Kss.\n"
"Please refer to following web sites for details:\n"
f"- mecab: {_mecab_info_linux_macos}\n"
f"- konlpy.tag.Mecab: {_konlpy_info_windows}\n"
f"- pecab: {_pecab_info}\n",
windows="You don't have any available morpheme analyzer backend (mecab, pecab).\n"
"You need to install one of mecab, konlpy.tag.Mecab and pecab to use Kss.\n"
f"- konlpy.tag.Mecab: {_konlpy_info_linux_macos}\n",
windows="For your information, Kss also supports mecab backend.\n"
"We recommend you to install mecab or konlpy.tag.Mecab for faster execution of Kss.\n"
"Please refer to following web sites for details:\n"
f"- mecab: {_mecab_info_windows}\n"
f"- konlpy.tag.Mecab: {_konlpy_info_windows}\n"
f"- pecab: {_pecab_info}\n",
),
)
f"- konlpy.tag.Mecab: {_konlpy_info_windows}\n",
)

logger.warning(
"Because there's no supported C++ morpheme analyzer, "
"Kss will take pecab as a backend. :D\n"
f"{installation_help_message}"
)
PUNCT_INFORM = True

return punct_backend


def _check_num_workers(
Expand Down

0 comments on commit 63b5073

Please sign in to comment.