-
Notifications
You must be signed in to change notification settings - Fork 3
/
basic_mecab_controller.py
142 lines (115 loc) · 4.37 KB
/
basic_mecab_controller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Copyright: Ren Tatsumoto <tatsu at autistici.org> and contributors
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import functools
import os
import subprocess
from collections.abc import Sequence
from typing import Optional
try:
from .mecab_exe_finder import IS_WIN, SUPPORT_DIR, find_executable
except ImportError:
from mecab_exe_finder import IS_WIN, SUPPORT_DIR, find_executable
INPUT_BUFFER_SIZE = str(819200)
MECAB_RC_PATH = os.path.join(SUPPORT_DIR, "mecabrc")
@functools.cache
def startup_info():
if IS_WIN:
# Prevents a console window from popping up on Windows
si = subprocess.STARTUPINFO()
si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
else:
si = None
return si
@functools.cache
def find_best_dic_dir():
"""
If the user has mecab-ipadic-neologd (or mecab-ipadic) installed, pick its system dictionary.
"""
possible_locations = (
"/usr/lib/mecab/dic/mecab-ipadic-neologd",
"/usr/local/lib/mecab/dic/mecab-ipadic-neologd",
"/opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd",
"/usr/lib/mecab/dic/ipadic",
"/usr/local/lib/mecab/dic/ipadic", # for `brew install mecab-ipadic`
"/opt/homebrew/lib/mecab/dic/ipadic",
)
for directory in possible_locations:
if os.path.isdir(directory):
return directory
return SUPPORT_DIR
def normalize_for_platform(popen: list[str]) -> list[str]:
if IS_WIN:
popen = [os.path.normpath(x) for x in popen]
return popen
def check_mecab_rc():
if not os.path.isfile(MECAB_RC_PATH):
with open(MECAB_RC_PATH, "w") as f:
# create mecabrc if it doesn't exist
f.write("")
def expr_to_bytes(expr: str) -> bytes:
return expr.encode("utf-8", "ignore") + b"\n"
def mecab_output_to_str(outs: bytes) -> str:
return outs.rstrip(b"\r\n").decode("utf-8", "replace")
class BasicMecabController:
_mecab_cmd: list[str] = [
find_executable("mecab"),
"--dicdir=" + find_best_dic_dir(),
"--rcfile=" + MECAB_RC_PATH,
"--userdic=" + os.path.join(SUPPORT_DIR, "user_dic.dic"),
"--input-buffer-size=" + INPUT_BUFFER_SIZE,
]
_mecab_args: list[str] = []
_verbose: bool
def __init__(
self,
mecab_cmd: Optional[list[str]] = None,
mecab_args: Optional[list[str]] = None,
verbose: bool = False,
) -> None:
super().__init__()
check_mecab_rc()
self._verbose = verbose
self._mecab_cmd = normalize_for_platform((mecab_cmd or self._mecab_cmd) + (mecab_args or self._mecab_args))
os.environ["DYLD_LIBRARY_PATH"] = SUPPORT_DIR
os.environ["LD_LIBRARY_PATH"] = SUPPORT_DIR
if self._verbose:
print("mecab cmd:", self._mecab_cmd)
def run(self, expr: str) -> str:
try:
proc = subprocess.Popen(
self._mecab_cmd,
bufsize=-1,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
startupinfo=startup_info(),
)
except OSError:
raise Exception("Please ensure your Linux system has 64 bit binary support.")
try:
outs, errs = proc.communicate(expr_to_bytes(expr), timeout=5)
except subprocess.TimeoutExpired:
proc.kill()
outs, errs = proc.communicate()
str_out = mecab_output_to_str(outs)
if "tagger.cpp" in str_out and "no such file or directory" in str_out:
raise RuntimeError("Please ensure your Windows user name contains only English characters.")
return str_out
def main():
mecab = BasicMecabController()
try_expressions = (
"カリン、自分でまいた種は自分で刈り取れ",
"昨日、林檎を2個買った。",
"真莉、大好きだよん^^",
"彼2000万も使った。",
"彼二千三百六十円も使った。",
"千葉",
"昨日すき焼きを食べました",
"二人の美人",
"詳細はお気軽にお問い合わせ下さい。",
"Lorem ipsum dolor sit amet. Съешь ещё этих мягких французских булок, да выпей же чаю.",
)
for expr in try_expressions:
print(mecab.run(expr))
if __name__ == "__main__":
main()