-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_templates.py
132 lines (109 loc) · 4.57 KB
/
process_templates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import re
from typing import Tuple, Optional
import pathlib
from constants import WRITE_PATHS, SEGMENT_SEPARATOR, PROCESSED_DIR
from src.segmentation import remove_duplicate_seps
from src.utils.etc import determine_pos, read_redirects, is_usable_template, basic_json_read, clean_string
# question marks at the end of templates are probably safe to remove in articles
# however, it is not clear what is their purpose exactly
def separate_form_template(form_template: str) -> Optional[Tuple[str, str]]:
ending = re.findall(r"[^\}]+$", form_template)
if ending:
ending = ending[0]
else:
ending = ""
stem_name = re.findall(r"основа-?\d?", form_template)
if not stem_name:
return None
else:
return stem_name[0], clean_string(ending)
def process_template(template_page: dict):
pos = determine_pos(template_page, page_type="template")
result = {}
for key, value in template_page["template"].items():
if "основа" in value:
if "<br>" in value:
value = value.split("<br>")
elif "<br />" in value:
value = value.split("<br />")
elif " " in value:
value = value.split()
else:
value = [value]
if "if" not in value and "основа" in value[0]:
if pos == "noun":
result[key] = separate_form_template(value[0])
return result
def process_ending_noun(ending, *, sep: str = SEGMENT_SEPARATOR):
if ending == "еняток":
return sep + "ен" + sep + "ят" + sep + "ок"
if ending == "енят":
return sep + "ен" + sep + "ят"
if ending == "енятки":
return sep + "ен" + sep + "ят" + sep + "к" + sep + "и"
if ending == "еняткам":
return sep + "ен" + sep + "ят" + sep + "к" + sep + "ам"
if ending == "енятками":
return sep + "ен" + sep + "ят" + sep + "к" + sep + "ам" + "и"
if ending == "енятках":
return sep + "ен" + sep + "ят" + sep + "к" + sep + "ах"
if ending == "енята":
return sep + "ен" + sep + "ят" + sep + "а"
if ending == "енятам":
return sep + "ен" + sep + "ят" + sep + "ам"
if ending == "енятами":
return sep + "ен" + sep + "ят" + sep + "ам" + sep + "и"
if ending == "енятах":
return sep + "ен" + sep + "ят" + sep + "ах"
if ending == "яьми":
return sep + "ями"
if ending == "еви":
return sep + "ев" + "и"
if len(ending) > 2 and ending[:2] == "ён":
return sep + ending[:2] + sep + ending[2:]
if len(ending) == 1:
if ending in "ьъ":
return ending
else:
return sep + ending
elif len(ending) > 2 and ending[:2] == "иц":
return "и" + "ц" + sep + ending[2:]
else:
if ending[0] in "ьи":
return ending[0] + sep + ending[1:]
elif ending[0] in "кцн":
return sep + ending[0] + sep + ending[1:]
else:
return sep + ending
def process_ending(ending: str, pos: str, *, sep: str = SEGMENT_SEPARATOR) -> str:
if not ending:
return ending
if ending[-1] == '̈е': # broken unicode
ending[-1] = "ё"
if pos == "noun":
ending = process_ending_noun(ending, sep=sep)
if len(ending) > 2 and ending[-2:] in ("ся", "сь"):
ending = ending[:-2] + sep + ending[-2:]
ending = remove_duplicate_seps(ending, sep=sep)
return ending
redirects = read_redirects(WRITE_PATHS["template_redirect"])
full_noun_templates = []
broken_noun_templates = []
noun_templates = basic_json_read(WRITE_PATHS["template"])
destination = pathlib.Path(PROCESSED_DIR)
destination.mkdir(parents=True, exist_ok=True)
with open(destination / "templates.jsonl", "w") as file:
for template in noun_templates:
if determine_pos(template, page_type="template") == "noun":
template_id = template["id"]
template_title = template["title"]
if is_usable_template(template):
temp = process_template(template)
final_temp = {}
for key, value in temp.items():
final_temp[key] = {value[0]: process_ending(value[1], "noun")}
if final_temp:
file.write(json.dumps({"id": template_id,
"title": template_title,
"template": final_temp}, ensure_ascii=False) + "\n")