-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert.py
executable file
·219 lines (198 loc) · 10.4 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/python3
import fileinput
import re
import sys
import csv
def is_even(n):
return n % 2 == 0
def is_odd(n):
return n % 2 != 0
def fix_badly_wrapped_lines(lastline, line):
"""
Returns a tuple of (bool, str).
The bool indicates whether the input line was a complete line on its own, or whether it's
a partial line that had to be appended to the previous line in order to be complete.
This is used in the processing for loop, because we wait until the *next* complete line to
process the *previous* complete line.
"""
# Some lines have two pronunciations, therefore four slashes
# Any line with an even number of slashes is probably okay
slashes = re.findall("/", line)
if is_even(len(slashes)):
# Unless it has no slashes at all, in which case it was part of the previous line
if len(slashes) == 0:
return False, lastline.rstrip() + " " + line.lstrip()
else:
return True, line.rstrip()
# A line with an odd number of slashes is tricky, because we might have two lines like this:
# word /long-pronunciation-that-
# got-split-across-lines/ definition
# The rule that works with our input data is that if the previous line had an even number of
# slashes, it was complete, so this line is the first line of a pair. But if the previous line
# had an odd number of slashes, this line is the second line of a pair and now we have the complete text.
if is_odd(len(re.findall("/", lastline))):
# Last line plus this line make a complete pair that can now be processed. We do NOT add a space in this scenario,
# since pronunciation splits almost never should have had spaces in them.
return True, lastline.rstrip() + line.lstrip()
else:
# Don't process this line yet since it's incomplete
return False, ""
def fix_typoes(line):
# There is one line containing "/ឬ " where it should have been "/ ឬ " with an extra space. This becomes
# important in our pronunciation handling later.
return line.replace("/ឬ ", "/ ឬ ")
most_recent_headword = ""
most_recent_sense_number = ""
def process(line):
global most_recent_headword
global most_recent_sense_number
# Break line down into headword, pronunciation, etc.
headword = ""
sense_number = ""
subentry = ""
pronunciation = ""
grammatical_info = ""
parenthesis_contents = ""
definition = ""
# First, extract pronunciation, taking into account that there could be more than one
parts = re.split("/", line)
if len(parts) > 1:
before = parts[0]
middle = "".join(parts[1:-1])
after = parts[-1]
pronunciation = middle.strip()
# Extract sense number if present
parts = re.split(r"(\d+)", before.rstrip(), 1) # \d also matches [០-៩] thanks to Unicode
if len(parts) > 1:
before, middle, _ = parts
headword = before
sense_number = middle.strip()
else:
headword = parts[0]
# If headword has leading spaces, this is a subentry
if re.match(r"^\s", headword):
subentry = headword.strip()
headword = most_recent_headword
sense_number = most_recent_sense_number
else:
headword = headword.strip()
most_recent_headword = headword
most_recent_sense_number = sense_number
# TODO: Detect examples (marked by ឧ.) in either source/etymology *or* definition, and strip them out.
# If a parenthesis had nothing but an example in it, then source/etymology should be empty and that goes under "example" instead.
# Extract source/etymology (found in parentheses) if present
parts = re.split(r"\((.*)\)", after.strip(), 1)
if len(parts) > 1:
before, middle, after = parts
grammatical_info = before.strip()
parenthesis_contents = middle.strip()
# A few lines have multiple parentheses, and the above regex is greedy
parenthesis_contents = parenthesis_contents.replace("(", "").replace(")", "")
definition = after.strip()
else:
# Some lines have "..." in them, so we need to search for a dot NOT preceded or followed by another dot
dots_matches = list(re.finditer(r"((?<!\.)\.(?!\.))", parts[0]))
if len(dots_matches) > 0:
last_match = dots_matches[-1]
grammatical_info = last_match.string[:last_match.end(1)]
definition = last_match.string[last_match.end(1):].strip()
else:
# No grammatical info found
definition = parts[0].strip()
else:
# Any lines that still don't contain a slash are not actual input data
return None
return (headword, sense_number, subentry, pronunciation, grammatical_info, parenthesis_contents, definition) # TODO: Also return examples
def run_test(input, expected_output):
output = process(input)
if output != expected_output:
print("Unexpected results from", input, ": got", output, "and expected", expected_output, file=sys.stderr)
exit(2)
def output_entry_details(outfile, pronunciation, grammatical_info, parenthesis_contents, definition):
# TODO: Also handle examples
if pronunciation:
print(r"\ph {}".format(pronunciation), file=outfile)
if grammatical_info:
print(r"\ps {}".format(grammatical_info), file=outfile)
if parenthesis_contents:
print(r"\etym {}".format(parenthesis_contents), file=outfile)
if definition:
print(r"\de {}".format(definition), file=outfile)
previous_headword_for_sfm = ("", "", "", "", "", "") # TODO: Add seventh field here once we handle examples
previous_subentries = []
def output_to_sfm(outfile, headword, sense_number, subentry, pronunciation, grammatical_info, parenthesis_contents, definition): # TODO: Add example parameter as well
global previous_headword_for_sfm
global previous_subentries
if subentry and headword == previous_headword_for_sfm[0] and sense_number == previous_headword_for_sfm[1]:
previous_subentries.append((subentry, pronunciation, grammatical_info, parenthesis_contents, definition))
else:
# New headword, so we're done adding subentries. Print out the previous one
(prev_headword, prev_sense_number, prev_pronunciation, prev_grammatical_info, prev_parenthesis_contents, prev_definition) = previous_headword_for_sfm
print(r"\lx {}".format(prev_headword), file=outfile)
if prev_sense_number:
print(r"\sn {}".format(prev_sense_number), file=outfile)
output_entry_details(outfile, prev_pronunciation, prev_grammatical_info, prev_parenthesis_contents, prev_definition)
for subentry in previous_subentries:
(sub_lexeme, sub_pronunciation, sub_grammatical_info, sub_parenthesis_contents, sub_definition) = subentry # TODO: Also handle examples
print(r"\se {}".format(sub_lexeme), file=outfile)
output_entry_details(outfile, sub_pronunciation, sub_grammatical_info, sub_parenthesis_contents, sub_definition)
print("", file=outfile)
previous_headword_for_sfm = (headword, sense_number, pronunciation, grammatical_info, parenthesis_contents, definition)
previous_subentries = []
def process_input():
lastline = ""
last_complete_line = ""
outfile_csv = None
outfile_sfm = None
out_sfm = None
for line in fileinput.input():
if fileinput.isfirstline():
at_start = True
csv_fname = fileinput.filename().replace(".txt", ".csv")
sfm_fname = fileinput.filename().replace(".txt", ".sfm")
if outfile_csv is not None:
outfile_csv.close()
if out_sfm is not None:
out_sfm.close()
outfile_csv = open(csv_fname, "w")
csvwriter = csv.writer(outfile_csv)
csvwriter.writerow(("Headword", "Sense number", "Subentry", "Pronunciation", "Part of Speech", "Source/Etymology", "Definition")) # TODO: Also examples
outfile_sfm = open(sfm_fname, "w")
if at_start:
if "/" not in line:
continue # Skip initial set of slash-less lines, as that's the intro text
# Now we've seen a slash, so process this line and all subsequent lines
at_start = False
if line is None or not (line.strip()):
continue # Skip blank lines
ready, maybe_fixed_line = fix_badly_wrapped_lines(lastline, line)
fixed_line = fix_typoes(maybe_fixed_line)
if ready:
result = process(last_complete_line)
if result is not None:
csvwriter.writerow(result)
output_to_sfm(outfile_sfm, *result)
# If fix_badly_wrapped_lines returned an empty string, then we're not supposed to overwrite the previous completed line
if maybe_fixed_line:
last_complete_line = fixed_line
lastline = line
# Don't forget to process the last complete line, which won't have been output by the loop above
last_result = process(last_complete_line)
if last_result is not None:
csvwriter.writerow(last_result)
output_to_sfm(outfile_sfm, *last_result)
# And one last call to output_to_sfm() to flush the last entry since it's still waiting for subentries that won't be arriving
output_to_sfm(outfile_sfm, None, None, None, None, None, None, None) # TODO: Add an eighth None here once we handle examples
outfile_csv.close()
outfile_sfm.close()
for input, expected_output in [
("បើក ១ /បើក/ កិ. ធ្វើឱ្យច្រហ, ឱ្យមានផ្លូវ, មានទំនង...។",
('បើក', '១', '', 'បើក', 'កិ.', '', 'ធ្វើឱ្យច្រហ, ឱ្យមានផ្លូវ, មានទំនង...។')),
("បស្ចិមទិស /បាស់-ចិម-ទឹស/ ឬ /ប៉ាច់-ចិម-ម៉ៈ-ទឹស/ ន. ",
('បស្ចិមទិស', '', '', 'បាស់-ចិម-ទឹស ឬ ប៉ាច់-ចិម-ម៉ៈ-ទឹស', 'ន.', '', '')),
("ប្អ៊ឹះ /ប្អ៊ឹះ/ គុ. ឬ កិ.វិ.",
('ប្អ៊ឹះ', '', '', 'ប្អ៊ឹះ', 'គុ. ឬ កិ.វិ.', '', '')),
# TODO: Add test to demonstrate handling examples (marked by ឧ.)
]:
run_test(input, expected_output)
process_input()