forked from tatuylonen/wiktextract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wiktwords
executable file
·482 lines (435 loc) · 20 KB
/
wiktwords
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
#!/usr/bin/env python3
#
# Main program for extracting a dictionary from wiktionary. This has
# mostly been used with enwiktionary, but should be usable with other
# wiktionaries as well.
#
# Copyright (c) 2018-2023 Tatu Ylonen. See LICENSE and https://ylonen.org
#
# For pre-extracted data files, see https://kaikki.org/dictionary/
import os
import re
import sys
import html
import json
import pstats
import hashlib
import cProfile
import argparse
import collections
from wikitextprocessor import Wtp
from wiktextract.inflection import set_debug_cell_text
from wiktextract import (WiktionaryConfig, parse_wiktionary,
reprocess_wiktionary, parse_page,
extract_namespace)
from wiktextract import extract_thesaurus_data
from wiktextract import extract_categories
# Pages whose titles have any of these prefixes are ignored.
IGNORE_PREFIEXES = None
# Pages with these prefixes are captured.
RECOGNIZED_PREFIXES = None
def init_prefixes(ctx: Wtp) -> None:
global IGNORE_PREFIEXES, RECOGNIZED_PREFIXES
if IGNORE_PREFIEXES is None:
IGNORE_PREFIEXES = {
ctx.NAMESPACE_DATA.get("Index", {}).get("name"),
ctx.NAMESPACE_DATA.get("Help", {}).get("name"),
ctx.NAMESPACE_DATA.get("MediaWiki", {}).get("name"),
ctx.NAMESPACE_DATA.get("Citations", {}).get("name"),
ctx.NAMESPACE_DATA.get("Concordance", {}).get("name"),
ctx.NAMESPACE_DATA.get("Rhymes", {}).get("name"),
ctx.NAMESPACE_DATA.get("Thread", {}).get("name"),
ctx.NAMESPACE_DATA.get("Summary", {}).get("name"),
ctx.NAMESPACE_DATA.get("File", {}).get("name"),
ctx.NAMESPACE_DATA.get("Transwiki", {}).get("name"),
}
if RECOGNIZED_PREFIXES is None:
RECOGNIZED_PREFIXES = {
ctx.NAMESPACE_DATA.get("Category", {}).get("name"),
ctx.NAMESPACE_DATA.get("Appendix", {}).get("name"),
ctx.NAMESPACE_DATA.get("Project", {}).get("name"),
ctx.NAMESPACE_DATA.get("Thesaurus", {}).get("name"),
ctx.NAMESPACE_DATA.get("Module", {}).get("name"),
ctx.NAMESPACE_DATA.get("Template", {}).get("name"),
ctx.NAMESPACE_DATA.get("Reconstruction", {}).get("name"),
}
def capture_page(model, orig_title, text, pages_dir):
"""Checks if the page needs special handling (and maybe saving).
Returns True if the page should be processed normally as a
dictionary entry."""
assert isinstance(model, str)
assert isinstance(orig_title, str)
assert isinstance(text, str)
assert pages_dir is None or isinstance(pages_dir, str)
analyze = True
title = orig_title
m = re.match(r"^([A-Z][a-z][-a-zA-Z0-9_]+):(.+)$", title)
if not m:
if len(title) > 100:
h = hashlib.sha256()
h.update(title.encode("utf-8"))
title = title[:100] + "-" + h.hexdigest()[:10]
title = "Words:" + title[:2] + "/" + title
analyze = True
else:
prefix, tail = m.groups()
if prefix in IGNORE_PREFIEXES:
analyze = False
elif prefix not in RECOGNIZED_PREFIXES:
print("UNRECOGNIZED PREFIX", title)
analyze = False
if pages_dir is not None:
title = re.sub(r"//", "__slashslash__", title)
title = re.sub(r":", "/", title)
path = pages_dir + "/" + title + ".txt"
path = re.sub(r"/\.+", lambda m: re.sub(r"\.", "__dot__", m.group(0)),
path)
path = re.sub(r"//+", "/", path)
dirpath = os.path.dirname(path)
try:
os.makedirs(dirpath, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write("TITLE: {}\n".format(orig_title))
text = html.unescape(text)
f.write(text)
except OSError as err:
print("OSError: {}, "
"when writing file name {!r}, for "
"title: {!r}".format(err, path, orig_title))
return analyze
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Multilingual Wiktionary data extractor")
parser.add_argument("path", type=str, nargs="?", default=None,
help="Input file (.../enwiktionary-<date>-"
"pages-articles.xml.bz2)")
parser.add_argument("--out", type=str, default=None,
help="Path where to write output (- for stdout)")
parser.add_argument("--errors", type=str,
help="File in which to save error information")
parser.add_argument("--dump-file-language-code", type=str, default="en",
help="Language code of the dump file.")
parser.add_argument("--language", type=str, action="append", default=[],
help="Language code to capture (can specify multiple "
"times, defaults to English [en] and "
"Translingual [mul])")
parser.add_argument("--all-languages", action="store_true", default=False,
help="Extract words for all languages")
parser.add_argument("--list-languages", action="store_true", default=False,
help="Print list of supported languages")
parser.add_argument("--pages-dir", type=str, default=None,
help="Directory under which to save all pages")
parser.add_argument("--all", action="store_true", default=False,
help="Capture everything for the selected languages")
parser.add_argument("--translations", action="store_true", default=False,
help="Capture translations")
parser.add_argument("--pronunciations", action="store_true", default=False,
help="Capture pronunciation information")
parser.add_argument("--linkages", action="store_true", default=False,
help="Capture linkages (hypernyms, synonyms, etc)")
parser.add_argument("--compounds", action="store_true", default=False,
help="Capture compound words using each word")
parser.add_argument("--redirects", action="store_true", default=False,
help="Capture redirects")
parser.add_argument("--examples", action="store_true", default=False,
help="Capture usage examples")
parser.add_argument("--etymologies", action="store_true", default=False,
help="Capture etymologies")
parser.add_argument("--inflections", action="store_true", default=False,
help="Capture inflection tables")
parser.add_argument("--statistics", action="store_true", default=False,
help="Print statistics")
parser.add_argument("--page", type=str,
help="Parse a single Wiktionary page (for debugging)")
parser.add_argument("--cache", type=str,
help="File prefix where phase1 results are saved; "
"speeds up processing a single page tremendously")
parser.add_argument("--num-threads", type=int, default=None,
help="Number of parallel processes (default: #cpus)")
parser.add_argument("--verbose", action="store_true", default=False,
help="Print verbose status messages (for debugging)")
parser.add_argument("--human-readable", action="store_true", default=False,
help="Write output in human-readable JSON")
parser.add_argument("--override", type=str, action="append",
help="Override module by one in file (for debugging)")
parser.add_argument("--use-thesaurus", action="store_true", default=False,
help="Include thesaurus in single page mode")
parser.add_argument("--profile", action="store_true", default=False,
help="Enable CPU time profiling")
parser.add_argument("--categories-file", type=str,
help="Extract category tree as JSON in this file")
parser.add_argument("--modules-file", type=str,
help="Extract all modules and save in this .tar file")
parser.add_argument("--templates-file", type=str,
help="Extract all templates and save in this .tar file")
parser.add_argument("--redirects-file", type=str,
help="Optional file containing sound file redirect "
"names from Wikimedia Commons and what "
"they point to")
parser.add_argument("--inflection-tables-file", type=str, default=None,
help="Extract expanded tables in this file (for test data)")
parser.add_argument("--debug-cell-text", type=str, default=None,
help="Print out debug messages when encountering this text")
args = parser.parse_args()
if args.debug_cell_text:
# importing debug_cell_text from wiktextract.inflection
# does not work because the debug_cell_text here would be
# only a reference, and assigning to it just changes the
# thing it is pointing at. Instead of just importing the
# whole inflection module and doing wiktextract.inflection
# .debug_cell_text =, a simple setter function does
# the same thing.
set_debug_cell_text(args.debug_cell_text)
# The --all option turns on capturing all data types
if args.all and (not args.pages_dir or args.out):
args.translations = True
args.pronunciations = True
args.linkages = True
args.compounds = True
args.redirects = True
args.examples = True
args.etymologies = True
args.inflections = True
# Default to English and Translingual if language not specified.
if not args.language:
args.language = ["en", "mul"]
if args.all_languages:
args.language = None
print("Capturing words for all available languages")
else:
print("Capturing words for:", ", ".join(args.language))
if args.num_threads and args.num_threads > 1:
import multiprocessing
if multiprocessing.get_start_method() == "spawn":
print("--num-threads not supported on this OS (no stable implementation of fork() available)")
sys.exit(1)
# Open output file.
out_path = args.out
if not out_path and args.pages_dir:
out_f = None
elif out_path and out_path != "-":
if out_path.startswith("/dev/"):
out_tmp_path = out_path
else:
out_tmp_path = out_path + ".tmp"
out_f = open(out_tmp_path, "w", buffering=1024*1024, encoding="utf-8")
else:
out_tmp_path = out_path
out_f = sys.stdout
word_count = 0
config = WiktionaryConfig(dump_file_lang_code=args.dump_file_language_code,
capture_language_codes=args.language,
capture_translations=args.translations,
capture_pronunciation=args.pronunciations,
capture_linkages=args.linkages,
capture_compounds=args.compounds,
capture_redirects=args.redirects,
capture_examples=args.examples,
capture_etymologies=args.etymologies,
capture_inflections=args.inflections,
verbose=args.verbose,
expand_tables=args.inflection_tables_file,)
if args.language:
new_lang_codes = []
for x in args.language:
if x not in config.LANGUAGES_BY_CODE:
if x in config.LANGUAGES_BY_NAME:
new_lang_codes.append(config.LANGUAGES_BY_NAME[x])
else:
print("Invalid language:", x)
sys.exit(1)
else:
new_lang_codes.append(x)
config.capture_language_codes = new_lang_codes
if args.language:
lang_names = []
for x in args.language:
if x in config.LANGUAGES_BY_CODE:
lang_names.extend(config.LANGUAGES_BY_CODE[x])
else:
lang_names.extend(config.LANGUAGES_BY_CODE[
config.LANGUAGES_BY_NAME[x] ])
lang_names = [re.escape(x) for x in lang_names]
lang_names_re = r"==\s*("
lang_names_re += "|".join(lang_names)
lang_names_re += r")"
lang_names_re = re.compile(lang_names_re)
# Create expansion context
ctx = Wtp(cache_file=args.cache, num_threads=args.num_threads,
lang_code=args.dump_file_language_code,
languages_by_code=config.LANGUAGES_BY_CODE)
# We are now having problems with "Module:no globals", which causes
# infinite Python recursion on a number of pages (it may be a sandbox
# problem that it is not properly reset). Override that debugging module
# to be empty.
ctx.add_page("Scribunto", "Module:no globals", "", transient=True)
# If --list-languages has been specified, just print the list of supported
# languages
if args.list_languages:
print("Supported languages:")
for lang_name, lang_code in config.LANGUAGES_BY_NAME.items():
print(f" {lang_name}: {lang_code}")
sys.exit(0)
if not args.path and not args.cache:
print("The PATH argument for wiktionary dump file is normally "
"mandatory.")
print("Alternatively, --cache with --page can be used.")
sys.exit(1)
if args.override:
for path in args.override:
with open(path, "r", encoding="utf-8") as f:
text = f.read()
m = re.match(r"(?s)^TITLE: ([^\n]*)\n", text)
if m:
title = m.group(1)
text = text[m.end():]
else:
print("First line of file supplied with --override must be "
"\"TITLE: <page title>\"")
print("(The page title for this would normally start "
"with Module:")
sys.exit(1)
# Load it as a transient page, overriding the normal page
ctx.add_page("Scribunto", title, text, transient=True)
def word_cb(data):
global word_count
word_count += 1
if out_f is not None:
if args.human_readable:
out_f.write(json.dumps(data, indent=2, sort_keys=True,
ensure_ascii=False))
else:
out_f.write(json.dumps(data, ensure_ascii=False))
out_f.write("\n")
if not out_path or out_path == "-" or word_count % 1000 == 0:
out_f.flush()
def capture_cb(model, title, text):
return capture_page(model, title, text, args.pages_dir)
# load redirects to ctx if given
if args.redirects_file:
with open(args.redirects_file) as f:
config.redirects = json.load(f)
if args.profile:
pr = cProfile.Profile()
pr.enable()
init_prefixes(ctx)
try:
if args.path:
# Parse the normal full Wiktionary data dump
parse_wiktionary(ctx, args.path, config, word_cb, capture_cb,
(args.page is not None), # phase1_only
(args.pages_dir is not None and
not args.out)) # dont_parse
if args.page:
# Parse a single Wiktionary page (extracted using --pages-dir)
if not args.cache:
print("NOTE: you probably want to use --cache with --page or "
"otherwise processing will be very slow.")
# Load the page wikitext from the given file
with open(args.page, "r", encoding="utf-8") as f:
text = f.read()
m = re.match(r"(?s)^TITLE: ([^\n]*)\n", text)
if m:
title = m.group(1)
text = text[m.end():]
else:
title = "Test page"
# Extract Thesaurus data (this is a bit slow for a single page, but
# needed for debugging linkages with thesaurus extraction). This
# is disabled by default to speed up single page testing.
if args.use_thesaurus:
config.thesaurus_data = extract_thesaurus_data(ctx, config)
# Parse the page
ret = parse_page(ctx, title, text, config)
for x in ret:
word_cb(x)
# Merge errors from ctx to config, so that we can also use
# --errors with single page extraction
config.merge_return(ctx.to_return())
if not args.path and not args.page:
# Parse again from the cache file
reprocess_wiktionary(ctx, config, word_cb, capture_cb,
dont_parse=(bool(args.pages_dir) and
not bool(args.out)))
finally:
if out_path and out_path != "-" and out_f is not None:
out_f.close()
if args.modules_file:
extract_namespace(ctx, "Module", args.modules_file)
if args.templates_file:
extract_namespace(ctx, "Template", args.templates_file)
if args.categories_file:
print("Extracting category tree")
tree = extract_categories(ctx, config)
sys.stdout.flush()
with open(args.categories_file, "w") as f:
json.dump(tree, f, indent=2, sort_keys=True)
if args.profile:
pr.disable()
ps = pstats.Stats(pr).sort_stats(pstats.SortKey.CUMULATIVE)
ps.print_stats()
if out_f is not None and out_path != out_tmp_path:
try:
os.remove(out_path)
except FileNotFoundError:
pass
os.rename(out_tmp_path, out_path)
if args.statistics:
print("")
print("LANGUAGE COUNTS")
for k, cnt in sorted(config.language_counts.items(),
key=lambda x: -x[1]):
print(" {:>7d} {}".format(cnt, k))
if cnt < 1000:
break
print(" ...")
print("")
print("")
print("POS HEADER USAGE")
for k, cnt in sorted(config.pos_counts.items(),
key=lambda x: -x[1]):
print(" {:>7d} {}".format(cnt, k))
print("")
print("POS SUBSECTION HEADER USAGE")
for k, cnt in sorted(config.section_counts.items(),
key=lambda x: -x[1]):
print(" {:>7d} {}".format(cnt, k))
print("")
print("{} WORDS CAPTURED".format(word_count))
if args.errors:
with open(args.errors, "w", encoding="utf-8") as f:
json.dump({
"errors": config.errors,
"warnings": config.warnings,
"debugs": config.debugs,
}, f, sort_keys=True, indent=2)
def dump_un(title, limit, counts, samples):
counts_ht = {}
for k, v in counts.items():
counts_ht[k] = v
lst = list(sorted(counts_ht.items(), reverse=True,
key=lambda x: x[1]))
if lst:
print(title)
for k, cnt in lst[:limit]:
print("{:5d} {}".format(cnt, k))
for kk, vv in samples[k].items():
for sample in vv:
print(" {}".format(sample))
def dump_val(title, limit, counts):
counts_ht = collections.defaultdict(int)
for la, la_val in counts.items():
for name, name_val in la_val.items():
for value, cnt in name_val.items():
counts_ht[name, value] += cnt
for la, la_val in counts.items():
for name, name_val in la_val.items():
for value, v in name_val.items():
counts_ht[name, value] = v
lst = list(sorted(counts_ht.items(), reverse=True,
key=lambda x: x[1]))
if lst:
print("")
print(title)
for (k, kk), v in lst[:limit]:
print(" {:5d} {}: {}".format(v, k, kk))