-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordlist.py
executable file
·101 lines (82 loc) · 3.68 KB
/
wordlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
"""
This script is meant to serve as a functional example of how the xigt library can be used
to process xigt files.
:author: Ryan Georgi <[email protected]>
"""
import glob
import re
from argparse import ArgumentParser
from collections import defaultdict
from xigt import xigtpath
from xigt.codecs import xigtxml
def wordlist(filelist, gloss=None, meta=None):
"""
This function takes a list of Xigt-XML ODIN files, looks for the
'normalized' ODIN tier, and grabs the contents of all gloss and
meta lines. It tokenizes simply by matching all word characters
(using regex's `\w` escape) so as to pull out hyphenated and dotted
gloss line tokens.
The output is returned as a wordlist reverse sorted by count.
:param filelist: List of input files to process.
:type filelist: list[str]
:param gloss: Path to use for the output gloss wordlist.
:type gloss: str
:param meta: Path to use for the output meta wordlist.
:type meta: str
"""
gloss_words = defaultdict(int)
meta_words = defaultdict(int)
# -------------------------------------------
# Iterate over all the paths in the list of files.
# -------------------------------------------
for path in filelist:
with open(path, 'r', encoding='utf-8') as f:
# Load the XigtCorpus, using the transient mode (most memory efficient)
xc = xigtxml.load(f, mode='transient')
# Now, iterate over each `Igt` instance in each file,
for igt in xc:
# Use a xigtpath expression to find the `tier` item that is a child of this node,
# with state="normalized" as an attribute.
norm_tier = xigtpath.find(igt, './tier[@state="normalized"]')
# Next, since the `tag` attribute can be G+CR or M+AC etc., grab all lines
# with a tag that starts with the desired tag letter.
gloss_lines = [item for item in norm_tier if item.attributes['tag'].startswith("G")]
meta_lines = [item for item in norm_tier if item.attributes['tag'].startswith("M")]
# Define a local function to update the wordlists for gloss and meta
# lines.
def update_count(l_l, words):
for l in l_l:
if l.value():
for w in l.value().split():
for sub_w in re.findall('[\w]+', w): # <-- tokenize
if sub_w.strip():
words[sub_w.lower()] += 1 # <-- lowercase, and add
# Update the counts.
update_count(gloss_lines, gloss_words)
update_count(meta_lines, meta_words)
# Define a function to write out the wordlist objects to files.
# here, we will reverse sort by frequency of the word, and
# tab-delineate the columns.
def write_items(words, path):
if path:
f = open(path, 'w', encoding='utf-8')
items = sorted(words.items(), key=lambda x: (x[1], x[0]), reverse=True)
for w, count in items:
f.write('{}\t{}\n'.format(w, count))
f.close()
write_items(gloss_words, gloss)
write_items(meta_words, meta)
# -------------------------------------------
# Main
# -------------------------------------------
if __name__ == '__main__':
p = ArgumentParser()
p.add_argument('INFILES', nargs='+')
p.add_argument('-g', '--gloss')
p.add_argument('-m', '--meta')
args = p.parse_args()
filelist = []
for infile in args.INFILES:
filelist.extend(glob.glob(infile))
wordlist(filelist, args.gloss, args.meta)