-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathphase4.py
158 lines (130 loc) · 6.22 KB
/
phase4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import json
import shutil
import subprocess
import tempfile
import os
from os.path import join
import logging
import pandas as pd
import time
from itertools import chain
from lib.general_helpers import prepare_output_dir, find_cached_output
# Phase 4: Thesaurus preprocessing
# Depending on: Phase 2
# Input: Thesaurus by DATEV
# Output: File with list of (word, synset) tuples
# Expected format of the thesaurus file:
# [ { "Class": string, "Concept": number, "Keys": [string], ... } ]
# See `ipynbs/02_thesaurus_stats.ipynb` for the steps in converting the original DATEV file into the required format
def run_phase4(_config):
_output_dir = prepare_output_dir(_config)
try:
run_phase4_core(_config["inputs"], _config["params"], _output_dir)
return _output_dir
except Exception as e:
shutil.rmtree(_output_dir)
raise e
def run_phase4_core(_inputs, params, _output_dir):
thesaurus_file_name = _inputs["thesaurus"]
word_embeddings_file_name = _inputs["word_embeddings"]
with open(thesaurus_file_name) as f:
thesaurus_concepts = json.load(f)
synonym_concepts = [c for c in thesaurus_concepts if c["Class"] == "synonym"]
# Preprocessing steps 1 and 2
for synonym_concept in synonym_concepts:
if "transform_to_lowercase" in params["actions"]:
synonym_concept["Keys"] = [key.lower() for key
in synonym_concept["Keys"]]
if "remove_keys_with_space_or_hyphen" in params["actions"]:
synonym_concept["Keys"] = [key for key
in synonym_concept["Keys"]
if " " not in key
and "-" not in key]
elif "remove_keys_with_space_keep_hyphen" in params["actions"]:
synonym_concept["Keys"] = [key for key
in synonym_concept["Keys"]
if " " not in key]
if "replace_all_sz" in params["actions"]:
synonym_concept["Keys"] = [key.replace(u"ß", 'ss') for key
in synonym_concept["Keys"]]
synonym_concepts = [x for x in synonym_concepts if len(x["Keys"]) > 0]
# Preprocessing step 3
# we sort synonym_concepts by number of keys (ascending) in the concept
# so that on the conversion to dict,
# the word will map to the concept with more keys
# example: `dict([('B', 1), ('B', 2), ('A', 2), ('C', 3)])` will map 'B' to 2
if "create_n_1_mapping_largest_sysnet_wins_key" in params["actions"]:
synonym_concepts = sorted(
synonym_concepts,
key=lambda _concept: len(_concept["Keys"]))
list_of_keylists = [
[(key, x["Concept"]) for key in x["Keys"]]
for x in synonym_concepts]
wordlist = list(chain.from_iterable(list_of_keylists))
# Wordset is the one we want - every key (word) appears there just once
wordset = dict(wordlist) # dict({word: conceptNumber})
else:
raise NotImplementedError
# Preprocessing step 4
if "remove_keys_not_in_corpus" in params["actions"]:
thesaurus_words_with_concepts = tempfile.NamedTemporaryFile(mode="w+")
sorted_thesaurus_words_without_concepts = tempfile.NamedTemporaryFile("w+")
for word, concept in wordset.items():
thesaurus_words_with_concepts.write("{} {}\n".format(word, concept))
thesaurus_words_with_concepts.flush()
os.fsync(thesaurus_words_with_concepts.fileno())
cmd1 = subprocess.getoutput(
"cat {} | awk '{{print $1;}}' | sort > {}".format(
thesaurus_words_with_concepts.name,
sorted_thesaurus_words_without_concepts.name))
logging.info(time.strftime("%H:%M:%S") + " cmd1" + cmd1)
word_embeddings_sorted_word_list = tempfile.NamedTemporaryFile("w+")
cmd2 = subprocess.getoutput(
"cat {} | tail -n +2 | awk '{{print $1;}}' | sort > {}".format(
join(word_embeddings_file_name, "main.txt"),
word_embeddings_sorted_word_list.name
))
logging.info(time.strftime("%H:%M:%S") + " cmd2" + cmd2)
thesaurus_words_also_in_corpus = tempfile.NamedTemporaryFile("w+")
cmd3 = subprocess.getoutput(
"comm -12 {} {} > {}".format(
sorted_thesaurus_words_without_concepts.name,
word_embeddings_sorted_word_list.name,
thesaurus_words_also_in_corpus.name
))
logging.info(time.strftime("%H:%M:%S") + " cmd3" + cmd3)
sorted_thesaurus_words_without_concepts.close()
word_embeddings_sorted_word_list.close()
thesaurus_words_also_in_corpus_with_concepts = tempfile.NamedTemporaryFile("w+")
cmd4 = subprocess.getoutput(
"awk 'NR==FNR{{a[$0]=$0}}NR>FNR{{if($1==a[$1])print $0}}' {} {} > {}".format(
thesaurus_words_also_in_corpus.name,
thesaurus_words_with_concepts.name,
thesaurus_words_also_in_corpus_with_concepts.name
))
logging.info(time.strftime("%H:%M:%S") + " cmd4" + cmd4)
thesaurus_words_also_in_corpus.close()
thesaurus_words_with_concepts.close()
thesaurus_after4 = pd.read_table(
thesaurus_words_also_in_corpus_with_concepts.name,
sep=" ",
header=None,
names=["word", "synset"],
index_col=0)
thesaurus_words_also_in_corpus_with_concepts.close()
else:
raise NotImplementedError
# Preprocessing step 5
if "remove_synsets_with_less_than_two_keys" in params["actions"]:
synset_value_counts = thesaurus_after4["synset"].value_counts()
matches = thesaurus_after4["synset"].isin(synset_value_counts[synset_value_counts > 1].index)
thesaurus_after5 = thesaurus_after4[matches]
else:
raise NotImplementedError
thesaurus_after5.to_csv(join(_output_dir, "main.txt"), sep=" ")
def phase4(_config):
logging.info("Phase 4 starting")
_cached_output_dir = find_cached_output(_config["output"], _config["inputs"], _config["params"])
if _cached_output_dir is not None:
return _cached_output_dir
return run_phase4(_config)