-
Notifications
You must be signed in to change notification settings - Fork 0
/
overlaps.py
277 lines (225 loc) · 8.96 KB
/
overlaps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# %%
import json
import time
import random
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import datasets
from iso639 import Lang
from iso639.exceptions import InvalidLanguageValue
import wikipediaapi
from duckduckgo_search import DDGS
import matplotlib.pyplot as plt
import seaborn as sns
# %%
# load all manifestos
processed_paths = Path("data/processed/").glob("*.csv")
df = pd.DataFrame([])
for path in tqdm(processed_paths):
df_one = pd.read_csv(path)
df = pd.concat([df, df_one])
# %%
# add metadata
def add_metadata(df: pd.DataFrame, select_cols: list|None = None, drop_cols: list|None = None) -> pd.DataFrame:
"""
good preset for select_cols:
["id", "created", "metadata.party_abbrev", "metadata.party_name"]
"""
ds = datasets.load_from_disk("data/interim/mp_1990")
if select_cols:
meta = ds.flatten().select_columns(select_cols).to_pandas()
else:
meta = ds.flatten().to_pandas()
if drop_cols:
meta = meta.drop(columns=drop_cols)
df_meta = df.merge(meta, how="left", on="id")
return df_meta
df_meta = add_metadata(df, drop_cols=["text", "added"])
# %%
# add language tag
def add_language_tag(df: pd.DataFrame) -> pd.DataFrame:
"""
"""
ds = datasets.load_from_disk("data/interim/mp_1990")
langs_in_ds = pd.DataFrame.from_records(ds["metadata"])["language"].unique().tolist()
exceptions = {"Greek": "Modern Greek (1453-)",}
lang2iso = {}
for lang in langs_in_ds:
try:
iso_code = Lang(lang.title()).pt1
lang2iso.update({lang: iso_code})
except InvalidLanguageValue:
if lang.title() in exceptions:
alt_lang = exceptions[lang.title()]
iso_code = Lang(alt_lang).pt1
lang2iso.update({lang: iso_code})
else:
lang2iso.update({lang: "NaN"})
def convert_language_tag(obs):
lang_orig = obs["metadata"]["language"]
lang = lang2iso[lang_orig]
obs["metadata"]["language"] = lang
return obs
ds = ds.map(convert_language_tag)
# %%
# cutoff for very LF entities
ent_counts = df_meta.groupby("norm_text").size().rename("count").sort_values(ascending=False).reset_index()
ent_nonunique = ent_counts.query("count > 2")
print(f"Goes from {len(ent_counts)} unique entities to {len(ent_nonunique)} with more than 1 occurance")
# merge
df_fil = pd.merge(df_meta, ent_nonunique, on="norm_text", how="right")
print(f"Goes from {len(df_meta)} entity instances to {len(df_fil)}")
# year information
df_fil["created"] = pd.to_datetime(df_fil["created"])
df_fil["year"] = [date.year for date in df_fil["created"]]
# filter by label
allowed_labels = ["ORG", "LOC", "MISC", "GPE", "NORP", "PER", "PERSON", "placeName", "orgName", "LANGUAGE", "ORGANIZATION", "NAT_REL_POL", "geogName", "persName"]
df_fil_lab = df_fil.query("label == @allowed_labels")
print(f"Goes from {len(df_fil)} to {len(df_fil_lab)} when dropping grabage labels")
# drop non-alphanumeric
def calculate_alpha_percentage(input_string):
# Initialize counter for alphabetical characters
alpha_count = 0
total_count = len(input_string)
# Iterate through each character in the string
for char in input_string:
if char.isalpha() or char.isspace():
alpha_count += 1
# Calculate the percentage
if total_count == 0:
return 0.0
percentage = (alpha_count / total_count) * 100
return percentage
percent_alpha = df_fil_lab["norm_text"].apply(calculate_alpha_percentage).tolist()
df_fil_lab["percent_alpha"] = percent_alpha
df_alpha = df_fil_lab.query("percent_alpha > 33.3")
print(f"Goes from {len(df_fil_lab)} to {len(df_alpha)} after removing non-alphabetical entities")
# def justify_threshold():
# """
# Go through the top 50 and identify exceptions
# """
# df_fil_lab.query("percent_alpha < 66")[["norm_text", "count", "percent_alpha"]].drop_duplicates(subset=["norm_text"]).sort_values("percent_alpha", ascending=False).head(50)
# df_fil_lab.query("percent_alpha < 66")[["norm_text", "count", "percent_alpha"]].drop_duplicates(subset=["norm_text"]).sort_values("count", ascending=False).head(50)
# exceptions = ["d66", "ja21", "f.d.p.", "ε.ε.", "e.e.", "g7", "g8", "g20", "g-20", "c02", "covid-19", "i+d+i"]
# top n named entities per country
df_top = pd.DataFrame([])
for country_name, df_country in df_alpha.groupby(["metadata.country_name", "year"]):
top_ne = df_country["norm_text"].value_counts().head(50).keys().tolist()
df_one = df_country.query("norm_text == @top_ne")
df_top = pd.concat([df_top, df_one])
print(f"Goes from {len(df_alpha)} filtered NEs to {len(df_top)} top NEs")
print(f"Got {len(df_top["norm_text"].unique())} unique NEs")
# %%
# dictionary to search wikipedia with
# # part 1: absolute matches
# # THEY DON'T MAKE SENSE ON ACCOUNT OF "PS" MEANING ALL KINDS OF DIFFERENT SHIT
# absolute_matches = {}
# grouped_text_lang = df_top.groupby('norm_text')['metadata.language'].unique()
# for token in tqdm(ent_nonunique["norm_text"].unique()):
# if token in grouped_text_lang:
# valid_langs = grouped_text_lang[token]
# translation = {lang: token for lang in valid_langs}
# absolute_matches[token] = translation
# %%
# part 2: filling in the gaps
def search_iteration(query, sleep=True) -> dict|str|None:
"""
Returns
-------
dict : if search ran and found
None: if search ran and did not find
str: if there was an error
"""
if sleep:
time.sleep(random.uniform(1, 5))
try:
results = DDGS().text(query, max_results=5)
first_wiki_link = None
for page in results:
if "wikipedia.org" in page["href"]:
first_wiki_link = page
break
else:
continue
return first_wiki_link
except Exception as e:
return f"Error: {str(e)}"
SKIP_TO = 0
finished_iterations = 0
last_save = 0
results = []
error_checking = []
for name, df_group in tqdm(df_top.groupby(["norm_text", "metadata.language"])):
if finished_iterations < SKIP_TO:
finished_iterations += 1
continue
token, lang = name
# first round: raw text
wiki_1st = search_iteration(token)
results.append({"token": token, "lang": lang, "round": 1, "wiki": wiki_1st})
error_checking.append(wiki_1st)
# second round: raw text + context
query = f"{token} {lang}"
wiki_2nd = search_iteration(query)
results.append({"token": token, "lang": lang, "round": 2, "wiki": wiki_2nd})
error_checking.append(wiki_2nd)
# third round raw text + context + wiki
if not wiki_1st and not wiki_2nd:
query = f"{token} {lang} wiki"
wiki_3rd = search_iteration(query)
results.append({"token": token, "lang": lang, "round": 3, "wiki": wiki_3rd})
error_checking.append(wiki_2nd)
# save every ten itterations
if finished_iterations % 10 == 0:
# save the last 10 named entities
with open(f"data/ddg/dump_{finished_iterations}.json", "w") as fout:
json.dump(results, fout)
# if more than 50% are errors, break and live to see another day
error_rate = sum([isinstance(obj, str) for obj in error_checking]) / len(error_checking)
if error_rate > 0.5:
raise ValueError(f"Too many errors. Try again from iteration: {last_save}")
# reset results
results = []
error_checking = []
last_save = finished_iterations
# long pause every 100 iterations
if (finished_iterations + 1) % 100 == 0:
print("Pausing for a while...")
time.sleep(random.uniform(30, 60))
# finished iterations
finished_iterations += 1
# %%
counter = 0
for name, df_group in tqdm(df_top.groupby(["norm_text", "metadata.language"])):
counter += 1
# %%
# fucked_mask = df_fil[df_fil["metadata.language"] == "french"]["norm_text"].str.contains("eu")
# fucked_france = df_fil[df_fil["metadata.language"] == "french"][fucked_mask]
# %%
# new wiki api
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('manifesto-neighbourhoods/0.0 (https://github.com/centre-for-humanities-computing/manifesto-neighbourhoods; [email protected])', 'en')
# %%
# edge cases
df_fil[df_fil["metadata.language"] == "danish"].query("count > 5").query("label == 'ORG'").head(20)
# %%
# cer
cer = ds.filter(lambda x: x["id"] == "53321_201602")
cer[0]["text"][13275 - 100: 13278 + 200]
# %%
# danish nato
nato = ds.filter(lambda x : x["id"] == "13229_199803")
nato[0]["text"][7603 - 100 : 7607 + 200]
# %%
# regions dictionary
# %%
# non-interesting
non_interesting_overlap = ["europa", "eu", "nederland", "lietuva", "estado", "europees", "sozial", "españa", "deutschland", "македонија", "ireland", "österreich", "nederlands", "ps", "labour"]
df_fil.query("norm_text != @non_interesting_overlap")["norm_text"].value_counts().head(20)
# %%
for gr_name, group in df_fil.groupby("metadata.language"):
print(gr_name)
print(group["norm_text"].value_counts().head(20))
print("\n")
# %%