-
Notifications
You must be signed in to change notification settings - Fork 1
/
QueryExecutor.py
258 lines (233 loc) · 8.96 KB
/
QueryExecutor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""
Query Executor class and methods
"""
# import pprint
import re
from typing import Dict, List, Optional, Tuple
import requests
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
from prettytable import PrettyTable
from GPT3Extractor import gpt3Extractor
from lib.utils import RELATIONS
from SpanBertExtractor import spanBertExtractor
# HTML tags that we want to extract text from.
class QueryExecutor:
"Creates a QueryExecutor object"
def __init__(self, args) -> None:
"""
Initialize a QueryExecutor object
Instance Variables:
query: the query string
r: the relation to extract
t: the extraction confidence threshold
k: the number of tuples that we request in the output
spanbert: whether or not to use SpanBERT
gpt3: whether or not to use GPT-3
google_engine_id: the Google Custom Search Engine ID
openai_secret_key: the OpenAI Secret Key
engine: the Google Custom Search Engine
seen_urls: the set of URLs that we have already seen
used_queries: the set of queries that we have already used
extractor: the extractor object (either SpanBERTExtractor or GPT-3Extractor)
"""
self.q = args.q
self.r = args.r
self.t = args.t
self.k = args.k
self.spanbert = args.spanbert
self.gpt3 = args.gpt3
self.custom_search_key = args.custom_search_key
self.google_engine_id = args.google_engine_id
self.openai_secret_key = args.openai_secret_key
self.engine = build("customsearch", "v1", developerKey=args.custom_search_key)
self.seen_urls = set()
self.used_queries = set([self.q])
self.extractor = (
gpt3Extractor(r=self.r, openai_key=self.openai_secret_key)
if self.gpt3
else spanBertExtractor(r=self.r, t=self.t)
)
def printQueryParams(self) -> None:
"""
Prints the query parameters
Parameters:
None
Returns:
None
"""
print("Parameters:")
print(f"Client key = {self.custom_search_key}")
print(f"Engine key = {self.google_engine_id}")
print(f"OpenAI key = {self.openai_secret_key}")
print(f"Relation = {RELATIONS[self.r]}")
if self.spanbert:
print("Method = spanbert")
print(f"Threshold = {self.t}")
if self.gpt3:
print("Method = gpt3")
print("Threshold = XXX")
print(f"Query = {self.q}")
print(f"# of Tuples = {self.k}")
return
def getQueryResult(self, query: str, k) -> List:
"""
Get the top 10 results for a given query from Google Custom Search API
Source: https://github.com/googleapis/google-api-python-client/blob/main/samples/customsearch/main.py
"""
full_res = (
self.engine.cse()
.list(
q=query,
cx=self.google_engine_id,
)
.execute()
)
return full_res["items"][0 : k + 1]
def processText(self, url: str) -> Optional[str]:
"""
Get the tokens from a given URL
If webpage retrieval fails (e.g. because of a timeout), it is skipped (None returned)
Extracts the plain text from the URL using Beautiful Soup.
If the resulting plain text is longer than 10,000 characters, it is truncated.
Only the text in the <p> tags is processed.
Parameters:
url (str) - the URL to process
Returns:
List[str] - the list of tokens
"""
try:
print(" Fetching text from url ...")
page = requests.get(url, timeout=5)
except requests.exceptions.Timeout:
print(f"Error processing {url}: The request timed out. Moving on...")
return None
try:
soup = BeautifulSoup(page.content, "html.parser")
html_blocks = soup.find_all("p")
text = ""
for block in html_blocks:
text += block.get_text()
if text != "":
text_len = len(text)
print(
f" Trimming webpage content from {text_len} to 10000 characters"
)
preprocessed_text = (text[:10000]) if text_len > 10000 else text
print(
f" Webpage length (num characters): {len(preprocessed_text)}"
)
# Removing redundant newlines and some whitespace characters.
preprocessed_text = re.sub("\t+", " ", preprocessed_text)
preprocessed_text = re.sub("\n+", " ", preprocessed_text)
preprocessed_text = re.sub(" +", " ", preprocessed_text)
preprocessed_text = preprocessed_text.replace("\u200b", "")
return preprocessed_text
else:
return None
except Exception as e:
print(f"Error processing {url}: {e}. Moving on ...")
return None
def parseResult(self, result: Dict[str, str]) -> None:
"""
Parse the result of a query.
Exposed function for use by main function.
Parameters:
result (dict) - one item as returned as the result of a query
Returns:
None
"""
url = result["link"]
if url not in self.seen_urls:
self.seen_urls.add(url)
text = self.processText(url)
if not text:
return None
self.extractor.get_relations(text)
return
def checkContinue(self) -> bool:
"""
Evaluate if we have evaluated at least k tuples, ie continue or halt.
Parameters: None
Returns: bool (True if we need to find more relations, else False)
"""
return len(self.extractor.relations) < self.k
def getNewQuery(self) -> Optional[str]:
"""
Creates a new query.
Select from X a tuple y such that y has not been used for querying yet
Create a query q from tuple y by concatenating
the attribute values together.
If no such y tuple exists, then stop/return None.
(ISE has "stalled" before retrieving k high-confidence tuples.)
Parameters:
None
Returns:
query (str) if available; else None
"""
if self.gpt3:
# Iterating through extracted tuples
for relation in list(self.extractor.relations):
# Constructing query
if self.gpt3:
tmp_query = " ".join(relation)
# Checking if query has been used
if tmp_query not in self.used_queries:
# Adding query to used queries
self.used_queries.add(relation)
# Setting new query
self.q = tmp_query
return self.q
# No valid query found
return None
elif self.spanbert:
# Sort by tuples by confidence
rels = sorted(
self.extractor.relations.items(), key=lambda item: item[1], reverse=True
)
# TODO: remove after testing
# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(rels)
queryNotFound = True
i = 0
while queryNotFound:
# No valid query found
if i >= len(rels):
return None
subj_obj, _pred = rels[i]
tmp_query = " ".join(subj_obj)
# Checking if query has been used
if tmp_query not in self.used_queries:
queryNotFound = False
# Adding query to used queries
self.used_queries.add(tmp_query)
# Setting new query
self.q = tmp_query
return self.q
i += 1
return
def printRelations(self) -> None:
"""
Print the results of the query, relations in table format
If -spanbert, sort by confidence (descending)
Parameters:
None
Returns:
None
"""
print(
f"================== ALL RELATIONS for {RELATIONS[self.r]} ( {len(self.extractor.relations)} ) ================="
)
table = PrettyTable()
table.align = "l"
if self.gpt3:
table.field_names = ["Subject", "Object"]
table.add_rows(self.extractor.relations)
else:
table.field_names = ["Confidence", "Subject", "Object"]
for subj_obj, pred in self.extractor.relations.items():
table.add_row([pred, subj_obj[0], subj_obj[1]])
table.sortby = "Confidence"
table.reversesort = True
print(table)
return