-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathemails.py
69 lines (49 loc) · 2.31 KB
/
emails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
## emails.py
import math, sys
from SPARQLWrapper import SPARQLWrapper, JSON
from queries.peopleKeywordCount import peopleKeywordCount
from queries.createPeopleKeywordQuery import createPeopleKeywordQuery
# get the year to query from the user
year = input("Enter year to query: ")
year = str(year)
## when making name file, change to + "names.csv"
filename = year + "emails.csv"
# there are too many results to get all at once
# here we ask the database how many results there
# are for the year we are interested in. Given that
# we can get 10,000 results per query, we do a little
# math to compute how many times we need to query the database
# to get all the results
offset = 0
limit = float(peopleKeywordCount(year))
numQueries = math.ceil(limit/10000)
# setting up the query
# specifying the web address of the database
# setting the return format to JSON - JavaScript Object Notation
sparql = SPARQLWrapper("http://abstractsearch.agu.org:8890/sparql")
sparql.setReturnFormat(JSON)
# keep looping and querying until we get all the results
import csv
with open(filename, 'wb') as csvfile:
resultwriter = csv.writer(csvfile, delimiter= ',' ,
quoting=csv.QUOTE_MINIMAL)
#keyword = []
#name = []
while (numQueries > 0):
query = createPeopleKeywordQuery(year,str(offset))
sparql.setQuery(query)
offset = offset + 10000
results = sparql.query().convert()
# Save all keywords into array
## pass array into import csv program
## we can loop program and write rows one at a time
for result in results["results"]["bindings"]:
keywordURL = result["keyword"]["value"]
email = result["mbox"]["value"]
keyCode = keywordURL.split("keywords/")
if keywordURL != "None":
keyCode = keywordURL.split("keywords/")
resultwriter.writerow([keyCode[1], email])
#print(result["keyword"]["value"] + " " + result["section"]["value"])
numQueries = numQueries - 1
# set delimeter to comma - separates into different columns