-
Notifications
You must be signed in to change notification settings - Fork 3
/
stitcher_urls.py
189 lines (169 loc) · 6.14 KB
/
stitcher_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import requests
import user_agent
import re
import string
import logging
import time
from webapp import podrex_db_utils as db
from bs4 import BeautifulSoup
from scipy.stats import exponnorm
punc_regex = re.compile('[%s]' % re.escape(string.punctuation))
headers = {"User-Agent":user_agent.generate_user_agent(os=None,
navigator=None, platform=None, device_type="desktop")}
def get_podcast_name(conn):
"""
Gets the name of a podcast from the database
Parameters
conn: active psycopg2 connection
Returns
podcast_name (str): podcast name
itunes_url (str): itunes_url for matching later
"""
cursor = conn.cursor()
try:
cursor.execute("SELECT podcast_name, itunes_url "
"FROM stitcher "
"WHERE stitcher_url IS NULL "
"LIMIT 1")
result = cursor.fetchone()
podcast_name = punc_regex.sub("", result[0])
itunes_url = result[1]
cursor.close()
return podcast_name, itunes_url
except:
logging.exception("failed to get name")
conn.rollback()
cursor.close()
def google_url_constructor(podcast_name):
"""
Constructs a google search url from a podcast name
Parameters
podcast_name (str): podcast name to search
Returns
google_url (str): google url to search
"""
google_url = "https://www.google.com/search?q=site%3Astitcher.com"
google_url = google_url + "+" + "+".join(podcast_name.split())
return google_url
def google_request(google_url, headers):
"""
Returns google result from a request to google_url
Parameters
google_url (str): google search url
headers (dict): headers to use for consistent search
Returns
google_result (requests object)
"""
google_result = requests.get(google_url, headers=headers)
return google_result
def parse_google_result(google_result):
"""
Returns the first title and url from a google search result
Parameters
google_result (requests object)
Returns
search_name, search_url
"""
soup = BeautifulSoup(google_result.text, "html.parser")
try:
top_result = soup.find("h3",{"class":"r"}).find("a")
search_url = top_result.attrs["href"]
search_name = top_result.decode_contents().split("|")[0]
return search_url, search_name, True
except:
paragraphs = soup.findAll("p")
if ("did not match any documents" in
''.join([p.decode_contents() for p in soup.findAll("p")])):
return True, True, False
else:
logging.exception("failed to find top google result in parsing")
return None, None, False
def update_db(conn, itunes_url, search_url, search_name):
"""
Updates a row in the db with stitcher url and name
Parameters
conn: active psycopg2 connection
itunes_url (str): itunes_url on which to match db row
stitcher_url (str): parsed stitcher url
stitcher_name (str): parsed stitcher name
Returns
True on success, False on failure
"""
cursor = conn.cursor()
try:
cursor.execute("UPDATE stitcher SET search_name = (%s), "
"stitcher_url = (%s) "
"WHERE itunes_url = (%s)",
[search_name, search_url, itunes_url])
conn.commit()
cursor.close()
return True
except:
conn.rollback()
logging.exception("failed to update db on {}".format(itunes_url))
cursor.close()
return False
def process_podcast(conn, log_file):
"""
Wrapper function to process a podcast
Parameters
conn: active psycopg2 connection
log_file (writeable file object): log file to write errors
Returns
None
"""
cursor = conn.cursor()
podcast_name, itunes_url = get_podcast_name(conn)
google_url = google_url_constructor(podcast_name)
google_result = google_request(google_url, headers)
if google_result.status_code == 503:
print("YOU'VE BEEN DISCOVERED!!!!")
cursor.close()
time.sleep(3600)
return None
elif google_result.status_code != 200:
print("failure on {}".format(podcast_name))
log_file.write("failure on {}\n".format(podcast_name))
cursor.execute("update stitcher set stitcher_url = 'problem' "
"where itunes_url = (%s)", [itunes_url])
time.sleep(exponnorm.rvs(2, 45, 1, 1))
cursor.close()
return None
search_url, search_name, parse_success = parse_google_result(google_result)
if not parse_success:
if search_url == True:
print("no results for {}".format(podcast_name))
log_file.write("no results for {}\n".format(podcast_name))
cursor.execute("UPDATE stitcher SET search_name = 'no result', "
"stitcher_url = 'no result' "
"WHERE itunes_url = (%s)", [itunes_url])
conn.commit()
cursor.close()
time.sleep(exponnorm.rvs(2, 45, 1, 1))
return None
else:
print("failure on {}\n{}".format(podcast_name, google_result.text))
log_file.write("failure on {}\n{}".format(podcast_name,
google_result.text))
cursor.execute("update stitcher set stitcher_url = 'problem' "
"where itunes_url = (%s)", [itunes_url])
conn.commit()
cursor.close()
time.sleep(exponnorm.rvs(2, 45, 1, 1))
return None
success = update_db(conn, itunes_url, search_url, search_name)
if success:
print("success on {}".format(podcast_name))
log_file.write("success on {}".format(podcast_name))
cursor.close()
time.sleep(exponnorm.rvs(2, 45, 1, 1))
else:
print("failure on {}".format(podcast_name))
log_file.write("failure on {}".format(podcast_name))
cursor.close()
time.sleep(exponnorm.rvs(2, 45, 1, 1))
if __name__ == "__main__":
conn = db.connect_db()
while True:
with open ("stitcher_log.log", "a") as log_file:
process_podcast(conn, log_file)