-
Notifications
You must be signed in to change notification settings - Fork 0
/
transient_metadata_extraction.py
46 lines (41 loc) · 1.26 KB
/
transient_metadata_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from urllib.request import urlopen
from html.parser import HTMLParser
import csv
from pipeline_utils import crts_url_list
data_list = []
data_list_item = []
meta_data_list = []
class metaDataHTMLParser(HTMLParser):
def handle_endtag(self, tag):
global data_list
global data_list_item
if tag == 'tr':
data_list.append(data_list_item)
print(data_list_item)
data_list_item = []
def handle_data(self, data):
global data_list_item
# print(str(data))
data_list_item.append(str(data))
def transient_metadata_extraction():
global data_list
urls = crts_url_list
for index, url in enumerate(urls):
html = urlopen(url)
the_page = str(html.read())
parser = metaDataHTMLParser()
parser.feed(the_page)
data_list = data_list[1:]
#remove junk
for item in data_list:
item.pop(0)
id = item.pop(0)
id = id[:len(id)-2]
item.insert(0,id)
item.pop(12)
# # #write to file11
# # save_meta_data(data_list, index)
meta_data_list.append(data_list)
data_list = []
flat_list = [item for sublist in meta_data_list for item in sublist]
return flat_list