-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget-all-link-data.py
33 lines (25 loc) · 982 Bytes
/
get-all-link-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from get_documentation_page_data import fetch_data
import json
from urllib.parse import urlparse, unquote
def generate_filename_from_url(url):
# Parse the URL to get the path component
path = urlparse(url).path
# Unquote URL-encoded characters
path = unquote(path)
# Split the path into segments and filter out empty segments
segments = [segment for segment in path.split('/') if segment]
# Use the last few segments of the path to form a filename
filename = '-'.join(segments)
# Append a file extension, e.g., .html
filename += '.json'
return filename
with open("./unique_sitemap_links.json","r") as file:
links = json.load(file)
full_data =[]
for idx,link in enumerate(links[:100]):
filename = generate_filename_from_url(link)
print(idx,link)
data = fetch_data(url = link)
full_data.append(data)
with open("./page_data/full_data_100.json","w") as file:
json.dump(full_data,file)