forked from LiamSwayne/web_tool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharchive_local_send_to_ia.py
98 lines (86 loc) · 3.13 KB
/
archive_local_send_to_ia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
import os
import time
import logging
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor
from internetarchive import upload, get_session
from bs4 import BeautifulSoup
# Set up logging
logging.basicConfig(level=logging.INFO)
logging.getLogger('internetarchive').setLevel(logging.DEBUG)
def check_ia_config():
try:
get_session()
print("Internet Archive configuration is valid.")
except Exception as e:
print(f"Error with Internet Archive configuration: {str(e)}")
exit(1)
def fetch_url(url):
try:
response = requests.get(url, timeout=30)
return response.content
except Exception as e:
print(f"Error fetching {url}: {str(e)}")
return None
def fetch_resources(url):
content = fetch_url(url)
if content is None:
return None
resources = {
'html': content,
'resources': {}
}
soup = BeautifulSoup(content, 'html.parser')
for tag in soup.find_all(['img', 'script', 'link']):
src = tag.get('src') or tag.get('href')
if src and src.startswith(('http://', 'https://')):
resource_content = fetch_url(src)
if resource_content:
resources['resources'][src] = resource_content
return resources
def package_for_archive(url, resources):
if resources is None:
return None
domain = urlparse(url).netloc
timestamp = int(time.time())
dir_name = f"{domain}_{timestamp}"
os.makedirs(dir_name, exist_ok=True)
with open(f"{dir_name}/index.html", 'wb') as f:
f.write(resources['html'])
for resource_url, content in resources['resources'].items():
filename = os.path.basename(urlparse(resource_url).path)
with open(f"{dir_name}/{filename}", 'wb') as f:
f.write(content)
return dir_name
def submit_to_archive(package_dir):
try:
r = upload(package_dir, files=[f"{package_dir}/{f}" for f in os.listdir(package_dir)],
metadata={"collection": "your_specific_collection", "mediatype": "web"})
print(f"Submitted {package_dir} to Internet Archive")
return r
except Exception as e:
print(f"Error submitting {package_dir} to Internet Archive: {str(e)}")
if hasattr(e, 'response') and e.response is not None:
print(f"Response status code: {e.response.status_code}")
print(f"Response content: {e.response.text}")
return None
def process_url(url):
resources = fetch_resources(url)
if resources:
package_dir = package_for_archive(url, resources)
if package_dir:
submit_to_archive(package_dir)
time.sleep(5) # Wait 5 seconds between submissions
# Check Internet Archive configuration
check_ia_config()
# Read URLs from file
with open('selected_urls.txt', 'r') as f:
urls = f.read().splitlines()
# Process up to 100 URLs
urls_to_process = urls[:100]
# Uncomment the following line to test with a single URL
# process_url("http://example.com")
# Process URLs in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
executor.map(process_url, urls_to_process)