forked from duckduckgrayduck/internet-archive-export-add-on
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
59 lines (50 loc) · 2.34 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
""" This program uses the internetarchive python library and DocumentCloud's addon system"""
import os.path
import shutil
import subprocess
from internetarchive import upload
from documentcloud.addon import AddOn
FILECOIN_ID = 104
class Archive(AddOn):
"""Based on DocumentCloud HelloWorld template Add-On."""
def main(self):
"""
At the present time all items are uploaded to Document Cloud's Internet Archive page,
which can be found here: https://archive.org/details/@documentcloudupload
If you fork the project and create your own repo secrets (IA_USER and IA_PASS),
The code will upload to your Internet Archive account.
The subprocess.call() runs the Internat Archive configuration command.
See https://archive.org/services/docs/api/internetarchive/quickstart.html
"""
if not self.documents:
self.set_message("Please select at least one document")
return
os.makedirs(os.path.dirname("./out/"), exist_ok=True)
item_name = self.data["item_name"]
# Item names in the Internet archive cannot include spaces, so spaces -> dashes.
item_name = item_name.replace(" ", "-")
# pulls the internet archive username & password secrets from the workflow environment.
ia_user = os.environ["TOKEN"]
ia_pass = os.environ["KEY"]
# cmd to set up the config file for Internet Archive API access.
cmd = f'ia configure --username {ia_user} --password {ia_pass}'
subprocess.call(cmd, shell=True)
doc_ids = []
for document in self.get_documents():
document_id = str(document.id)
title = f'{document.title}-{document_id}.pdf'
save_path = "./out"
full_path = os.path.join(save_path, title)
with open(full_path, "wb") as file:
file.write(document.pdf)
upload(item_name, files=full_path)
doc_ids.append(document_id)
if self.data.get("filecoin") and doc_ids:
self.client.post(
"addon_runs/",
json={"addon": FILECOIN_ID, "parameters": {}, "documents": doc_ids},
)
# temporary directory out is deleted after completion.
shutil.rmtree("./out/", ignore_errors=False, onerror=None)
if __name__ == "__main__":
Archive().main()