-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollector.py
220 lines (196 loc) · 7.6 KB
/
collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import shutil
import time
import random
import requests as req
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
# Commenting out cfsrape as it doesn't work anymore. Trying different approach with - cloudscraper
# import cfscrape
import cloudscraper
from packer import Packer
from variant import Variant
from sweepers.factory import SweeperFactory
class Collector:
"""
Collector can collect chapters, scrape chapter URL and get images
All will be archived in a temp dir named: archives
"""
TMP_COLLECTIONS_DIR = "collections"
def __init__(self, options, dry_run, clean, parallel, reverse, start_from, use_proxies=True,):
"""Initialize the Collector object
:param url: <str> The URL from which to collect chapters and other info
:param dry_run: <bool> Will only print and not download
:return: None
"""
super().__init__()
self.dry_run = dry_run
self.clean_after = clean
self.parallel = parallel
self.options = options
self.reverse = reverse
self.use_proxies = use_proxies
self.packer = Packer()
self.collection_path = self.TMP_COLLECTIONS_DIR
self.sweeper = None
self.start_from = start_from
self._init_referrer()
def _init_referrer(self):
self.session = req.session()
if self.options is not None:
if (
"referer" in self.options
and self.options["referer"] != ""
and self.options["referer"] is not None
):
print("= Added the referer:", self.options["referer"])
self.session.headers.update({"referer": self.options["referer"]})
self.scraper = cloudscraper.create_scraper(sess=self.session)
def _tear_down_all(self):
shutil.rmtree(self.TMP_COLLECTIONS_DIR, ignore_errors=True)
print("=" * 75)
def _tear_down_collection(self):
if not os.path.exists(str(self.TMP_COLLECTIONS_DIR)):
return
for collection in tqdm(os.listdir(self.TMP_COLLECTIONS_DIR), desc="# Cleaning"):
coll_path = os.path.join(self.TMP_COLLECTIONS_DIR, collection)
if not os.path.isdir(coll_path):
continue
for chap in os.listdir(coll_path):
chapter = os.path.join(coll_path, chap)
if os.path.isdir(chapter) and os.path.isfile(
os.path.join(chapter, "1.jpg")
):
self._tear_down_chapter(chapter)
print("=" * 75)
def _tear_down_chapter(self, name):
abs_path = os.path.abspath(name)
print("### Removing: ", abs_path)
shutil.rmtree(abs_path)
def clean_scraper(self):
print("### Something went wrong. Cleaning scraper...")
self.session.close()
self.scraper.close()
self.session = req.session()
self.scraper = cloudscraper.create_scraper(sess=self.session)
def collect(self):
"""Collect all chapters and images from chapters
:return: None
"""
tos = self.options["to"]
self._collect(tos, Variant.TO)
mas = self.options["ma"]
self._collect(mas, Variant.MA)
mts = self.options["mt"]
self._collect(mts, Variant.MT)
def _collect(self, options, variant):
"""INNER COLLECTOR
:return: None
"""
print("=" * 75)
urls = options["urls"]
if len(urls) == 0:
print("- No URLs in:", variant)
print("=" * 75)
return
print(f"# Found {len(urls)} URLs in {variant}...")
print("# Opted for simple collection. CPU count:", os.cpu_count())
for url in urls:
print("# Initializing sweeper...")
self.sweeper = SweeperFactory(
main_url=url,
dry_run=self.dry_run,
filters=options["filter"],
reverse=self.reverse,
start_from=self.start_from,
use_proxies=self.use_proxies,
).create_sweeper(variant)
print("# Starting up the sweeper...")
self.sweeper.init()
print("# Sweeping...")
self.sweeper.sweep(save_chapter=self._save_chapter)
print("# Stopping sweeper...")
self.sweeper.stop()
# print("# Saving collection...")
# self._save_collection()
print("=" * 75)
def pack(self):
self.packer.pack_all(self.collection_path)
def pack_collections(self):
self.packer.pack_collections(self.TMP_COLLECTIONS_DIR)
def clean(self):
if self.clean_after:
self._tear_down_collection()
def close(self):
if self.scraper is not None:
self.scraper.close()
if self.sweeper is not None:
self.sweeper.close()
def _save_collection(self):
"""Saves all chapters
:return: None
"""
print("=" * 75)
print("## Saving chapters...")
# create collection dir
self.collection_path = self.sweeper.get_name_path(self.TMP_COLLECTIONS_DIR)
os.makedirs(self.collection_path, exist_ok=True)
if not self.parallel:
print("## Opted for simple downloads. CPU count:", os.cpu_count())
for ch_name, i in tqdm(
self.sweeper.chapters.items(), desc="## Saving chapters"
):
self._save_chapter(ch_name, self.sweeper.get_chapter_imgs(ch_name))
else:
print("## Opted for parallel downloads. CPU count:", os.cpu_count())
with ThreadPoolExecutor(max_workers=os.cpu_count() - 1) as executor:
futures = [
executor.submit(
self._save_chapter,
ch_name,
self.sweeper.get_chapter_imgs(ch_name),
)
for ch_name, _ in self.sweeper.chapter_imgs.items()
]
for future in as_completed(futures):
pass
print("=" * 75)
def _save_chapter(self, chapter_name, imgs):
"""Saves chapter
:return: None
"""
print("=" * 75)
print(f"## Saving chapter: {chapter_name} ...")
# create dirs for imgs
col_dir = os.path.join(self.TMP_COLLECTIONS_DIR, self.sweeper.name)
chapter_dir = os.path.join(col_dir, chapter_name)
os.makedirs(chapter_dir, exist_ok=True)
# download images
for img in tqdm(imgs, desc="### {0}".format(chapter_name), ascii=True):
img_name, img_url = img
img_path = os.path.join(chapter_dir, img_name)
if os.path.exists(img_path):
continue
self._save_img(img_url=img_url, img_path=img_path)
# pack the chapter
self.pack_collections()
def _save_img(self, img_url, img_path):
ok = False
for x in range(0, 10):
for i in range(0, 10):
# sleep randomly so that we mask network behavior & retry
time.sleep(random.uniform(0.2, 1))
r = self.scraper.get(img_url, stream=True, timeout=(60, 60))
if r.status_code == 200:
with open(img_path, "wb") as f:
for chunk in r.iter_content(1024):
f.write(chunk)
ok = True
break
if ok:
break
else:
print("!!! ERROR downloading IMG: ", img_url)
self.clean_scraper()