Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make downloading of books multithreaded, controlled via -j/--jobs #97

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ You can download to an absolute path, say `C:/ebooks/springer/`
```bash
python3 main.py -f C:/ebooks/springer/
```
Parallel downloading, say with 3 jobs, can be enabled with
```bash
python3 main.py -j 3
```

### Download all German books (PDF and EPUB)
To download the German books use
Expand Down
140 changes: 98 additions & 42 deletions helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
try:
import queue
except:
import Queue as queue
import threading
import tempfile


BOOK_TITLE = 'Book Title'
Expand Down Expand Up @@ -38,19 +44,19 @@ def print_invalid_categories(invalid_categories):
# Remove duplicates
invalid_categories = series[~series.duplicated()]
s = 'categories' if len(invalid_categories) > 1 else 'category'
print("The following invalid book {} will be ignored:".format(s))
tqdm.write("The following invalid book {} will be ignored:".format(s))
for i, name in enumerate(invalid_categories):
print(" {}. {}".format((i + 1), name))
print('')
tqdm.write(" {}. {}".format((i + 1), name))
tqdm.write('')


def print_summary(books, invalid_categories, args):
# Set Pandas to no maximum row limit
pd.set_option('display.max_rows', None)
if args.verbose:
# Print all titles to be downloaded
print(books.loc[:, (BOOK_TITLE, CATEGORY)])
print("\n{} titles ready to be downloaded...".format(len(books.index)))
tqdm.write(str(books.loc[:, (BOOK_TITLE, CATEGORY)]))
tqdm.write("\n{} titles ready to be downloaded...".format(len(books.index)))
print_invalid_categories(invalid_categories)


Expand All @@ -76,37 +82,78 @@ def indices_of_categories(categories, books):
return books.index[t].tolist(), invalid_categories


def download_book(request, output_file, patch):
def download_book(request, output_file, patch, jobnum):
new_url = request.url.replace('%2F','/').replace('/book/', patch['url']) + patch['ext']
request = requests.get(new_url, stream=True)
t = threading.current_thread()
with requests.get(new_url, stream=True) as req:
if req.status_code == 200:
if not os.path.exists(output_file):
path = create_path('./tmp')
tmp_file = os.path.join(path, '_-_temp_file_-_.bak')
file_size = int(req.headers['Content-Length'])
chunk_size = 1024
num_bars = file_size // chunk_size
with open(tmp_file, 'wb') as out_file:
for chunk in tqdm(req.iter_content(chunk_size=chunk_size),
total=num_bars, unit='KB', desc=os.path.basename(output_file),
leave=True):
out_file.write(chunk)
out_file.close()
shutil.move(tmp_file, output_file)


def download_books(books, folder, patches):
if req.status_code == 200 and not os.path.exists(output_file):
path = create_path('./tmp')
file_size = int(req.headers['Content-Length'])
chunk_size = 1024
num_bars = file_size // chunk_size
tmp_file = None
with tempfile.NamedTemporaryFile(dir=path, mode='wb', delete=False) as out_file:
tmp_file = out_file.name
for chunk in tqdm(req.iter_content(chunk_size=chunk_size),
total=num_bars, unit='KB', desc='Job {}: {}'.format(jobnum, os.path.basename(output_file)),
leave=False, position=jobnum):
if t.cancelled:
out_file.close()
os.unlink(tmp_file)
return
out_file.write(chunk)
out_file.close()
shutil.move(tmp_file, output_file)


def make_worker(items_queue, progress, jobnum):
'Helper to make a paremeterized worker-thread-function'
def worker():
'The worker function that fetches items to download from the queue'
t = threading.current_thread()
t.cancelled = False
request = None
while True:
try:
if t.cancelled:
break
item = items_queue.get(True, 0.1)
dest_folder = item['folder']
bookname = item['name']
title = item['title']
patch = item['patch']
url = item['url']
output_file = get_book_path_if_new(dest_folder, bookname, patch)
if output_file is not None:
request = requests.get(url) if request is None else request
download_book(request, output_file, patch, jobnum)
except (OSError, IOError) as e:
tqdm.write(e)
title = title.encode('ascii', 'ignore').decode('ascii')
tqdm.write('* Problem downloading: {}, so skipping it.'
.format(title))
request = None # Enforce new get request
# then continue to download the next book
except queue.Empty:
return
items_queue.task_done()
if not t.cancelled:
progress.update(1)
return worker


def download_books(books, folder, patches, jobs):
assert MAX_FILENAME_LEN >= MIN_FILENAME_LEN, \
'Please change MAX_FILENAME_LEN to a value greater than {}'.format(
MIN_FILENAME_LEN
)
max_length = get_max_filename_length(folder)
longest_name = books[CATEGORY].map(len).max()
if max_length - longest_name < MIN_FILENAME_LEN:
print('The download directory path is too lengthy:')
print('{}'.format(os.path.abspath(folder)))
print('Please choose a shorter one')
tqdm.write('The download directory path is too lengthy:')
tqdm.write('{}'.format(os.path.abspath(folder)))
tqdm.write('Please choose a shorter one')
exit(-1)
books = books[
[
Expand All @@ -118,28 +165,37 @@ def download_books(books, folder, patches):
'English Package Name'
]
]
for url, title, author, edition, isbn, category in tqdm(books.values, desc='Overall Progress'):
pbar = tqdm(total=len(books.values)*len(patches), desc='Overall Progress', leave=True, position=0)
q = queue.Queue()
threads = []
for i in range(jobs):
t = threading.Thread(target=make_worker(q, pbar, i+1))
t.daemon = True
t.start()
threads.append(t)
for url, title, author, edition, isbn, category in books.values:
dest_folder = create_path(os.path.join(folder, category))
length = max_length - len(category) - 2
if length > MAX_FILENAME_LEN:
length = MAX_FILENAME_LEN
bookname = compose_bookname(title, author, edition, isbn, length)
request = None
for patch in patches:
try:
output_file = get_book_path_if_new(dest_folder, bookname, patch)
if output_file is not None:
request = requests.get(url) if request is None else request
download_book(request, output_file, patch)
except (OSError, IOError) as e:
print(e)
title = title.encode('ascii', 'ignore').decode('ascii')
print('* Problem downloading: {}, so skipping it.'
.format(title))
time.sleep(30)
request = None # Enforce new get request
# then continue to download the next book

q.put(dict(folder=dest_folder, name=bookname, patch=patch, title=title, url=url))
try:
while True:
if not q.empty():
time.sleep(0.1)
else:
break
except KeyboardInterrupt:
for t in threads:
t.cancelled = True
for t in threads:
t.join()
raise
finally:
pbar.close()
q.join()

replacements = {'/':'-', '\\':'-', ':':'-', '*':'', '>':'', '<':'', '?':'', \
'|':'', '"':''}
Expand Down
7 changes: 6 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,18 @@
'-i','--index', nargs='+', dest='book_index',
help='list of book indices to download'
)
parser.add_argument(
'-j', '--jobs', dest='jobs', type=int, default=1,
help='number of parallel download jobs'
)
parser.add_argument(
'-v','--verbose', action='store_true', help='show more details'
)

args = parser.parse_args()
folder = create_path(args.folder if args.folder else './downloads')

assert args.jobs > 0, '-j or --jobs must be > 0'
assert args.language in ('en', 'de'), '-l or --language must be "en" or "de"'
if args.language == 'en':
table_url = 'https://resource-cms.springernature.com/springer-cms/rest/v1/content/17858272/data/'
Expand Down Expand Up @@ -85,6 +90,6 @@
books = filter_books(books, sorted(indices))
books.index = [i + 2 for i in books.index] # Recorrect indices
print_summary(books, invalid_categories, args)
download_books(books, folder, patches)
download_books(books, folder, patches, args.jobs)

print('\nFinish downloading.')