alexgand · wildmichael · May 9, 2020 · May 10, 2020
diff --git a/README.MD b/README.MD
@@ -21,6 +21,10 @@ You can download to an absolute path, say `C:/ebooks/springer/`
 ```bash
 python3 main.py -f C:/ebooks/springer/
 ```
+Parallel downloading, say with 3 jobs, can be enabled with
+```bash
+python3 main.py -j 3
+```
 
 ### Download all German books (PDF and EPUB)
 To download the German books use

diff --git a/helper.py b/helper.py
@@ -8,6 +8,12 @@
 import numpy as np
 import pandas as pd
 from tqdm import tqdm
+try:
+    import queue
+except:
+    import Queue as queue
+import threading
+import tempfile
 
 
 BOOK_TITLE = 'Book Title'
@@ -38,19 +44,19 @@ def print_invalid_categories(invalid_categories):
         # Remove duplicates
         invalid_categories = series[~series.duplicated()]
         s = 'categories' if len(invalid_categories) > 1 else 'category'
-        print("The following invalid book {} will be ignored:".format(s))
+        tqdm.write("The following invalid book {} will be ignored:".format(s))
         for i, name in enumerate(invalid_categories):
-            print(" {}. {}".format((i + 1), name))
-        print('')
+            tqdm.write(" {}. {}".format((i + 1), name))
+        tqdm.write('')
 
 
 def print_summary(books, invalid_categories, args):
     # Set Pandas to no maximum row limit
     pd.set_option('display.max_rows', None)
     if args.verbose:
         # Print all titles to be downloaded
-        print(books.loc[:, (BOOK_TITLE, CATEGORY)])
-    print("\n{} titles ready to be downloaded...".format(len(books.index)))
+        tqdm.write(str(books.loc[:, (BOOK_TITLE, CATEGORY)]))
+    tqdm.write("\n{} titles ready to be downloaded...".format(len(books.index)))
     print_invalid_categories(invalid_categories)
 
 
@@ -76,37 +82,78 @@ def indices_of_categories(categories, books):
     return books.index[t].tolist(), invalid_categories
 
 
-def download_book(request, output_file, patch):
+def download_book(request, output_file, patch, jobnum):
     new_url = request.url.replace('%2F','/').replace('/book/', patch['url']) + patch['ext']
     request = requests.get(new_url, stream=True)
+    t = threading.current_thread()
     with requests.get(new_url, stream=True) as req:
-        if req.status_code == 200:
-            if not os.path.exists(output_file):
-                path = create_path('./tmp')
-                tmp_file = os.path.join(path, '_-_temp_file_-_.bak')
-                file_size = int(req.headers['Content-Length'])
-                chunk_size = 1024
-                num_bars = file_size // chunk_size
-                with open(tmp_file, 'wb') as out_file:
-                    for chunk in tqdm(req.iter_content(chunk_size=chunk_size),
-                            total=num_bars, unit='KB', desc=os.path.basename(output_file),
-                            leave=True):
-                        out_file.write(chunk)
-                    out_file.close()
-                shutil.move(tmp_file, output_file)
-
-
-def download_books(books, folder, patches):
+        if req.status_code == 200 and not os.path.exists(output_file):
+            path = create_path('./tmp')
+            file_size = int(req.headers['Content-Length'])
+            chunk_size = 1024
+            num_bars = file_size // chunk_size
+            tmp_file = None
+            with tempfile.NamedTemporaryFile(dir=path, mode='wb', delete=False) as out_file:
+                tmp_file = out_file.name
+                for chunk in tqdm(req.iter_content(chunk_size=chunk_size),
+                        total=num_bars, unit='KB', desc='Job {}: {}'.format(jobnum, os.path.basename(output_file)),
+                        leave=False, position=jobnum):
+                    if t.cancelled:
+                        out_file.close()
+                        os.unlink(tmp_file)
+                        return
+                    out_file.write(chunk)
+                out_file.close()
+            shutil.move(tmp_file, output_file)
+
+
+def make_worker(items_queue, progress, jobnum):
+    'Helper to make a paremeterized worker-thread-function'
+    def worker():
+        'The worker function that fetches items to download from the queue'
+        t = threading.current_thread()
+        t.cancelled = False
+        request = None
+        while True:
+            try:
+                if t.cancelled:
+                    break
+                item = items_queue.get(True, 0.1)
+                dest_folder = item['folder']
+                bookname = item['name']
+                title = item['title']
+                patch = item['patch']
+                url = item['url']
+                output_file = get_book_path_if_new(dest_folder, bookname, patch)
+                if output_file is not None:
+                    request = requests.get(url) if request is None else request
+                    download_book(request, output_file, patch, jobnum)
+            except (OSError, IOError) as e:
+                tqdm.write(e)
+                title = title.encode('ascii', 'ignore').decode('ascii')
+                tqdm.write('* Problem downloading: {}, so skipping it.'
+                        .format(title))
+                request = None                    # Enforce new get request
+                # then continue to download the next book
+            except queue.Empty:
+                return
+            items_queue.task_done()
+            if not t.cancelled:
+                progress.update(1)
+    return worker
+
+
+def download_books(books, folder, patches, jobs):
     assert MAX_FILENAME_LEN >= MIN_FILENAME_LEN,                             \
         'Please change MAX_FILENAME_LEN to a value greater than {}'.format(
             MIN_FILENAME_LEN
         )
     max_length = get_max_filename_length(folder)
     longest_name = books[CATEGORY].map(len).max()
     if max_length - longest_name < MIN_FILENAME_LEN:
-        print('The download directory path is too lengthy:')
-        print('{}'.format(os.path.abspath(folder)))
-        print('Please choose a shorter one')
+        tqdm.write('The download directory path is too lengthy:')
+        tqdm.write('{}'.format(os.path.abspath(folder)))
+        tqdm.write('Please choose a shorter one')
         exit(-1)
     books = books[
         [
@@ -118,28 +165,37 @@ def download_books(books, folder, patches):
           'English Package Name'
         ]
     ]
-    for url, title, author, edition, isbn, category in tqdm(books.values, desc='Overall Progress'):
+    pbar = tqdm(total=len(books.values)*len(patches), desc='Overall Progress', leave=True, position=0)
+    q = queue.Queue()
+    threads = []
+    for i in range(jobs):
+        t = threading.Thread(target=make_worker(q, pbar, i+1))
+        t.daemon = True
+        t.start()
+        threads.append(t)
+    for url, title, author, edition, isbn, category in books.values:
         dest_folder = create_path(os.path.join(folder, category))
         length = max_length - len(category) - 2
         if length > MAX_FILENAME_LEN:
             length = MAX_FILENAME_LEN
         bookname = compose_bookname(title, author, edition, isbn, length)
-        request = None
         for patch in patches:
-            try:
-                output_file = get_book_path_if_new(dest_folder, bookname, patch)
-                if output_file is not None:
-                    request = requests.get(url) if request is None else request
-                    download_book(request, output_file, patch)
-            except (OSError, IOError) as e:
-                print(e)
-                title = title.encode('ascii', 'ignore').decode('ascii')
-                print('* Problem downloading: {}, so skipping it.'
-                        .format(title))
-                time.sleep(30)
-                request = None                    # Enforce new get request
-                # then continue to download the next book
-
+            q.put(dict(folder=dest_folder, name=bookname, patch=patch, title=title, url=url))
+    try:
+        while True:
+            if not q.empty():
+                time.sleep(0.1)
+            else:
+                break
+    except KeyboardInterrupt:
+        for t in threads:
+            t.cancelled = True
+        for t in threads:
+            t.join()
+        raise
+    finally:
+        pbar.close()
+    q.join()
 
 replacements = {'/':'-', '\\':'-', ':':'-', '*':'', '>':'', '<':'', '?':'', \
                 '|':'', '"':''}

diff --git a/main.py b/main.py
@@ -26,13 +26,18 @@
     '-i','--index', nargs='+', dest='book_index',
     help='list of book indices to download'
 )
+parser.add_argument(
+    '-j', '--jobs', dest='jobs', type=int, default=1,
+    help='number of parallel download jobs'
+)
 parser.add_argument(
     '-v','--verbose', action='store_true', help='show more details'
 )
 
 args = parser.parse_args()
 folder = create_path(args.folder if args.folder else './downloads')
 
+assert args.jobs > 0, '-j or --jobs must be > 0'
 assert args.language in ('en', 'de'), '-l or --language must be "en" or "de"'
 if args.language == 'en':
     table_url = 'https://resource-cms.springernature.com/springer-cms/rest/v1/content/17858272/data/'
@@ -85,6 +90,6 @@
 books = filter_books(books, sorted(indices))
 books.index = [i + 2 for i in books.index]              # Recorrect indices
 print_summary(books, invalid_categories, args)
-download_books(books, folder, patches)
+download_books(books, folder, patches, args.jobs)
 
 print('\nFinish downloading.')