Merge pull request #1 from mattbriggs/multithread

Multithread update
mattbriggs · Jan 17, 2020 · 8635c31 · 8635c31
2 parents 44835cb + 1b391ec
commit 8635c31
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,8 @@
 
 `pdf_link_check.py` checks the hyperlinks in an Portable Document Format (PDF) file. The script is a command line app.
 
-Release: V1.0.0 2020.1.14
+Release: V1.1.0 2020.1.17
+
 
 ## Install dependencies
 
@@ -35,10 +36,16 @@ The script requires the following dependencies:
 
 - Python module: CSV
 
-    Part of the Python core packages. No need to install with PIP. CSV standars for comma separated value.
+    Part of the Python core packages. No need to install with PIP. CSV stands for comma separated value.
 
     For more information, see [CSV File Reading and Writing](https://docs.python.org/3/library/csv.html)
 
+- Python module: Threading
+
+    Part of the Python core packages. No need to install with PIP.
+
+    For more information, see [threading — Thread-based parallelism](https://docs.python.org/3.6/library/threading.html)
+
 
 ## Use `pdf_link_check.py`
 

diff --git a/pdf_link_check.py b/pdf_link_check.py
@@ -3,34 +3,49 @@
     PDFLinkCheck.py checks the hyperlinks in an Portable Document Format (PDF)
     file.
 
-    Release V1.0.0 2020.1.13
+    Release V1.1.0 2020.1.17
 '''
 
 import csv
+import threading
 import PyPDF2 as pypdf
 import requests
 
+def get_split(numtosplit):
+    '''Split a number into four equal(ish) sections. Number of pages must be greater
+    than 13.'''
+    if numtosplit > 13:
+        sections = []
+        breaksize = int(numtosplit/4)
+        sec1_start = 0
+        sec1_end = breaksize
+        sec2_start = breaksize + 1
+        sec2_end = breaksize * 2
+        sec3_start = sec2_end + 1
+        sec3_end = breaksize * 3
+        sec4_start = sec3_end +1
+        sec4_end = numtosplit
 
-def main():
-    '''Main logic of the script:
-    - Get input PDF and output CSV location.
-    - Extract pages from the PDF.
-    - For each page, find annotations, and URIs in the annotations.
-        - Get the URIs.
-            - For each URI try to make a web request and get the response code.
-            - Record the page number, URI, and response code result or NA for timeouts.
-    - Save the report.
-    '''
+        sections = [(sec1_start, sec1_end),
+                    (sec2_start, sec2_end),
+                    (sec3_start, sec3_end),
+                    (sec4_start, sec4_end)]
 
-    print("Starting")
-    pdf_file = input("Add PDF file > ")
-    report_out = input("Save Report (CSV) > ")
-    pdf = pypdf.PdfFileReader(pdf_file)
-    pages = pdf.numPages
+        return sections
 
-    link_report = [["page", "url", "status", "request-error"]]
+    raise ValueError("Number too small to split into four sections.")
+
+
+def get_links_from_page(indexstart, indexend, reportlist, pdf):
+    ''' - Extract pages from the PDF using the incoming range.
+        - For each page, find annotations, and URIs in the annotations.
+            - Get the URIs.
+                - For each URI try to make a web request and get the response code.
+                - Record the page number, URI, and response code result or NA for
+                  timeouts.
+    '''
 
-    for i in range(pages):
+    for i in range(indexstart, indexend):
         page_obj = pdf.getPage(i)
         page_no = i + 1
         annots = page_obj["/Annots"]
@@ -51,7 +66,34 @@ def main():
                         request_error = str(e)
                     print("{} : {} : {}".format(page_no, raw_url, code))
                     record = [page_no, raw_url, code, request_error]
-                    link_report.append(record)
+                    reportlist.append(record)
+    return reportlist
+
+def main():
+    '''Main logic of the script:
+    - Get input PDF and output CSV location.
+    - Get the number of pages, and split into four equal sections
+    - Get the range for each section, and send each section range to the parser
+       running its own thread.
+    - Save the report.
+    '''
+
+    print("Starting")
+    pdf_file = input("Add PDF file > ")
+    report_out = input("Save Report (CSV) > ")
+    pdf = pypdf.PdfFileReader(pdf_file)
+    pages = pdf.numPages
+    link_report = [["page", "uri", "status", "request-error"]]
+    if pages < 13:
+        link_report.append(get_links_from_page(0, pages, pdf))
+    else:
+        split = get_split(pages)
+        threads = []
+        for i in range(4):
+            th = threading.Thread(target=get_links_from_page, args=(split[i][0], split[i][1], link_report, pdf))
+            th.start()
+            threads.append(th)
+        [th.join() for th in threads]
 
     # Generate CSV output
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
 requests==2.20.0
-PyPDF2==1.26.0
+PyPDF2==1.26.0