[IMP] add typing to handlers, update docstings and pdfminer url

py-pdf · Nov 2, 2024 · a210021 · a210021
1 parent c2266cb
commit a210021
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 14 deletions.
diff --git a/camelot/cli.py b/camelot/cli.py
@@ -38,7 +38,9 @@ def set_config(self, key, value):
 
 @click.group(name="camelot")
 @click.version_option(version=__version__)
-@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
+@click.option(
+    "-q", "--quiet", is_flag=False, default=False, help="Suppress logs and warnings."
+)
 @click.option(
     "-p",
     "--pages",

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -6,6 +6,7 @@
 import os
 import sys
 from pathlib import Path
+from typing import Any
 
 from pypdf import PdfReader
 from pypdf import PdfWriter
@@ -119,15 +120,17 @@ def _get_pages(self, pages):
             result.extend(range(p["start"], p["end"] + 1))
         return sorted(set(result))
 
-    def _save_page(self, filepath: StrByteType | Path, page, temp):
+    def _save_page(self, filepath: StrByteType | Path, page: int, temp: str):
         """Saves specified page from PDF into a temporary directory.
 
         Parameters
         ----------
+        filepath : str
+            Filepath or URL of the PDF file.
         page : int
             Page number.
-        layout_kwargs : dict, optional (default: {})
-            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.  # noqa
+        temp : str
+            Tmp directory.
 
 
         Returns
@@ -178,10 +181,10 @@ def _save_page(self, filepath: StrByteType | Path, page, temp):
 
     def parse(
         self,
-        flavor="lattice",
-        suppress_stdout=False,
-        parallel=False,
-        layout_kwargs=None,
+        flavor: str = "lattice",
+        suppress_stdout: bool = False,
+        parallel: bool = False,
+        layout_kwargs: dict[str, Any] | None = None,
         **kwargs,
     ):
         """Extract tables by calling parser.get_tables on all single page PDFs.
@@ -197,7 +200,7 @@ def parse(
             Process pages in parallel using all available cpu cores.
         layout_kwargs : dict, optional (default: {})
             A dict of `pdfminer.layout.LAParams
-            <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+            <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.
         kwargs : dict
             See camelot.read_pdf kwargs.
 
@@ -241,19 +244,21 @@ def parse(
 
         return TableList(sorted(tables))
 
-    def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
+    def _parse_page(
+        self, page: int, tempdir: str, parser, suppress_stdout: bool, layout_kwargs
+    ):
         """Extract tables by calling parser.get_tables on a single page PDF.
 
         Parameters
         ----------
-        page : str
+        page : int
             Page number to parse
         parser : Lattice, Stream, Network or Hybrid
             The parser to use.
         suppress_stdout : bool
             Suppress logs and warnings.
         layout_kwargs : dict, optional (default: {})
-            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+            A dict of `pdfminer.layout.LAParams <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.
 
         Returns
         -------

diff --git a/camelot/io.py b/camelot/io.py
@@ -46,7 +46,7 @@ def read_pdf(
         Process pages in parallel using all available cpu cores.
     layout_kwargs : dict, optional (default: {})
         A dict of `pdfminer.layout.LAParams
-        <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+        <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.
     table_areas : list, optional (default: None)
         List of table area strings of the form x1,y1,x2,y2
         where (x1, y1) -> left-top and (x2, y2) -> right-bottom

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -15,7 +15,7 @@
 def get_text_from_pdf(filename):
     """Method to extract text object from pdf."""
     # https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
-    # https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
+    # https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html
     document = open(filename, "rb")
     # Create resource manager
     rsrcmgr = PDFResourceManager()