From a2100216c10dcee6a97b419b2b4c92cdbb578be5 Mon Sep 17 00:00:00 2001 From: bosd Date: Sat, 2 Nov 2024 11:10:09 +0100 Subject: [PATCH] [IMP] add typing to handlers, update docstings and pdfminer url --- camelot/cli.py | 4 +++- camelot/handlers.py | 27 ++++++++++++++++----------- camelot/io.py | 2 +- tests/test_utils.py | 2 +- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index 8ff57135..6c0e2a1a 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -38,7 +38,9 @@ def set_config(self, key, value): @click.group(name="camelot") @click.version_option(version=__version__) -@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.") +@click.option( + "-q", "--quiet", is_flag=False, default=False, help="Suppress logs and warnings." +) @click.option( "-p", "--pages", diff --git a/camelot/handlers.py b/camelot/handlers.py index 7d76f960..a59ec38b 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -6,6 +6,7 @@ import os import sys from pathlib import Path +from typing import Any from pypdf import PdfReader from pypdf import PdfWriter @@ -119,15 +120,17 @@ def _get_pages(self, pages): result.extend(range(p["start"], p["end"] + 1)) return sorted(set(result)) - def _save_page(self, filepath: StrByteType | Path, page, temp): + def _save_page(self, filepath: StrByteType | Path, page: int, temp: str): """Saves specified page from PDF into a temporary directory. Parameters ---------- + filepath : str + Filepath or URL of the PDF file. page : int Page number. - layout_kwargs : dict, optional (default: {}) - A dict of `pdfminer.layout.LAParams `_ kwargs. # noqa + temp : str + Tmp directory. Returns @@ -178,10 +181,10 @@ def _save_page(self, filepath: StrByteType | Path, page, temp): def parse( self, - flavor="lattice", - suppress_stdout=False, - parallel=False, - layout_kwargs=None, + flavor: str = "lattice", + suppress_stdout: bool = False, + parallel: bool = False, + layout_kwargs: dict[str, Any] | None = None, **kwargs, ): """Extract tables by calling parser.get_tables on all single page PDFs. @@ -197,7 +200,7 @@ def parse( Process pages in parallel using all available cpu cores. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams - `_ kwargs. + `_ kwargs. kwargs : dict See camelot.read_pdf kwargs. @@ -241,19 +244,21 @@ def parse( return TableList(sorted(tables)) - def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs): + def _parse_page( + self, page: int, tempdir: str, parser, suppress_stdout: bool, layout_kwargs + ): """Extract tables by calling parser.get_tables on a single page PDF. Parameters ---------- - page : str + page : int Page number to parse parser : Lattice, Stream, Network or Hybrid The parser to use. suppress_stdout : bool Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) - A dict of `pdfminer.layout.LAParams `_ kwargs. + A dict of `pdfminer.layout.LAParams `_ kwargs. Returns ------- diff --git a/camelot/io.py b/camelot/io.py index 931688cd..6bef46a1 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -46,7 +46,7 @@ def read_pdf( Process pages in parallel using all available cpu cores. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams - `_ kwargs. + `_ kwargs. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom diff --git a/tests/test_utils.py b/tests/test_utils.py index 95ff6166..85ae6005 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -15,7 +15,7 @@ def get_text_from_pdf(filename): """Method to extract text object from pdf.""" # https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file - # https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis + # https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html document = open(filename, "rb") # Create resource manager rsrcmgr = PDFResourceManager()