Skip to content

Commit

Permalink
[IMP] add typing to handlers, update docstings and pdfminer url
Browse files Browse the repository at this point in the history
  • Loading branch information
bosd committed Nov 2, 2024
1 parent c2266cb commit a210021
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 14 deletions.
4 changes: 3 additions & 1 deletion camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ def set_config(self, key, value):

@click.group(name="camelot")
@click.version_option(version=__version__)
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
@click.option(
"-q", "--quiet", is_flag=False, default=False, help="Suppress logs and warnings."
)
@click.option(
"-p",
"--pages",
Expand Down
27 changes: 16 additions & 11 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import sys
from pathlib import Path
from typing import Any

from pypdf import PdfReader
from pypdf import PdfWriter
Expand Down Expand Up @@ -119,15 +120,17 @@ def _get_pages(self, pages):
result.extend(range(p["start"], p["end"] + 1))
return sorted(set(result))

def _save_page(self, filepath: StrByteType | Path, page, temp):
def _save_page(self, filepath: StrByteType | Path, page: int, temp: str):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
page : int
Page number.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
temp : str
Tmp directory.
Returns
Expand Down Expand Up @@ -178,10 +181,10 @@ def _save_page(self, filepath: StrByteType | Path, page, temp):

def parse(
self,
flavor="lattice",
suppress_stdout=False,
parallel=False,
layout_kwargs=None,
flavor: str = "lattice",
suppress_stdout: bool = False,
parallel: bool = False,
layout_kwargs: dict[str, Any] | None = None,
**kwargs,
):
"""Extract tables by calling parser.get_tables on all single page PDFs.
Expand All @@ -197,7 +200,7 @@ def parse(
Process pages in parallel using all available cpu cores.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
<https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.
kwargs : dict
See camelot.read_pdf kwargs.
Expand Down Expand Up @@ -241,19 +244,21 @@ def parse(

return TableList(sorted(tables))

def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
def _parse_page(
self, page: int, tempdir: str, parser, suppress_stdout: bool, layout_kwargs
):
"""Extract tables by calling parser.get_tables on a single page PDF.
Parameters
----------
page : str
page : int
Page number to parse
parser : Lattice, Stream, Network or Hybrid
The parser to use.
suppress_stdout : bool
Suppress logs and warnings.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
A dict of `pdfminer.layout.LAParams <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.
Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def read_pdf(
Process pages in parallel using all available cpu cores.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
<https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def get_text_from_pdf(filename):
"""Method to extract text object from pdf."""
# https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
# https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
# https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html
document = open(filename, "rb")
# Create resource manager
rsrcmgr = PDFResourceManager()
Expand Down

0 comments on commit a210021

Please sign in to comment.