From a58b7f7ef59663e4144328527cfe468a5ef0b959 Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 20:32:37 -0700 Subject: [PATCH] add image_density and image_height to python package --- py_zerox/pyzerox/core/zerox.py | 5 ++++- py_zerox/pyzerox/processor/pdf.py | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index c17bba8..6b7c8dc 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -7,6 +7,7 @@ import aiofiles import aiofiles.os as async_os import asyncio +from ..constants import PDFConversionDefaultOptions # Package Imports from ..processor import ( @@ -26,6 +27,8 @@ async def zerox( cleanup: bool = True, concurrency: int = 10, file_path: Optional[str] = "", + image_density: int = PDFConversionDefaultOptions.DPI, + image_height: tuple[Optional[int], int] = PDFConversionDefaultOptions.SIZE, maintain_format: bool = False, model: str = "gpt-4o-mini", output_dir: Optional[str] = None, @@ -130,7 +133,7 @@ async def zerox( **subset_pdf_create_kwargs) # Convert the file to a series of images, below function returns a list of image paths in page order - images = await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory) + images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory) if maintain_format: for image in images: diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index c3b3fa6..af36629 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -11,14 +11,14 @@ from ..models import litellmmodel -async def convert_pdf_to_images(local_path: str, temp_dir: str) -> List[str]: +async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str) -> List[str]: """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order.""" options = { "pdf_path": local_path, "output_folder": temp_dir, - "dpi": PDFConversionDefaultOptions.DPI, + "dpi": image_density, "fmt": PDFConversionDefaultOptions.FORMAT, - "size": PDFConversionDefaultOptions.SIZE, + "size": image_height, "thread_count": PDFConversionDefaultOptions.THREAD_COUNT, "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO, "paths_only": True,