diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index c17bba8..6b7c8dc 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -7,6 +7,7 @@ import aiofiles import aiofiles.os as async_os import asyncio +from ..constants import PDFConversionDefaultOptions # Package Imports from ..processor import ( @@ -26,6 +27,8 @@ async def zerox( cleanup: bool = True, concurrency: int = 10, file_path: Optional[str] = "", + image_density: int = PDFConversionDefaultOptions.DPI, + image_height: tuple[Optional[int], int] = PDFConversionDefaultOptions.SIZE, maintain_format: bool = False, model: str = "gpt-4o-mini", output_dir: Optional[str] = None, @@ -130,7 +133,7 @@ async def zerox( **subset_pdf_create_kwargs) # Convert the file to a series of images, below function returns a list of image paths in page order - images = await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory) + images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory) if maintain_format: for image in images: diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index c3b3fa6..af36629 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -11,14 +11,14 @@ from ..models import litellmmodel -async def convert_pdf_to_images(local_path: str, temp_dir: str) -> List[str]: +async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str) -> List[str]: """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order.""" options = { "pdf_path": local_path, "output_folder": temp_dir, - "dpi": PDFConversionDefaultOptions.DPI, + "dpi": image_density, "fmt": PDFConversionDefaultOptions.FORMAT, - "size": PDFConversionDefaultOptions.SIZE, + "size": image_height, "thread_count": PDFConversionDefaultOptions.THREAD_COUNT, "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO, "paths_only": True,