From 57f21f04083eb1316fd88ff7206f4ab85c08d403 Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 18:31:34 -0700 Subject: [PATCH 1/8] add support for selecting density in node --- node-zerox/src/index.ts | 6 ++++-- node-zerox/src/utils.ts | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts index d7550d1..c3f79e7 100644 --- a/node-zerox/src/index.ts +++ b/node-zerox/src/index.ts @@ -23,6 +23,7 @@ export const zerox = async ({ cleanup = true, concurrency = 10, correctOrientation = true, + density = 300, filePath, llmParams = {}, maintainFormat = false, @@ -72,8 +73,8 @@ export const zerox = async ({ tempDir || os.tmpdir(), `zerox-temp-${rand}` ); - const sourceDirectory = path.join(tempDirectory, 'source') - const processedDirectory = path.join(tempDirectory, 'processed') + const sourceDirectory = path.join(tempDirectory, "source"); + const processedDirectory = path.join(tempDirectory, "processed"); await fs.ensureDir(sourceDirectory); await fs.ensureDir(processedDirectory); @@ -105,6 +106,7 @@ export const zerox = async ({ // Convert the file to a series of images await convertPdfToImages({ correctOrientation, + density, localPath: pdfPath, maxTesseractWorkers, pagesToConvertAsImages, diff --git a/node-zerox/src/utils.ts b/node-zerox/src/utils.ts index 2f92490..0b04d1f 100644 --- a/node-zerox/src/utils.ts +++ b/node-zerox/src/utils.ts @@ -205,6 +205,7 @@ const determineOptimalRotation = async ({ // Convert each page to a png, correct orientation, and save that image to tmp export const convertPdfToImages = async ({ correctOrientation, + density, localPath, maxTesseractWorkers, pagesToConvertAsImages, @@ -213,6 +214,7 @@ export const convertPdfToImages = async ({ trimEdges, }: { correctOrientation: boolean; + density: number; localPath: string; maxTesseractWorkers: number; pagesToConvertAsImages: number | number[]; @@ -221,7 +223,7 @@ export const convertPdfToImages = async ({ trimEdges: boolean; }) => { const options = { - density: 300, + density, format: "png", height: 2048, preserveAspectRatio: true, From e04920c5c3c74d7bd615210c5323893c229ea9a8 Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 18:33:20 -0700 Subject: [PATCH 2/8] revert styling --- node-zerox/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts index c3f79e7..596d72c 100644 --- a/node-zerox/src/index.ts +++ b/node-zerox/src/index.ts @@ -73,8 +73,8 @@ export const zerox = async ({ tempDir || os.tmpdir(), `zerox-temp-${rand}` ); - const sourceDirectory = path.join(tempDirectory, "source"); - const processedDirectory = path.join(tempDirectory, "processed"); + const sourceDirectory = path.join(tempDirectory, 'source'); + const processedDirectory = path.join(tempDirectory, 'processed'); await fs.ensureDir(sourceDirectory); await fs.ensureDir(processedDirectory); From 1c5652070fe62c9e1f3cbca67cb2e3f6b0f6c1dc Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 18:33:59 -0700 Subject: [PATCH 3/8] revert stylings part 2 --- node-zerox/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts index 596d72c..fb86e75 100644 --- a/node-zerox/src/index.ts +++ b/node-zerox/src/index.ts @@ -73,8 +73,8 @@ export const zerox = async ({ tempDir || os.tmpdir(), `zerox-temp-${rand}` ); - const sourceDirectory = path.join(tempDirectory, 'source'); - const processedDirectory = path.join(tempDirectory, 'processed'); + const sourceDirectory = path.join(tempDirectory, 'source') + const processedDirectory = path.join(tempDirectory, 'processed') await fs.ensureDir(sourceDirectory); await fs.ensureDir(processedDirectory); From d7a3776da68fb63844c466168e69d292d0918113 Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 18:37:04 -0700 Subject: [PATCH 4/8] add height and update args --- node-zerox/src/index.ts | 4 +++- node-zerox/src/types.ts | 2 ++ node-zerox/src/utils.ts | 4 +++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts index fb86e75..958f9bf 100644 --- a/node-zerox/src/index.ts +++ b/node-zerox/src/index.ts @@ -21,10 +21,11 @@ import Tesseract from "tesseract.js"; export const zerox = async ({ cleanup = true, - concurrency = 10, + concurrency = 10, correctOrientation = true, density = 300, filePath, + height = 2048, llmParams = {}, maintainFormat = false, maxTesseractWorkers = -1, @@ -107,6 +108,7 @@ export const zerox = async ({ await convertPdfToImages({ correctOrientation, density, + height, localPath: pdfPath, maxTesseractWorkers, pagesToConvertAsImages, diff --git a/node-zerox/src/types.ts b/node-zerox/src/types.ts index ccd491d..89580d8 100644 --- a/node-zerox/src/types.ts +++ b/node-zerox/src/types.ts @@ -2,7 +2,9 @@ export interface ZeroxArgs { cleanup?: boolean; concurrency?: number; correctOrientation?: boolean; + density?: number; filePath: string; + height?: number; llmParams?: LLMParams; maintainFormat?: boolean; maxTesseractWorkers?: number; diff --git a/node-zerox/src/utils.ts b/node-zerox/src/utils.ts index 0b04d1f..321bad9 100644 --- a/node-zerox/src/utils.ts +++ b/node-zerox/src/utils.ts @@ -206,6 +206,7 @@ const determineOptimalRotation = async ({ export const convertPdfToImages = async ({ correctOrientation, density, + height, localPath, maxTesseractWorkers, pagesToConvertAsImages, @@ -215,6 +216,7 @@ export const convertPdfToImages = async ({ }: { correctOrientation: boolean; density: number; + height: number; localPath: string; maxTesseractWorkers: number; pagesToConvertAsImages: number | number[]; @@ -225,7 +227,7 @@ export const convertPdfToImages = async ({ const options = { density, format: "png", - height: 2048, + height, preserveAspectRatio: true, saveFilename: path.basename(localPath, path.extname(localPath)), savePath: tempDir, From b6ac15185b145e8b1c2045c18a779e322f286f52 Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 18:37:38 -0700 Subject: [PATCH 5/8] remove space --- node-zerox/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts index 958f9bf..fbeb4ab 100644 --- a/node-zerox/src/index.ts +++ b/node-zerox/src/index.ts @@ -21,7 +21,7 @@ import Tesseract from "tesseract.js"; export const zerox = async ({ cleanup = true, - concurrency = 10, + concurrency = 10, correctOrientation = true, density = 300, filePath, From 71cf1e1ef0d37ead87e850f67cc2622750b0dc5f Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 19:12:22 -0700 Subject: [PATCH 6/8] clarify props --- node-zerox/src/index.ts | 8 ++++---- node-zerox/src/types.ts | 4 ++-- node-zerox/src/utils.ts | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts index fbeb4ab..eb9726b 100644 --- a/node-zerox/src/index.ts +++ b/node-zerox/src/index.ts @@ -23,9 +23,9 @@ export const zerox = async ({ cleanup = true, concurrency = 10, correctOrientation = true, - density = 300, + imageDensity = 300, filePath, - height = 2048, + imageHeight = 2048, llmParams = {}, maintainFormat = false, maxTesseractWorkers = -1, @@ -107,8 +107,8 @@ export const zerox = async ({ // Convert the file to a series of images await convertPdfToImages({ correctOrientation, - density, - height, + imageDensity, + imageHeight, localPath: pdfPath, maxTesseractWorkers, pagesToConvertAsImages, diff --git a/node-zerox/src/types.ts b/node-zerox/src/types.ts index 89580d8..a1c9a29 100644 --- a/node-zerox/src/types.ts +++ b/node-zerox/src/types.ts @@ -2,9 +2,9 @@ export interface ZeroxArgs { cleanup?: boolean; concurrency?: number; correctOrientation?: boolean; - density?: number; + imageDensity?: number; filePath: string; - height?: number; + imageHeight?: number; llmParams?: LLMParams; maintainFormat?: boolean; maxTesseractWorkers?: number; diff --git a/node-zerox/src/utils.ts b/node-zerox/src/utils.ts index 321bad9..7ac774d 100644 --- a/node-zerox/src/utils.ts +++ b/node-zerox/src/utils.ts @@ -205,8 +205,8 @@ const determineOptimalRotation = async ({ // Convert each page to a png, correct orientation, and save that image to tmp export const convertPdfToImages = async ({ correctOrientation, - density, - height, + imageDensity, + imageHeight, localPath, maxTesseractWorkers, pagesToConvertAsImages, @@ -215,8 +215,8 @@ export const convertPdfToImages = async ({ trimEdges, }: { correctOrientation: boolean; - density: number; - height: number; + imageDensity: number; + imageHeight: number; localPath: string; maxTesseractWorkers: number; pagesToConvertAsImages: number | number[]; @@ -225,9 +225,9 @@ export const convertPdfToImages = async ({ trimEdges: boolean; }) => { const options = { - density, + density: imageDensity, format: "png", - height, + height: imageHeight, preserveAspectRatio: true, saveFilename: path.basename(localPath, path.extname(localPath)), savePath: tempDir, From 89e4cc3d4b5a2cb1ceccefefeda58129d83134d2 Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 19:18:59 -0700 Subject: [PATCH 7/8] sort --- node-zerox/src/index.ts | 2 +- node-zerox/src/types.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts index eb9726b..a7d59b0 100644 --- a/node-zerox/src/index.ts +++ b/node-zerox/src/index.ts @@ -23,8 +23,8 @@ export const zerox = async ({ cleanup = true, concurrency = 10, correctOrientation = true, - imageDensity = 300, filePath, + imageDensity = 300, imageHeight = 2048, llmParams = {}, maintainFormat = false, diff --git a/node-zerox/src/types.ts b/node-zerox/src/types.ts index a1c9a29..2faf3af 100644 --- a/node-zerox/src/types.ts +++ b/node-zerox/src/types.ts @@ -2,8 +2,8 @@ export interface ZeroxArgs { cleanup?: boolean; concurrency?: number; correctOrientation?: boolean; - imageDensity?: number; filePath: string; + imageDensity?: number; imageHeight?: number; llmParams?: LLMParams; maintainFormat?: boolean; From a58b7f7ef59663e4144328527cfe468a5ef0b959 Mon Sep 17 00:00:00 2001 From: Alexander Densley Date: Wed, 4 Dec 2024 20:32:37 -0700 Subject: [PATCH 8/8] add image_density and image_height to python package --- py_zerox/pyzerox/core/zerox.py | 5 ++++- py_zerox/pyzerox/processor/pdf.py | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index c17bba8..6b7c8dc 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -7,6 +7,7 @@ import aiofiles import aiofiles.os as async_os import asyncio +from ..constants import PDFConversionDefaultOptions # Package Imports from ..processor import ( @@ -26,6 +27,8 @@ async def zerox( cleanup: bool = True, concurrency: int = 10, file_path: Optional[str] = "", + image_density: int = PDFConversionDefaultOptions.DPI, + image_height: tuple[Optional[int], int] = PDFConversionDefaultOptions.SIZE, maintain_format: bool = False, model: str = "gpt-4o-mini", output_dir: Optional[str] = None, @@ -130,7 +133,7 @@ async def zerox( **subset_pdf_create_kwargs) # Convert the file to a series of images, below function returns a list of image paths in page order - images = await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory) + images = await convert_pdf_to_images(image_density=image_density, image_height=image_height, local_path=local_path, temp_dir=temp_directory) if maintain_format: for image in images: diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index c3b3fa6..af36629 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -11,14 +11,14 @@ from ..models import litellmmodel -async def convert_pdf_to_images(local_path: str, temp_dir: str) -> List[str]: +async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str) -> List[str]: """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order.""" options = { "pdf_path": local_path, "output_folder": temp_dir, - "dpi": PDFConversionDefaultOptions.DPI, + "dpi": image_density, "fmt": PDFConversionDefaultOptions.FORMAT, - "size": PDFConversionDefaultOptions.SIZE, + "size": image_height, "thread_count": PDFConversionDefaultOptions.THREAD_COUNT, "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO, "paths_only": True,