Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat. Postprocessing control - custom page separator, postprocess function etc #40

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
40 changes: 27 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ Refer to the [LiteLLM Documentation](https://docs.litellm.ai/docs/providers) for

```python
from pyzerox import zerox

import os
import json
import asyncio
Expand Down Expand Up @@ -195,7 +196,7 @@ file_path = 'path/to/vertex_ai_service_account.json'

# Load the JSON file
with open(file_path, 'r') as file:
vertex_credentials = json.load(file)
vertex_credentials = json.load(file)

# Convert to JSON string
vertex_credentials_json = json.dumps(vertex_credentials)
Expand All @@ -209,15 +210,23 @@ kwargs = {"vertex_credentials": vertex_credentials}

# Define main async entrypoint
async def main():
file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported
file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported

## process only some pages or all
select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)

## process only some pages or all
select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)
output_file_path = "output.md" ## filepath to save the consolidated output file (markdown by default). Pass None to skip saving any output file
page_separator = "\n\n" ## The separator to use between pages when writing the output to `output_file_path`

output_dir = "./output_test" ## directory to save the consolidated markdown file
result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs)
return result
## function to apply on model's text output (on each page). Function should take input as string and return output also as string.
## By default uses Zerox's format_markdown function to format text as markdown
# post_process_function = lambda x: x.strip() ## To skip any post processing pass None, which would just keep the raw text output from the model.

result = await zerox(file_path = file_path, model = model, output_file_path = output_file_path,
custom_system_prompt = custom_system_prompt, select_pages = select_pages,
# post_process_function = post_process_function,
**kwargs)
return result


# run the main function:
Expand All @@ -236,10 +245,12 @@ async def zerox(
file_path: Optional[str] = "",
maintain_format: bool = False,
model: str = "gpt-4o-mini",
output_dir: Optional[str] = None,
output_file_path: Optional[str] = None,
page_separator: str = "\n\n",
temp_dir: Optional[str] = None,
custom_system_prompt: Optional[str] = None,
select_pages: Optional[Union[int, Iterable[int]]] = None,
post_process_function: Optional[Callable[[str], str]] = format_markdown,
**kwargs
) -> ZeroxOutput:
...
Expand All @@ -258,22 +269,25 @@ Parameters
- **model** (str, optional):
The model to use for generating completions. Defaults to "gpt-4o-mini".
Refer to LiteLLM Providers for the correct model name, as it may differ depending on the provider.
- **output_dir** (Optional[str], optional):
The directory to save the markdown output. Defaults to None.
- **output_file_path** (Optional[str], optional):
The path to save the markdown output (e.g., "output.md"). Any required directories will be created. Defaults to None.
- **page_separator** (str, optional):
The separator to use between pages when writing the output to `output_file_path`. Defaults to "\n\n".
- **temp_dir** (str, optional):
The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted before zerox uses it.
- **custom_system_prompt** (str, optional):
The system prompt to use for the model, this overrides the default system prompt of zerox.Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning. Defaults to None.
- **select_pages** (Optional[Union[int, Iterable[int]]], optional):
Pages to process, can be a single page number or an iterable of page numbers, Defaults to None
- **post_process_function** (Optional[Callable[[str], str]], optional):
A function to post-process the text output from the model for each page. It should take a string as input and return a string as output. Defaults to Zerox's `format_markdown` function, which formats the output in markdown. Pass None to skip post-processing.
- **kwargs** (dict, optional):
Additional keyword arguments to pass to the litellm.completion method.
Refer to the LiteLLM Documentation and Completion Input for details.

Returns

- ZeroxOutput:
Contains the markdown content generated by the model and also some metadata (refer below).
Contains the output content (markdown as default) generated by the model and also some metadata (refer below).

### Example Output (Output from "azure/gpt-4o-mini"):

Expand Down
9 changes: 6 additions & 3 deletions py_zerox/pyzerox/core/types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Optional, Dict, Any, Union, Iterable
from typing import List, Optional, Dict, Any, Union, Iterable, Callable
from ..processor import format_markdown
from dataclasses import dataclass, field


Expand All @@ -12,11 +13,13 @@ class ZeroxArgs:
cleanup: bool = True
concurrency: int = 10
maintain_format: bool = False
model: str = "gpt-4o-mini",
output_dir: Optional[str] = None
model: str = "gpt-4o-mini"
output_file_path: Optional[str] = None
page_separator: Optional[str] = None
temp_dir: Optional[str] = None
custom_system_prompt: Optional[str] = None
select_pages: Optional[Union[int, Iterable[int]]] = None
post_process_function: Optional[Callable[[str], str]] = format_markdown
kwargs: Dict[str, Any] = field(default_factory=dict)

@dataclass
Expand Down
58 changes: 40 additions & 18 deletions py_zerox/pyzerox/core/zerox.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import aioshutil as async_shutil
import tempfile
import warnings
from typing import List, Optional, Union, Iterable
from typing import List, Optional, Union, Iterable, Callable
from datetime import datetime
import aiofiles
import aiofiles.os as async_os
Expand All @@ -15,6 +15,7 @@
process_page,
process_pages_in_batches,
create_selected_pages_pdf,
format_markdown,
)
from ..errors import FileUnavailable
from ..constants.messages import Messages
Expand All @@ -28,14 +29,16 @@ async def zerox(
file_path: Optional[str] = "",
maintain_format: bool = False,
model: str = "gpt-4o-mini",
output_dir: Optional[str] = None,
output_file_path: Optional[str] = None,
page_separator: Optional[str] = None,
temp_dir: Optional[str] = None,
custom_system_prompt: Optional[str] = None,
select_pages: Optional[Union[int, Iterable[int]]] = None,
post_process_function: Optional[Callable[[str], str]] = format_markdown,
**kwargs
) -> ZeroxOutput:
"""
API to perform OCR to markdown using Vision models.
API to perform OCR to markdown (default) using Vision models.
Please setup the environment variables for the model and model provider before using this API. Refer: https://docs.litellm.ai/docs/providers

:param cleanup: Whether to cleanup the temporary files after processing, defaults to True
Expand All @@ -48,24 +51,28 @@ async def zerox(
:type maintain_format: bool, optional
:param model: The model to use for generating completions, defaults to "gpt-4o-mini". Note - Refer: https://docs.litellm.ai/docs/providers to pass correct model name as according to provider it might be different from actual name.
:type model: str, optional
:param output_dir: The directory to save the markdown output, defaults to None
:type output_dir: str, optional
:param output_file_path: The path to save the output output file (Example "output.md"). Any required directories will be created, defaults to None
:type output_file_path: str, optional
:param temp_dir: The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted for zerox uses it.
:type temp_dir: str, optional
:param page_separator: The separator to use between pages (at the end of each page) when writing the output to "output_file_path", can include a {page_no} placeholder to insert the page number. Uses "\\n\\n<=== Page {page_no} ===>\\n\\n" by default. defaults to None
:type page_separator: str, None
:param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None
:type custom_system_prompt: str, optional
:param select_pages: Pages to process, can be a single page number or an iterable of page numbers, defaults to None
:type select_pages: int or Iterable[int], optional
:param post_process_function: A function to post-process the text output from the model for each page. It should take string as an input and return string as an output, defaults to "format_markdown" function (zerox's default for markdown formatting). Pass None to skip any post processing on the text output of the model.
:type post_process_function: Callable[[str], str], optional

:param kwargs: Additional keyword arguments to pass to the model.completion -> litellm.completion method. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input
:return: The markdown content generated by the model.
:return: The content generated by the model after Zerox's postprocessing (if provided).
"""


input_token_count = 0
output_token_count = 0
prior_page = ""
aggregated_markdown: List[str] = []
aggregated_output: List[str] = []
start_time = datetime.now()

# File Path Validators
Expand All @@ -84,14 +91,17 @@ async def zerox(
warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING)

# If select_pages is a single integer, convert it to a list for consistency

if isinstance(select_pages, int):
select_pages = [select_pages]

# Sort the pages to maintain consistency
if select_pages is not None:
select_pages = sorted(select_pages)

# Ensure the output directory exists

# Ensure the directory for output_file_path exists
output_dir = os.path.dirname(output_file_path) if output_file_path else None
if output_dir:
await async_os.makedirs(output_dir, exist_ok=True)

Expand Down Expand Up @@ -139,10 +149,11 @@ async def zerox(
input_token_count,
output_token_count,
prior_page,
post_process_function,
)

if result:
aggregated_markdown.append(result)
aggregated_output.append(result)
else:
results = await process_pages_in_batches(
images,
Expand All @@ -152,19 +163,30 @@ async def zerox(
input_token_count,
output_token_count,
prior_page,
post_process_function,
)

aggregated_markdown = [result[0] for result in results if isinstance(result[0], str)]
aggregated_output = [result[0] for result in results if isinstance(result[0], str)]

## add token usage
input_token_count += sum([result[1] for result in results])
output_token_count += sum([result[2] for result in results])

# Write the aggregated markdown to a file
if output_dir:
result_file_path = os.path.join(output_dir, f"{file_name}.md")
async with aiofiles.open(result_file_path, "w") as f:
await f.write("\n\n".join(aggregated_markdown))
# Write the aggregated output to a file
if output_file_path:
if not page_separator and not isinstance(page_separator, str):
page_separator = "\n\n<=== Page {page_no} ===>\n\n"

async with aiofiles.open(output_file_path, "w") as f:
for i, page_content in enumerate(aggregated_output):
await f.write(page_content)

# Replace {page_no} with the actual page number in page_separator
if "{page_no}" in page_separator:
page_no_text = page_separator.format(page_no=(select_pages[i] if select_pages else i + 1))
await f.write(f"{page_no_text}")
else:
await f.write(page_separator)

# Cleanup the downloaded PDF file
if cleanup and os.path.exists(temp_directory):
Expand All @@ -176,16 +198,16 @@ async def zerox(

# Adjusting the formatted_pages logic to account for select_pages to output the correct page numbers
if select_pages is not None:
# Map aggregated markdown to the selected pages
# Map aggregated_output to the selected pages
formatted_pages = [
Page(content=content, page=select_pages[i], content_length=len(content))
for i, content in enumerate(aggregated_markdown)
for i, content in enumerate(aggregated_output)
]
else:
# Default behavior when no select_pages is provided
formatted_pages = [
Page(content=content, page=i + 1, content_length=len(content))
for i, content in enumerate(aggregated_markdown)
for i, content in enumerate(aggregated_output)
]

return ZeroxOutput(
Expand Down
15 changes: 13 additions & 2 deletions py_zerox/pyzerox/models/modellitellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

DEFAULT_SYSTEM_PROMPT = Prompts.DEFAULT_SYSTEM_PROMPT


class litellmmodel(BaseModel):
## setting the default system prompt
_system_prompt = DEFAULT_SYSTEM_PROMPT
Expand All @@ -30,12 +29,24 @@ def __init__(
:type model: str, optional

:param kwargs: Additional keyword arguments to pass to self.completion -> litellm.completion. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input

Note: kwargs params starting with "__zxmetaconfig" are treated as meta config params and are not passed to litellm backend.
"""
super().__init__(model=model, **kwargs)

## create another dict having the keys starting with "__zxmetaconfig"
self.meta_config = {k: v for k, v in self.kwargs.items() if k.startswith("__zxmetaconfig")}

## remove the meta config keys from kwargs
self.kwargs = {k: v for k, v in self.kwargs.items() if not k.startswith("__zxmetaconfig")}

## calling custom methods to validate the environment and model
self.validate_environment()
self.validate_model()

## way to override vision validation
if self.meta_config.get("__zxmetaconfig_validate_vision_capability", True):
self.validate_model()

self.validate_access()

@property
Expand Down
18 changes: 14 additions & 4 deletions py_zerox/pyzerox/processor/pdf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import os
import asyncio
from typing import List, Optional, Tuple
from typing import List, Optional, Tuple, Callable
from pdf2image import convert_from_path

# Package Imports
Expand Down Expand Up @@ -40,6 +40,7 @@ async def process_page(
input_token_count: int = 0,
output_token_count: int = 0,
prior_page: str = "",
post_process_function: Optional[Callable[[str], str]] = format_markdown,
semaphore: Optional[asyncio.Semaphore] = None,
) -> Tuple[str, int, int, str]:
"""Process a single page of a PDF"""
Expand All @@ -54,6 +55,7 @@ async def process_page(
input_token_count,
output_token_count,
prior_page,
post_process_function,
)

image_path = os.path.join(temp_directory, image)
Expand All @@ -66,12 +68,18 @@ async def process_page(
prior_page=prior_page,
)

formatted_markdown = format_markdown(completion.content)
## post process the completion
if post_process_function:
output_text = post_process_function(completion.content)
else:
## skip post processing
output_text = completion.content

input_token_count += completion.input_tokens
output_token_count += completion.output_tokens
prior_page = formatted_markdown
prior_page = output_text

return formatted_markdown, input_token_count, output_token_count, prior_page
return output_text, input_token_count, output_token_count, prior_page

except Exception as error:
logging.error(f"{Messages.FAILED_TO_PROCESS_IMAGE} Error:{error}")
Expand All @@ -86,6 +94,7 @@ async def process_pages_in_batches(
input_token_count: int = 0,
output_token_count: int = 0,
prior_page: str = "",
post_process_function: Optional[Callable[[str], str]] = format_markdown,
):
# Create a semaphore to limit the number of concurrent tasks
semaphore = asyncio.Semaphore(concurrency)
Expand All @@ -99,6 +108,7 @@ async def process_pages_in_batches(
input_token_count,
output_token_count,
prior_page,
post_process_function,
semaphore,
)
for image in images
Expand Down