diff --git a/README.md b/README.md index a235c56..3c18abf 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,7 @@ Refer to the [LiteLLM Documentation](https://docs.litellm.ai/docs/providers) for ```python from pyzerox import zerox + import os import json import asyncio @@ -195,7 +196,7 @@ file_path = 'path/to/vertex_ai_service_account.json' # Load the JSON file with open(file_path, 'r') as file: - vertex_credentials = json.load(file) + vertex_credentials = json.load(file) # Convert to JSON string vertex_credentials_json = json.dumps(vertex_credentials) @@ -209,15 +210,23 @@ kwargs = {"vertex_credentials": vertex_credentials} # Define main async entrypoint async def main(): - file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported + file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported + + ## process only some pages or all + select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed) - ## process only some pages or all - select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed) + output_file_path = "output.md" ## filepath to save the consolidated output file (markdown by default). Pass None to skip saving any output file + page_separator = "\n\n" ## The separator to use between pages when writing the output to `output_file_path` - output_dir = "./output_test" ## directory to save the consolidated markdown file - result = await zerox(file_path=file_path, model=model, output_dir=output_dir, - custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs) - return result + ## function to apply on model's text output (on each page). Function should take input as string and return output also as string. + ## By default uses Zerox's format_markdown function to format text as markdown + # post_process_function = lambda x: x.strip() ## To skip any post processing pass None, which would just keep the raw text output from the model. + + result = await zerox(file_path = file_path, model = model, output_file_path = output_file_path, + custom_system_prompt = custom_system_prompt, select_pages = select_pages, + # post_process_function = post_process_function, + **kwargs) + return result # run the main function: @@ -236,10 +245,12 @@ async def zerox( file_path: Optional[str] = "", maintain_format: bool = False, model: str = "gpt-4o-mini", - output_dir: Optional[str] = None, + output_file_path: Optional[str] = None, + page_separator: str = "\n\n", temp_dir: Optional[str] = None, custom_system_prompt: Optional[str] = None, select_pages: Optional[Union[int, Iterable[int]]] = None, + post_process_function: Optional[Callable[[str], str]] = format_markdown, **kwargs ) -> ZeroxOutput: ... @@ -258,22 +269,25 @@ Parameters - **model** (str, optional): The model to use for generating completions. Defaults to "gpt-4o-mini". Refer to LiteLLM Providers for the correct model name, as it may differ depending on the provider. -- **output_dir** (Optional[str], optional): - The directory to save the markdown output. Defaults to None. +- **output_file_path** (Optional[str], optional): + The path to save the markdown output (e.g., "output.md"). Any required directories will be created. Defaults to None. +- **page_separator** (str, optional): + The separator to use between pages when writing the output to `output_file_path`. Defaults to "\n\n". - **temp_dir** (str, optional): The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted before zerox uses it. - **custom_system_prompt** (str, optional): The system prompt to use for the model, this overrides the default system prompt of zerox.Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning. Defaults to None. - **select_pages** (Optional[Union[int, Iterable[int]]], optional): Pages to process, can be a single page number or an iterable of page numbers, Defaults to None +- **post_process_function** (Optional[Callable[[str], str]], optional): + A function to post-process the text output from the model for each page. It should take a string as input and return a string as output. Defaults to Zerox's `format_markdown` function, which formats the output in markdown. Pass None to skip post-processing. - **kwargs** (dict, optional): Additional keyword arguments to pass to the litellm.completion method. Refer to the LiteLLM Documentation and Completion Input for details. Returns - - ZeroxOutput: - Contains the markdown content generated by the model and also some metadata (refer below). + Contains the output content (markdown as default) generated by the model and also some metadata (refer below). ### Example Output (Output from "azure/gpt-4o-mini"): diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py index ffe251d..62cc1a3 100644 --- a/py_zerox/pyzerox/core/types.py +++ b/py_zerox/pyzerox/core/types.py @@ -1,4 +1,5 @@ -from typing import List, Optional, Dict, Any, Union, Iterable +from typing import List, Optional, Dict, Any, Union, Iterable, Callable +from ..processor import format_markdown from dataclasses import dataclass, field @@ -12,11 +13,13 @@ class ZeroxArgs: cleanup: bool = True concurrency: int = 10 maintain_format: bool = False - model: str = "gpt-4o-mini", - output_dir: Optional[str] = None + model: str = "gpt-4o-mini" + output_file_path: Optional[str] = None + page_separator: Optional[str] = None temp_dir: Optional[str] = None custom_system_prompt: Optional[str] = None select_pages: Optional[Union[int, Iterable[int]]] = None + post_process_function: Optional[Callable[[str], str]] = format_markdown kwargs: Dict[str, Any] = field(default_factory=dict) @dataclass diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index 6ea7423..d6fef74 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -2,7 +2,7 @@ import aioshutil as async_shutil import tempfile import warnings -from typing import List, Optional, Union, Iterable +from typing import List, Optional, Union, Iterable, Callable from datetime import datetime import aiofiles import aiofiles.os as async_os @@ -15,6 +15,7 @@ process_page, process_pages_in_batches, create_selected_pages_pdf, + format_markdown, ) from ..errors import FileUnavailable from ..constants.messages import Messages @@ -28,14 +29,16 @@ async def zerox( file_path: Optional[str] = "", maintain_format: bool = False, model: str = "gpt-4o-mini", - output_dir: Optional[str] = None, + output_file_path: Optional[str] = None, + page_separator: Optional[str] = None, temp_dir: Optional[str] = None, custom_system_prompt: Optional[str] = None, select_pages: Optional[Union[int, Iterable[int]]] = None, + post_process_function: Optional[Callable[[str], str]] = format_markdown, **kwargs ) -> ZeroxOutput: """ - API to perform OCR to markdown using Vision models. + API to perform OCR to markdown (default) using Vision models. Please setup the environment variables for the model and model provider before using this API. Refer: https://docs.litellm.ai/docs/providers :param cleanup: Whether to cleanup the temporary files after processing, defaults to True @@ -48,24 +51,28 @@ async def zerox( :type maintain_format: bool, optional :param model: The model to use for generating completions, defaults to "gpt-4o-mini". Note - Refer: https://docs.litellm.ai/docs/providers to pass correct model name as according to provider it might be different from actual name. :type model: str, optional - :param output_dir: The directory to save the markdown output, defaults to None - :type output_dir: str, optional + :param output_file_path: The path to save the output output file (Example "output.md"). Any required directories will be created, defaults to None + :type output_file_path: str, optional :param temp_dir: The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted for zerox uses it. :type temp_dir: str, optional + :param page_separator: The separator to use between pages (at the end of each page) when writing the output to "output_file_path", can include a {page_no} placeholder to insert the page number. Uses "\\n\\n<=== Page {page_no} ===>\\n\\n" by default. defaults to None + :type page_separator: str, None :param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None :type custom_system_prompt: str, optional :param select_pages: Pages to process, can be a single page number or an iterable of page numbers, defaults to None :type select_pages: int or Iterable[int], optional + :param post_process_function: A function to post-process the text output from the model for each page. It should take string as an input and return string as an output, defaults to "format_markdown" function (zerox's default for markdown formatting). Pass None to skip any post processing on the text output of the model. + :type post_process_function: Callable[[str], str], optional :param kwargs: Additional keyword arguments to pass to the model.completion -> litellm.completion method. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input - :return: The markdown content generated by the model. + :return: The content generated by the model after Zerox's postprocessing (if provided). """ input_token_count = 0 output_token_count = 0 prior_page = "" - aggregated_markdown: List[str] = [] + aggregated_output: List[str] = [] start_time = datetime.now() # File Path Validators @@ -84,6 +91,7 @@ async def zerox( warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING) # If select_pages is a single integer, convert it to a list for consistency + if isinstance(select_pages, int): select_pages = [select_pages] @@ -91,7 +99,9 @@ async def zerox( if select_pages is not None: select_pages = sorted(select_pages) - # Ensure the output directory exists + + # Ensure the directory for output_file_path exists + output_dir = os.path.dirname(output_file_path) if output_file_path else None if output_dir: await async_os.makedirs(output_dir, exist_ok=True) @@ -139,10 +149,11 @@ async def zerox( input_token_count, output_token_count, prior_page, + post_process_function, ) if result: - aggregated_markdown.append(result) + aggregated_output.append(result) else: results = await process_pages_in_batches( images, @@ -152,19 +163,30 @@ async def zerox( input_token_count, output_token_count, prior_page, + post_process_function, ) - aggregated_markdown = [result[0] for result in results if isinstance(result[0], str)] + aggregated_output = [result[0] for result in results if isinstance(result[0], str)] ## add token usage input_token_count += sum([result[1] for result in results]) output_token_count += sum([result[2] for result in results]) - # Write the aggregated markdown to a file - if output_dir: - result_file_path = os.path.join(output_dir, f"{file_name}.md") - async with aiofiles.open(result_file_path, "w") as f: - await f.write("\n\n".join(aggregated_markdown)) + # Write the aggregated output to a file + if output_file_path: + if not page_separator and not isinstance(page_separator, str): + page_separator = "\n\n<=== Page {page_no} ===>\n\n" + + async with aiofiles.open(output_file_path, "w") as f: + for i, page_content in enumerate(aggregated_output): + await f.write(page_content) + + # Replace {page_no} with the actual page number in page_separator + if "{page_no}" in page_separator: + page_no_text = page_separator.format(page_no=(select_pages[i] if select_pages else i + 1)) + await f.write(f"{page_no_text}") + else: + await f.write(page_separator) # Cleanup the downloaded PDF file if cleanup and os.path.exists(temp_directory): @@ -176,16 +198,16 @@ async def zerox( # Adjusting the formatted_pages logic to account for select_pages to output the correct page numbers if select_pages is not None: - # Map aggregated markdown to the selected pages + # Map aggregated_output to the selected pages formatted_pages = [ Page(content=content, page=select_pages[i], content_length=len(content)) - for i, content in enumerate(aggregated_markdown) + for i, content in enumerate(aggregated_output) ] else: # Default behavior when no select_pages is provided formatted_pages = [ Page(content=content, page=i + 1, content_length=len(content)) - for i, content in enumerate(aggregated_markdown) + for i, content in enumerate(aggregated_output) ] return ZeroxOutput( diff --git a/py_zerox/pyzerox/models/modellitellm.py b/py_zerox/pyzerox/models/modellitellm.py index bda4828..b544bb7 100644 --- a/py_zerox/pyzerox/models/modellitellm.py +++ b/py_zerox/pyzerox/models/modellitellm.py @@ -14,7 +14,6 @@ DEFAULT_SYSTEM_PROMPT = Prompts.DEFAULT_SYSTEM_PROMPT - class litellmmodel(BaseModel): ## setting the default system prompt _system_prompt = DEFAULT_SYSTEM_PROMPT @@ -30,12 +29,24 @@ def __init__( :type model: str, optional :param kwargs: Additional keyword arguments to pass to self.completion -> litellm.completion. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input + + Note: kwargs params starting with "__zxmetaconfig" are treated as meta config params and are not passed to litellm backend. """ super().__init__(model=model, **kwargs) + ## create another dict having the keys starting with "__zxmetaconfig" + self.meta_config = {k: v for k, v in self.kwargs.items() if k.startswith("__zxmetaconfig")} + + ## remove the meta config keys from kwargs + self.kwargs = {k: v for k, v in self.kwargs.items() if not k.startswith("__zxmetaconfig")} + ## calling custom methods to validate the environment and model self.validate_environment() - self.validate_model() + + ## way to override vision validation + if self.meta_config.get("__zxmetaconfig_validate_vision_capability", True): + self.validate_model() + self.validate_access() @property diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index c3b3fa6..9f681aa 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -1,7 +1,7 @@ import logging import os import asyncio -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Callable from pdf2image import convert_from_path # Package Imports @@ -40,6 +40,7 @@ async def process_page( input_token_count: int = 0, output_token_count: int = 0, prior_page: str = "", + post_process_function: Optional[Callable[[str], str]] = format_markdown, semaphore: Optional[asyncio.Semaphore] = None, ) -> Tuple[str, int, int, str]: """Process a single page of a PDF""" @@ -54,6 +55,7 @@ async def process_page( input_token_count, output_token_count, prior_page, + post_process_function, ) image_path = os.path.join(temp_directory, image) @@ -66,12 +68,18 @@ async def process_page( prior_page=prior_page, ) - formatted_markdown = format_markdown(completion.content) + ## post process the completion + if post_process_function: + output_text = post_process_function(completion.content) + else: + ## skip post processing + output_text = completion.content + input_token_count += completion.input_tokens output_token_count += completion.output_tokens - prior_page = formatted_markdown + prior_page = output_text - return formatted_markdown, input_token_count, output_token_count, prior_page + return output_text, input_token_count, output_token_count, prior_page except Exception as error: logging.error(f"{Messages.FAILED_TO_PROCESS_IMAGE} Error:{error}") @@ -86,6 +94,7 @@ async def process_pages_in_batches( input_token_count: int = 0, output_token_count: int = 0, prior_page: str = "", + post_process_function: Optional[Callable[[str], str]] = format_markdown, ): # Create a semaphore to limit the number of concurrent tasks semaphore = asyncio.Semaphore(concurrency) @@ -99,6 +108,7 @@ async def process_pages_in_batches( input_token_count, output_token_count, prior_page, + post_process_function, semaphore, ) for image in images