Skip to content

Commit

Permalink
add pypdf parser to xpacks (#7808)
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 4e2b4780259639833d9702f006a966dc326eb527
  • Loading branch information
berkecanrizai authored and Manul from Pathway committed Dec 9, 2024
1 parent 450e983 commit 74bd3a2
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 1 deletion.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ xpack-llm-docs = [
"openparse == 0.5.6",
"unstructured >= 0.16",
"pdf2image",
"pypdf",
]
xpack-sharepoint = [
"Office365-REST-Python-Client >= 2.5.3",
Expand Down
78 changes: 78 additions & 0 deletions python/pathway/xpacks/llm/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import io
import logging
import os
import re
import subprocess
import tempfile
import warnings
Expand Down Expand Up @@ -742,6 +743,83 @@ async def __wrapped__(self, contents: bytes) -> list[tuple[str, dict]]:
return docs


class PypdfParser(pw.UDF):
"""
Parse PDF document using ``pypdf`` library.
Optionally, applies additional text cleanups for readability.
Args:
- apply_text_cleanup: Apply text cleanup for line breaks and repeated spaces.
- cache_strategy: Defines the caching mechanism. To enable caching,
a valid :py:class:``~pathway.udfs.CacheStrategy`` should be provided.
Defaults to None.
"""

def __init__(
self,
apply_text_cleanup: bool = True,
cache_strategy: udfs.CacheStrategy | None = None,
):
with optional_imports("xpack-llm-docs"):
from pypdf import PdfReader # noqa:F401

super().__init__(cache_strategy=cache_strategy)
self.apply_text_cleanup = apply_text_cleanup

def __wrapped__(self, contents: bytes) -> list[tuple[str, dict]]:
from pypdf import PdfReader

pdf = PdfReader(stream=BytesIO(contents))

docs: list[tuple[str, dict]] = []
file_metadata: dict = {}

logger.info(
f"PypdfParser starting to parse a document of length: {len(pdf.pages)}"
)

for page in pdf.pages:
text: str = page.extract_text()

if self.apply_text_cleanup:
text = self._clean_text(text)

page_metadata: dict = file_metadata | {"page_number": page.page_number}

docs.append((text, page_metadata))

logger.info(
f"PypdfParser completed parsing, total number of pages: {len(pdf.pages)}"
)

return docs

def _clean_text(self, text: str):
text_wo_lines = self._clean_text_lines(text)
simplified_text = self._remove_empty_space(text_wo_lines)
formatted_text = self._replace_newline_with_space_if_lower(simplified_text)
return formatted_text

def _clean_text_lines(self, text: str) -> str:
return re.sub(
r"(?<=\n)\s*([A-Z][^ ]*|[\d][^ ]*)", lambda m: m.group(1), text
).replace("\n ", "\n")

def _remove_empty_space(self, text: str) -> str:
return text.replace(" ", " ")

def _replace_newline_with_space_if_lower(self, text: str) -> str:
"""Remove unnecessary line breaks."""

def replace_newline(match: re.Match):
if match.group(1).islower():
return " " + match.group(1)
return "\n" + match.group(1)

modified_text = re.sub(r"\n(\w)", replace_newline, text)
return modified_text


async def parse_images(
images: list[Image.Image],
llm: pw.UDF,
Expand Down
34 changes: 33 additions & 1 deletion python/pathway/xpacks/llm/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@

import pathway as pw
from pathway.tests.utils import assert_table_equality
from pathway.xpacks.llm.parsers import OpenParse, ParseUnstructured, ParseUtf8
from pathway.xpacks.llm.parsers import (
OpenParse,
ParseUnstructured,
ParseUtf8,
PypdfParser,
)

for _ in range(10):
try:
Expand Down Expand Up @@ -126,3 +131,30 @@ def remove_line_breaks(t: str) -> str:
expected_table = pw.debug.table_from_pandas(expected_df)

assert_table_equality(result, expected_table)


def test_parse_pypdf(tmp_path: Path):
parser = PypdfParser()

txt = (
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
"tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,"
"quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
)

pdf_path = _create_temp_pdf_with_text(txt, tmp_path)

with open(pdf_path, "rb") as pdf_file:
raw_pdf_data = pdf_file.read()

input_df = pd.DataFrame([dict(raw=raw_pdf_data)])

class Schema(pw.Schema):
raw: bytes

input_table = pw.debug.table_from_pandas(input_df, schema=Schema)
result = input_table.select(ret=parser(pw.this.raw)[0][0])

assert_table_equality(
result, pw.debug.table_from_pandas(pd.DataFrame([dict(ret=txt)]))
)

0 comments on commit 74bd3a2

Please sign in to comment.