add pypdf parser to xpacks (#7808)

GitOrigin-RevId: 4e2b4780259639833d9702f006a966dc326eb527
pathwaycom · Dec 9, 2024 · 74bd3a2 · 74bd3a2
1 parent 450e983
commit 74bd3a2
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 1 deletion.
diff --git a/pyproject.toml b/pyproject.toml
@@ -74,6 +74,7 @@ xpack-llm-docs = [
     "openparse == 0.5.6",
     "unstructured >= 0.16",
     "pdf2image",
+    "pypdf",
 ]
 xpack-sharepoint = [
     "Office365-REST-Python-Client >= 2.5.3",

diff --git a/python/pathway/xpacks/llm/parsers.py b/python/pathway/xpacks/llm/parsers.py
@@ -11,6 +11,7 @@
 import io
 import logging
 import os
+import re
 import subprocess
 import tempfile
 import warnings
@@ -742,6 +743,83 @@ async def __wrapped__(self, contents: bytes) -> list[tuple[str, dict]]:
         return docs
 
 
+class PypdfParser(pw.UDF):
+    """
+    Parse PDF document using ``pypdf`` library.
+    Optionally, applies additional text cleanups for readability.
+
+    Args:
+        - apply_text_cleanup: Apply text cleanup for line breaks and repeated spaces.
+        - cache_strategy: Defines the caching mechanism. To enable caching,
+            a valid :py:class:``~pathway.udfs.CacheStrategy`` should be provided.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        apply_text_cleanup: bool = True,
+        cache_strategy: udfs.CacheStrategy | None = None,
+    ):
+        with optional_imports("xpack-llm-docs"):
+            from pypdf import PdfReader  # noqa:F401
+
+        super().__init__(cache_strategy=cache_strategy)
+        self.apply_text_cleanup = apply_text_cleanup
+
+    def __wrapped__(self, contents: bytes) -> list[tuple[str, dict]]:
+        from pypdf import PdfReader
+
+        pdf = PdfReader(stream=BytesIO(contents))
+
+        docs: list[tuple[str, dict]] = []
+        file_metadata: dict = {}
+
+        logger.info(
+            f"PypdfParser starting to parse a document of length: {len(pdf.pages)}"
+        )
+
+        for page in pdf.pages:
+            text: str = page.extract_text()
+
+            if self.apply_text_cleanup:
+                text = self._clean_text(text)
+
+            page_metadata: dict = file_metadata | {"page_number": page.page_number}
+
+            docs.append((text, page_metadata))
+
+        logger.info(
+            f"PypdfParser completed parsing, total number of pages: {len(pdf.pages)}"
+        )
+
+        return docs
+
+    def _clean_text(self, text: str):
+        text_wo_lines = self._clean_text_lines(text)
+        simplified_text = self._remove_empty_space(text_wo_lines)
+        formatted_text = self._replace_newline_with_space_if_lower(simplified_text)
+        return formatted_text
+
+    def _clean_text_lines(self, text: str) -> str:
+        return re.sub(
+            r"(?<=\n)\s*([A-Z][^ ]*|[\d][^ ]*)", lambda m: m.group(1), text
+        ).replace("\n ", "\n")
+
+    def _remove_empty_space(self, text: str) -> str:
+        return text.replace("   ", " ")
+
+    def _replace_newline_with_space_if_lower(self, text: str) -> str:
+        """Remove unnecessary line breaks."""
+
+        def replace_newline(match: re.Match):
+            if match.group(1).islower():
+                return " " + match.group(1)
+            return "\n" + match.group(1)
+
+        modified_text = re.sub(r"\n(\w)", replace_newline, text)
+        return modified_text
+
+
 async def parse_images(
     images: list[Image.Image],
     llm: pw.UDF,

diff --git a/python/pathway/xpacks/llm/tests/test_parsers.py b/python/pathway/xpacks/llm/tests/test_parsers.py
@@ -12,7 +12,12 @@
 
 import pathway as pw
 from pathway.tests.utils import assert_table_equality
-from pathway.xpacks.llm.parsers import OpenParse, ParseUnstructured, ParseUtf8
+from pathway.xpacks.llm.parsers import (
+    OpenParse,
+    ParseUnstructured,
+    ParseUtf8,
+    PypdfParser,
+)
 
 for _ in range(10):
     try:
@@ -126,3 +131,30 @@ def remove_line_breaks(t: str) -> str:
     expected_table = pw.debug.table_from_pandas(expected_df)
 
     assert_table_equality(result, expected_table)
+
+
+def test_parse_pypdf(tmp_path: Path):
+    parser = PypdfParser()
+
+    txt = (
+        "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
+        "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,"
+        "quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
+    )
+
+    pdf_path = _create_temp_pdf_with_text(txt, tmp_path)
+
+    with open(pdf_path, "rb") as pdf_file:
+        raw_pdf_data = pdf_file.read()
+
+    input_df = pd.DataFrame([dict(raw=raw_pdf_data)])
+
+    class Schema(pw.Schema):
+        raw: bytes
+
+    input_table = pw.debug.table_from_pandas(input_df, schema=Schema)
+    result = input_table.select(ret=parser(pw.this.raw)[0][0])
+
+    assert_table_equality(
+        result, pw.debug.table_from_pandas(pd.DataFrame([dict(ret=txt)]))
+    )