Add tabular reader (#47)

* tabular reader * tabular reader * tabular reader * tabular reader * tabular reader * tabular reader --------- Co-authored-by: Yue Fei <[email protected]>
aigc-apps · Jun 5, 2024 · 115a695 · 115a695
1 parent 128f424
commit 115a695
Show file tree

Hide file tree

Showing 10 changed files with 579 additions and 114 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -69,6 +69,7 @@ openinference-instrumentation = "^0.1.7"
 llama-index-llms-huggingface = "^0.2.0"
 pytest-asyncio = "^0.23.7"
 pytest-cov = "^5.0.0"
+xlrd = "^2.0.1"
 
 [tool.poetry.scripts]
 pai_rag = "pai_rag.main:main"

diff --git a/src/pai_rag/docs/tabular_doc.md b/src/pai_rag/docs/tabular_doc.md
@@ -0,0 +1,76 @@
+# Tabular processing with PAI-RAG
+
+## PaiCSVReader
+
+PaiCSVReader(concat_rows=True, row_joiner="\n", csv_config={})
+
+### Parameters:
+
+**concat_rows:** _bool, default=True._
+Whether to concatenate rows into one document.
+
+**row_joiner:** _str, default="\n"._
+The separator used to join rows.
+
+**header:** _None or int, list of int, default 0._
+row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row
+positions will be combined into a MultiIndex. Use None if there is no header.
+
+### Functions:
+
+load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None)
+
+## PaiPandasCSVReader
+
+PaiPandasCSVReader(concat_rows=True, row_joiner="\n", pandas_config={})
+
+### Parameters:
+
+**concat_rows:** _bool, default=True._
+Whether to concatenate rows into one document.
+
+**row_joiner:** _str, default="\n"._
+The separator used to join rows.
+
+**pandas_config:** _dict, default={}._
+The configuration of pandas.read_csv.
+Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html for more information.
+Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own.
+
+#### one important parameter:
+
+**header:** _None or int, list of int, default 0._
+Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row
+positions will be combined into a MultiIndex. Use None if there is no header.
+
+### Functions:
+
+load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None)
+
+## PaiPandasExcelReader
+
+PaiPandasExcelReader(concat_rows=True, row_joiner="\n", pandas_config={})
+
+### Parameters:
+
+**concat_rows:** _bool, default=True._
+Whether to concatenate rows into one document.
+
+**row_joiner:** _str, default="\n"._
+The separator used to join rows.
+
+**pandas_config:** _dict, default={}._
+The configuration of pandas.read_csv.
+Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html for more information.
+Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own.
+
+#### one important parameter:
+
+**header:** _None or int, list of int, default 0._
+Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row
+positions will be combined into a MultiIndex. Use None if there is no header.
+
+### Functions:
+
+load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None)
+only process the first sheet
diff --git a/src/pai_rag/integrations/readers/pai_csv_reader.py b/src/pai_rag/integrations/readers/pai_csv_reader.py
@@ -0,0 +1,151 @@
+"""Tabular parser-CSV parser.
+
+Contains parsers for tabular data files.
+
+"""
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from fsspec import AbstractFileSystem
+
+import pandas as pd
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+class PaiCSVReader(BaseReader):
+    """CSV parser.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+        header （object）: None or int, list of int, default 0.
+            Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row
+            positions will be combined into a MultiIndex. Use None if there is no header.
+
+    """
+
+    def __init__(
+        self, *args: Any, concat_rows: bool = True, header: object = 0, **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._header = header
+
+    def load_data(
+        self, file: Path, extra_info: Optional[Dict] = None
+    ) -> List[Document]:
+        """Parse csv file.
+
+        Returns:
+            Union[str, List[str]]: a string or a List of strings.
+
+        """
+        try:
+            import csv
+        except ImportError:
+            raise ImportError("csv module is required to read CSV files.")
+        text_list = []
+        headers = []
+        data_lines = []
+        data_line_start_index = 1
+        if isinstance(self._header, list):
+            data_line_start_index = max(self._header) + 1
+        elif isinstance(self._header, int):
+            data_line_start_index = self._header + 1
+            self._header = [self._header]
+
+        with open(file) as fp:
+            csv_reader = csv.reader(fp)
+
+            if self._header is None:
+                for row in csv_reader:
+                    text_list.append(", ".join(row))
+            else:
+                for i, row in enumerate(csv_reader):
+                    if i in self._header:
+                        headers.append(row)
+                    elif i >= data_line_start_index:
+                        data_lines.append(row)
+                headers = [tuple(group) for group in zip(*headers)]
+                for line in data_lines:
+                    if len(line) == len(headers):
+                        data_entry = str(dict(zip(headers, line)))
+                        text_list.append(data_entry)
+
+        metadata = {"filename": file.name, "extension": file.suffix}
+        if extra_info:
+            metadata = {**metadata, **extra_info}
+
+        if self._concat_rows:
+            return [Document(text="\n".join(text_list), metadata=metadata)]
+        else:
+            return [Document(text=text, metadata=metadata) for text in text_list]
+
+
+class PaiPandasCSVReader(BaseReader):
+    r"""Pandas-based CSV parser.
+
+    Parses CSVs using the separator detection from Pandas `read_csv`function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        pandas_config (dict): Options for the `pandas.read_csv` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
+            for more information.
+            Set to empty dict by default, this means pandas will try to figure
+            out the separators, table head, etc. on its own.
+
+    """
+
+    def __init__(
+        self,
+        *args: Any,
+        concat_rows: bool = True,
+        row_joiner: str = "\n",
+        pandas_config: dict = {},
+        **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._row_joiner = row_joiner
+        self._pandas_config = pandas_config
+
+    def load_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> List[Document]:
+        """Parse csv file."""
+        if fs:
+            with fs.open(file) as f:
+                df = pd.read_csv(f, **self._pandas_config)
+        else:
+            df = pd.read_csv(file, **self._pandas_config)
+
+        text_list = df.apply(
+            lambda row: str(dict(zip(df.columns, row.astype(str)))), axis=1
+        ).tolist()
+
+        if self._concat_rows:
+            return [
+                Document(
+                    text=(self._row_joiner).join(text_list), metadata=extra_info or {}
+                )
+            ]
+        else:
+            return [
+                Document(text=text, metadata=extra_info or {}) for text in text_list
+            ]
diff --git a/src/pai_rag/integrations/readers/pai_excel_reader.py b/src/pai_rag/integrations/readers/pai_excel_reader.py
@@ -0,0 +1,121 @@
+"""Tabular parser-Excel parser.
+
+Contains parsers for tabular data files.
+
+"""
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from fsspec import AbstractFileSystem
+from openpyxl import load_workbook
+
+import pandas as pd
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+class PaiPandasExcelReader(BaseReader):
+    r"""Pandas-based Excel parser.
+
+
+    Args:
+        concat_rows (bool): whether to concatenate all rows into one document.
+            If set to False, a Document will be created for each row.
+            True by default.
+
+        row_joiner (str): Separator to use for joining each row.
+            Only used when `concat_rows=True`.
+            Set to "\n" by default.
+
+        pandas_config (dict): Options for the `pandas.read_excel` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
+            for more information.
+            Set to empty dict by default, this means pandas will try to figure
+            out the separators, table head, etc. on its own.
+
+    """
+
+    def __init__(
+        self,
+        *args: Any,
+        concat_rows: bool = True,
+        row_joiner: str = "\n",
+        pandas_config: dict = {},
+        **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._concat_rows = concat_rows
+        self._row_joiner = row_joiner
+        self._pandas_config = pandas_config
+
+    def read_xlsx(
+        self,
+        file: Path,
+        fs: Optional[AbstractFileSystem] = None,
+    ):
+        """Parse Excel file。"""
+        if fs:
+            with fs.open(file) as f:
+                excel = pd.ExcelFile(load_workbook(f), engine="openpyxl")
+        else:
+            excel = pd.ExcelFile(load_workbook(file), engine="openpyxl")
+        sheet_name = excel.sheet_names[0]
+        sheet = excel.book[sheet_name]
+        df = excel.parse(sheet_name, **self._pandas_config)
+
+        header_max = 0
+        if (
+            "header" in self._pandas_config
+            and self._pandas_config["header"] is not None
+            and isinstance(self._pandas_config["header"], list)
+        ):
+            header_max = max(self._pandas_config["header"])
+        elif (
+            "header" in self._pandas_config
+            and self._pandas_config["header"] is not None
+            and isinstance(self._pandas_config["header"], int)
+        ):
+            header_max = self._pandas_config["header"]
+
+        for item in sheet.merged_cells:
+            top_col, top_row, bottom_col, bottom_row = item.bounds
+            base_value = item.start_cell.value
+            # Convert 1-based index to 0-based index
+            top_row -= 1
+            top_col -= 1
+            # Since the previous lines are set as headers, the coordinates need to be adjusted here.
+            if (
+                "header" in self._pandas_config
+                and self._pandas_config["header"] is not None
+            ) or "header" not in self._pandas_config:
+                top_row -= header_max + 1
+                bottom_row -= header_max + 1
+
+            df.iloc[top_row:bottom_row, top_col:bottom_col] = base_value
+        return df
+
+    def load_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> List[Document]:
+        """Parse Excel file. only process the first sheet"""
+
+        df = self.read_xlsx(file, fs)
+
+        text_list = df.apply(
+            lambda row: str(dict(zip(df.columns, row.astype(str)))), axis=1
+        ).tolist()
+
+        if self._concat_rows:
+            return [
+                Document(
+                    text=(self._row_joiner).join(text_list), metadata=extra_info or {}
+                )
+            ]
+        else:
+            return [
+                Document(text=text, metadata=extra_info or {}) for text in text_list
+            ]
diff --git a/src/pai_rag/modules/datareader/datareader_factory.py b/src/pai_rag/modules/datareader/datareader_factory.py
@@ -4,6 +4,8 @@
 from pai_rag.integrations.readers.pai_pdf_reader import PaiPDFReader
 from pai_rag.integrations.readers.llama_parse_reader import LlamaParseDirectoryReader
 from pai_rag.integrations.readers.html.html_reader import HtmlReader
+from pai_rag.integrations.readers.pai_csv_reader import PaiPandasCSVReader
+from pai_rag.integrations.readers.pai_excel_reader import PaiPandasExcelReader
 from llama_index.readers.database import DatabaseReader
 from llama_index.core import SimpleDirectoryReader
 import logging
@@ -25,6 +27,15 @@ def _create_new_instance(self, new_params: Dict[str, Any]):
                 enable_image_ocr=self.reader_config.get("enable_image_ocr", False),
                 model_dir=self.reader_config.get("easyocr_model_dir", None),
             ),
+            ".csv": PaiPandasCSVReader(
+                concat_rows=self.reader_config.get("concat_rows", False),
+            ),
+            ".xlsx": PaiPandasExcelReader(
+                concat_rows=self.reader_config.get("concat_rows", False),
+            ),
+            ".xls": PaiPandasExcelReader(
+                concat_rows=self.reader_config.get("concat_rows", False),
+            ),
         }
         return self