Skip to content

Commit

Permalink
Add tabular reader (#47)
Browse files Browse the repository at this point in the history
* tabular reader

* tabular reader

* tabular reader

* tabular reader

* tabular reader

* tabular reader

---------

Co-authored-by: Yue Fei <[email protected]>
  • Loading branch information
Ceceliachenen and moria97 authored Jun 5, 2024
1 parent 128f424 commit 115a695
Show file tree
Hide file tree
Showing 10 changed files with 579 additions and 114 deletions.
244 changes: 130 additions & 114 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ openinference-instrumentation = "^0.1.7"
llama-index-llms-huggingface = "^0.2.0"
pytest-asyncio = "^0.23.7"
pytest-cov = "^5.0.0"
xlrd = "^2.0.1"

[tool.poetry.scripts]
pai_rag = "pai_rag.main:main"
Expand Down
76 changes: 76 additions & 0 deletions src/pai_rag/docs/tabular_doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Tabular processing with PAI-RAG

## PaiCSVReader

PaiCSVReader(concat_rows=True, row_joiner="\n", csv_config={})

### Parameters:

**concat_rows:** _bool, default=True._
Whether to concatenate rows into one document.

**row_joiner:** _str, default="\n"._
The separator used to join rows.

**header:** _None or int, list of int, default 0._
row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row
positions will be combined into a MultiIndex. Use None if there is no header.

### Functions:

load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None)

## PaiPandasCSVReader

PaiPandasCSVReader(concat_rows=True, row_joiner="\n", pandas_config={})

### Parameters:

**concat_rows:** _bool, default=True._
Whether to concatenate rows into one document.

**row_joiner:** _str, default="\n"._
The separator used to join rows.

**pandas_config:** _dict, default={}._
The configuration of pandas.read_csv.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html for more information.
Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own.

#### one important parameter:

**header:** _None or int, list of int, default 0._
Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row
positions will be combined into a MultiIndex. Use None if there is no header.

### Functions:

load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None)

## PaiPandasExcelReader

PaiPandasExcelReader(concat_rows=True, row_joiner="\n", pandas_config={})

### Parameters:

**concat_rows:** _bool, default=True._
Whether to concatenate rows into one document.

**row_joiner:** _str, default="\n"._
The separator used to join rows.

**pandas_config:** _dict, default={}._
The configuration of pandas.read_csv.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html for more information.
Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own.

#### one important parameter:

**header:** _None or int, list of int, default 0._
Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row
positions will be combined into a MultiIndex. Use None if there is no header.

### Functions:

load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None)
only process the first sheet
151 changes: 151 additions & 0 deletions src/pai_rag/integrations/readers/pai_csv_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""Tabular parser-CSV parser.
Contains parsers for tabular data files.
"""

from pathlib import Path
from typing import Any, Dict, List, Optional
from fsspec import AbstractFileSystem

import pandas as pd
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document


class PaiCSVReader(BaseReader):
"""CSV parser.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
header (object): None or int, list of int, default 0.
Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row
positions will be combined into a MultiIndex. Use None if there is no header.
"""

def __init__(
self, *args: Any, concat_rows: bool = True, header: object = 0, **kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._header = header

def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse csv file.
Returns:
Union[str, List[str]]: a string or a List of strings.
"""
try:
import csv
except ImportError:
raise ImportError("csv module is required to read CSV files.")
text_list = []
headers = []
data_lines = []
data_line_start_index = 1
if isinstance(self._header, list):
data_line_start_index = max(self._header) + 1
elif isinstance(self._header, int):
data_line_start_index = self._header + 1
self._header = [self._header]

with open(file) as fp:
csv_reader = csv.reader(fp)

if self._header is None:
for row in csv_reader:
text_list.append(", ".join(row))
else:
for i, row in enumerate(csv_reader):
if i in self._header:
headers.append(row)
elif i >= data_line_start_index:
data_lines.append(row)
headers = [tuple(group) for group in zip(*headers)]
for line in data_lines:
if len(line) == len(headers):
data_entry = str(dict(zip(headers, line)))
text_list.append(data_entry)

metadata = {"filename": file.name, "extension": file.suffix}
if extra_info:
metadata = {**metadata, **extra_info}

if self._concat_rows:
return [Document(text="\n".join(text_list), metadata=metadata)]
else:
return [Document(text=text, metadata=metadata) for text in text_list]


class PaiPandasCSVReader(BaseReader):
r"""Pandas-based CSV parser.
Parses CSVs using the separator detection from Pandas `read_csv`function.
If special parameters are required, use the `pandas_config` dict.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
row_joiner (str): Separator to use for joining each row.
Only used when `concat_rows=True`.
Set to "\n" by default.
pandas_config (dict): Options for the `pandas.read_csv` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
for more information.
Set to empty dict by default, this means pandas will try to figure
out the separators, table head, etc. on its own.
"""

def __init__(
self,
*args: Any,
concat_rows: bool = True,
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._row_joiner = row_joiner
self._pandas_config = pandas_config

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
"""Parse csv file."""
if fs:
with fs.open(file) as f:
df = pd.read_csv(f, **self._pandas_config)
else:
df = pd.read_csv(file, **self._pandas_config)

text_list = df.apply(
lambda row: str(dict(zip(df.columns, row.astype(str)))), axis=1
).tolist()

if self._concat_rows:
return [
Document(
text=(self._row_joiner).join(text_list), metadata=extra_info or {}
)
]
else:
return [
Document(text=text, metadata=extra_info or {}) for text in text_list
]
121 changes: 121 additions & 0 deletions src/pai_rag/integrations/readers/pai_excel_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Tabular parser-Excel parser.
Contains parsers for tabular data files.
"""

from pathlib import Path
from typing import Any, Dict, List, Optional
from fsspec import AbstractFileSystem
from openpyxl import load_workbook

import pandas as pd
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document


class PaiPandasExcelReader(BaseReader):
r"""Pandas-based Excel parser.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
row_joiner (str): Separator to use for joining each row.
Only used when `concat_rows=True`.
Set to "\n" by default.
pandas_config (dict): Options for the `pandas.read_excel` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
for more information.
Set to empty dict by default, this means pandas will try to figure
out the separators, table head, etc. on its own.
"""

def __init__(
self,
*args: Any,
concat_rows: bool = True,
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._row_joiner = row_joiner
self._pandas_config = pandas_config

def read_xlsx(
self,
file: Path,
fs: Optional[AbstractFileSystem] = None,
):
"""Parse Excel file。"""
if fs:
with fs.open(file) as f:
excel = pd.ExcelFile(load_workbook(f), engine="openpyxl")
else:
excel = pd.ExcelFile(load_workbook(file), engine="openpyxl")
sheet_name = excel.sheet_names[0]
sheet = excel.book[sheet_name]
df = excel.parse(sheet_name, **self._pandas_config)

header_max = 0
if (
"header" in self._pandas_config
and self._pandas_config["header"] is not None
and isinstance(self._pandas_config["header"], list)
):
header_max = max(self._pandas_config["header"])
elif (
"header" in self._pandas_config
and self._pandas_config["header"] is not None
and isinstance(self._pandas_config["header"], int)
):
header_max = self._pandas_config["header"]

for item in sheet.merged_cells:
top_col, top_row, bottom_col, bottom_row = item.bounds
base_value = item.start_cell.value
# Convert 1-based index to 0-based index
top_row -= 1
top_col -= 1
# Since the previous lines are set as headers, the coordinates need to be adjusted here.
if (
"header" in self._pandas_config
and self._pandas_config["header"] is not None
) or "header" not in self._pandas_config:
top_row -= header_max + 1
bottom_row -= header_max + 1

df.iloc[top_row:bottom_row, top_col:bottom_col] = base_value
return df

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
"""Parse Excel file. only process the first sheet"""

df = self.read_xlsx(file, fs)

text_list = df.apply(
lambda row: str(dict(zip(df.columns, row.astype(str)))), axis=1
).tolist()

if self._concat_rows:
return [
Document(
text=(self._row_joiner).join(text_list), metadata=extra_info or {}
)
]
else:
return [
Document(text=text, metadata=extra_info or {}) for text in text_list
]
11 changes: 11 additions & 0 deletions src/pai_rag/modules/datareader/datareader_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pai_rag.integrations.readers.pai_pdf_reader import PaiPDFReader
from pai_rag.integrations.readers.llama_parse_reader import LlamaParseDirectoryReader
from pai_rag.integrations.readers.html.html_reader import HtmlReader
from pai_rag.integrations.readers.pai_csv_reader import PaiPandasCSVReader
from pai_rag.integrations.readers.pai_excel_reader import PaiPandasExcelReader
from llama_index.readers.database import DatabaseReader
from llama_index.core import SimpleDirectoryReader
import logging
Expand All @@ -25,6 +27,15 @@ def _create_new_instance(self, new_params: Dict[str, Any]):
enable_image_ocr=self.reader_config.get("enable_image_ocr", False),
model_dir=self.reader_config.get("easyocr_model_dir", None),
),
".csv": PaiPandasCSVReader(
concat_rows=self.reader_config.get("concat_rows", False),
),
".xlsx": PaiPandasExcelReader(
concat_rows=self.reader_config.get("concat_rows", False),
),
".xls": PaiPandasExcelReader(
concat_rows=self.reader_config.get("concat_rows", False),
),
}
return self

Expand Down
Loading

0 comments on commit 115a695

Please sign in to comment.