-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* tabular reader * tabular reader * tabular reader * tabular reader * tabular reader * tabular reader --------- Co-authored-by: Yue Fei <[email protected]>
- Loading branch information
1 parent
b78ab56
commit 6a29d81
Showing
10 changed files
with
579 additions
and
114 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# Tabular processing with PAI-RAG | ||
|
||
## PaiCSVReader | ||
|
||
PaiCSVReader(concat_rows=True, row_joiner="\n", csv_config={}) | ||
|
||
### Parameters: | ||
|
||
**concat_rows:** _bool, default=True._ | ||
Whether to concatenate rows into one document. | ||
|
||
**row_joiner:** _str, default="\n"._ | ||
The separator used to join rows. | ||
|
||
**header:** _None or int, list of int, default 0._ | ||
row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row | ||
positions will be combined into a MultiIndex. Use None if there is no header. | ||
|
||
### Functions: | ||
|
||
load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None) | ||
|
||
## PaiPandasCSVReader | ||
|
||
PaiPandasCSVReader(concat_rows=True, row_joiner="\n", pandas_config={}) | ||
|
||
### Parameters: | ||
|
||
**concat_rows:** _bool, default=True._ | ||
Whether to concatenate rows into one document. | ||
|
||
**row_joiner:** _str, default="\n"._ | ||
The separator used to join rows. | ||
|
||
**pandas_config:** _dict, default={}._ | ||
The configuration of pandas.read_csv. | ||
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html for more information. | ||
Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own. | ||
|
||
#### one important parameter: | ||
|
||
**header:** _None or int, list of int, default 0._ | ||
Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row | ||
positions will be combined into a MultiIndex. Use None if there is no header. | ||
|
||
### Functions: | ||
|
||
load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None) | ||
|
||
## PaiPandasExcelReader | ||
|
||
PaiPandasExcelReader(concat_rows=True, row_joiner="\n", pandas_config={}) | ||
|
||
### Parameters: | ||
|
||
**concat_rows:** _bool, default=True._ | ||
Whether to concatenate rows into one document. | ||
|
||
**row_joiner:** _str, default="\n"._ | ||
The separator used to join rows. | ||
|
||
**pandas_config:** _dict, default={}._ | ||
The configuration of pandas.read_csv. | ||
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html for more information. | ||
Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own. | ||
|
||
#### one important parameter: | ||
|
||
**header:** _None or int, list of int, default 0._ | ||
Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row | ||
positions will be combined into a MultiIndex. Use None if there is no header. | ||
|
||
### Functions: | ||
|
||
load_data(file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None) | ||
only process the first sheet |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
"""Tabular parser-CSV parser. | ||
Contains parsers for tabular data files. | ||
""" | ||
|
||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional | ||
from fsspec import AbstractFileSystem | ||
|
||
import pandas as pd | ||
from llama_index.core.readers.base import BaseReader | ||
from llama_index.core.schema import Document | ||
|
||
|
||
class PaiCSVReader(BaseReader): | ||
"""CSV parser. | ||
Args: | ||
concat_rows (bool): whether to concatenate all rows into one document. | ||
If set to False, a Document will be created for each row. | ||
True by default. | ||
header (object): None or int, list of int, default 0. | ||
Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row | ||
positions will be combined into a MultiIndex. Use None if there is no header. | ||
""" | ||
|
||
def __init__( | ||
self, *args: Any, concat_rows: bool = True, header: object = 0, **kwargs: Any | ||
) -> None: | ||
"""Init params.""" | ||
super().__init__(*args, **kwargs) | ||
self._concat_rows = concat_rows | ||
self._header = header | ||
|
||
def load_data( | ||
self, file: Path, extra_info: Optional[Dict] = None | ||
) -> List[Document]: | ||
"""Parse csv file. | ||
Returns: | ||
Union[str, List[str]]: a string or a List of strings. | ||
""" | ||
try: | ||
import csv | ||
except ImportError: | ||
raise ImportError("csv module is required to read CSV files.") | ||
text_list = [] | ||
headers = [] | ||
data_lines = [] | ||
data_line_start_index = 1 | ||
if isinstance(self._header, list): | ||
data_line_start_index = max(self._header) + 1 | ||
elif isinstance(self._header, int): | ||
data_line_start_index = self._header + 1 | ||
self._header = [self._header] | ||
|
||
with open(file) as fp: | ||
csv_reader = csv.reader(fp) | ||
|
||
if self._header is None: | ||
for row in csv_reader: | ||
text_list.append(", ".join(row)) | ||
else: | ||
for i, row in enumerate(csv_reader): | ||
if i in self._header: | ||
headers.append(row) | ||
elif i >= data_line_start_index: | ||
data_lines.append(row) | ||
headers = [tuple(group) for group in zip(*headers)] | ||
for line in data_lines: | ||
if len(line) == len(headers): | ||
data_entry = str(dict(zip(headers, line))) | ||
text_list.append(data_entry) | ||
|
||
metadata = {"filename": file.name, "extension": file.suffix} | ||
if extra_info: | ||
metadata = {**metadata, **extra_info} | ||
|
||
if self._concat_rows: | ||
return [Document(text="\n".join(text_list), metadata=metadata)] | ||
else: | ||
return [Document(text=text, metadata=metadata) for text in text_list] | ||
|
||
|
||
class PaiPandasCSVReader(BaseReader): | ||
r"""Pandas-based CSV parser. | ||
Parses CSVs using the separator detection from Pandas `read_csv`function. | ||
If special parameters are required, use the `pandas_config` dict. | ||
Args: | ||
concat_rows (bool): whether to concatenate all rows into one document. | ||
If set to False, a Document will be created for each row. | ||
True by default. | ||
row_joiner (str): Separator to use for joining each row. | ||
Only used when `concat_rows=True`. | ||
Set to "\n" by default. | ||
pandas_config (dict): Options for the `pandas.read_csv` function call. | ||
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html | ||
for more information. | ||
Set to empty dict by default, this means pandas will try to figure | ||
out the separators, table head, etc. on its own. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
*args: Any, | ||
concat_rows: bool = True, | ||
row_joiner: str = "\n", | ||
pandas_config: dict = {}, | ||
**kwargs: Any | ||
) -> None: | ||
"""Init params.""" | ||
super().__init__(*args, **kwargs) | ||
self._concat_rows = concat_rows | ||
self._row_joiner = row_joiner | ||
self._pandas_config = pandas_config | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
"""Parse csv file.""" | ||
if fs: | ||
with fs.open(file) as f: | ||
df = pd.read_csv(f, **self._pandas_config) | ||
else: | ||
df = pd.read_csv(file, **self._pandas_config) | ||
|
||
text_list = df.apply( | ||
lambda row: str(dict(zip(df.columns, row.astype(str)))), axis=1 | ||
).tolist() | ||
|
||
if self._concat_rows: | ||
return [ | ||
Document( | ||
text=(self._row_joiner).join(text_list), metadata=extra_info or {} | ||
) | ||
] | ||
else: | ||
return [ | ||
Document(text=text, metadata=extra_info or {}) for text in text_list | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
"""Tabular parser-Excel parser. | ||
Contains parsers for tabular data files. | ||
""" | ||
|
||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional | ||
from fsspec import AbstractFileSystem | ||
from openpyxl import load_workbook | ||
|
||
import pandas as pd | ||
from llama_index.core.readers.base import BaseReader | ||
from llama_index.core.schema import Document | ||
|
||
|
||
class PaiPandasExcelReader(BaseReader): | ||
r"""Pandas-based Excel parser. | ||
Args: | ||
concat_rows (bool): whether to concatenate all rows into one document. | ||
If set to False, a Document will be created for each row. | ||
True by default. | ||
row_joiner (str): Separator to use for joining each row. | ||
Only used when `concat_rows=True`. | ||
Set to "\n" by default. | ||
pandas_config (dict): Options for the `pandas.read_excel` function call. | ||
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html | ||
for more information. | ||
Set to empty dict by default, this means pandas will try to figure | ||
out the separators, table head, etc. on its own. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
*args: Any, | ||
concat_rows: bool = True, | ||
row_joiner: str = "\n", | ||
pandas_config: dict = {}, | ||
**kwargs: Any | ||
) -> None: | ||
"""Init params.""" | ||
super().__init__(*args, **kwargs) | ||
self._concat_rows = concat_rows | ||
self._row_joiner = row_joiner | ||
self._pandas_config = pandas_config | ||
|
||
def read_xlsx( | ||
self, | ||
file: Path, | ||
fs: Optional[AbstractFileSystem] = None, | ||
): | ||
"""Parse Excel file。""" | ||
if fs: | ||
with fs.open(file) as f: | ||
excel = pd.ExcelFile(load_workbook(f), engine="openpyxl") | ||
else: | ||
excel = pd.ExcelFile(load_workbook(file), engine="openpyxl") | ||
sheet_name = excel.sheet_names[0] | ||
sheet = excel.book[sheet_name] | ||
df = excel.parse(sheet_name, **self._pandas_config) | ||
|
||
header_max = 0 | ||
if ( | ||
"header" in self._pandas_config | ||
and self._pandas_config["header"] is not None | ||
and isinstance(self._pandas_config["header"], list) | ||
): | ||
header_max = max(self._pandas_config["header"]) | ||
elif ( | ||
"header" in self._pandas_config | ||
and self._pandas_config["header"] is not None | ||
and isinstance(self._pandas_config["header"], int) | ||
): | ||
header_max = self._pandas_config["header"] | ||
|
||
for item in sheet.merged_cells: | ||
top_col, top_row, bottom_col, bottom_row = item.bounds | ||
base_value = item.start_cell.value | ||
# Convert 1-based index to 0-based index | ||
top_row -= 1 | ||
top_col -= 1 | ||
# Since the previous lines are set as headers, the coordinates need to be adjusted here. | ||
if ( | ||
"header" in self._pandas_config | ||
and self._pandas_config["header"] is not None | ||
) or "header" not in self._pandas_config: | ||
top_row -= header_max + 1 | ||
bottom_row -= header_max + 1 | ||
|
||
df.iloc[top_row:bottom_row, top_col:bottom_col] = base_value | ||
return df | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
"""Parse Excel file. only process the first sheet""" | ||
|
||
df = self.read_xlsx(file, fs) | ||
|
||
text_list = df.apply( | ||
lambda row: str(dict(zip(df.columns, row.astype(str)))), axis=1 | ||
).tolist() | ||
|
||
if self._concat_rows: | ||
return [ | ||
Document( | ||
text=(self._row_joiner).join(text_list), metadata=extra_info or {} | ||
) | ||
] | ||
else: | ||
return [ | ||
Document(text=text, metadata=extra_info or {}) for text in text_list | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.