Skip to content

Commit

Permalink
fix html reader
Browse files Browse the repository at this point in the history
  • Loading branch information
Ceceliachenen committed Dec 5, 2024
1 parent f495a8b commit 5ee5f63
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 13 deletions.
9 changes: 0 additions & 9 deletions src/pai_rag/integrations/readers/pai_html_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,6 @@ def _convert_table_to_pai_table(self, table):

if current_row_index == 0:
max_cols += col_span
# print("row_span", row_span)
# print("max_rows", max_rows)
# print("current_row_index", current_row_index)
# print("table_matrix", table_matrix)
# print("current_col_index", current_col_index)
# print("row_cells", row_cells)
# print("max_cols", max_cols)
for i in range(1, row_span):
if current_row_index + i >= max_rows:
row_cells = [""] * max_cols
Expand Down Expand Up @@ -241,8 +234,6 @@ def load(
"""

md_content = self.convert_html_to_markdown(file_path)
with open("tests/testdata/data/test_back_data/test_html_pai.md", "w") as f:
f.write(md_content)
logger.info(f"[PaiHtmlReader] successfully processed html file {file_path}.")
docs = []
if metadata and extra_info:
Expand Down
7 changes: 3 additions & 4 deletions src/pai_rag/utils/markdown_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def get_columns(self):
else:
data_col_start_index = 0
return [
[row[i] for i in range(data_col_start_index, self.get_col_numbers())]
for row in self.data
[row[col] for row in self.data]
for col in range(data_col_start_index, self.get_col_numbers())
]


Expand Down Expand Up @@ -114,11 +114,10 @@ def convert_table_to_markdown(table: PaiTable, total_cols: int) -> str:
if len(table.get_column_headers()) > 0:
headers = table.get_column_headers()
rows = table.get_columns()
total_cols = table.get_row_numbers()
else:
headers = table.get_row_headers()
rows = table.get_rows()
print(headers)
print(rows)
if headers:
for header in headers:
markdown.append("| " + " | ".join(header) + " |")
Expand Down
23 changes: 23 additions & 0 deletions tests/data_readers/test_html_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
from pathlib import Path
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_html_reader import PaiHtmlReader

BASE_DIR = Path(__file__).parent.parent.parent


def test_pai_html_reader():
config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()
directory_reader = resolve(
cls=PaiDataReader,
reader_config=config.data_reader,
)
input_dir = "tests/testdata/data/html_data"

directory_reader.file_readers[".html"] = PaiHtmlReader()

documents = directory_reader.load_data(file_path_or_directory=input_dir)
assert len(documents) == 5

0 comments on commit 5ee5f63

Please sign in to comment.