diff --git a/src/pai_rag/integrations/readers/pai_html_reader.py b/src/pai_rag/integrations/readers/pai_html_reader.py index 48f7d937..1f8e7ead 100644 --- a/src/pai_rag/integrations/readers/pai_html_reader.py +++ b/src/pai_rag/integrations/readers/pai_html_reader.py @@ -100,13 +100,6 @@ def _convert_table_to_pai_table(self, table): if current_row_index == 0: max_cols += col_span - # print("row_span", row_span) - # print("max_rows", max_rows) - # print("current_row_index", current_row_index) - # print("table_matrix", table_matrix) - # print("current_col_index", current_col_index) - # print("row_cells", row_cells) - # print("max_cols", max_cols) for i in range(1, row_span): if current_row_index + i >= max_rows: row_cells = [""] * max_cols @@ -241,8 +234,6 @@ def load( """ md_content = self.convert_html_to_markdown(file_path) - with open("tests/testdata/data/test_back_data/test_html_pai.md", "w") as f: - f.write(md_content) logger.info(f"[PaiHtmlReader] successfully processed html file {file_path}.") docs = [] if metadata and extra_info: diff --git a/src/pai_rag/utils/markdown_utils.py b/src/pai_rag/utils/markdown_utils.py index 08663587..c997905c 100644 --- a/src/pai_rag/utils/markdown_utils.py +++ b/src/pai_rag/utils/markdown_utils.py @@ -48,8 +48,8 @@ def get_columns(self): else: data_col_start_index = 0 return [ - [row[i] for i in range(data_col_start_index, self.get_col_numbers())] - for row in self.data + [row[col] for row in self.data] + for col in range(data_col_start_index, self.get_col_numbers()) ] @@ -114,11 +114,10 @@ def convert_table_to_markdown(table: PaiTable, total_cols: int) -> str: if len(table.get_column_headers()) > 0: headers = table.get_column_headers() rows = table.get_columns() + total_cols = table.get_row_numbers() else: headers = table.get_row_headers() rows = table.get_rows() - print(headers) - print(rows) if headers: for header in headers: markdown.append("| " + " | ".join(header) + " |") diff --git a/tests/data_readers/test_html_reader.py b/tests/data_readers/test_html_reader.py new file mode 100644 index 00000000..f4e8f228 --- /dev/null +++ b/tests/data_readers/test_html_reader.py @@ -0,0 +1,23 @@ +import os +from pathlib import Path +from pai_rag.core.rag_config_manager import RagConfigManager +from pai_rag.core.rag_module import resolve +from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader +from pai_rag.integrations.readers.pai_html_reader import PaiHtmlReader + +BASE_DIR = Path(__file__).parent.parent.parent + + +def test_pai_html_reader(): + config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml") + config = RagConfigManager.from_file(config_file).get_value() + directory_reader = resolve( + cls=PaiDataReader, + reader_config=config.data_reader, + ) + input_dir = "tests/testdata/data/html_data" + + directory_reader.file_readers[".html"] = PaiHtmlReader() + + documents = directory_reader.load_data(file_path_or_directory=input_dir) + assert len(documents) == 5