forked from run-llama/llama_index
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix issue with irregular table (run-llama#9130) (run-llama#9249)
* Fix an inssue where unstructured will extract table with varying row columns count, leading to panda crashing run-llama#9130 * Consider table with wrong layout (likely html positioning with table) as Text. Add a test to verify that node_parser unstruct_element handle table correctly * Update the irregular table to work also when the irregularity happen beyond line 0-1. Add a test for table that contain empty cell, Add a test for table that were not all line contian the same number of column. --------- Co-authored-by: Pierre <[email protected]>
- Loading branch information
Showing
2 changed files
with
120 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import pytest | ||
from llama_index.node_parser.relational.unstructured_element import ( | ||
UnstructuredElementNodeParser, | ||
) | ||
from llama_index.schema import Document, IndexNode, TextNode | ||
|
||
try: | ||
from unstructured.partition.html import partition_html | ||
except ImportError: | ||
partition_html = None # type: ignore | ||
|
||
try: | ||
from lxml import html | ||
except ImportError: | ||
html = None # type: ignore | ||
|
||
|
||
@pytest.mark.skipif(partition_html is None, reason="unstructured not installed") | ||
@pytest.mark.skipif(html is None, reason="lxml not installed") | ||
def test_html_table_extraction() -> None: | ||
test_data = Document( | ||
text=""" | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<title>Test Page</title> | ||
</head> | ||
<body> | ||
<table> | ||
<tr> | ||
<td>My title center</td> | ||
</tr> | ||
<tr> | ||
<td>Design Website like its 2000</td> | ||
<td>Yeah!</td> | ||
</tr> | ||
</table> | ||
<p> | ||
Test paragraph | ||
</p> | ||
<table> | ||
<tr> | ||
<td>Year</td> | ||
<td>Benefits</td> | ||
</tr> | ||
<tr> | ||
<td>2020</td> | ||
<td>12,000</td> | ||
</tr> | ||
<tr> | ||
<td>2021</td> | ||
<td>10,000</td> | ||
</tr> | ||
<tr> | ||
<td>2022</td> | ||
<td>130,000</td> | ||
</tr> | ||
</table> | ||
<table> | ||
<tr> | ||
<td>Year</td> | ||
<td>Benefits</td> | ||
</tr> | ||
<tr> | ||
<td>2020</td> | ||
<td>12,000</td> | ||
</tr> | ||
<tr> | ||
<td>2021</td> | ||
<td>10,000</td> | ||
<td>2021</td> | ||
<td>10,000</td> | ||
</tr> | ||
<tr> | ||
<td>2022</td> | ||
<td>130,000</td> | ||
</tr> | ||
</table> | ||
<table> | ||
<tr> | ||
<td>age</td> | ||
<td>group</td> | ||
</tr> | ||
<tr> | ||
<td>yellow</td> | ||
<td></td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html> | ||
""" | ||
) | ||
|
||
node_parser = UnstructuredElementNodeParser() | ||
|
||
nodes = node_parser.get_nodes_from_documents([test_data]) | ||
print(len(nodes)) | ||
print(nodes) | ||
assert len(nodes) == 4 | ||
assert isinstance(nodes[0], TextNode) | ||
assert isinstance(nodes[1], IndexNode) | ||
assert isinstance(nodes[2], TextNode) | ||
assert isinstance(nodes[3], TextNode) |