Skip to content

Commit

Permalink
BUG: Fix pd.read_html handling of rowspan in table header (#60464)
Browse files Browse the repository at this point in the history
* BUG: Fix pd.read_html handling of rowspan in table header

* BUG: Fix docstring error in _expand_colspan_rowspan

* BUG: Update return type for _expand_colspan_rowspan

* BUG: Address review and add not to whatsnew
  • Loading branch information
snitish authored Dec 3, 2024
1 parent e631442 commit d9dfaa9
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 20 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
Expand Down
58 changes: 38 additions & 20 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,15 +454,26 @@ def row_is_all_th(row):
while body_rows and row_is_all_th(body_rows[0]):
header_rows.append(body_rows.pop(0))

header = self._expand_colspan_rowspan(header_rows, section="header")
body = self._expand_colspan_rowspan(body_rows, section="body")
footer = self._expand_colspan_rowspan(footer_rows, section="footer")
header, rem = self._expand_colspan_rowspan(header_rows, section="header")
body, rem = self._expand_colspan_rowspan(
body_rows,
section="body",
remainder=rem,
overflow=len(footer_rows) > 0,
)
footer, _ = self._expand_colspan_rowspan(
footer_rows, section="footer", remainder=rem, overflow=False
)

return header, body, footer

def _expand_colspan_rowspan(
self, rows, section: Literal["header", "footer", "body"]
) -> list[list]:
self,
rows,
section: Literal["header", "footer", "body"],
remainder: list[tuple[int, str | tuple, int]] | None = None,
overflow: bool = True,
) -> tuple[list[list], list[tuple[int, str | tuple, int]]]:
"""
Given a list of <tr>s, return a list of text rows.
Expand All @@ -471,12 +482,20 @@ def _expand_colspan_rowspan(
rows : list of node-like
List of <tr>s
section : the section that the rows belong to (header, body or footer).
remainder: list[tuple[int, str | tuple, int]] | None
Any remainder from the expansion of previous section
overflow: bool
If true, return any partial rows as 'remainder'. If not, use up any
partial rows. True by default.
Returns
-------
list of list
Each returned row is a list of str text, or tuple (text, link)
if extract_links is not None.
remainder
Remaining partial rows if any. If overflow is False, an empty list
is returned.
Notes
-----
Expand All @@ -485,9 +504,7 @@ def _expand_colspan_rowspan(
"""
all_texts = [] # list of rows, each a list of str
text: str | tuple
remainder: list[
tuple[int, str | tuple, int]
] = [] # list of (index, text, nrows)
remainder = remainder if remainder is not None else []

for tr in rows:
texts = [] # the output for this row
Expand Down Expand Up @@ -528,19 +545,20 @@ def _expand_colspan_rowspan(
all_texts.append(texts)
remainder = next_remainder

# Append rows that only appear because the previous row had non-1
# rowspan
while remainder:
next_remainder = []
texts = []
for prev_i, prev_text, prev_rowspan in remainder:
texts.append(prev_text)
if prev_rowspan > 1:
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
all_texts.append(texts)
remainder = next_remainder
if not overflow:
# Append rows that only appear because the previous row had non-1
# rowspan
while remainder:
next_remainder = []
texts = []
for prev_i, prev_text, prev_rowspan in remainder:
texts.append(prev_text)
if prev_rowspan > 1:
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
all_texts.append(texts)
remainder = next_remainder

return all_texts
return all_texts, remainder

def _handle_hidden_tables(self, tbl_list, attr_name: str):
"""
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html):

tm.assert_frame_equal(result, expected)

def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html):
# GH60210

result = flavor_read_html(
StringIO(
"""
<table>
<tr>
<th rowspan="2">A</th>
<th>B</th>
</tr>
<tr>
<td>1</td>
</tr>
<tr>
<td>C</td>
<td>2</td>
</tr>
</table>
"""
)
)[0]

expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"])

tm.assert_frame_equal(result, expected)

def test_header_inferred_from_rows_with_only_th(self, flavor_read_html):
# GH17054
result = flavor_read_html(
Expand Down

0 comments on commit d9dfaa9

Please sign in to comment.