BUG: Fix pd.read_html handling of rowspan in table header (#60464)

* BUG: Fix pd.read_html handling of rowspan in table header * BUG: Fix docstring error in _expand_colspan_rowspan * BUG: Update return type for _expand_colspan_rowspan * BUG: Address review and add not to whatsnew
pandas-dev · Dec 3, 2024 · d9dfaa9 · d9dfaa9
1 parent e631442
commit d9dfaa9
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 20 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -701,6 +701,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
+- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -454,15 +454,26 @@ def row_is_all_th(row):
             while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
-        header = self._expand_colspan_rowspan(header_rows, section="header")
-        body = self._expand_colspan_rowspan(body_rows, section="body")
-        footer = self._expand_colspan_rowspan(footer_rows, section="footer")
+        header, rem = self._expand_colspan_rowspan(header_rows, section="header")
+        body, rem = self._expand_colspan_rowspan(
+            body_rows,
+            section="body",
+            remainder=rem,
+            overflow=len(footer_rows) > 0,
+        )
+        footer, _ = self._expand_colspan_rowspan(
+            footer_rows, section="footer", remainder=rem, overflow=False
+        )
 
         return header, body, footer
 
     def _expand_colspan_rowspan(
-        self, rows, section: Literal["header", "footer", "body"]
-    ) -> list[list]:
+        self,
+        rows,
+        section: Literal["header", "footer", "body"],
+        remainder: list[tuple[int, str | tuple, int]] | None = None,
+        overflow: bool = True,
+    ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]:
         """
         Given a list of <tr>s, return a list of text rows.
 
@@ -471,12 +482,20 @@ def _expand_colspan_rowspan(
         rows : list of node-like
             List of <tr>s
         section : the section that the rows belong to (header, body or footer).
+        remainder: list[tuple[int, str | tuple, int]] | None
+            Any remainder from the expansion of previous section
+        overflow: bool
+            If true, return any partial rows as 'remainder'. If not, use up any
+            partial rows. True by default.
 
         Returns
         -------
         list of list
             Each returned row is a list of str text, or tuple (text, link)
             if extract_links is not None.
+        remainder
+            Remaining partial rows if any. If overflow is False, an empty list
+            is returned.
 
         Notes
         -----
@@ -485,9 +504,7 @@ def _expand_colspan_rowspan(
         """
         all_texts = []  # list of rows, each a list of str
         text: str | tuple
-        remainder: list[
-            tuple[int, str | tuple, int]
-        ] = []  # list of (index, text, nrows)
+        remainder = remainder if remainder is not None else []
 
         for tr in rows:
             texts = []  # the output for this row
@@ -528,19 +545,20 @@ def _expand_colspan_rowspan(
             all_texts.append(texts)
             remainder = next_remainder
 
-        # Append rows that only appear because the previous row had non-1
-        # rowspan
-        while remainder:
-            next_remainder = []
-            texts = []
-            for prev_i, prev_text, prev_rowspan in remainder:
-                texts.append(prev_text)
-                if prev_rowspan > 1:
-                    next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
-            all_texts.append(texts)
-            remainder = next_remainder
+        if not overflow:
+            # Append rows that only appear because the previous row had non-1
+            # rowspan
+            while remainder:
+                next_remainder = []
+                texts = []
+                for prev_i, prev_text, prev_rowspan in remainder:
+                    texts.append(prev_text)
+                    if prev_rowspan > 1:
+                        next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
+                all_texts.append(texts)
+                remainder = next_remainder
 
-        return all_texts
+        return all_texts, remainder
 
     def _handle_hidden_tables(self, tbl_list, attr_name: str):
         """

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html):
+        # GH60210
+
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <th rowspan="2">A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <td>1</td>
+                </tr>
+                <tr>
+                    <td>C</td>
+                    <td>2</td>
+                </tr>
+            </table>
+        """
+            )
+        )[0]
+
+        expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"])
+
+        tm.assert_frame_equal(result, expected)
+
     def test_header_inferred_from_rows_with_only_th(self, flavor_read_html):
         # GH17054
         result = flavor_read_html(