Fix for MultiIndex headers (also fixes tests)

abmyii · abmyii · commit 09356966b520 · 2022-04-25T02:36:04.000+01:00
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -31,6 +31,8 @@
 
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.frame import DataFrame
+from pandas.core.indexes.base import Index
+from pandas.core.indexes.multi import MultiIndex
 
 from pandas.io.common import (
     file_exists,
@@ -490,7 +492,8 @@ def _expand_colspan_rowspan(
         Returns
         -------
         list of list
-            Each returned row is a list of str text.
+            Each returned row is a list of str text, or tuple (text, link)
+            if extract_links is not None.
 
         Notes
         -----
@@ -522,10 +525,8 @@ def _expand_colspan_rowspan(
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
                 if self.extract_links == "all" or self.extract_links == section:
-                    # All cells will be tuples except for the headers for
-                    # consistency in selection (e.g. using .str indexing)
                     href = self._href_getter(td)
-                    text = (text, href) if href else (text,)
+                    text = (text, href) if href else (text, None)
                 rowspan = int(self._attr_getter(td, "rowspan") or 1)
                 colspan = int(self._attr_getter(td, "colspan") or 1)
 
@@ -874,7 +875,13 @@ def _data_to_frame(**kwargs):
     # fill out elements of body that are "ragged"
     _expand_elements(body)
     with TextParser(body, header=header, **kwargs) as tp:
-        return tp.read()
+        df = tp.read()
+
+        # Cast MultiIndex header to an Index of tuples.
+        # This maintains consistency of selection (e.g. df.columns.str[1])
+        if isinstance(df.columns, MultiIndex):
+            df.columns = Index(df.columns)
+        return df
 
 
 _valid_parsers = {
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -135,6 +135,29 @@ def gh_13141_data(self):
           </table>
           """
 
+    @pytest.fixture
+    def gh_13141_expected(self):
+        return {
+            "head_ignore": ["HTTP", "FTP", "Linkless"],
+            "head_extract": [
+                ("HTTP", np.nan),
+                ("FTP", np.nan),
+                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
+            ],
+            "body_ignore": ["Wikipedia", "Debian", "Linkless"],
+            "body_extract": [
+                ("Wikipedia", "https://en.wikipedia.org/"),
+                ("Debian", "ftp://ftp.us.debian.org/"),
+                ("Linkless", None),
+            ],
+            "footer_ignore": ["Footer", None, None],
+            "footer_extract": [
+                ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
+                None,
+                None,
+            ],
+        }
+
     @pytest.fixture(autouse=True, scope="function")
     def set_defaults(self, flavor):
         self.read_html = partial(read_html, flavor=flavor)
@@ -1309,118 +1332,30 @@ def test_parse_path_object(self, datapath):
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
 
-    def test_extract_links(self, gh_13141_data):
-        # GH 13141:
-        # read_html argument to interpret hyperlinks as links (not merely text)
-        result = self.read_html(
-            gh_13141_data,
-            extract_links="all",
-        )[0]
-
-        expected = DataFrame(
-            [
-                [
-                    ("Wikipedia", "https://en.wikipedia.org/"),
-                    ("Debian", "ftp://ftp.us.debian.org/"),
-                    ("Linkless",),
-                ],
-                [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
-            ],
-            columns=(
-                ("HTTP", np.nan),
-                ("FTP", np.nan),
-                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
-            ),
-        )
-
-        tm.assert_frame_equal(result, expected)
-
-    def test_extract_links_header(self, gh_13141_data):
-        result = self.read_html(
-            gh_13141_data,
-            extract_links="header",
-        )[0]
-
-        expected = DataFrame(
-            [
-                [
-                    "Wikipedia",
-                    "Debian",
-                    "Linkless",
-                ],
-                ["Footer", None, None],
-            ],
-            columns=(
-                ("HTTP", np.nan),
-                ("FTP", np.nan),
-                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
-            ),
-        )
-
-        tm.assert_frame_equal(result, expected)
-
-    def test_extract_links_footer(self, gh_13141_data):
-        result = self.read_html(
-            gh_13141_data,
-            extract_links="footer",
-        )[0]
-
-        expected = DataFrame(
-            [
-                [
-                    "Wikipedia",
-                    "Debian",
-                    "Linkless",
-                ],
-                [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
-            ],
-            columns=(
-                "HTTP",
-                "FTP",
-                "Linkless",
-            ),
-        )
-
+    @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
+    def test_extract_links(self, gh_13141_data, gh_13141_expected, arg):
+        data_exp = gh_13141_expected["body_ignore"]
+        foot_exp = gh_13141_expected["footer_ignore"]
+        head_exp = gh_13141_expected["head_ignore"]
+        if arg == "all":
+            data_exp = gh_13141_expected["body_extract"]
+            foot_exp = gh_13141_expected["footer_extract"]
+            head_exp = gh_13141_expected["head_extract"]
+        elif arg == "body":
+            data_exp = gh_13141_expected["body_extract"]
+        elif arg == "footer":
+            foot_exp = gh_13141_expected["footer_extract"]
+        elif arg == "header":
+            head_exp = gh_13141_expected["head_extract"]
+
+        result = self.read_html(gh_13141_data, extract_links=arg)[0]
+        expected = DataFrame([data_exp, foot_exp], columns=head_exp)
         tm.assert_frame_equal(result, expected)
 
-    def test_extract_links_body(self, gh_13141_data):
-        result = self.read_html(
-            gh_13141_data,
-            extract_links="body",
-        )[0]
-
-        expected = DataFrame(
-            [
-                [
-                    ("Wikipedia", "https://en.wikipedia.org/"),
-                    ("Debian", "ftp://ftp.us.debian.org/"),
-                    ("Linkless",),
-                ],
-                ["Footer", None, None],
-            ],
-            columns=(
-                "HTTP",
-                "FTP",
-                "Linkless",
-            ),
-        )
-
-        tm.assert_frame_equal(result, expected)
-
-    def test_extract_links_bad(self):
-        html = """
-          <table>
-            <tr>
-              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
-            </tr>
-            <tr>
-              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
-            </tr>
-          </table>
-        """
+    def test_extract_links_bad(self, gh_13141_data):
         msg = (
             "`extract_links` must be one of "
             '{None, "header", "footer", "body", "all"}, got "incorrect"'
         )
         with pytest.raises(ValueError, match=msg):
-            read_html(html, extract_links="incorrect")
+            read_html(gh_13141_data, extract_links="incorrect")