Skip to content

Commit 0935696

Browse files
committed
Fix for MultiIndex headers (also fixes tests)
1 parent 1555fbd commit 0935696

File tree

2 files changed

+55
-113
lines changed

2 files changed

+55
-113
lines changed

pandas/io/html.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131

3232
from pandas.core.construction import create_series_with_explicit_dtype
3333
from pandas.core.frame import DataFrame
34+
from pandas.core.indexes.base import Index
35+
from pandas.core.indexes.multi import MultiIndex
3436

3537
from pandas.io.common import (
3638
file_exists,
@@ -490,7 +492,8 @@ def _expand_colspan_rowspan(
490492
Returns
491493
-------
492494
list of list
493-
Each returned row is a list of str text.
495+
Each returned row is a list of str text, or tuple (text, link)
496+
if extract_links is not None.
494497
495498
Notes
496499
-----
@@ -522,10 +525,8 @@ def _expand_colspan_rowspan(
522525
# Append the text from this <td>, colspan times
523526
text = _remove_whitespace(self._text_getter(td))
524527
if self.extract_links == "all" or self.extract_links == section:
525-
# All cells will be tuples except for the headers for
526-
# consistency in selection (e.g. using .str indexing)
527528
href = self._href_getter(td)
528-
text = (text, href) if href else (text,)
529+
text = (text, href) if href else (text, None)
529530
rowspan = int(self._attr_getter(td, "rowspan") or 1)
530531
colspan = int(self._attr_getter(td, "colspan") or 1)
531532

@@ -874,7 +875,13 @@ def _data_to_frame(**kwargs):
874875
# fill out elements of body that are "ragged"
875876
_expand_elements(body)
876877
with TextParser(body, header=header, **kwargs) as tp:
877-
return tp.read()
878+
df = tp.read()
879+
880+
# Cast MultiIndex header to an Index of tuples.
881+
# This maintains consistency of selection (e.g. df.columns.str[1])
882+
if isinstance(df.columns, MultiIndex):
883+
df.columns = Index(df.columns)
884+
return df
878885

879886

880887
_valid_parsers = {

pandas/tests/io/test_html.py

Lines changed: 43 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,29 @@ def gh_13141_data(self):
135135
</table>
136136
"""
137137

138+
@pytest.fixture
139+
def gh_13141_expected(self):
140+
return {
141+
"head_ignore": ["HTTP", "FTP", "Linkless"],
142+
"head_extract": [
143+
("HTTP", np.nan),
144+
("FTP", np.nan),
145+
("Linkless", "https://en.wiktionary.org/wiki/linkless"),
146+
],
147+
"body_ignore": ["Wikipedia", "Debian", "Linkless"],
148+
"body_extract": [
149+
("Wikipedia", "https://en.wikipedia.org/"),
150+
("Debian", "ftp://ftp.us.debian.org/"),
151+
("Linkless", None),
152+
],
153+
"footer_ignore": ["Footer", None, None],
154+
"footer_extract": [
155+
("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
156+
None,
157+
None,
158+
],
159+
}
160+
138161
@pytest.fixture(autouse=True, scope="function")
139162
def set_defaults(self, flavor):
140163
self.read_html = partial(read_html, flavor=flavor)
@@ -1309,118 +1332,30 @@ def test_parse_path_object(self, datapath):
13091332
df2 = self.read_html(file_path)[0]
13101333
tm.assert_frame_equal(df1, df2)
13111334

1312-
def test_extract_links(self, gh_13141_data):
1313-
# GH 13141:
1314-
# read_html argument to interpret hyperlinks as links (not merely text)
1315-
result = self.read_html(
1316-
gh_13141_data,
1317-
extract_links="all",
1318-
)[0]
1319-
1320-
expected = DataFrame(
1321-
[
1322-
[
1323-
("Wikipedia", "https://en.wikipedia.org/"),
1324-
("Debian", "ftp://ftp.us.debian.org/"),
1325-
("Linkless",),
1326-
],
1327-
[("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
1328-
],
1329-
columns=(
1330-
("HTTP", np.nan),
1331-
("FTP", np.nan),
1332-
("Linkless", "https://en.wiktionary.org/wiki/linkless"),
1333-
),
1334-
)
1335-
1336-
tm.assert_frame_equal(result, expected)
1337-
1338-
def test_extract_links_header(self, gh_13141_data):
1339-
result = self.read_html(
1340-
gh_13141_data,
1341-
extract_links="header",
1342-
)[0]
1343-
1344-
expected = DataFrame(
1345-
[
1346-
[
1347-
"Wikipedia",
1348-
"Debian",
1349-
"Linkless",
1350-
],
1351-
["Footer", None, None],
1352-
],
1353-
columns=(
1354-
("HTTP", np.nan),
1355-
("FTP", np.nan),
1356-
("Linkless", "https://en.wiktionary.org/wiki/linkless"),
1357-
),
1358-
)
1359-
1360-
tm.assert_frame_equal(result, expected)
1361-
1362-
def test_extract_links_footer(self, gh_13141_data):
1363-
result = self.read_html(
1364-
gh_13141_data,
1365-
extract_links="footer",
1366-
)[0]
1367-
1368-
expected = DataFrame(
1369-
[
1370-
[
1371-
"Wikipedia",
1372-
"Debian",
1373-
"Linkless",
1374-
],
1375-
[("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
1376-
],
1377-
columns=(
1378-
"HTTP",
1379-
"FTP",
1380-
"Linkless",
1381-
),
1382-
)
1383-
1335+
@pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
1336+
def test_extract_links(self, gh_13141_data, gh_13141_expected, arg):
1337+
data_exp = gh_13141_expected["body_ignore"]
1338+
foot_exp = gh_13141_expected["footer_ignore"]
1339+
head_exp = gh_13141_expected["head_ignore"]
1340+
if arg == "all":
1341+
data_exp = gh_13141_expected["body_extract"]
1342+
foot_exp = gh_13141_expected["footer_extract"]
1343+
head_exp = gh_13141_expected["head_extract"]
1344+
elif arg == "body":
1345+
data_exp = gh_13141_expected["body_extract"]
1346+
elif arg == "footer":
1347+
foot_exp = gh_13141_expected["footer_extract"]
1348+
elif arg == "header":
1349+
head_exp = gh_13141_expected["head_extract"]
1350+
1351+
result = self.read_html(gh_13141_data, extract_links=arg)[0]
1352+
expected = DataFrame([data_exp, foot_exp], columns=head_exp)
13841353
tm.assert_frame_equal(result, expected)
13851354

1386-
def test_extract_links_body(self, gh_13141_data):
1387-
result = self.read_html(
1388-
gh_13141_data,
1389-
extract_links="body",
1390-
)[0]
1391-
1392-
expected = DataFrame(
1393-
[
1394-
[
1395-
("Wikipedia", "https://en.wikipedia.org/"),
1396-
("Debian", "ftp://ftp.us.debian.org/"),
1397-
("Linkless",),
1398-
],
1399-
["Footer", None, None],
1400-
],
1401-
columns=(
1402-
"HTTP",
1403-
"FTP",
1404-
"Linkless",
1405-
),
1406-
)
1407-
1408-
tm.assert_frame_equal(result, expected)
1409-
1410-
def test_extract_links_bad(self):
1411-
html = """
1412-
<table>
1413-
<tr>
1414-
<th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
1415-
</tr>
1416-
<tr>
1417-
<td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
1418-
</tr>
1419-
</table>
1420-
"""
1355+
def test_extract_links_bad(self, gh_13141_data):
14211356
msg = (
14221357
"`extract_links` must be one of "
14231358
'{None, "header", "footer", "body", "all"}, got "incorrect"'
14241359
)
14251360
with pytest.raises(ValueError, match=msg):
1426-
read_html(html, extract_links="incorrect")
1361+
read_html(gh_13141_data, extract_links="incorrect")

0 commit comments

Comments
 (0)