Skip to content

Commit 1555fbd

Browse files
committed
Test all extract_link options
1 parent 1c8c891 commit 1555fbd

File tree

1 file changed

+92
-34
lines changed

1 file changed

+92
-34
lines changed

pandas/tests/io/test_html.py

Lines changed: 92 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,28 @@ def spam_data(self, datapath):
113113
def banklist_data(self, datapath):
114114
return datapath("io", "data", "html", "banklist.html")
115115

116+
@pytest.fixture
117+
def gh_13141_data(self):
118+
return """
119+
<table>
120+
<tr>
121+
<th>HTTP</th>
122+
<th>FTP</th>
123+
<th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
124+
</tr>
125+
<tr>
126+
<td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
127+
<td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
128+
<td>Linkless</td>
129+
</tr>
130+
<tfoot>
131+
<tr>
132+
<td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
133+
</tr>
134+
</tfoot>
135+
</table>
136+
"""
137+
116138
@pytest.fixture(autouse=True, scope="function")
117139
def set_defaults(self, flavor):
118140
self.read_html = partial(read_html, flavor=flavor)
@@ -1287,25 +1309,12 @@ def test_parse_path_object(self, datapath):
12871309
df2 = self.read_html(file_path)[0]
12881310
tm.assert_frame_equal(df1, df2)
12891311

1290-
def test_extract_links_body(self):
1312+
def test_extract_links(self, gh_13141_data):
12911313
# GH 13141:
12921314
# read_html argument to interpret hyperlinks as links (not merely text)
12931315
result = self.read_html(
1294-
"""
1295-
<table>
1296-
<tr>
1297-
<th>HTTP</th>
1298-
<th>FTP</th>
1299-
<th><a href="https://en.wiktionary.org/wiki/linkless">None</a></th>
1300-
</tr>
1301-
<tr>
1302-
<td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
1303-
<td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
1304-
<td>Linkless</td>
1305-
</tr>
1306-
</table>
1307-
""",
1308-
extract_links="body",
1316+
gh_13141_data,
1317+
extract_links="all",
13091318
)[0]
13101319

13111320
expected = DataFrame(
@@ -1314,37 +1323,86 @@ def test_extract_links_body(self):
13141323
("Wikipedia", "https://en.wikipedia.org/"),
13151324
("Debian", "ftp://ftp.us.debian.org/"),
13161325
("Linkless",),
1317-
]
1326+
],
1327+
[("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
1328+
],
1329+
columns=(
1330+
("HTTP", np.nan),
1331+
("FTP", np.nan),
1332+
("Linkless", "https://en.wiktionary.org/wiki/linkless"),
1333+
),
1334+
)
1335+
1336+
tm.assert_frame_equal(result, expected)
1337+
1338+
def test_extract_links_header(self, gh_13141_data):
1339+
result = self.read_html(
1340+
gh_13141_data,
1341+
extract_links="header",
1342+
)[0]
1343+
1344+
expected = DataFrame(
1345+
[
1346+
[
1347+
"Wikipedia",
1348+
"Debian",
1349+
"Linkless",
1350+
],
1351+
["Footer", None, None],
1352+
],
1353+
columns=(
1354+
("HTTP", np.nan),
1355+
("FTP", np.nan),
1356+
("Linkless", "https://en.wiktionary.org/wiki/linkless"),
1357+
),
1358+
)
1359+
1360+
tm.assert_frame_equal(result, expected)
1361+
1362+
def test_extract_links_footer(self, gh_13141_data):
1363+
result = self.read_html(
1364+
gh_13141_data,
1365+
extract_links="footer",
1366+
)[0]
1367+
1368+
expected = DataFrame(
1369+
[
1370+
[
1371+
"Wikipedia",
1372+
"Debian",
1373+
"Linkless",
1374+
],
1375+
[("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
13181376
],
13191377
columns=(
13201378
"HTTP",
13211379
"FTP",
1322-
"None",
1380+
"Linkless",
13231381
),
13241382
)
13251383

13261384
tm.assert_frame_equal(result, expected)
13271385

1328-
def test_extract_links_header(self):
1329-
# GH 13141:
1330-
# read_html argument to interpret hyperlinks as links (not merely text)
1386+
def test_extract_links_body(self, gh_13141_data):
13311387
result = self.read_html(
1332-
"""
1333-
<table>
1334-
<tr>
1335-
<th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
1336-
</tr>
1337-
<tr>
1338-
<td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
1339-
</tr>
1340-
</table>
1341-
""",
1342-
extract_links="header",
1388+
gh_13141_data,
1389+
extract_links="body",
13431390
)[0]
13441391

13451392
expected = DataFrame(
1346-
[["Wikipedia"]],
1347-
columns=(("Linkless", "https://en.wiktionary.org/wiki/linkless"),),
1393+
[
1394+
[
1395+
("Wikipedia", "https://en.wikipedia.org/"),
1396+
("Debian", "ftp://ftp.us.debian.org/"),
1397+
("Linkless",),
1398+
],
1399+
["Footer", None, None],
1400+
],
1401+
columns=(
1402+
"HTTP",
1403+
"FTP",
1404+
"Linkless",
1405+
),
13481406
)
13491407

13501408
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)