@@ -113,6 +113,28 @@ def spam_data(self, datapath):
113
113
def banklist_data (self , datapath ):
114
114
return datapath ("io" , "data" , "html" , "banklist.html" )
115
115
116
+ @pytest .fixture
117
+ def gh_13141_data (self ):
118
+ return """
119
+ <table>
120
+ <tr>
121
+ <th>HTTP</th>
122
+ <th>FTP</th>
123
+ <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
124
+ </tr>
125
+ <tr>
126
+ <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
127
+ <td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
128
+ <td>Linkless</td>
129
+ </tr>
130
+ <tfoot>
131
+ <tr>
132
+ <td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
133
+ </tr>
134
+ </tfoot>
135
+ </table>
136
+ """
137
+
116
138
@pytest .fixture (autouse = True , scope = "function" )
117
139
def set_defaults (self , flavor ):
118
140
self .read_html = partial (read_html , flavor = flavor )
@@ -1287,25 +1309,12 @@ def test_parse_path_object(self, datapath):
1287
1309
df2 = self .read_html (file_path )[0 ]
1288
1310
tm .assert_frame_equal (df1 , df2 )
1289
1311
1290
- def test_extract_links_body (self ):
1312
+ def test_extract_links (self , gh_13141_data ):
1291
1313
# GH 13141:
1292
1314
# read_html argument to interpret hyperlinks as links (not merely text)
1293
1315
result = self .read_html (
1294
- """
1295
- <table>
1296
- <tr>
1297
- <th>HTTP</th>
1298
- <th>FTP</th>
1299
- <th><a href="https://en.wiktionary.org/wiki/linkless">None</a></th>
1300
- </tr>
1301
- <tr>
1302
- <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
1303
- <td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
1304
- <td>Linkless</td>
1305
- </tr>
1306
- </table>
1307
- """ ,
1308
- extract_links = "body" ,
1316
+ gh_13141_data ,
1317
+ extract_links = "all" ,
1309
1318
)[0 ]
1310
1319
1311
1320
expected = DataFrame (
@@ -1314,37 +1323,86 @@ def test_extract_links_body(self):
1314
1323
("Wikipedia" , "https://en.wikipedia.org/" ),
1315
1324
("Debian" , "ftp://ftp.us.debian.org/" ),
1316
1325
("Linkless" ,),
1317
- ]
1326
+ ],
1327
+ [("Footer" , "https://en.wikipedia.org/wiki/Page_footer" ), None , None ],
1328
+ ],
1329
+ columns = (
1330
+ ("HTTP" , np .nan ),
1331
+ ("FTP" , np .nan ),
1332
+ ("Linkless" , "https://en.wiktionary.org/wiki/linkless" ),
1333
+ ),
1334
+ )
1335
+
1336
+ tm .assert_frame_equal (result , expected )
1337
+
1338
+ def test_extract_links_header (self , gh_13141_data ):
1339
+ result = self .read_html (
1340
+ gh_13141_data ,
1341
+ extract_links = "header" ,
1342
+ )[0 ]
1343
+
1344
+ expected = DataFrame (
1345
+ [
1346
+ [
1347
+ "Wikipedia" ,
1348
+ "Debian" ,
1349
+ "Linkless" ,
1350
+ ],
1351
+ ["Footer" , None , None ],
1352
+ ],
1353
+ columns = (
1354
+ ("HTTP" , np .nan ),
1355
+ ("FTP" , np .nan ),
1356
+ ("Linkless" , "https://en.wiktionary.org/wiki/linkless" ),
1357
+ ),
1358
+ )
1359
+
1360
+ tm .assert_frame_equal (result , expected )
1361
+
1362
+ def test_extract_links_footer (self , gh_13141_data ):
1363
+ result = self .read_html (
1364
+ gh_13141_data ,
1365
+ extract_links = "footer" ,
1366
+ )[0 ]
1367
+
1368
+ expected = DataFrame (
1369
+ [
1370
+ [
1371
+ "Wikipedia" ,
1372
+ "Debian" ,
1373
+ "Linkless" ,
1374
+ ],
1375
+ [("Footer" , "https://en.wikipedia.org/wiki/Page_footer" ), None , None ],
1318
1376
],
1319
1377
columns = (
1320
1378
"HTTP" ,
1321
1379
"FTP" ,
1322
- "None " ,
1380
+ "Linkless " ,
1323
1381
),
1324
1382
)
1325
1383
1326
1384
tm .assert_frame_equal (result , expected )
1327
1385
1328
- def test_extract_links_header (self ):
1329
- # GH 13141:
1330
- # read_html argument to interpret hyperlinks as links (not merely text)
1386
+ def test_extract_links_body (self , gh_13141_data ):
1331
1387
result = self .read_html (
1332
- """
1333
- <table>
1334
- <tr>
1335
- <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
1336
- </tr>
1337
- <tr>
1338
- <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
1339
- </tr>
1340
- </table>
1341
- """ ,
1342
- extract_links = "header" ,
1388
+ gh_13141_data ,
1389
+ extract_links = "body" ,
1343
1390
)[0 ]
1344
1391
1345
1392
expected = DataFrame (
1346
- [["Wikipedia" ]],
1347
- columns = (("Linkless" , "https://en.wiktionary.org/wiki/linkless" ),),
1393
+ [
1394
+ [
1395
+ ("Wikipedia" , "https://en.wikipedia.org/" ),
1396
+ ("Debian" , "ftp://ftp.us.debian.org/" ),
1397
+ ("Linkless" ,),
1398
+ ],
1399
+ ["Footer" , None , None ],
1400
+ ],
1401
+ columns = (
1402
+ "HTTP" ,
1403
+ "FTP" ,
1404
+ "Linkless" ,
1405
+ ),
1348
1406
)
1349
1407
1350
1408
tm .assert_frame_equal (result , expected )
0 commit comments