@@ -135,6 +135,29 @@ def gh_13141_data(self):
135
135
</table>
136
136
"""
137
137
138
+ @pytest .fixture
139
+ def gh_13141_expected (self ):
140
+ return {
141
+ "head_ignore" : ["HTTP" , "FTP" , "Linkless" ],
142
+ "head_extract" : [
143
+ ("HTTP" , np .nan ),
144
+ ("FTP" , np .nan ),
145
+ ("Linkless" , "https://en.wiktionary.org/wiki/linkless" ),
146
+ ],
147
+ "body_ignore" : ["Wikipedia" , "Debian" , "Linkless" ],
148
+ "body_extract" : [
149
+ ("Wikipedia" , "https://en.wikipedia.org/" ),
150
+ ("Debian" , "ftp://ftp.us.debian.org/" ),
151
+ ("Linkless" , None ),
152
+ ],
153
+ "footer_ignore" : ["Footer" , None , None ],
154
+ "footer_extract" : [
155
+ ("Footer" , "https://en.wikipedia.org/wiki/Page_footer" ),
156
+ None ,
157
+ None ,
158
+ ],
159
+ }
160
+
138
161
@pytest .fixture (autouse = True , scope = "function" )
139
162
def set_defaults (self , flavor ):
140
163
self .read_html = partial (read_html , flavor = flavor )
@@ -1309,118 +1332,30 @@ def test_parse_path_object(self, datapath):
1309
1332
df2 = self .read_html (file_path )[0 ]
1310
1333
tm .assert_frame_equal (df1 , df2 )
1311
1334
1312
- def test_extract_links (self , gh_13141_data ):
1313
- # GH 13141:
1314
- # read_html argument to interpret hyperlinks as links (not merely text)
1315
- result = self .read_html (
1316
- gh_13141_data ,
1317
- extract_links = "all" ,
1318
- )[0 ]
1319
-
1320
- expected = DataFrame (
1321
- [
1322
- [
1323
- ("Wikipedia" , "https://en.wikipedia.org/" ),
1324
- ("Debian" , "ftp://ftp.us.debian.org/" ),
1325
- ("Linkless" ,),
1326
- ],
1327
- [("Footer" , "https://en.wikipedia.org/wiki/Page_footer" ), None , None ],
1328
- ],
1329
- columns = (
1330
- ("HTTP" , np .nan ),
1331
- ("FTP" , np .nan ),
1332
- ("Linkless" , "https://en.wiktionary.org/wiki/linkless" ),
1333
- ),
1334
- )
1335
-
1336
- tm .assert_frame_equal (result , expected )
1337
-
1338
- def test_extract_links_header (self , gh_13141_data ):
1339
- result = self .read_html (
1340
- gh_13141_data ,
1341
- extract_links = "header" ,
1342
- )[0 ]
1343
-
1344
- expected = DataFrame (
1345
- [
1346
- [
1347
- "Wikipedia" ,
1348
- "Debian" ,
1349
- "Linkless" ,
1350
- ],
1351
- ["Footer" , None , None ],
1352
- ],
1353
- columns = (
1354
- ("HTTP" , np .nan ),
1355
- ("FTP" , np .nan ),
1356
- ("Linkless" , "https://en.wiktionary.org/wiki/linkless" ),
1357
- ),
1358
- )
1359
-
1360
- tm .assert_frame_equal (result , expected )
1361
-
1362
- def test_extract_links_footer (self , gh_13141_data ):
1363
- result = self .read_html (
1364
- gh_13141_data ,
1365
- extract_links = "footer" ,
1366
- )[0 ]
1367
-
1368
- expected = DataFrame (
1369
- [
1370
- [
1371
- "Wikipedia" ,
1372
- "Debian" ,
1373
- "Linkless" ,
1374
- ],
1375
- [("Footer" , "https://en.wikipedia.org/wiki/Page_footer" ), None , None ],
1376
- ],
1377
- columns = (
1378
- "HTTP" ,
1379
- "FTP" ,
1380
- "Linkless" ,
1381
- ),
1382
- )
1383
-
1335
+ @pytest .mark .parametrize ("arg" , ["all" , "body" , "header" , "footer" ])
1336
+ def test_extract_links (self , gh_13141_data , gh_13141_expected , arg ):
1337
+ data_exp = gh_13141_expected ["body_ignore" ]
1338
+ foot_exp = gh_13141_expected ["footer_ignore" ]
1339
+ head_exp = gh_13141_expected ["head_ignore" ]
1340
+ if arg == "all" :
1341
+ data_exp = gh_13141_expected ["body_extract" ]
1342
+ foot_exp = gh_13141_expected ["footer_extract" ]
1343
+ head_exp = gh_13141_expected ["head_extract" ]
1344
+ elif arg == "body" :
1345
+ data_exp = gh_13141_expected ["body_extract" ]
1346
+ elif arg == "footer" :
1347
+ foot_exp = gh_13141_expected ["footer_extract" ]
1348
+ elif arg == "header" :
1349
+ head_exp = gh_13141_expected ["head_extract" ]
1350
+
1351
+ result = self .read_html (gh_13141_data , extract_links = arg )[0 ]
1352
+ expected = DataFrame ([data_exp , foot_exp ], columns = head_exp )
1384
1353
tm .assert_frame_equal (result , expected )
1385
1354
1386
- def test_extract_links_body (self , gh_13141_data ):
1387
- result = self .read_html (
1388
- gh_13141_data ,
1389
- extract_links = "body" ,
1390
- )[0 ]
1391
-
1392
- expected = DataFrame (
1393
- [
1394
- [
1395
- ("Wikipedia" , "https://en.wikipedia.org/" ),
1396
- ("Debian" , "ftp://ftp.us.debian.org/" ),
1397
- ("Linkless" ,),
1398
- ],
1399
- ["Footer" , None , None ],
1400
- ],
1401
- columns = (
1402
- "HTTP" ,
1403
- "FTP" ,
1404
- "Linkless" ,
1405
- ),
1406
- )
1407
-
1408
- tm .assert_frame_equal (result , expected )
1409
-
1410
- def test_extract_links_bad (self ):
1411
- html = """
1412
- <table>
1413
- <tr>
1414
- <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
1415
- </tr>
1416
- <tr>
1417
- <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
1418
- </tr>
1419
- </table>
1420
- """
1355
+ def test_extract_links_bad (self , gh_13141_data ):
1421
1356
msg = (
1422
1357
"`extract_links` must be one of "
1423
1358
'{None, "header", "footer", "body", "all"}, got "incorrect"'
1424
1359
)
1425
1360
with pytest .raises (ValueError , match = msg ):
1426
- read_html (html , extract_links = "incorrect" )
1361
+ read_html (gh_13141_data , extract_links = "incorrect" )
0 commit comments