File tree Expand file tree Collapse file tree 2 files changed +48
-1
lines changed Expand file tree Collapse file tree 2 files changed +48
-1
lines changed Original file line number Diff line number Diff line change @@ -605,7 +605,13 @@ def _build_doc(self):
605
605
else :
606
606
udoc = bdoc
607
607
from_encoding = self .encoding
608
- return BeautifulSoup (udoc , features = "html5lib" , from_encoding = from_encoding )
608
+
609
+ soup = BeautifulSoup (udoc , features = "html5lib" , from_encoding = from_encoding )
610
+
611
+ for br in soup .find_all ("br" ):
612
+ br .replace_with ("\n " + br .text )
613
+
614
+ return soup
609
615
610
616
611
617
def _build_xpath_expr (attrs ) -> str :
@@ -746,6 +752,10 @@ def _build_doc(self):
746
752
else :
747
753
if not hasattr (r , "text_content" ):
748
754
raise XMLSyntaxError ("no text parsed from document" , 0 , 0 , 0 )
755
+
756
+ for br in r .xpath ("*//br" ):
757
+ br .tail = "\n " + (br .tail or '' )
758
+
749
759
return r
750
760
751
761
def _parse_thead_tr (self , table ):
Original file line number Diff line number Diff line change @@ -1273,3 +1273,40 @@ def test_parse_path_object(self, datapath):
1273
1273
df1 = self .read_html (file_path_string )[0 ]
1274
1274
df2 = self .read_html (file_path )[0 ]
1275
1275
tm .assert_frame_equal (df1 , df2 )
1276
+
1277
+ def test_parse_br_as_space (self ):
1278
+ # GH 29528: pd.read_html() convert <br> to space
1279
+ result = self .read_html ("""
1280
+ <table>
1281
+ <tr>
1282
+ <th>A</th>
1283
+ </tr>
1284
+ <tr>
1285
+ <td>world1<br>word2</td>
1286
+ </tr>
1287
+ </table>
1288
+ """ )[0 ]
1289
+
1290
+ expected = DataFrame (data = [["word1 word2" ]], columns = ["A" ])
1291
+
1292
+ tm .assert_frame_equal (result , expected )
1293
+
1294
+ def test_parse_br_tail_retained (self ):
1295
+ # Ensure text after br are retained when they are replaced with a space.
1296
+ # See:
1297
+ # https://stackoverflow.com/q/33281217 and
1298
+ # https://stackoverflow.com/questions/12545897/convert-br-to-end-line/48628074#comment84810813_34640357
1299
+ result = self .read_html ("""
1300
+ <table>
1301
+ <tr>
1302
+ <th>A</th>
1303
+ </tr>
1304
+ <tr>
1305
+ <td>world1<br>word2</td>
1306
+ </tr>
1307
+ </table>
1308
+ """ )[0 ]
1309
+
1310
+ expected = DataFrame (data = [["word1 word2" ]], columns = ["A" ])
1311
+
1312
+ tm .assert_frame_equal (result , expected )
You can’t perform that action at this time.
0 commit comments