Skip to content

Commit a6d5ac1

Browse files
committed
BUG: Convert <br> to space
1 parent cdb725f commit a6d5ac1

File tree

2 files changed

+48
-1
lines changed

2 files changed

+48
-1
lines changed

pandas/io/html.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,13 @@ def _build_doc(self):
605605
else:
606606
udoc = bdoc
607607
from_encoding = self.encoding
608-
return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
608+
609+
soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
610+
611+
for br in soup.find_all("br"):
612+
br.replace_with("\n" + br.text)
613+
614+
return soup
609615

610616

611617
def _build_xpath_expr(attrs) -> str:
@@ -746,6 +752,10 @@ def _build_doc(self):
746752
else:
747753
if not hasattr(r, "text_content"):
748754
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
755+
756+
for br in r.xpath("*//br"):
757+
br.tail = "\n" + (br.tail or '')
758+
749759
return r
750760

751761
def _parse_thead_tr(self, table):

pandas/tests/io/test_html.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1273,3 +1273,40 @@ def test_parse_path_object(self, datapath):
12731273
df1 = self.read_html(file_path_string)[0]
12741274
df2 = self.read_html(file_path)[0]
12751275
tm.assert_frame_equal(df1, df2)
1276+
1277+
def test_parse_br_as_space(self):
1278+
# GH 29528: pd.read_html() convert <br> to space
1279+
result = self.read_html("""
1280+
<table>
1281+
<tr>
1282+
<th>A</th>
1283+
</tr>
1284+
<tr>
1285+
<td>world1<br>word2</td>
1286+
</tr>
1287+
</table>
1288+
""")[0]
1289+
1290+
expected = DataFrame(data=[["word1 word2"]], columns=["A"])
1291+
1292+
tm.assert_frame_equal(result, expected)
1293+
1294+
def test_parse_br_tail_retained(self):
1295+
# Ensure text after br are retained when they are replaced with a space.
1296+
# See:
1297+
# https://stackoverflow.com/q/33281217 and
1298+
# https://stackoverflow.com/questions/12545897/convert-br-to-end-line/48628074#comment84810813_34640357
1299+
result = self.read_html("""
1300+
<table>
1301+
<tr>
1302+
<th>A</th>
1303+
</tr>
1304+
<tr>
1305+
<td>world1<br>word2</td>
1306+
</tr>
1307+
</table>
1308+
""")[0]
1309+
1310+
expected = DataFrame(data=[["word1 word2"]], columns=["A"])
1311+
1312+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)