BUG: Convert <br> to space

abmyii · abmyii · commit a6d5ac1c156e · 2022-02-13T15:18:30.000Z
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -605,7 +605,13 @@ def _build_doc(self):
         else:
             udoc = bdoc
             from_encoding = self.encoding
-        return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
+
+        soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
+
+        for br in soup.find_all("br"):
+            br.replace_with("\n" + br.text)
+
+        return soup
 
 
 def _build_xpath_expr(attrs) -> str:
@@ -746,6 +752,10 @@ def _build_doc(self):
         else:
             if not hasattr(r, "text_content"):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
+
+        for br in r.xpath("*//br"):
+            br.tail = "\n" + (br.tail or '')
+
         return r
 
     def _parse_thead_tr(self, table):
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1273,3 +1273,40 @@ def test_parse_path_object(self, datapath):
         df1 = self.read_html(file_path_string)[0]
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
+
+    def test_parse_br_as_space(self):
+        # GH 29528: pd.read_html() convert <br> to space
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                </tr>
+                <tr>
+                    <td>world1<br>word2</td>
+                </tr>
+            </table>
+        """)[0]
+
+        expected = DataFrame(data=[["word1 word2"]], columns=["A"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_parse_br_tail_retained(self):
+        # Ensure text after br are retained when they are replaced with a space.
+        # See:
+        #   https://stackoverflow.com/q/33281217 and
+        #   https://stackoverflow.com/questions/12545897/convert-br-to-end-line/48628074#comment84810813_34640357
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                </tr>
+                <tr>
+                    <td>world1<br>word2</td>
+                </tr>
+            </table>
+        """)[0]
+
+        expected = DataFrame(data=[["word1 word2"]], columns=["A"])
+
+        tm.assert_frame_equal(result, expected)