From a6d5ac1c156e8d6f6fabe4d9949d3318bc77db20 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sun, 13 Feb 2022 15:12:03 +0000
Subject: [PATCH 1/4] BUG: Convert <br> to space

---
 pandas/io/html.py            | 12 +++++++++++-
 pandas/tests/io/test_html.py | 37 ++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 2947b22f85d61..5f807989bdb6c 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -605,7 +605,13 @@ def _build_doc(self):
         else:
             udoc = bdoc
             from_encoding = self.encoding
-        return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
+
+        soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
+
+        for br in soup.find_all("br"):
+            br.replace_with("\n" + br.text)
+
+        return soup
 
 
 def _build_xpath_expr(attrs) -> str:
@@ -746,6 +752,10 @@ def _build_doc(self):
         else:
             if not hasattr(r, "text_content"):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
+
+        for br in r.xpath("*//br"):
+            br.tail = "\n" + (br.tail or '')
+
         return r
 
     def _parse_thead_tr(self, table):
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 1363a0b04ee0a..ccdf305bc256e 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1273,3 +1273,40 @@ def test_parse_path_object(self, datapath):
         df1 = self.read_html(file_path_string)[0]
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
+
+    def test_parse_br_as_space(self):
+        # GH 29528: pd.read_html() convert <br> to space
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                </tr>
+                <tr>
+                    <td>world1<br>word2</td>
+                </tr>
+            </table>
+        """)[0]
+
+        expected = DataFrame(data=[["word1 word2"]], columns=["A"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_parse_br_tail_retained(self):
+        # Ensure text after br are retained when they are replaced with a space.
+        # See:
+        #   https://stackoverflow.com/q/33281217 and
+        #   https://stackoverflow.com/questions/12545897/convert-br-to-end-line/48628074#comment84810813_34640357
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                </tr>
+                <tr>
+                    <td>world1<br>word2</td>
+                </tr>
+            </table>
+        """)[0]
+
+        expected = DataFrame(data=[["word1 word2"]], columns=["A"])
+
+        tm.assert_frame_equal(result, expected)

From a106516f207db36822bfd601062d869d2361cefa Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sun, 13 Feb 2022 16:59:11 +0000
Subject: [PATCH 2/4] Fix test broken by change

---
 pandas/tests/io/test_html.py | 42 ++++++++++--------------------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 47658c32139ad..8c2b4977f80e3 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -611,17 +611,17 @@ def try_remove_ws(x):
         )
         assert df.shape == ground_truth.shape
         old = [
-            "First Vietnamese American BankIn Vietnamese",
-            "Westernbank Puerto RicoEn Espanol",
-            "R-G Premier Bank of Puerto RicoEn Espanol",
-            "EurobankEn Espanol",
-            "Sanderson State BankEn Espanol",
-            "Washington Mutual Bank(Including its subsidiary Washington "
+            "First Vietnamese American Bank In Vietnamese",
+            "Westernbank Puerto Rico En Espanol",
+            "R-G Premier Bank of Puerto Rico En Espanol",
+            "Eurobank En Espanol",
+            "Sanderson State Bank En Espanol",
+            "Washington Mutual Bank (Including its subsidiary Washington "
             "Mutual Bank FSB)",
-            "Silver State BankEn Espanol",
-            "AmTrade International BankEn Espanol",
-            "Hamilton Bank, NAEn Espanol",
-            "The Citizens Savings BankPioneer Community Bank, Inc.",
+            "Silver State Bank En Espanol",
+            "AmTrade International Bank En Espanol",
+            "Hamilton Bank, NA En Espanol",
+            "The Citizens Savings Bank Pioneer Community Bank, Inc.",
         ]
         new = [
             "First Vietnamese American Bank",
@@ -1295,27 +1295,7 @@ def test_parse_br_as_space(self):
                     <th>A</th>
                 </tr>
                 <tr>
-                    <td>world1<br>word2</td>
-                </tr>
-            </table>
-        """)[0]
-
-        expected = DataFrame(data=[["word1 word2"]], columns=["A"])
-
-        tm.assert_frame_equal(result, expected)
-
-    def test_parse_br_tail_retained(self):
-        # Ensure text after br are retained when they are replaced with a space.
-        # See:
-        #   https://stackoverflow.com/q/33281217 and
-        #   https://stackoverflow.com/questions/12545897/convert-br-to-end-line/48628074#comment84810813_34640357
-        result = self.read_html("""
-            <table>
-                <tr>
-                    <th>A</th>
-                </tr>
-                <tr>
-                    <td>world1<br>word2</td>
+                    <td>word1<br>word2</td>
                 </tr>
             </table>
         """)[0]

From 830f0e6e649df8527bb6bda11dfd9666c6068acd Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sun, 13 Feb 2022 17:07:51 +0000
Subject: [PATCH 3/4] Black

---
 pandas/io/html.py            | 2 +-
 pandas/tests/io/test_html.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 9f724d3ff9eab..5405f1d2c042f 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -767,7 +767,7 @@ def _build_doc(self):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
 
         for br in r.xpath("*//br"):
-            br.tail = "\n" + (br.tail or '')
+            br.tail = "\n" + (br.tail or "")
 
         return r
 
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 8c2b4977f80e3..19fd1b552f87d 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1289,7 +1289,8 @@ def test_parse_path_object(self, datapath):
 
     def test_parse_br_as_space(self):
         # GH 29528: pd.read_html() convert <br> to space
-        result = self.read_html("""
+        result = self.read_html(
+            """
             <table>
                 <tr>
                     <th>A</th>
@@ -1298,7 +1299,8 @@ def test_parse_br_as_space(self):
                     <td>word1<br>word2</td>
                 </tr>
             </table>
-        """)[0]
+        """
+        )[0]
 
         expected = DataFrame(data=[["word1 word2"]], columns=["A"])
 

From 0c9e02716c5c34e51d9039a544ee8531108a3554 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sun, 13 Feb 2022 17:10:56 +0000
Subject: [PATCH 4/4] Add whatsnew entry

---
 doc/source/whatsnew/v1.5.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index a35ca589065d8..ace1110eb7fdb 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -353,6 +353,7 @@ I/O
 - Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`)
 - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
 - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)
+- Bug in :func:`read_html` where elements surrounding ``<br>`` were joined without a space between them (:issue:`29528`)
 
 Period
 ^^^^^^