From a6d5ac1c156e8d6f6fabe4d9949d3318bc77db20 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sun, 13 Feb 2022 15:12:03 +0000 Subject: [PATCH 1/4] BUG: Convert
to space --- pandas/io/html.py | 12 +++++++++++- pandas/tests/io/test_html.py | 37 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 2947b22f85d61..5f807989bdb6c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -605,7 +605,13 @@ def _build_doc(self): else: udoc = bdoc from_encoding = self.encoding - return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + for br in soup.find_all("br"): + br.replace_with("\n" + br.text) + + return soup def _build_xpath_expr(attrs) -> str: @@ -746,6 +752,10 @@ def _build_doc(self): else: if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) + + for br in r.xpath("*//br"): + br.tail = "\n" + (br.tail or '') + return r def _parse_thead_tr(self, table): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 1363a0b04ee0a..ccdf305bc256e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1273,3 +1273,40 @@ def test_parse_path_object(self, datapath): df1 = self.read_html(file_path_string)[0] df2 = self.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) + + def test_parse_br_as_space(self): + # GH 29528: pd.read_html() convert
to space + result = self.read_html(""" + + + + + + + +
A
world1
word2
+ """)[0] + + expected = DataFrame(data=[["word1 word2"]], columns=["A"]) + + tm.assert_frame_equal(result, expected) + + def test_parse_br_tail_retained(self): + # Ensure text after br are retained when they are replaced with a space. + # See: + # https://stackoverflow.com/q/33281217 and + # https://stackoverflow.com/questions/12545897/convert-br-to-end-line/48628074#comment84810813_34640357 + result = self.read_html(""" + + + + + + + +
A
world1
word2
+ """)[0] + + expected = DataFrame(data=[["word1 word2"]], columns=["A"]) + + tm.assert_frame_equal(result, expected) From a106516f207db36822bfd601062d869d2361cefa Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sun, 13 Feb 2022 16:59:11 +0000 Subject: [PATCH 2/4] Fix test broken by change --- pandas/tests/io/test_html.py | 42 ++++++++++-------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 47658c32139ad..8c2b4977f80e3 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -611,17 +611,17 @@ def try_remove_ws(x): ) assert df.shape == ground_truth.shape old = [ - "First Vietnamese American BankIn Vietnamese", - "Westernbank Puerto RicoEn Espanol", - "R-G Premier Bank of Puerto RicoEn Espanol", - "EurobankEn Espanol", - "Sanderson State BankEn Espanol", - "Washington Mutual Bank(Including its subsidiary Washington " + "First Vietnamese American Bank In Vietnamese", + "Westernbank Puerto Rico En Espanol", + "R-G Premier Bank of Puerto Rico En Espanol", + "Eurobank En Espanol", + "Sanderson State Bank En Espanol", + "Washington Mutual Bank (Including its subsidiary Washington " "Mutual Bank FSB)", - "Silver State BankEn Espanol", - "AmTrade International BankEn Espanol", - "Hamilton Bank, NAEn Espanol", - "The Citizens Savings BankPioneer Community Bank, Inc.", + "Silver State Bank En Espanol", + "AmTrade International Bank En Espanol", + "Hamilton Bank, NA En Espanol", + "The Citizens Savings Bank Pioneer Community Bank, Inc.", ] new = [ "First Vietnamese American Bank", @@ -1295,27 +1295,7 @@ def test_parse_br_as_space(self): A - world1
word2 - - - """)[0] - - expected = DataFrame(data=[["word1 word2"]], columns=["A"]) - - tm.assert_frame_equal(result, expected) - - def test_parse_br_tail_retained(self): - # Ensure text after br are retained when they are replaced with a space. - # See: - # https://stackoverflow.com/q/33281217 and - # https://stackoverflow.com/questions/12545897/convert-br-to-end-line/48628074#comment84810813_34640357 - result = self.read_html(""" - - - - - - +
A
world1
word2
word1
word2
""")[0] From 830f0e6e649df8527bb6bda11dfd9666c6068acd Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sun, 13 Feb 2022 17:07:51 +0000 Subject: [PATCH 3/4] Black --- pandas/io/html.py | 2 +- pandas/tests/io/test_html.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 9f724d3ff9eab..5405f1d2c042f 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -767,7 +767,7 @@ def _build_doc(self): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) for br in r.xpath("*//br"): - br.tail = "\n" + (br.tail or '') + br.tail = "\n" + (br.tail or "") return r diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 8c2b4977f80e3..19fd1b552f87d 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1289,7 +1289,8 @@ def test_parse_path_object(self, datapath): def test_parse_br_as_space(self): # GH 29528: pd.read_html() convert
to space - result = self.read_html(""" + result = self.read_html( + """ @@ -1298,7 +1299,8 @@ def test_parse_br_as_space(self):
Aword1
word2
- """)[0] + """ + )[0] expected = DataFrame(data=[["word1 word2"]], columns=["A"]) From 0c9e02716c5c34e51d9039a544ee8531108a3554 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sun, 13 Feb 2022 17:10:56 +0000 Subject: [PATCH 4/4] Add whatsnew entry --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a35ca589065d8..ace1110eb7fdb 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -353,6 +353,7 @@ I/O - Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) +- Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`) Period ^^^^^^