From ae5b1c97c6f0d03eb77da52643f60f2a55eeee5d Mon Sep 17 00:00:00 2001 From: Dimitra Karadima Date: Fri, 18 Feb 2022 15:17:33 +0200 Subject: [PATCH 1/4] BUG: error in read_excel with some ods files #45598 --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/excel/_odfreader.py | 8 ++++++-- pandas/tests/io/data/excel/test_newlines.ods | Bin 0 -> 2261 bytes pandas/tests/io/excel/test_odf.py | 13 +++++++++++++ 4 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/io/data/excel/test_newlines.ods diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c8b2617ffc535..6c950217c1b48 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -359,6 +359,7 @@ I/O - Bug in :meth:`DataFrame.to_csv` not respecting ``float_format`` for ``Float64`` dtype (:issue:`45991`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) +- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements(:issue:`45598`) Period ^^^^^^ diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 6adce02dc50f0..cff7eefbc90f4 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -102,7 +102,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: table: list[list[Scalar]] = [] for sheet_row in sheet_rows: - sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] + sheet_cells = [ + x + for x in sheet_row.childNodes + if "qname" in dir(x) and x.qname in cell_names + ] empty_cells = 0 table_row: list[Scalar] = [] @@ -231,5 +235,5 @@ def _get_cell_string_value(self, cell) -> str: # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704 value.append(self._get_cell_string_value(fragment)) else: - value.append(str(fragment)) + value.append(str(fragment).strip("\n")) return "".join(value) diff --git a/pandas/tests/io/data/excel/test_newlines.ods b/pandas/tests/io/data/excel/test_newlines.ods new file mode 100644 index 0000000000000000000000000000000000000000..262529800351ca679217ff41f97373c8f2a3d7b4 GIT binary patch literal 2261 zcmZ`)2Ut_d77kr$;u_F}MUj#qB@`(e1;oYhgb)Z#kf56Y2}DRBBy%Ax*$cmQWPwqEFfZ7P_S(9J@x^g&i&3eckZ2k&dfP;{vQb!64nN+w>Up& z5`;fkvuyw%fPg1pDPaLv0f5ZYu zERg~s2Lxg4OnA>wgZG6fP! z@cW+A!t|rWMkN{8GBnOGBFlg!9_RIRSDBc(u5BvXJck4vGkf6R>(U8I4~a3Gf0#BA zT6^c&Xux=CTOFz*L9`9KZ^~nQhR29X;Ne4TGJZ?ZTAA$bk8XPCBJ|q)5K5Ze2J+Ij zojm`hRAE>8)Wa}SU7+5u`E}diUSwt>XD9O29j<$moWBN7 zmPG(IN5vtHB1_>%g~hjT(Z^CSKMF+Yz2eqRdv&>@K@tV_S2XD8ry>H!S2kAs~YW8#@w5^-~XF4^@&mSMspvrMia=~)mawx9a zM7|t~s*2+7W=_zTLo${^ z!4tj7KiYBOHH2NGu4MfDs;u+dQ`Lz=NrQ-h@%UJDz>WeJEkja&sORc3(^qWO=bWCc zGvUa*Z?zS;^!@>ctkd2$oQHu8O!$Q3mGagm-X}<{T>f%bc16UbML z7?e&H1(ZRhGEKtqQ~{ln@IZ9PjnU!^R4M3o8>Hle_L$Q=@7T3OV;uM@Q&c`-zcWnV zwvWXO+igEnV?6H~iKtM}IOdi-u~X$j!GEp{2RwHNoiA|TUlR?npj@!5+(VG|ZQUVq zTJV!15L9IJdD`vGaZ=G~_y_yp%Ze9i%x_ylA0+d zR<_hGYP3?yFSR{sBEx9i6-NDKoO}I=FZM&Pa!1C3Ou$bEDB<*3ZM~~dZ$-tP3!Vel z$8uN_AC}n5PO(i zE}M$bq6(|@ul5ozfxCs26eYJG$wzwn`%4%awZthjNb+7p(uui)C1QUcjd+Bq7!p(g zN8NdTxse;Gv7_4vHb?DEFX082yz%(?m5+MK_Fw^?N&-D(3f3){j#UJMEJ-}bcc z#tE)%`sd_3FKu-4omWgFITIHhvlA(t$lTArTwyk$m9-FRY$KHf^`dxdtCkMV8Q2qU zMa|0RgNOye&849#k*JWuCySeW9+Kc6)6<_wS!0+V(8Qc8oSdaAQ#@1M&}uOX(6h*owM{F)9$k<##BD@)QmYk7^_g%9n?;Nvx65=9z`G!a$!+-*QptU`?I;zH>UwuDrtA zqysh+C*usE=E{#3hKz&Bz>gZjdAW=zx--No&zW<5+)V-}IC+nIt0Pgxvp8+_g=iNN zf@_tq?$}1q_j2?u>B}UQ9hAx=sY$sNY`=RKIPE6*MX{I;><#t15CZvgMdH9B$bvVb z1GUiZQjH#24AV&l}I!Qr8lJv z^)M*9nK5lhrCf?KKu5!ZOa=;z2NuBeYoTAivX<Win|<^q!+~teRm=J8>Y?nLQg3nf!1}^TUj1^TkH(TepeG_7 zB+{SkrGN`%Er5OP4eHIDXf9{zMgT%}6|DQT%JOr^M)vLCYhvn0gNiT1> zHh>R+TxOh-t~ctoP%5H5+$`8xkAve4jvV$|88-aWEh;1w*WM&y^HN&z)Kr8SeZE<< z_9_oApZg$PoNYhfq_y~ntC;+i2ekTV1T(U=k2IdEj&>_3GQ0kDy Date: Mon, 28 Feb 2022 16:34:26 +0200 Subject: [PATCH 2/4] BUG: use hasattr instead of dir --- pandas/io/excel/_odfreader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 4c175f1f15780..856ce52a6d6b6 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -115,7 +115,7 @@ def get_sheet_data( sheet_cells = [ x for x in sheet_row.childNodes - if "qname" in dir(x) and x.qname in cell_names + if hasattr(x, "qname") and x.qname in cell_names ] empty_cells = 0 table_row: list[Scalar | NaTType] = [] From 4751041ed26279465a1a468401161d83b35fa25e Mon Sep 17 00:00:00 2001 From: Dimitra Karadima Date: Thu, 3 Mar 2022 12:52:30 +0200 Subject: [PATCH 3/4] DOC: add issue number in new test case --- pandas/tests/io/excel/test_odf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 1894e1656c632..1eaae62f369a6 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -39,6 +39,7 @@ def test_read_writer_table(): def test_read_newlines_between_xml_elements_table(): + # GH#45598 # Also test reading table from an text OpenDocument file # (.ods) that contains newlines between xml elements. expected = pd.DataFrame( From a706cb3624c48e1c3b94ce6be4c1a6aa470144ea Mon Sep 17 00:00:00 2001 From: Dimitra Karadima Date: Thu, 3 Mar 2022 14:43:33 +0200 Subject: [PATCH 4/4] DOC: remove comment --- pandas/tests/io/excel/test_odf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 1eaae62f369a6..25079b235d332 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -40,8 +40,6 @@ def test_read_writer_table(): def test_read_newlines_between_xml_elements_table(): # GH#45598 - # Also test reading table from an text OpenDocument file - # (.ods) that contains newlines between xml elements. expected = pd.DataFrame( [[1.0, 4.0, 7], [np.nan, np.nan, 8], [3.0, 6.0, 9]], columns=["Column 1", "Column 2", "Column 3"],