From d3c7200ae513bc0a1e9d08c075a9043e047a554a Mon Sep 17 00:00:00 2001 From: Yves Delley Date: Tue, 15 Aug 2023 23:53:07 +0200 Subject: [PATCH 1/7] added unit-test to highlight issue #54564 --- pandas/tests/io/data/excel/test6.xls | Bin 0 -> 5632 bytes pandas/tests/io/excel/test_xlrd.py | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 pandas/tests/io/data/excel/test6.xls diff --git a/pandas/tests/io/data/excel/test6.xls b/pandas/tests/io/data/excel/test6.xls new file mode 100644 index 0000000000000000000000000000000000000000..e43a1a67510d8e9b6daae5b50670337c3c0c9e9e GIT binary patch literal 5632 zcmeHLO=uHQ5dPjKX_L~V*_aAeD4}50+KYJc(ry*BCrj0XB7&wiptPhCdeMWe)*J=F zlj5b&iy&1m{-A>1JO~v%2!fF4(UYLiLv{UTHYrIDAw?)w-Yh$Bc4pq2+3)@AzV~j) zEzLjbDq{@|B81<53@tT!LpfG`PJ(vdm&a-urA(EjO@u%!%8qP_;CcLQtMsMxcNq>I zhF_BdUwEI!MT|`4bD0gKgPKFuNDc$!26w4RV4G%@QhrYP7Ud7r^M&#u)!a}%fD}sR z6qdY^zWrI%a zPrHj>z1@@-hs&4Kiyyst^zf@P&KPpdq2KhIJGX_NdDYTz7E5-Yl?Q7;?d*>_T*WkZ z{UCATcHkxJ2nBe!y0RIFry$_)?Um?x#c5NW3qd(VG$a!@uw!{<&`II3f-_XnNg=+L zECLn*i-1MIB481)2v`Ix0-Fng;9r=3y-Zv71QFhH1|Ww4a<21>x0IItPXvx(iq|wZ ztcQ?iS->UseDkyw5mQA*#!@p>JuejB6O3NK1jaOviDs(T`EXyj8ej6LGd6k%HjIrt TnzgMve(n1=8>E5XU&a3eowT}k literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 509029861715e..dc086d02db153 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,5 +1,6 @@ import io +import numpy as np import pytest import pandas as pd @@ -44,6 +45,17 @@ def test_read_xlsx_fails(datapath): pd.read_excel(path, engine="xlrd") +def test_nan_in_xls(datapath): + # GH 54564 + path = datapath("io", "data", "excel", "test6.xls") + + expected = pd.DataFrame(np.r_[:3, np.nan].reshape(2, 2)) + + result = pd.read_excel(path) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "file_header", [ From 87fd93332dc40a66a81bb195a2afff57e7a73aec Mon Sep 17 00:00:00 2001 From: Yves Delley Date: Wed, 16 Aug 2023 00:13:06 +0200 Subject: [PATCH 2/7] fixed #54564 --- pandas/io/excel/_xlrd.py | 14 +++++++++++--- pandas/tests/io/excel/test_xlrd.py | 4 ++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index c68a0ab516e05..43f3519d1a389 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -120,9 +120,17 @@ def _parse_cell(cell_contents, cell_typ): elif cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less surprising - val = int(cell_contents) - if val == cell_contents: - cell_contents = val + try: + val = int(cell_contents) + except Exception: + # GH54564 - if the cell contents are NaN/Inf, we get an exception; + # that is just another case where we don't want to convert. + # The exception filter is quite general on purpose: whenever + # the cell content cannot be converted to int - just don't. + pass + else: + if val == cell_contents: + cell_contents = val return cell_contents data = [] diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index dc086d02db153..ce65fda5ad6c9 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -49,9 +49,9 @@ def test_nan_in_xls(datapath): # GH 54564 path = datapath("io", "data", "excel", "test6.xls") - expected = pd.DataFrame(np.r_[:3, np.nan].reshape(2, 2)) + expected = pd.DataFrame(np.r_[:3, np.nan].reshape(2, 2)).astype(float) - result = pd.read_excel(path) + result = pd.read_excel(path, header=None).astype(float) tm.assert_frame_equal(result, expected) From d0333185118fd8b23ff8dab8a3949292b1cc4102 Mon Sep 17 00:00:00 2001 From: Yves Delley Date: Wed, 16 Aug 2023 00:21:13 +0200 Subject: [PATCH 3/7] added whatsnew entry --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d1a689dc60830..ca8ab076f156e 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -749,6 +749,7 @@ I/O - Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`) - Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) - Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`) +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`) - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) From 2cf1b38041e4fcb6f44215fb4e0a1a499c6ec063 Mon Sep 17 00:00:00 2001 From: Yves Delley Date: Thu, 17 Aug 2023 19:42:57 +0200 Subject: [PATCH 4/7] anticipate this fix going into v2.2.0 instead of v2.1.0 --- doc/source/whatsnew/v2.1.0.rst | 1 - doc/source/whatsnew/v2.2.0.rst | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ca8ab076f156e..d1a689dc60830 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -749,7 +749,6 @@ I/O - Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`) - Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) - Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`) -- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`) - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c35473b852eb9..d8f94608b0b7b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -166,8 +166,7 @@ MultiIndex I/O ^^^ -- -- +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) Period ^^^^^^ From b77352949f41f0a2aadd58ecce778467dc122b52 Mon Sep 17 00:00:00 2001 From: Yves Delley Date: Fri, 18 Aug 2023 10:44:47 +0200 Subject: [PATCH 5/7] LBYL instead of EAFP --- pandas/io/excel/_xlrd.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 43f3519d1a389..2c318e380bce0 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,6 +1,7 @@ from __future__ import annotations from datetime import time +import math from typing import TYPE_CHECKING import numpy as np @@ -120,15 +121,12 @@ def _parse_cell(cell_contents, cell_typ): elif cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less surprising - try: - val = int(cell_contents) - except Exception: + if math.isfinite(cell_contents): # GH54564 - if the cell contents are NaN/Inf, we get an exception; # that is just another case where we don't want to convert. # The exception filter is quite general on purpose: whenever # the cell content cannot be converted to int - just don't. - pass - else: + val = int(cell_contents) if val == cell_contents: cell_contents = val return cell_contents From f0e92aa79f2164db8403deaa7e08b9f00481a008 Mon Sep 17 00:00:00 2001 From: Yves Delley Date: Fri, 18 Aug 2023 14:58:11 +0200 Subject: [PATCH 6/7] address review request --- pandas/io/excel/_xlrd.py | 5 +---- pandas/tests/io/excel/test_xlrd.py | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 2c318e380bce0..a444970792e6e 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -122,10 +122,7 @@ def _parse_cell(cell_contents, cell_typ): # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less surprising if math.isfinite(cell_contents): - # GH54564 - if the cell contents are NaN/Inf, we get an exception; - # that is just another case where we don't want to convert. - # The exception filter is quite general on purpose: whenever - # the cell content cannot be converted to int - just don't. + # GH54564 - don't attempt to convert NaN/Inf val = int(cell_contents) if val == cell_contents: cell_contents = val diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index ce65fda5ad6c9..efef18641041c 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -49,9 +49,9 @@ def test_nan_in_xls(datapath): # GH 54564 path = datapath("io", "data", "excel", "test6.xls") - expected = pd.DataFrame(np.r_[:3, np.nan].reshape(2, 2)).astype(float) + expected = pd.DataFrame({0: np.r_[0, 2], 1: np.r_[1, np.nan]}) - result = pd.read_excel(path, header=None).astype(float) + result = pd.read_excel(path, header=None) tm.assert_frame_equal(result, expected) From aef4d2bd01f5453619bf54cee33620ec8d92800d Mon Sep 17 00:00:00 2001 From: Yves Delley Date: Mon, 21 Aug 2023 13:57:53 +0200 Subject: [PATCH 7/7] fix tests on windows --- pandas/tests/io/excel/test_xlrd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index efef18641041c..6d5008ca9ee68 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -49,7 +49,7 @@ def test_nan_in_xls(datapath): # GH 54564 path = datapath("io", "data", "excel", "test6.xls") - expected = pd.DataFrame({0: np.r_[0, 2], 1: np.r_[1, np.nan]}) + expected = pd.DataFrame({0: np.r_[0, 2].astype("int64"), 1: np.r_[1, np.nan]}) result = pd.read_excel(path, header=None)