From bc3e5d3a36cd5dffd951eb3fb9e66a0cca4e250d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Feb 2021 11:24:55 -0500 Subject: [PATCH 1/5] DEP: Remove xlrd as being the default reader for xlsx --- doc/source/user_guide/io.rst | 10 ++++------ doc/source/whatsnew/v1.3.0.rst | 10 ++++++++++ pandas/io/excel/_base.py | 14 +++++--------- pandas/io/excel/_util.py | 7 ------- requirements-dev.txt | 1 + 5 files changed, 20 insertions(+), 22 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d6934a3ca2a6c..3aca1600cbf46 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2853,14 +2853,12 @@ See the :ref:`cookbook` for some advanced strategies. The `xlrd `__ package is now only for reading old-style ``.xls`` files. - Before pandas 1.2.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` + Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` would result in using the ``xlrd`` engine in many cases, including new - Excel 2007+ (``.xlsx``) files. - If `openpyxl `__ is installed, - many of these cases will now default to using the ``openpyxl`` engine. - See the :func:`read_excel` documentation for more details. + Excel 2007+ (``.xlsx``) files. pandas will now default to using the + `openpyxl `__ engine. - Thus, it is strongly encouraged to install ``openpyxl`` to read Excel 2007+ + It is strongly encouraged to install ``openpyxl`` to read Excel 2007+ (``.xlsx``) files. **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** This is no longer supported, switch to using ``openpyxl`` instead. diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 799bc88ffff4e..8e3978ed9fe1a 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -8,6 +8,16 @@ including other versions of pandas. {{ header }} +.. warning:: + + When reading new Excel 2007+ (``.xlsx``) files, the default argument + ``engine=None`` to :func:`~pandas.read_excel` will now result in using the + `openpyxl `_ engine in all cases + when the option :attr:`io.excel.xlsx.reader` is set to ``"auto"``. + Previously, some cases would use the + `xlrd `_ engine instead. See + :ref:`What's new 1.2.0 ` for background on this change. + .. --------------------------------------------------------------------------- Enhancements diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f12a530ea6c34..ed912cbb10814 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -129,11 +129,9 @@ ``pyxlsb`` will be used. .. versionadded:: 1.3.0 - - Otherwise if `openpyxl `_ is installed, - then ``openpyxl`` will be used. - - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This - case will raise a ``ValueError`` in a future version of pandas. + - Otherwise ``openpyxl`` will be used. + + .. versionchanged:: 1.3.0 converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -997,7 +995,7 @@ class ExcelFile: Parameters ---------- path_or_buffer : str, path object (pathlib.Path or py._path.local.LocalPath), - a file-like object, xlrd workbook or openpypl workbook. + a file-like object, xlrd workbook or openpyxl workbook. If a string or path object, expected to be a path to a .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None @@ -1111,9 +1109,7 @@ def __init__( stacklevel = 2 warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " - f"only the xls format is supported. As a result, the " - f"openpyxl engine will be used if it is installed and the " - f"engine argument is not specified. Install " + f"only the xls format is supported. Install " f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 01ccc9d15a6a3..4e1d572aae569 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -62,13 +62,6 @@ def get_default_engine(ext, mode="reader"): _default_writers["xlsx"] = "xlsxwriter" return _default_writers[ext] else: - if ( - import_optional_dependency("openpyxl", errors="ignore") is None - and import_optional_dependency("xlrd", errors="ignore") is not None - ): - # if no openpyxl but xlrd installed, return xlrd - # the version is handled elsewhere - _default_readers["xlsx"] = "xlrd" return _default_readers[ext] diff --git a/requirements-dev.txt b/requirements-dev.txt index be60c90aef8aa..483b0291b1569 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -58,6 +58,7 @@ beautifulsoup4>=4.6.0 html5lib lxml openpyxl +# TODO xlrd xlsxwriter xlwt From 2b9ce01ebc941edcc2b07b4beb1576ed475f1a8d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Feb 2021 11:38:49 -0500 Subject: [PATCH 2/5] fixup --- pandas/io/excel/_base.py | 2 +- requirements-dev.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ed912cbb10814..f95e87b7a2b12 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -130,7 +130,7 @@ .. versionadded:: 1.3.0 - Otherwise ``openpyxl`` will be used. - + .. versionchanged:: 1.3.0 converters : dict, default None diff --git a/requirements-dev.txt b/requirements-dev.txt index 483b0291b1569..be60c90aef8aa 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -58,7 +58,6 @@ beautifulsoup4>=4.6.0 html5lib lxml openpyxl -# TODO xlrd xlsxwriter xlwt From 9473ba33fb06ca99600e1b6445e7792c5d74fc15 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Feb 2021 12:13:04 -0500 Subject: [PATCH 3/5] fixup --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3aca1600cbf46..67c74f9a04618 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2856,7 +2856,7 @@ See the :ref:`cookbook` for some advanced strategies. Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` would result in using the ``xlrd`` engine in many cases, including new Excel 2007+ (``.xlsx``) files. pandas will now default to using the - `openpyxl `__ engine. + `openpyxl `__ engine. It is strongly encouraged to install ``openpyxl`` to read Excel 2007+ (``.xlsx``) files. From bc7f00dc70e373933fe8ebe029f8bf3f24f80ad4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 16 Feb 2021 16:46:55 -0500 Subject: [PATCH 4/5] Add test for default engine --- pandas/tests/io/excel/test_readers.py | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a594718bd62d9..7ee193787ac85 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -117,6 +117,29 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch): monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "read_excel", func) + def test_engine_used(self, read_ext, engine, monkeypatch): + def parser(self, *args, **kwargs): + return self.engine + + monkeypatch.setattr(pd.ExcelFile, "parse", parser) + + expected_defaults = { + "xlsx": "openpyxl", + "xlsm": "openpyxl", + "xlsb": "pyxlsb", + "xls": "xlrd", + "ods": "odf", + } + + with open("test1" + read_ext, "rb") as f: + result = pd.read_excel(f) + + if engine is not None: + expected = engine + else: + expected = expected_defaults[read_ext[1:]] + assert result == expected + def test_usecols_int(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) @@ -1164,6 +1187,24 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch): monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "ExcelFile", func) + def test_engine_used(self, read_ext, engine, monkeypatch): + expected_defaults = { + "xlsx": "openpyxl", + "xlsm": "openpyxl", + "xlsb": "pyxlsb", + "xls": "xlrd", + "ods": "odf", + } + + with pd.ExcelFile("test1" + read_ext) as excel: + result = excel.engine + + if engine is not None: + expected = engine + else: + expected = expected_defaults[read_ext[1:]] + assert result == expected + def test_excel_passes_na(self, read_ext): with pd.ExcelFile("test4" + read_ext) as excel: parsed = pd.read_excel( From 53392756ecf4d7403d6ea899241ffd578bc47b8a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 16 Feb 2021 16:47:42 -0500 Subject: [PATCH 5/5] Add issue number --- pandas/tests/io/excel/test_readers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 7ee193787ac85..71abb11d2616d 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -118,6 +118,7 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch): monkeypatch.setattr(pd, "read_excel", func) def test_engine_used(self, read_ext, engine, monkeypatch): + # GH 38884 def parser(self, *args, **kwargs): return self.engine