From c4bf8fb0ed7970abd078f482e69591f084a683cd Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Tue, 4 May 2021 15:55:02 -0500 Subject: [PATCH 01/17] Fix #41225. --- pandas/io/excel/_base.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d26a991ba2820..9edbeaa8e1634 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1014,9 +1014,15 @@ def close(self): return content -XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" -ZIP_SIGNATURE = b"PK\x03\x04" -PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) +SIGNATURES = { + "biff2": b"\x09\x00\x04\x00", + "biff3": b"\x09\x02\x06\x00", + "biff4": b"\x09\x04\x06\x00", + "biff5": b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", + "biff8": b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", + "zip": b"PK\x03\x04", +} +PEEK_SIZE = max(map(len, SIGNATURES.values())) @doc(storage_options=_shared_docs["storage_options"]) @@ -1037,8 +1043,8 @@ def inspect_excel_format( Returns ------- - str - Format of file. + str or None + Format of file (if it can be determined) Raises ------ @@ -1063,10 +1069,14 @@ def inspect_excel_format( peek = buf stream.seek(0) - if peek.startswith(XLS_SIGNATURE): + if any( + peek.startswith(signature) + for (file_format, signature) in SIGNATURES.items() + if file_format.startswith('biff') + ): return "xls" - elif not peek.startswith(ZIP_SIGNATURE): - raise ValueError("File is not a recognized excel file") + elif not peek.startswith(SIGNATURES["zip"]): + return None # ZipFile typing is overly-strict # https://github.com/python/typeshed/issues/4212 @@ -1174,8 +1184,12 @@ def __init__( ext = inspect_excel_format( content_or_path=path_or_buffer, storage_options=storage_options ) + if ext is None: + raise ValueError( + "Excel file format cannot be determined, you must specify " + "an engine manually." + ) - # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") @@ -1190,12 +1204,13 @@ def __init__( path_or_buffer, storage_options=storage_options ) - if ext != "xls" and xlrd_version >= "2": + # Pass through if ext is None, otherwise check if ext valid for xlrd + if ext and ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - elif ext != "xls": + elif ext and ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( From 95bf3258f723d249296066208c5bffa8d007a99f Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Wed, 5 May 2021 08:59:43 -0500 Subject: [PATCH 02/17] Adjust return type. --- pandas/io/excel/_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9edbeaa8e1634..27428225e95f5 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -10,6 +10,7 @@ from typing import ( Any, Mapping, + Optional, cast, ) import warnings @@ -1029,7 +1030,7 @@ def close(self): def inspect_excel_format( content_or_path: FilePathOrBuffer, storage_options: StorageOptions = None, -) -> str: +) -> Optional[str]: """ Inspect the path or content of an excel file and get its format. From 416ddd9539e41abaeff2418a7a9889b595e1ab96 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Wed, 5 May 2021 09:23:27 -0500 Subject: [PATCH 03/17] Update tests. --- pandas/tests/io/excel/test_readers.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c4b3221e1d3a7..5f87931636515 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from xlrd import XLRDError import pandas.util._test_decorators as td @@ -718,9 +719,15 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - if engine is None or engine == "xlrd": + if engine is None: error = ValueError - msg = "File is not a recognized excel file" + msg = ( + "Excel file format cannot be determined, you must " + "specify an engine manually." + ) + elif engine == "xlrd": + error = XLRDError + msg = "Unsupported format, or corrupt file.*" else: error = BadZipFile msg = "File is not a zip file" From 44053bb728172b31788b896f10e6ffe2e4ccdba1 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Wed, 5 May 2021 09:24:27 -0500 Subject: [PATCH 04/17] Properly stylize strings. --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 27428225e95f5..ea6f27a057780 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1073,7 +1073,7 @@ def inspect_excel_format( if any( peek.startswith(signature) for (file_format, signature) in SIGNATURES.items() - if file_format.startswith('biff') + if file_format.startswith("biff") ): return "xls" elif not peek.startswith(SIGNATURES["zip"]): From 1b1b6481597c84d2fe98ad6aac9b0957e56c06f7 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Wed, 5 May 2021 09:52:12 -0500 Subject: [PATCH 05/17] Correct expected exception message. --- pandas/tests/io/excel/test_readers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5f87931636515..69f5436a7476a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -727,7 +727,10 @@ def test_corrupt_bytes_raises(self, read_ext, engine): ) elif engine == "xlrd": error = XLRDError - msg = "Unsupported format, or corrupt file.*" + msg = ( + "Unsupported format, or corrupt file: Expected BOF " + "record; found b'foo'" + ) else: error = BadZipFile msg = "File is not a zip file" From 3c11c74b2a603e7c66251ead40670a53fabb1f24 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Wed, 5 May 2021 11:14:29 -0500 Subject: [PATCH 06/17] Add relevant whatsnew entry. --- doc/source/whatsnew/v1.2.5.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 60e146b2212eb..09c7239c5c3ec 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -34,8 +34,7 @@ Bug fixes Other ~~~~~ - -- +- Loosen XLS signatures used in :func:`read_excel` to determine if the `xlrd` engine should be used (:issue:`41225`) - .. --------------------------------------------------------------------------- From 9d0e8fadc0d9f932ceed359b276c12df820d551d Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Wed, 5 May 2021 13:25:57 -0500 Subject: [PATCH 07/17] Address requested cosmetic changes. --- doc/source/whatsnew/v1.2.5.rst | 3 ++- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/io/excel/_base.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 09c7239c5c3ec..60e146b2212eb 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -34,7 +34,8 @@ Bug fixes Other ~~~~~ -- Loosen XLS signatures used in :func:`read_excel` to determine if the `xlrd` engine should be used (:issue:`41225`) + +- - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 61bea198e42db..89f7789ee50df 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -831,6 +831,7 @@ I/O - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) +- Loosen XLS signatures used in :func:`read_excel` to determine if the `xlrd` engine should be used (:issue:`41225`) Period ^^^^^^ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ea6f27a057780..4f5d242bbcbca 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -10,7 +10,7 @@ from typing import ( Any, Mapping, - Optional, + Union, cast, ) import warnings @@ -1030,7 +1030,7 @@ def close(self): def inspect_excel_format( content_or_path: FilePathOrBuffer, storage_options: StorageOptions = None, -) -> Optional[str]: +) -> Union[str, None]: """ Inspect the path or content of an excel file and get its format. From 3d7980813bb4bf99cb6640f23b95591a0abf9b30 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Wed, 5 May 2021 13:58:09 -0500 Subject: [PATCH 08/17] Import exception when needed. --- pandas/tests/io/excel/test_readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 69f5436a7476a..3736e50948dee 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -9,7 +9,6 @@ import numpy as np import pytest -from xlrd import XLRDError import pandas.util._test_decorators as td @@ -726,6 +725,7 @@ def test_corrupt_bytes_raises(self, read_ext, engine): "specify an engine manually." ) elif engine == "xlrd": + from xlrd import XLRDError error = XLRDError msg = ( "Unsupported format, or corrupt file: Expected BOF " From cbfa5634dab2c3aed144bc58dea2ea09fffe974b Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Wed, 5 May 2021 14:54:22 -0500 Subject: [PATCH 09/17] Use new type hint mechanism for multiple output types. --- pandas/io/excel/_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 4f5d242bbcbca..09d69df71504b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -10,7 +10,6 @@ from typing import ( Any, Mapping, - Union, cast, ) import warnings @@ -1030,7 +1029,7 @@ def close(self): def inspect_excel_format( content_or_path: FilePathOrBuffer, storage_options: StorageOptions = None, -) -> Union[str, None]: +) -> str | None: """ Inspect the path or content of an excel file and get its format. From c113b20c65b575076f46a9a517ab93e376f5bf53 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Thu, 6 May 2021 08:16:11 -0500 Subject: [PATCH 10/17] Address minor documentation recommendations. --- doc/source/whatsnew/v1.3.0.rst | 5 ++--- pandas/io/excel/_base.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 89f7789ee50df..c132eee603844 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -195,7 +195,7 @@ Other enhancements - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) -- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) +- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) and older .xls files (:issue:`41225`) - :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) - :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) @@ -224,7 +224,6 @@ Other enhancements - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) -- .. --------------------------------------------------------------------------- @@ -831,7 +830,7 @@ I/O - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) -- Loosen XLS signatures used in :func:`read_excel` to determine if the `xlrd` engine should be used (:issue:`41225`) +- :func:`read_excel` now raises the specified engine's exception for incorrect file types if the excel format cannot be determined by pandas (:issue:`41225`) Period ^^^^^^ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 09d69df71504b..f2a02d9ca43fb 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1044,7 +1044,7 @@ def inspect_excel_format( Returns ------- str or None - Format of file (if it can be determined) + Format of file if it can be determined. Raises ------ From f023c9d2a80f21eb190d11021790bd05f1317d8e Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Thu, 6 May 2021 12:53:51 -0500 Subject: [PATCH 11/17] Add tests. --- pandas/io/excel/_base.py | 6 +++--- pandas/tests/io/excel/test_xlrd.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f2a02d9ca43fb..56df68ae9ebe2 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1015,9 +1015,9 @@ def close(self): SIGNATURES = { - "biff2": b"\x09\x00\x04\x00", - "biff3": b"\x09\x02\x06\x00", - "biff4": b"\x09\x04\x06\x00", + "biff2": b"\x09\x00\x04\x00\x07\x00\x10\x00", + "biff3": b"\x09\x02\x06\x00\x00\x00\x10\x00", + "biff4": b"\x09\x04\x06\x00\x00\x00\x10\x00", "biff5": b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "biff8": b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "zip": b"PK\x03\x04", diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index c0d8acf8ab562..4232493b5f5c0 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -77,3 +77,22 @@ def test_read_excel_warning_with_xlsx_file(datapath): else: with tm.assert_produces_warning(None): pd.read_excel(path, "Sheet1", engine=None) + + +def test_read_old_xls_files(): + # GH 41226 + import io + import struct + + headers = { + "biff2": b"\x09\x00\x04\x00\x07\x00\x10\x00", + "biff3": b"\x09\x02\x06\x00\x00\x00\x10\x00", + "biff4": b"\x09\x04\x06\x00\x00\x00\x10\x00", + "biff5": b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", + } + body = b"\x00" * 16 + b"\x3e\x00\x03\x00\xfe\xff" # Required for biff5 + for file_format, header in headers.items(): + f = io.BytesIO(header + body) + with pytest.raises(struct.error, match="unpack requires a buffer "): + # If struct.error is raised, file has passed xlrd's filetype checks + pd.read_excel(f) From 955f67916d6d0ed633d2af08118ea763793d6348 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Mon, 10 May 2021 14:30:01 -0500 Subject: [PATCH 12/17] Reword based on feedback. --- doc/source/whatsnew/v1.3.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c132eee603844..920371e30c250 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -830,7 +830,8 @@ I/O - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) -- :func:`read_excel` now raises the specified engine's exception for incorrect file types if the excel format cannot be determined by pandas (:issue:`41225`) +- Bug in read_excel would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) +- Period ^^^^^^ From d6206bc5d104a8fc61c64d8fd3a28f0c53780a52 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Mon, 10 May 2021 14:31:00 -0500 Subject: [PATCH 13/17] Refactor to test whether returns . --- pandas/tests/io/excel/test_xlrd.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 4232493b5f5c0..0acbefca30806 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,3 +1,4 @@ +import io import pytest from pandas.compat._optional import import_optional_dependency @@ -7,6 +8,7 @@ from pandas.tests.io.excel import xlrd_version from pandas.io.excel import ExcelFile +from pandas.io.excel._base import inspect_excel_format xlrd = pytest.importorskip("xlrd") xlwt = pytest.importorskip("xlwt") @@ -79,20 +81,16 @@ def test_read_excel_warning_with_xlsx_file(datapath): pd.read_excel(path, "Sheet1", engine=None) -def test_read_old_xls_files(): +@pytest.mark.parametrize( + "file_header", + [ + b"\x09\x00\x04\x00\x07\x00\x10\x00", + b"\x09\x02\x06\x00\x00\x00\x10\x00", + b"\x09\x04\x06\x00\x00\x00\x10\x00", + b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", + ] +) +def test_read_old_xls_files(file_header): # GH 41226 - import io - import struct - - headers = { - "biff2": b"\x09\x00\x04\x00\x07\x00\x10\x00", - "biff3": b"\x09\x02\x06\x00\x00\x00\x10\x00", - "biff4": b"\x09\x04\x06\x00\x00\x00\x10\x00", - "biff5": b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", - } - body = b"\x00" * 16 + b"\x3e\x00\x03\x00\xfe\xff" # Required for biff5 - for file_format, header in headers.items(): - f = io.BytesIO(header + body) - with pytest.raises(struct.error, match="unpack requires a buffer "): - # If struct.error is raised, file has passed xlrd's filetype checks - pd.read_excel(f) + f = io.BytesIO(file_header) + assert inspect_excel_format(f) == "xls" From c306f697efafa5b69f862f85b2a71ac49e5cbf84 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Mon, 10 May 2021 14:31:25 -0500 Subject: [PATCH 14/17] Refactor to remove extraneous key information into comments. --- pandas/io/excel/_base.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 56df68ae9ebe2..8c9f9b39795b6 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1014,15 +1014,14 @@ def close(self): return content -SIGNATURES = { - "biff2": b"\x09\x00\x04\x00\x07\x00\x10\x00", - "biff3": b"\x09\x02\x06\x00\x00\x00\x10\x00", - "biff4": b"\x09\x04\x06\x00\x00\x00\x10\x00", - "biff5": b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", - "biff8": b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", - "zip": b"PK\x03\x04", -} -PEEK_SIZE = max(map(len, SIGNATURES.values())) +XLS_SIGNATURES = ( + b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2 + b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3 + b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4 + b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary +) +ZIP_SIGNATURE = b"PK\x03\x04" +PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE, ))) @doc(storage_options=_shared_docs["storage_options"]) @@ -1069,13 +1068,9 @@ def inspect_excel_format( peek = buf stream.seek(0) - if any( - peek.startswith(signature) - for (file_format, signature) in SIGNATURES.items() - if file_format.startswith("biff") - ): + if any(peek.startswith(sig) for sig in XLS_SIGNATURES): return "xls" - elif not peek.startswith(SIGNATURES["zip"]): + elif not peek.startswith(ZIP_SIGNATURE): return None # ZipFile typing is overly-strict From 8cc480eea087d6ef84bf5a7d0438bc75f1c7dd14 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Thu, 13 May 2021 09:19:07 -0500 Subject: [PATCH 15/17] Address pre-commit. --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8c9f9b39795b6..5f62e897ff256 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1021,7 +1021,7 @@ def close(self): b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary ) ZIP_SIGNATURE = b"PK\x03\x04" -PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE, ))) +PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,))) @doc(storage_options=_shared_docs["storage_options"]) From 03d6393884b71dae64a1f7ca9b6abc6d8d5bdc93 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Thu, 13 May 2021 09:33:23 -0500 Subject: [PATCH 16/17] Pre-commit fixes. --- pandas/tests/io/excel/test_readers.py | 1 + pandas/tests/io/excel/test_xlrd.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 3736e50948dee..1feb154782e66 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -726,6 +726,7 @@ def test_corrupt_bytes_raises(self, read_ext, engine): ) elif engine == "xlrd": from xlrd import XLRDError + error = XLRDError msg = ( "Unsupported format, or corrupt file: Expected BOF " diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 0acbefca30806..67c7634f2677b 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,4 +1,5 @@ import io + import pytest from pandas.compat._optional import import_optional_dependency @@ -88,7 +89,7 @@ def test_read_excel_warning_with_xlsx_file(datapath): b"\x09\x02\x06\x00\x00\x00\x10\x00", b"\x09\x04\x06\x00\x00\x00\x10\x00", b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", - ] + ], ) def test_read_old_xls_files(file_header): # GH 41226 From 9e9a8ffb375b5c7a24f58545187f79a4deebbff2 Mon Sep 17 00:00:00 2001 From: Geoffrey Eisenbarth Date: Fri, 14 May 2021 07:58:08 -0500 Subject: [PATCH 17/17] Address requested changes in the whatsnew. --- doc/source/whatsnew/v1.3.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c60be05c5211f..196a5b5cd136e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -197,7 +197,7 @@ Other enhancements - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) -- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) and older .xls files (:issue:`41225`) +- :func:`pandas.read_excel` can now auto detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`) - :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) - :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) @@ -843,7 +843,7 @@ I/O - Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`) - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) -- Bug in read_excel would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) +- Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) - Period