From 2bdf91ec4761f7b183266c5ef35f31c3532622b6 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 26 Jun 2023 08:21:23 -0400 Subject: [PATCH 1/9] BUG: Make bz2 import optional --- pandas/compat/compressors.py | 30 +++++++++++++++++++----------- pandas/io/common.py | 10 +++++++++- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/pandas/compat/compressors.py b/pandas/compat/compressors.py index a4f39c4e34bd4..1f31e34c092c9 100644 --- a/pandas/compat/compressors.py +++ b/pandas/compat/compressors.py @@ -4,11 +4,17 @@ from __future__ import annotations -import bz2 from pickle import PickleBuffer from pandas.compat._constants import PY310 +try: + import bz2 + + has_bz2 = True +except ImportError: + has_bz2 = False + try: import lzma @@ -41,17 +47,19 @@ def flatten_buffer( return memoryview(b).tobytes("A") -class BZ2File(bz2.BZ2File): - if not PY310: +if has_bz2: - def write(self, b) -> int: - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) + class BZ2File(bz2.BZ2File): + if not PY310: + + def write(self, b) -> int: + # Workaround issue where `bz2.BZ2File` expects `len` + # to return the number of bytes in `b` by converting + # `b` into something that meets that constraint with + # minimal copying. + # + # Note: This is fixed in Python 3.10. + return super().write(flatten_buffer(b)) if has_lzma: diff --git a/pandas/io/common.py b/pandas/io/common.py index 43780a08a4339..d96f784b07c26 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -59,7 +59,6 @@ ) from pandas.compat import get_lzma_file from pandas.compat._optional import import_optional_dependency -from pandas.compat.compressors import BZ2File as _BZ2File from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -73,6 +72,11 @@ from pandas.core.indexes.api import MultiIndex from pandas.core.shared_docs import _shared_docs +try: + from pandas.compat.compressors import BZ2File as _BZ2File +except ImportError: + _BZ2File = None + _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") _RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") @@ -766,6 +770,10 @@ def get_handle( elif compression == "bz2": # Overload of "BZ2File" to handle pickle protocol 5 # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" + if _BZ2File is None: + raise ImportError( + "bz2 compression requires the bz2 module to be installed" + ) handle = _BZ2File( # type: ignore[call-overload] handle, mode=ioargs.mode, From 5ff0eb088d7434cb9710871effaffb435b2ab5f5 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 26 Jun 2023 09:21:33 -0400 Subject: [PATCH 2/9] CLN: Create `get_bz2_file` to match `get_lzma_file` --- pandas/_testing/_io.py | 8 +++++--- pandas/compat/__init__.py | 23 +++++++++++++++++++++++ pandas/io/common.py | 16 +++++----------- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index d79968a580e40..ab3da383c43a6 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -1,6 +1,5 @@ from __future__ import annotations -import bz2 from functools import wraps import gzip import io @@ -13,7 +12,10 @@ ) import zipfile -from pandas.compat import get_lzma_file +from pandas.compat import ( + get_bz2_file, + get_lzma_file, +) from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -410,7 +412,7 @@ def write_to_compressed(compression, path, data, dest: str = "test"): elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": - compress_method = bz2.BZ2File + compress_method = get_bz2_file() elif compression == "zstd": compress_method = import_optional_dependency("zstandard").open elif compression == "xz": diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 00957c45a7fbe..de4b91e44da19 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -154,6 +154,29 @@ def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: return pandas.compat.compressors.LZMAFile +def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: + """ + Importing the `BZ2File` class from the `bz2` module. + + Returns + ------- + class + The `BZ2File` class from the `bz2` module. + + Raises + ------ + RuntimeError + If the `bz2` module was not imported correctly, or didn't exist. + """ + if not pandas.compat.compressors.has_bz2: + raise RuntimeError( + "bz2 module not available. " + "A Python re-install with the proper dependencies, " + "might be required to solve this issue." + ) + return pandas.compat.compressors.BZ2File + + __all__ = [ "is_numpy_dev", "pa_version_under7p0", diff --git a/pandas/io/common.py b/pandas/io/common.py index d96f784b07c26..6199491be71a5 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -57,7 +57,10 @@ StorageOptions, WriteBuffer, ) -from pandas.compat import get_lzma_file +from pandas.compat import ( + get_bz2_file, + get_lzma_file, +) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -72,11 +75,6 @@ from pandas.core.indexes.api import MultiIndex from pandas.core.shared_docs import _shared_docs -try: - from pandas.compat.compressors import BZ2File as _BZ2File -except ImportError: - _BZ2File = None - _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") _RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") @@ -770,11 +768,7 @@ def get_handle( elif compression == "bz2": # Overload of "BZ2File" to handle pickle protocol 5 # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" - if _BZ2File is None: - raise ImportError( - "bz2 compression requires the bz2 module to be installed" - ) - handle = _BZ2File( # type: ignore[call-overload] + handle = get_bz2_file()( # type: ignore[call-overload] handle, mode=ioargs.mode, **compression_args, From 957d0436a10f7afbc4ea1d6f9d88b22db78e511d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 26 Jun 2023 09:35:33 -0400 Subject: [PATCH 3/9] DOC: Add bz2 bugfix to changelog --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7b9efd7f593dd..c4c62227e4a21 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -449,6 +449,7 @@ I/O - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`) - Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`) +- Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`) Period ^^^^^^ From 1ab1045f8dc759877e81e37b0134e5c3ec7357ae Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 26 Jun 2023 17:23:39 -0400 Subject: [PATCH 4/9] TST: Test bz2 non-import works --- pandas/tests/test_common.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 4860ee235c03d..db72c796e0c51 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -7,6 +7,9 @@ import numpy as np import pytest +sys.modules.pop("bz2", None) # Remove 'bz2' from available modules for testing +from pandas.compat import get_bz2_file + import pandas as pd from pandas import Series import pandas._testing as tm @@ -15,6 +18,13 @@ from pandas.util.version import Version +def test_bz2_nonimport(): + assert "bz2" not in sys.modules + msg = "bz2 module not available." + with pytest.raises(RuntimeError, match=msg): + get_bz2_file() + + def test_get_callable_name(): getname = com.get_callable_name From 3ac3ced0d7b17d00045abbeb2410b36cb1403d0b Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 26 Jun 2023 19:35:39 -0400 Subject: [PATCH 5/9] TST: Test bz2 non-import from subprocess --- pandas/tests/test_common.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index db72c796e0c51..6753008851338 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -7,9 +7,6 @@ import numpy as np import pytest -sys.modules.pop("bz2", None) # Remove 'bz2' from available modules for testing -from pandas.compat import get_bz2_file - import pandas as pd from pandas import Series import pandas._testing as tm @@ -18,13 +15,6 @@ from pandas.util.version import Version -def test_bz2_nonimport(): - assert "bz2" not in sys.modules - msg = "bz2 module not available." - with pytest.raises(RuntimeError, match=msg): - get_bz2_file() - - def test_get_callable_name(): getname = com.get_callable_name @@ -255,3 +245,19 @@ def test_str_size(): ] result = subprocess.check_output(call).decode()[-4:-1].strip("\n") assert int(result) == int(expected) + + +def test_bz2_missing_import(): + # Check whether bz2 missing import is handled correctly (issue #53857) + code = ( + "import pytest\n" + "sys.modules.pop('bz2', None)\n" + "import pandas\n" + "assert 'bz2' not in sys.modules\n" + "from pandas.compat import get_bz2_file\n" + "msg = 'bz2 module not available.'\n" + "with pytest.raises(RuntimeError, match=msg):\n" + "\tget_bz2_file()" + ) + call = [sys.executable, "-c", code] + subprocess.check_output(call) From cd16368e11c7fb265d38f72d019261cb39a65fd3 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 27 Jun 2023 17:13:44 -0400 Subject: [PATCH 6/9] TST: Fix bz2 non-import test --- pandas/tests/test_common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 6753008851338..dc7f0578b41aa 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -250,10 +250,11 @@ def test_str_size(): def test_bz2_missing_import(): # Check whether bz2 missing import is handled correctly (issue #53857) code = ( + "import sys\n" + "assert 'bz2' in sys.modules\n" + "sys.modules['bz2'] = None\n" "import pytest\n" - "sys.modules.pop('bz2', None)\n" - "import pandas\n" - "assert 'bz2' not in sys.modules\n" + "import pandas as pd\n" "from pandas.compat import get_bz2_file\n" "msg = 'bz2 module not available.'\n" "with pytest.raises(RuntimeError, match=msg):\n" From c66f157320ac5ab9a88a68ca2f6ffa6055fb8cb1 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 27 Jun 2023 18:03:08 -0400 Subject: [PATCH 7/9] TST: Fix indentation issues in bz2 import test --- pandas/tests/test_common.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index dc7f0578b41aa..cb32c98df2c56 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -3,6 +3,7 @@ import string import subprocess import sys +import textwrap import numpy as np import pytest @@ -249,16 +250,16 @@ def test_str_size(): def test_bz2_missing_import(): # Check whether bz2 missing import is handled correctly (issue #53857) - code = ( - "import sys\n" - "assert 'bz2' in sys.modules\n" - "sys.modules['bz2'] = None\n" - "import pytest\n" - "import pandas as pd\n" - "from pandas.compat import get_bz2_file\n" - "msg = 'bz2 module not available.'\n" - "with pytest.raises(RuntimeError, match=msg):\n" - "\tget_bz2_file()" - ) + code = """ + import sys + sys.modules['bz2'] = None + import pytest + import pandas as pd + from pandas.compat import get_bz2_file + msg = 'bz2 module not available.' + with pytest.raises(RuntimeError, match=msg): + get_bz2_file() + """ + code = textwrap.dedent(code) call = [sys.executable, "-c", code] subprocess.check_output(call) From d346bdf6e20f1fd97a356057044f74f7486cd83b Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 27 Jun 2023 18:21:58 -0400 Subject: [PATCH 8/9] MAINT: Clean up merge commit --- pandas/_testing/_io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 26cc69646d41f..20dec2836eb3b 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -1,6 +1,5 @@ from __future__ import annotations -from functools import wraps import gzip import io import tarfile From e73d25281881a41e5214e56cb0493b71bc206f0e Mon Sep 17 00:00:00 2001 From: Miles Cranmer Date: Fri, 7 Jul 2023 13:51:25 -0400 Subject: [PATCH 9/9] Mark bz2 missing test with `single_cpu` --- pandas/tests/test_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index cb32c98df2c56..e7ae4af3f8640 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -248,6 +248,7 @@ def test_str_size(): assert int(result) == int(expected) +@pytest.mark.single_cpu def test_bz2_missing_import(): # Check whether bz2 missing import is handled correctly (issue #53857) code = """