From 56a69cdb21bff584b7be814694a42fa00ec456f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 27 May 2022 18:17:40 -0400 Subject: [PATCH 1/5] CLN: do not suppress errors when closing file handles --- pandas/io/common.py | 145 +++++++++++++++----------------------------- 1 file changed, 49 insertions(+), 96 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index fdee1600c2a32..ed35e5aebda13 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,7 +10,6 @@ from io import ( BufferedIOBase, BytesIO, - FileIO, RawIOBase, StringIO, TextIOBase, @@ -114,11 +113,8 @@ def close(self) -> None: self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) - try: - for handle in self.created_handles: - handle.close() - except (OSError, ValueError): - pass + for handle in self.created_handles: + handle.close() self.created_handles = [] self.is_wrapped = False @@ -763,21 +759,20 @@ def get_handle( # TAR Encoding elif compression == "tar": - if "mode" not in compression_args: - compression_args["mode"] = ioargs.mode - if is_path: - handle = _BytesTarFile.open(name=handle, **compression_args) + compression_args.setdefault("mode", ioargs.mode) + if isinstance(handle, str): + handle = _BytesTarFile(name=handle, **compression_args) else: - handle = _BytesTarFile.open(fileobj=handle, **compression_args) + handle = _BytesTarFile(fileobj=handle, **compression_args) assert isinstance(handle, _BytesTarFile) - if handle.mode == "r": + if "r" in handle.tar_buffer.mode: handles.append(handle) - files = handle.getnames() + files = handle.tar_buffer.getnames() if len(files) == 1: - file = handle.extractfile(files[0]) + file = handle.tar_buffer.extractfile(files[0]) assert file is not None handle = file - elif len(files) == 0: + elif not files: raise ValueError(f"Zero files found in TAR archive {path_or_buf}") else: raise ValueError( @@ -876,115 +871,73 @@ def get_handle( ) -# error: Definition of "__exit__" in base class "TarFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "__enter__" in base class "TarFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "__enter__" in base class "TarFile" is incompatible with -# definition in base class "BinaryIO" [misc] -# error: Definition of "__enter__" in base class "TarFile" is incompatible with -# definition in base class "IO" [misc] -# error: Definition of "read" in base class "TarFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "read" in base class "TarFile" is incompatible with -# definition in base class "IO" [misc] -class _BytesTarFile(tarfile.TarFile, BytesIO): # type: ignore[misc] +class _BytesTarFile(BytesIO): """ Wrapper for standard library class TarFile and allow the returned file-like handle to accept byte strings via `write` method. - BytesIO provides attributes of file-like object and TarFile.addfile writes - bytes strings into a member of the archive. + Note: technically only needed when writing to a tar file. """ # GH 17778 def __init__( self, - name: str | bytes | os.PathLike[str] | os.PathLike[bytes], - mode: Literal["r", "a", "w", "x"], - fileobj: FileIO, + name: str | None = None, + mode: Literal["r", "a", "w", "x"] = "r", + fileobj: BaseBuffer | None = None, archive_name: str | None = None, **kwargs, - ): + ) -> None: + super().__init__() self.archive_name = archive_name - self.multiple_write_buffer: BytesIO | None = None - self._closing = False - - super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs) + self.name = name + self.tar_buffer = tarfile.TarFile.open( + name=name, mode=self.extend_mode(mode), fileobj=fileobj, **kwargs + ) - @classmethod - def open(cls, name=None, mode="r", **kwargs): + def extend_mode(self, mode: str) -> str: mode = mode.replace("b", "") - return super().open(name=name, mode=cls.extend_mode(name, mode), **kwargs) - - @classmethod - def extend_mode( - cls, name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], mode: str - ) -> str: if mode != "w": return mode - if isinstance(name, (os.PathLike, str)): - filename = Path(name) - if filename.suffix == ".gz": - return mode + ":gz" - elif filename.suffix == ".xz": - return mode + ":xz" - elif filename.suffix == ".bz2": - return mode + ":bz2" + if self.name is not None: + suffix = Path(self.name).suffix + if suffix in (".gz", ".xz", ".bz2"): + mode = f"{mode}:{suffix[1:]}" return mode - def infer_filename(self): + def infer_filename(self) -> str | None: """ If an explicit archive_name is not given, we still want the file inside the zip file not to be named something.tar, because that causes confusion (GH39465). """ - if isinstance(self.name, (os.PathLike, str)): - # error: Argument 1 to "Path" has - # incompatible type "Union[str, PathLike[str], PathLike[bytes]]"; - # expected "Union[str, PathLike[str]]" [arg-type] - filename = Path(self.name) # type: ignore[arg-type] - if filename.suffix == ".tar": - return filename.with_suffix("").name - if filename.suffix in [".tar.gz", ".tar.bz2", ".tar.xz"]: - return filename.with_suffix("").with_suffix("").name - return filename.name - return None + if self.name is None: + return None - def write(self, data): - # buffer multiple write calls, write on flush - if self.multiple_write_buffer is None: - self.multiple_write_buffer = BytesIO() - self.multiple_write_buffer.write(data) + filename = Path(self.name) + if filename.suffix == ".tar": + return filename.with_suffix("").name + elif filename.suffix in (".tar.gz", ".tar.bz2", ".tar.xz"): + return filename.with_suffix("").with_suffix("").name + return filename.name - def flush(self) -> None: - # write to actual handle and close write buffer - if self.multiple_write_buffer is None or self.multiple_write_buffer.closed: + def close(self) -> None: + if self.closed: + # already closed return - - # TarFile needs a non-empty string - archive_name = self.archive_name or self.infer_filename() or "tar" - with self.multiple_write_buffer: - value = self.multiple_write_buffer.getvalue() + if self.getvalue(): + # need to write data to tar file + # TarFile needs a non-empty string + archive_name = self.archive_name or self.infer_filename() or "tar" tarinfo = tarfile.TarInfo(name=archive_name) - tarinfo.size = len(value) - self.addfile(tarinfo, BytesIO(value)) - - def close(self): - self.flush() + tarinfo.size = len(self.getvalue()) + self.seek(0) + with self.tar_buffer: + self.tar_buffer.addfile(tarinfo, self) + else: + # were reading from tar file + self.tar_buffer.close() super().close() - @property - def closed(self): - if self.multiple_write_buffer is None: - return False - return self.multiple_write_buffer.closed and super().closed - - @closed.setter - def closed(self, value): - if not self._closing and value: - self._closing = True - self.close() - # error: Definition of "__exit__" in base class "ZipFile" is incompatible with # definition in base class "BytesIO" [misc] From 5fb2a032132a2b625466ca0ed0766442cd4243c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 27 May 2022 20:27:02 -0400 Subject: [PATCH 2/5] more cleanups --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/common.py | 168 ++++++++++++++------------------- pandas/tests/io/test_common.py | 12 +++ 3 files changed, 86 insertions(+), 95 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4a0b9a97a9d11..5ce0d8a41a34a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -814,6 +814,7 @@ I/O - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) - :meth:`to_html` now excludes the ``border`` attribute from ```` elements when ``border`` keyword is set to ``False``. +- Most I/O methods do no longer suppress ``OSError`` and ``ValueError`` when closing file handles (:issue:`47136`) - Period diff --git a/pandas/io/common.py b/pandas/io/common.py index ed35e5aebda13..738f1bb208346 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,6 +1,10 @@ """Common IO api utilities""" from __future__ import annotations +from abc import ( + ABC, + abstractmethod, +) import bz2 import codecs from collections import abc @@ -708,8 +712,7 @@ def get_handle( # GZ Compression if compression == "gzip": - if is_path: - assert isinstance(handle, str) + if isinstance(handle, str): # error: Incompatible types in assignment (expression has type # "GzipFile", variable has type "Union[str, BaseBuffer]") handle = gzip.GzipFile( # type: ignore[assignment] @@ -738,18 +741,18 @@ def get_handle( # ZIP Compression elif compression == "zip": - # error: Argument 1 to "_BytesZipFile" has incompatible type "Union[str, - # BaseBuffer]"; expected "Union[Union[str, PathLike[str]], + # error: Argument 1 to "_BytesZipFile" has incompatible type + # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]], # ReadBuffer[bytes], WriteBuffer[bytes]]" handle = _BytesZipFile( handle, ioargs.mode, **compression_args # type: ignore[arg-type] ) - if handle.mode == "r": + if handle.buffer.mode == "r": handles.append(handle) - zip_names = handle.namelist() + zip_names = handle.buffer.namelist() if len(zip_names) == 1: - handle = handle.open(zip_names.pop()) - elif len(zip_names) == 0: + handle = handle.buffer.open(zip_names.pop()) + elif not zip_names: raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError( @@ -763,13 +766,18 @@ def get_handle( if isinstance(handle, str): handle = _BytesTarFile(name=handle, **compression_args) else: - handle = _BytesTarFile(fileobj=handle, **compression_args) + # error: Argument "fileobj" to "_BytesTarFile" has incompatible + # type "BaseBuffer"; expected "Union[ReadBuffer[bytes], + # WriteBuffer[bytes], None]" + handle = _BytesTarFile( + fileobj=handle, **compression_args # type: ignore[arg-type] + ) assert isinstance(handle, _BytesTarFile) - if "r" in handle.tar_buffer.mode: + if "r" in handle.buffer.mode: handles.append(handle) - files = handle.tar_buffer.getnames() + files = handle.buffer.getnames() if len(files) == 1: - file = handle.tar_buffer.extractfile(files[0]) + file = handle.buffer.extractfile(files[0]) assert file is not None handle = file elif not files: @@ -871,28 +879,54 @@ def get_handle( ) -class _BytesTarFile(BytesIO): +# error: Definition of "__enter__" in base class "IOBase" is incompatible +# with definition in base class "BinaryIO" +class _BufferedWriter(BytesIO, ABC): # type: ignore[misc] """ - Wrapper for standard library class TarFile and allow the returned file-like - handle to accept byte strings via `write` method. - - Note: technically only needed when writing to a tar file. + Some objects do not support multiple .write() calls (TarFile and ZipFile). + This wrapper writes to the underlying buffer on close. """ - # GH 17778 + @abstractmethod + def write_to_buffer(self): + ... + + def close(self) -> None: + if self.closed: + # already closed + return + if self.getvalue(): + # write to buffer + self.seek(0) + # error: "_BufferedWriter" has no attribute "buffer" + with self.buffer: # type: ignore[attr-defined] + self.write_to_buffer() + else: + # error: "_BufferedWriter" has no attribute "buffer" + self.buffer.close() # type: ignore[attr-defined] + super().close() + + +class _BytesTarFile(_BufferedWriter): def __init__( self, name: str | None = None, mode: Literal["r", "a", "w", "x"] = "r", - fileobj: BaseBuffer | None = None, + fileobj: ReadBuffer[bytes] | WriteBuffer[bytes] | None = None, archive_name: str | None = None, **kwargs, ) -> None: super().__init__() self.archive_name = archive_name self.name = name - self.tar_buffer = tarfile.TarFile.open( - name=name, mode=self.extend_mode(mode), fileobj=fileobj, **kwargs + # error: Argument "fileobj" to "open" of "TarFile" has incompatible + # type "Union[ReadBuffer[bytes], WriteBuffer[bytes], None]"; expected + # "Optional[IO[bytes]]" + self.buffer = tarfile.TarFile.open( + name=name, + mode=self.extend_mode(mode), + fileobj=fileobj, # type: ignore[arg-type] + **kwargs, ) def extend_mode(self, mode: str) -> str: @@ -920,47 +954,15 @@ def infer_filename(self) -> str | None: return filename.with_suffix("").with_suffix("").name return filename.name - def close(self) -> None: - if self.closed: - # already closed - return - if self.getvalue(): - # need to write data to tar file - # TarFile needs a non-empty string - archive_name = self.archive_name or self.infer_filename() or "tar" - tarinfo = tarfile.TarInfo(name=archive_name) - tarinfo.size = len(self.getvalue()) - self.seek(0) - with self.tar_buffer: - self.tar_buffer.addfile(tarinfo, self) - else: - # were reading from tar file - self.tar_buffer.close() - super().close() + def write_to_buffer(self) -> None: + # TarFile needs a non-empty string + archive_name = self.archive_name or self.infer_filename() or "tar" + tarinfo = tarfile.TarInfo(name=archive_name) + tarinfo.size = len(self.getvalue()) + self.buffer.addfile(tarinfo, self) -# error: Definition of "__exit__" in base class "ZipFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "__enter__" in base class "ZipFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "__enter__" in base class "ZipFile" is incompatible with -# definition in base class "BinaryIO" [misc] -# error: Definition of "__enter__" in base class "ZipFile" is incompatible with -# definition in base class "IO" [misc] -# error: Definition of "read" in base class "ZipFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "read" in base class "ZipFile" is incompatible with -# definition in base class "IO" [misc] -class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc] - """ - Wrapper for standard library class ZipFile and allow the returned file-like - handle to accept byte strings via `write` method. - - BytesIO provides attributes of file-like object and ZipFile.writestr writes - bytes strings into a member of the archive. - """ - - # GH 17778 +class _BytesZipFile(_BufferedWriter): def __init__( self, file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], @@ -968,56 +970,32 @@ def __init__( archive_name: str | None = None, **kwargs, ) -> None: + super().__init__() mode = mode.replace("b", "") self.archive_name = archive_name - self.multiple_write_buffer: StringIO | BytesIO | None = None - kwargs_zip: dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} - kwargs_zip.update(kwargs) + kwargs.setdefault("compression", zipfile.ZIP_DEFLATED) + # error: Argument 1 to "ZipFile" has incompatible type "Union[ + # Union[str, PathLike[str]], ReadBuffer[bytes], WriteBuffer[bytes]]"; + # expected "Union[Union[str, PathLike[str]], IO[bytes]]" + self.buffer = zipfile.ZipFile(file, mode, **kwargs) # type: ignore[arg-type] - # error: Argument 1 to "__init__" of "ZipFile" has incompatible type - # "Union[_PathLike[str], Union[str, Union[IO[Any], RawIOBase, BufferedIOBase, - # TextIOBase, TextIOWrapper, mmap]]]"; expected "Union[Union[str, - # _PathLike[str]], IO[bytes]]" - super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] - - def infer_filename(self): + def infer_filename(self) -> str | None: """ If an explicit archive_name is not given, we still want the file inside the zip file not to be named something.zip, because that causes confusion (GH39465). """ - if isinstance(self.filename, (os.PathLike, str)): - filename = Path(self.filename) + if isinstance(self.buffer.filename, (os.PathLike, str)): + filename = Path(self.buffer.filename) if filename.suffix == ".zip": return filename.with_suffix("").name return filename.name return None - def write(self, data): - # buffer multiple write calls, write on flush - if self.multiple_write_buffer is None: - self.multiple_write_buffer = ( - BytesIO() if isinstance(data, bytes) else StringIO() - ) - self.multiple_write_buffer.write(data) - - def flush(self) -> None: - # write to actual handle and close write buffer - if self.multiple_write_buffer is None or self.multiple_write_buffer.closed: - return - + def write_to_buffer(self) -> None: # ZipFile needs a non-empty string archive_name = self.archive_name or self.infer_filename() or "zip" - with self.multiple_write_buffer: - super().writestr(archive_name, self.multiple_write_buffer.getvalue()) - - def close(self): - self.flush() - super().close() - - @property - def closed(self): - return self.fp is None + self.buffer.writestr(archive_name, self.getvalue()) class _CSVMMapWrapper(abc.Iterator): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 22399917f2bf7..5beda35d52eb4 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -600,3 +600,15 @@ def test_fail_mmap(): with pytest.raises(UnsupportedOperation, match="fileno"): with BytesIO() as buffer: icom.get_handle(buffer, "rb", memory_map=True) + + +def test_close_on_error(): + # GH 47136 + class TestError: + def close(self): + raise OSError("test") + + with pytest.raises(OSError, match="test"): + with BytesIO() as buffer: + with icom.get_handle(buffer, "rb") as handles: + handles.created_handles.append(TestError()) From e51f75b83d4107ef177c0da83551edd2450f1b2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 28 May 2022 16:22:40 -0400 Subject: [PATCH 3/5] move to 1.4.3 --- doc/source/whatsnew/v1.4.3.rst | 2 +- doc/source/whatsnew/v1.5.0.rst | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index f0e208a117068..1d2f9b9880e7e 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -29,7 +29,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Most I/O methods do no longer suppress ``OSError`` and ``ValueError`` when closing file handles (:issue:`47136`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 5ce0d8a41a34a..4a0b9a97a9d11 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -814,7 +814,6 @@ I/O - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) - :meth:`to_html` now excludes the ``border`` attribute from ``
`` elements when ``border`` keyword is set to ``False``. -- Most I/O methods do no longer suppress ``OSError`` and ``ValueError`` when closing file handles (:issue:`47136`) - Period From cab26215ae5a2bb8efe9d0b7e3d0b9dfa90828d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sun, 29 May 2022 09:24:54 -0400 Subject: [PATCH 4/5] only cleanup --- doc/source/whatsnew/v1.4.3.rst | 2 +- pandas/io/common.py | 7 +++++-- pandas/tests/io/test_common.py | 12 ------------ 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 1d2f9b9880e7e..f0e208a117068 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -29,7 +29,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Most I/O methods do no longer suppress ``OSError`` and ``ValueError`` when closing file handles (:issue:`47136`) +- - .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 738f1bb208346..33f83d7c66433 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -117,8 +117,11 @@ def close(self) -> None: self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) - for handle in self.created_handles: - handle.close() + try: + for handle in self.created_handles: + handle.close() + except (OSError, ValueError): + pass self.created_handles = [] self.is_wrapped = False diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 5beda35d52eb4..22399917f2bf7 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -600,15 +600,3 @@ def test_fail_mmap(): with pytest.raises(UnsupportedOperation, match="fileno"): with BytesIO() as buffer: icom.get_handle(buffer, "rb", memory_map=True) - - -def test_close_on_error(): - # GH 47136 - class TestError: - def close(self): - raise OSError("test") - - with pytest.raises(OSError, match="test"): - with BytesIO() as buffer: - with icom.get_handle(buffer, "rb") as handles: - handles.created_handles.append(TestError()) From 087170196d267253c04391cce69d10af988a7011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sun, 29 May 2022 16:19:50 -0400 Subject: [PATCH 5/5] add test case --- pandas/tests/io/test_compression.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 98e136a9c4ba6..125d078ff39b1 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -334,3 +334,9 @@ def test_tar_gz_to_different_filename(): expected = "foo,bar\n1,2\n" assert content == expected + + +def test_tar_no_error_on_close(): + with io.BytesIO() as buffer: + with icom._BytesTarFile(fileobj=buffer, mode="w"): + pass