From c1823ef5eef28b32e860e94dcd7b1b7373113697 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Mon, 6 Dec 2021 09:21:18 +0000 Subject: [PATCH 01/33] Add reproduction test for .tar.gz archives co-authored-by: Margarete Dippel --- pandas/tests/io/parser/test_compression.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 5aa0edfd8b46a..e0857eb8f6ce8 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -162,6 +162,14 @@ def test_invalid_compression(all_parsers, invalid_compression): parser.read_csv("test_file.zip", **compress_kwargs) +@skip_pyarrow +def test_compression_tar_archive(all_parsers, csv_dir_path): + parser = all_parsers + path = os.path.join(csv_dir_path, "tar_csv.tar.gz") + df = parser.read_csv(path) + assert list(df.columns) == ["a"] + + def test_ignore_compression_extension(all_parsers): parser = all_parsers df = DataFrame({"a": [0, 1]}) From 9a85cbad219bfd39100c1ea24f63703ba797ca0f Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Mon, 6 Dec 2021 12:37:54 +0000 Subject: [PATCH 02/33] add support for .tar archives python's `tarfile` supports gzip, xz and bz2 encoding, so we don't need to make any special cases for that. co-authored-by: Margarete Dippel --- pandas/io/common.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index 844304396a23f..bca4bb5478f89 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,6 +18,7 @@ import mmap import os from pathlib import Path +import tarfile import tempfile from typing import ( IO, @@ -520,6 +521,9 @@ def infer_compression( # Cannot infer compression of a buffer, assume no compression return None + if ".tar" in filepath_or_buffer: + return "tar" + # Infer compression from the filename/URL extension for compression, extension in _compression_to_extension.items(): if filepath_or_buffer.lower().endswith(extension): @@ -747,6 +751,21 @@ def get_handle( f"Only one file per ZIP: {zip_names}" ) + # TAR Encoding + elif compression == "tar": + tar = tarfile.open(handle, "r:*") + handles.append(tar) + files = tar.getnames() + if len(files) == 1: + handle = tar.extractfile(files[0]) + elif len(files) == 0: + raise ValueError(f"Zero files found in TAR archive {path_or_buf}") + else: + raise ValueError( + "Multiple files found in TAR archive. " + f"Only one file per TAR archive: {files}" + ) + # XZ Compression elif compression == "xz": handle = get_lzma_file()(handle, ioargs.mode) From e6730613e279b86d4a673668421e6a947548c280 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Mon, 6 Dec 2021 12:49:50 +0000 Subject: [PATCH 03/33] update doc comments --- pandas/core/frame.py | 8 ++++---- pandas/io/common.py | 6 +++--- pandas/io/json/_json.py | 8 ++++---- pandas/io/parsers/readers.py | 6 +++--- pandas/io/pickle.py | 4 ++-- pandas/io/xml.py | 12 ++++++------ 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 01f817300a01a..1f6336015dc43 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3020,11 +3020,11 @@ def to_xml( layout of elements and attributes from original output. This argument requires ``lxml`` to be installed. Only XSLT 1.0 scripts and not later versions is currently supported. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + compression : {{'infer', 'gzip', 'bz2', 'zip', 'tar', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, zip or xz if path_or_buffer is a string ending in - '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression - otherwise. If using 'zip', the ZIP file must contain only one data + gzip, bz2, zip, xz or tar if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, and no decompression + otherwise. If using 'zip' or 'tar', the archive must contain only one data file to be read in. Set to None for no decompression. {storage_options} diff --git a/pandas/io/common.py b/pandas/io/common.py index bca4bb5478f89..0cc344bb7be07 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -263,7 +263,7 @@ def _get_filepath_or_buffer( ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer - compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional + compression : {{'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional @@ -497,9 +497,9 @@ def infer_compression( ---------- filepath_or_buffer : str or file handle File path or object. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None} If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', + compression from the following extensions: '.gz', '.bz2', '.zip', '.tar', or '.xz' (otherwise no compression). Returns diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 62f542de3437f..25002e31f28e0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -475,11 +475,11 @@ def read_json( ``JsonReader`` is a context manager. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, zip or xz if path_or_buf is a string ending in - '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression - otherwise. If using 'zip', the ZIP file must contain only one data + gzip, bz2, zip, xz or tar if path_or_buf is a string ending in + '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, and no decompression + otherwise. If using 'zip' or 'tar', the archive must contain only one data file to be read in. Set to None for no decompression. nrows : int, optional diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 82f8ee553df8e..127ea6378ab82 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -279,11 +279,11 @@ .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. -compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' +compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the - following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - decompression). If using 'zip', the ZIP file must contain only one data + following extensions: '.gz', '.bz2', '.zip', '.tar', '.xz' (otherwise no + decompression). If using 'zip' or 'tar', the archive must contain only one data file to be read in. Set to None for no decompression. thousands : str, optional Thousands separator. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 5e0a3e1646883..29cacc12b1da0 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -134,9 +134,9 @@ def read_pickle( .. versionchanged:: 1.0.0 Accept URL. URL is not limited to S3 and GCS. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer' If 'infer' and 'path_or_url' is path-like, then detect compression from - the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + the following extensions: '.gz', '.bz2', '.zip', '.tar', or '.xz' (otherwise no compression) If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 3c3b4afa2c57d..5aae9cb3a380a 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -68,9 +68,9 @@ class _XMLFrameParser: URL, file, file-like object, or a raw string containing XSLT, `etree` does not support XSLT but retained for consistency. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'tar', 'xz', None}, default 'infer' Compression type for on-the-fly decompression of on-disk data. - If 'infer', then use extension for gzip, bz2, zip or xz. + If 'infer', then use extension for gzip, bz2, zip, tar or xz. storage_options : dict, optional Extra options that make sense for a particular storage connection, @@ -801,11 +801,11 @@ def read_xml( transformation and not the original XML document. Only XSLT 1.0 scripts and not later versions is currently supported. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, zip or xz if path_or_buffer is a string ending in - '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression - otherwise. If using 'zip', the ZIP file must contain only one data + gzip, bz2, zip, xz, or tar if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', '.xz', or containing '.tar' respectively, and no decompression + otherwise. If using 'zip' or 'tar', the archive must contain only one data file to be read in. Set to None for no decompression. {storage_options} From a0d63865ef2c903ec233fcddba71003d4ecc94c0 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Mon, 6 Dec 2021 13:11:54 +0000 Subject: [PATCH 04/33] fix: pep8 errors --- pandas/core/frame.py | 8 +++++--- pandas/io/json/_json.py | 5 +++-- pandas/io/xml.py | 5 +++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1f6336015dc43..f13ec10b12391 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3020,11 +3020,13 @@ def to_xml( layout of elements and attributes from original output. This argument requires ``lxml`` to be installed. Only XSLT 1.0 scripts and not later versions is currently supported. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'tar', 'xz', None}}, default 'infer' + compression : {{'infer', 'gzip', 'bz2', + 'zip', 'tar', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip, xz or tar if path_or_buffer is a string ending in - '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, and no decompression - otherwise. If using 'zip' or 'tar', the archive must contain only one data + '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, + and no decompression otherwise. + If using 'zip' or 'tar', the archive must contain only one data file to be read in. Set to None for no decompression. {storage_options} diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 25002e31f28e0..83cbac48e6a2b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -478,8 +478,9 @@ def read_json( compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip, xz or tar if path_or_buf is a string ending in - '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, and no decompression - otherwise. If using 'zip' or 'tar', the archive must contain only one data + '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, + and no decompression otherwise. + If using 'zip' or 'tar', the archive must contain only one data file to be read in. Set to None for no decompression. nrows : int, optional diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 5aae9cb3a380a..9bf78c84377ed 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -804,8 +804,9 @@ def read_xml( compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip, xz, or tar if path_or_buffer is a string ending in - '.gz', '.bz2', '.zip', '.xz', or containing '.tar' respectively, and no decompression - otherwise. If using 'zip' or 'tar', the archive must contain only one data + '.gz', '.bz2', '.zip', '.xz', or containing '.tar' respectively, + and no decompression otherwise. + If using 'zip' or 'tar', the archive must contain only one data file to be read in. Set to None for no decompression. {storage_options} From 6a8edef38437769491c645e284ec2696e13b8182 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Tue, 7 Dec 2021 10:14:44 +0000 Subject: [PATCH 05/33] refactor: flip _compression_to_extension around to support multiple extensions on same compression co-authored-by: Margarete Dippel --- pandas/io/common.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 0cc344bb7be07..837f0dd4903b9 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -446,7 +446,8 @@ def file_path_to_url(path: str) -> str: return urljoin("file:", pathname2url(path)) -_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} +_extension_to_compression = {".gz": "gzip", ".bz2": "bz2", ".zip": "zip", ".xz": "xz"} +_supported_compressions = set(_extension_to_compression.values()) def get_compression_method( @@ -525,20 +526,18 @@ def infer_compression( return "tar" # Infer compression from the filename/URL extension - for compression, extension in _compression_to_extension.items(): + for extension, compression in _extension_to_compression.items(): if filepath_or_buffer.lower().endswith(extension): return compression return None # Compression has been specified. Check that it's valid - if compression in _compression_to_extension: + if compression in _supported_compressions: return compression # https://github.com/python/mypy/issues/5492 # Unsupported operand types for + ("List[Optional[str]]" and "List[str]") - valid = ["infer", None] + sorted( - _compression_to_extension - ) # type: ignore[operator] + valid = ["infer", None] + sorted(_supported_compressions) # type: ignore[operator] msg = ( f"Unrecognized compression type: {compression}\n" f"Valid compression types are {valid}" @@ -683,7 +682,7 @@ def get_handle( ioargs.encoding, ioargs.mode, errors, - ioargs.compression["method"] not in _compression_to_extension, + ioargs.compression["method"] not in _supported_compressions, ) is_path = isinstance(handle, str) From d4e40c97b67ab0ad98e5435b0ef0710310fb032e Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Tue, 7 Dec 2021 10:18:03 +0000 Subject: [PATCH 06/33] refactor: detect tar files using existing extension mapping co-authored-by: Margarete Dippel --- pandas/io/common.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 837f0dd4903b9..fbbeda9cc5d64 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -446,7 +446,16 @@ def file_path_to_url(path: str) -> str: return urljoin("file:", pathname2url(path)) -_extension_to_compression = {".gz": "gzip", ".bz2": "bz2", ".zip": "zip", ".xz": "xz"} +_extension_to_compression = { + ".tar": "tar", + ".tar.gz": "tar", + ".tar.bz2": "tar", + ".tar.xz": "tar", + ".gz": "gzip", + ".bz2": "bz2", + ".zip": "zip", + ".xz": "xz", +} _supported_compressions = set(_extension_to_compression.values()) @@ -522,9 +531,6 @@ def infer_compression( # Cannot infer compression of a buffer, assume no compression return None - if ".tar" in filepath_or_buffer: - return "tar" - # Infer compression from the filename/URL extension for extension, compression in _extension_to_compression.items(): if filepath_or_buffer.lower().endswith(extension): From 5f22df77accc57cfcd9864052db5b58e5dcae2e3 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Tue, 7 Dec 2021 13:51:59 +0000 Subject: [PATCH 07/33] feat: add support for writing tar files co-authored-by: Margarete Dippel --- pandas/_testing/_io.py | 10 +++ pandas/conftest.py | 4 +- pandas/core/generic.py | 4 ++ pandas/io/common.py | 101 +++++++++++++++++++++++++--- pandas/io/json/_json.py | 3 +- pandas/io/pickle.py | 3 +- pandas/tests/io/test_compression.py | 15 ++++- pandas/tests/io/test_gcs.py | 9 +++ pandas/tests/io/test_pickle.py | 8 ++- pandas/tests/io/test_stata.py | 4 ++ 10 files changed, 142 insertions(+), 19 deletions(-) diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 2c8e1b0daaeaa..5775129736339 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -3,6 +3,8 @@ import bz2 from functools import wraps import gzip +import io +import tarfile from typing import ( TYPE_CHECKING, Any, @@ -387,6 +389,14 @@ def write_to_compressed(compression, path, data, dest="test"): mode = "w" args = (dest, data) method = "writestr" + elif compression == "tar": + compress_method = tarfile.TarFile + mode = "w" + file = tarfile.TarInfo(name=dest) + bytes = io.BytesIO(data) + file.size = len(data) + args = (file, bytes) + method = "addfile" elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": diff --git a/pandas/conftest.py b/pandas/conftest.py index eb9a952250f36..c2e41bb5693d0 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -267,7 +267,7 @@ def other_closed(request): return request.param -@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) +@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz", "tar"]) def compression(request): """ Fixture for trying common compression types in compression tests. @@ -275,7 +275,7 @@ def compression(request): return request.param -@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) +@pytest.fixture(params=["gzip", "bz2", "zip", "xz", "tar"]) def compression_only(request): """ Fixture for trying common compression types in compression tests excluding diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 57f151feeae80..a56bc43d14455 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2341,6 +2341,7 @@ def to_json( default_handler: Callable[[Any], JSONSerializable] | None = None, lines: bool_t = False, compression: CompressionOptions = "infer", + mode: str = "w", index: bool_t = True, indent: int | None = None, storage_options: StorageOptions = None, @@ -2604,6 +2605,7 @@ def to_json( default_handler=default_handler, lines=lines, compression=compression, + mode=mode, index=index, indent=indent, storage_options=storage_options, @@ -2923,6 +2925,7 @@ def to_pickle( self, path, compression: CompressionOptions = "infer", + mode: str = "wb", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: @@ -2990,6 +2993,7 @@ def to_pickle( self, path, compression=compression, + mode=mode, protocol=protocol, storage_options=storage_options, ) diff --git a/pandas/io/common.py b/pandas/io/common.py index fbbeda9cc5d64..27605233c5f49 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,6 +10,7 @@ from io import ( BufferedIOBase, BytesIO, + FileIO, RawIOBase, StringIO, TextIOBase, @@ -758,18 +759,22 @@ def get_handle( # TAR Encoding elif compression == "tar": - tar = tarfile.open(handle, "r:*") - handles.append(tar) - files = tar.getnames() - if len(files) == 1: - handle = tar.extractfile(files[0]) - elif len(files) == 0: - raise ValueError(f"Zero files found in TAR archive {path_or_buf}") + if is_path: + handle = _BytesTarFile.open(name=handle, mode=ioargs.mode) else: - raise ValueError( - "Multiple files found in TAR archive. " - f"Only one file per TAR archive: {files}" - ) + handle = _BytesTarFile.open(fileobj=handle, mode=ioargs.mode) + if handle.mode == "r": + handles.append(handle) + files = handle.getnames() + if len(files) == 1: + handle = handle.extractfile(files[0]) + elif len(files) == 0: + raise ValueError(f"Zero files found in TAR archive {path_or_buf}") + else: + raise ValueError( + "Multiple files found in TAR archive. " + f"Only one file per TAR archive: {files}" + ) # XZ Compression elif compression == "xz": @@ -852,6 +857,80 @@ def get_handle( ) +class _BytesTarFile(tarfile.TarFile, BytesIO): + + # GH 17778 + def __init__( + self, + name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], + mode: str, + fileobj: FileIO, + archive_name: str | None = None, + **kwargs, + ): + self.archive_name = archive_name + self.multiple_write_buffer: StringIO | BytesIO | None = None + self._closing = False + + super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs) + + @classmethod + def open(cls, mode="r", **kwargs): + mode = mode.replace("b", "") + return super().open(mode=mode, **kwargs) + + def infer_filename(self): + """ + If an explicit archive_name is not given, we still want the file inside the zip + file not to be named something.tar, because that causes confusion (GH39465). + """ + if isinstance(self.name, (os.PathLike, str)): + filename = Path(self.name) + if filename.suffix == ".tar": + return filename.with_suffix("").name + if filename.suffix in [".tar.gz", ".tar.bz2", ".tar.xz"]: + return filename.with_suffix("").with_suffix("").name + return filename.name + return None + + def write(self, data): + # buffer multiple write calls, write on flush + if self.multiple_write_buffer is None: + self.multiple_write_buffer = ( + BytesIO() if isinstance(data, bytes) else StringIO() + ) + self.multiple_write_buffer.write(data) + + def flush(self) -> None: + # write to actual handle and close write buffer + if self.multiple_write_buffer is None or self.multiple_write_buffer.closed: + return + + # TarFile needs a non-empty string + archive_name = self.archive_name or self.infer_filename() or "tar" + with self.multiple_write_buffer: + value = self.multiple_write_buffer.getvalue() + tarinfo = tarfile.TarInfo(name=archive_name) + tarinfo.size = len(value) + self.addfile(tarinfo, io.BytesIO(value)) + + def close(self): + self.flush() + super().close() + + @property + def closed(self): + if self.multiple_write_buffer is None: + return False + return self.multiple_write_buffer.closed and super().closed + + @closed.setter + def closed(self, value): + if not self._closing and value: + self._closing = True + self.close() + + # error: Definition of "__exit__" in base class "ZipFile" is incompatible with # definition in base class "BytesIO" [misc] # error: Definition of "__enter__" in base class "ZipFile" is incompatible with diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 83cbac48e6a2b..6f6ee951f392a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -83,6 +83,7 @@ def to_json( default_handler: Callable[[Any], JSONSerializable] | None = None, lines: bool = False, compression: CompressionOptions = "infer", + mode: str = "w", index: bool = True, indent: int = 0, storage_options: StorageOptions = None, @@ -127,7 +128,7 @@ def to_json( if path_or_buf is not None: # apply compression and byte/text conversion with get_handle( - path_or_buf, "w", compression=compression, storage_options=storage_options + path_or_buf, mode, compression=compression, storage_options=storage_options ) as handles: handles.handle.write(s) else: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 29cacc12b1da0..d2837c49b3c63 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -25,6 +25,7 @@ def to_pickle( obj: Any, filepath_or_buffer: FilePath | WriteBuffer[bytes], compression: CompressionOptions = "infer", + mode: str = "wb", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): @@ -95,7 +96,7 @@ def to_pickle( with get_handle( filepath_or_buffer, - "wb", + mode, compression=compression, is_text=False, storage_options=storage_options, diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 3c278cb48e20f..056bc85241215 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -14,6 +14,10 @@ import pandas.io.common as icom +def flip(my_dict: dict): + return {value: key for key, value in my_dict.items()} + + @pytest.mark.parametrize( "obj", [ @@ -26,8 +30,13 @@ ) @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_compression_size(obj, method, compression_only): + kwargs = {} + + if compression_only == "tar": + kwargs["mode"] = "w:gz" + with tm.ensure_clean() as path: - getattr(obj, method)(path, compression=compression_only) + getattr(obj, method)(path, compression=compression_only, **kwargs) compressed_size = os.path.getsize(path) getattr(obj, method)(path, compression=None) uncompressed_size = os.path.getsize(path) @@ -72,7 +81,7 @@ def test_dataframe_compression_defaults_to_infer( ): # GH22004 input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) - extension = icom._compression_to_extension[compression_only] + extension = flip(icom._extension_to_compression)[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) @@ -92,7 +101,7 @@ def test_series_compression_defaults_to_infer( ): # GH22004 input = pd.Series([0, 5, -2, 10], name="X") - extension = icom._compression_to_extension[compression_only] + extension = flip(icom._extension_to_compression)[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) if "squeeze" in read_kwargs: diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 2e8e4a9017dbc..30ddf0e9e9b75 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -1,5 +1,6 @@ from io import BytesIO import os +import tarfile import zipfile import numpy as np @@ -104,6 +105,14 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): ) as res: for res_info, exp_info in zip(res.infolist(), exp.infolist()): assert res_info.CRC == exp_info.CRC + elif compression == "tar": + with tarfile.open(fileobj=BytesIO(result)) as exp, tarfile.open( + fileobj=BytesIO(expected) + ) as res: + for res_info, exp_info in zip(res.getmembers(), exp.getmembers()): + assert ( + res.extractfile(res_info).read() == exp.extractfile(exp_info).read() + ) else: assert result == expected diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index aa80df1bcbd38..41c2734fdf149 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -21,6 +21,7 @@ from pathlib import Path import pickle import shutil +import tarfile from warnings import ( catch_warnings, filterwarnings, @@ -306,13 +307,18 @@ def compress_file(self, src_path, dest_path, compression): elif compression == "zip": with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) + elif compression == "tar": + with open(src_path, "rb") as fh: + with tarfile.open(dest_path, mode="w") as tar: + tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path)) + tar.addfile(tarinfo, fh) elif compression == "xz": f = get_lzma_file()(dest_path, "w") else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) - if compression != "zip": + if compression not in ["zip", "tar"]: with open(src_path, "rb") as fh, f: f.write(fh.read()) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index eb457d74c6a01..8f396ff78c047 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -5,6 +5,7 @@ import io import os import struct +import tarfile import warnings import zipfile @@ -1899,6 +1900,9 @@ def test_compression(compression, version, use_dict, infer): elif compression == "zip": with zipfile.ZipFile(path, "r") as comp: fp = io.BytesIO(comp.read(comp.filelist[0])) + elif compression == "tar": + with tarfile.open(path) as tar: + fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read()) elif compression == "bz2": with bz2.open(path, "rb") as comp: fp = io.BytesIO(comp.read()) From c6573efdabb07ab5074a6b3b06ab63d9ac29a04e Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Wed, 15 Dec 2021 10:21:16 +0000 Subject: [PATCH 08/33] feat: assure it respects .gz endings --- pandas/io/common.py | 20 ++++++++++++++++-- pandas/tests/io/parser/test_compression.py | 24 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 27605233c5f49..58735f4b5d0dd 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -875,9 +875,25 @@ def __init__( super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs) @classmethod - def open(cls, mode="r", **kwargs): + def open(cls, name=None, mode="r", **kwargs): mode = mode.replace("b", "") - return super().open(mode=mode, **kwargs) + return super().open(name=name, mode=cls.extend_mode(name, mode), **kwargs) + + @classmethod + def extend_mode( + cls, name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], mode: str + ) -> str: + if mode != "w": + return mode + if isinstance(name, (os.PathLike, str)): + filename = Path(name) + if filename.suffix == ".gz": + return mode + ":gz" + elif filename.suffix == ".xz": + return mode + ":xz" + elif filename.suffix == ".bz2": + return mode + ":bz2" + return mode def infer_filename(self): """ diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index e0857eb8f6ce8..a411fca91ecf0 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -5,6 +5,7 @@ import os from pathlib import Path +import tarfile import zipfile import pytest @@ -180,3 +181,26 @@ def test_ignore_compression_extension(all_parsers): Path(path_zip).write_text(Path(path_csv).read_text()) tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) + + +@skip_pyarrow +def test_writes_tar_gz(all_parsers): + parser = all_parsers + data = DataFrame( + { + "Country": ["Venezuela", "Venezuela"], + "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], + } + ) + with tm.ensure_clean("test.tar.gz") as tar_path: + data.to_csv(tar_path, index=False) + + # test that read_csv infers .tar.gz to gzip: + tm.assert_frame_equal(parser.read_csv(tar_path), data) + + # test that file is indeed gzipped: + with tarfile.open(tar_path, "r:gz") as tar: + result = parser.read_csv( + tar.extractfile(tar.getnames()[0]), compression="infer" + ) + tm.assert_frame_equal(result, data) From a4ac382898e07e406b9ab4e2b0c147a9d32f09b7 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Wed, 15 Dec 2021 11:12:25 +0000 Subject: [PATCH 09/33] feat: add "tar" entry to compressionoptions --- pandas/_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 95277e97eae98..d35cfe50a3ae8 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -243,7 +243,7 @@ def closed(self) -> bool: # compression keywords and compression CompressionDict = Dict[str, Any] CompressionOptions = Optional[ - Union[Literal["infer", "gzip", "bz2", "zip", "xz"], CompressionDict] + Union[Literal["infer", "gzip", "bz2", "zip", "xz", "tar"], CompressionDict] ] From e66826b9e1bb430f3df704634e6006af3133516a Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Wed, 15 Dec 2021 11:29:30 +0000 Subject: [PATCH 10/33] chore: add whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 372f991d96a22..79703d1ea5c44 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -196,6 +196,29 @@ representation of :class:`DataFrame` objects (:issue:`4889`). .. _whatsnew_140.enhancements.other: +Reading directly from TAR archives +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +I/O methods like :function:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing +directly on TAR archives (:issue:`44787`). + +.. code-block:: python + + df = pd.read_csv("./movement.tar.gz") + # ... + df.to_csv("./out.tar.gz") + +This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives. +The used compression method is inferred from the filename. +If the compression method cannot be inferred, use the ``compression`` argument: + +.. code-block:: python + + df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821 + +(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open) + + Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`concat` will preserve the ``attrs`` when it is the same for all objects and discard the ``attrs`` when they are different. (:issue:`41828`) From 941be377d28f61dcfdc7afdd87bb84c04034194f Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Wed, 15 Dec 2021 11:35:33 +0000 Subject: [PATCH 11/33] fix: test_compression_size_fh --- pandas/tests/io/test_compression.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 056bc85241215..29f7c35b90395 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -56,7 +56,11 @@ def test_compression_size(obj, method, compression_only): @pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - with icom.get_handle(path, "w", compression=compression_only) as handles: + with icom.get_handle( + path, + "w:gz" if compression_only == "tar" else "w", + compression=compression_only, + ) as handles: getattr(obj, method)(handles.handle) assert not handles.handle.closed compressed_size = os.path.getsize(path) From 0468e5f38286c8644e4d91a1b6e91ea1e42a8d07 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Tue, 4 Jan 2022 08:38:20 +0100 Subject: [PATCH 12/33] add tarfile to shared compression docs --- pandas/core/shared_docs.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index f79fd3ed09f8d..d54e62fa34f19 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -407,11 +407,11 @@ ] = """compression : str or dict, default 'infer' For on-the-fly compression of the output data. If 'infer' and '%s' path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). Set to ``None`` for no compression. Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or ``tarfile.TarFile``, respectively. As an example, the following could be passed for faster compression and to create a reproducible gzip archive: ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.""" @@ -421,12 +421,12 @@ ] = """compression : str or dict, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and '%s' is path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using - 'zip', the ZIP file must contain only one data file to be read in. Set to + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). If using + 'zip' or 'tar', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or ``tarfile.TarFile``, respectively. As an example, the following could be passed for Zstandard decompression using a custom compression dictionary: ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.""" From 2531ee0b704c514f448eb934c38cb02da30dc86b Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Tue, 4 Jan 2022 09:00:41 +0100 Subject: [PATCH 13/33] fix formatting --- pandas/core/shared_docs.py | 30 +++++++++++++++++++----------- pandas/io/common.py | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index d54e62fa34f19..af881f4d4bbaf 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -407,12 +407,16 @@ ] = """compression : str or dict, default 'infer' For on-the-fly compression of the output data. If 'infer' and '%s' path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). Set to - ``None`` for no compression. Can also be a dict with key ``'method'`` set + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + Set to ``None`` for no compression. + Can also be a dict with key ``'method'`` set to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other - key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or ``tarfile.TarFile``, respectively. As an - example, the following could be passed for faster compression and to create + key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for faster compression and to create a reproducible gzip archive: ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.""" @@ -421,13 +425,17 @@ ] = """compression : str or dict, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and '%s' is path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). If using - 'zip' or 'tar', the ZIP file must contain only one data file to be read in. Set to - ``None`` for no decompression. Can also be a dict with key ``'method'`` set + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other - key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or ``tarfile.TarFile``, respectively. As an - example, the following could be passed for Zstandard decompression using a + key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a custom compression dictionary: ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.""" diff --git a/pandas/io/common.py b/pandas/io/common.py index 04779a84cf613..c14872d8d5c0b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -19,8 +19,8 @@ import mmap import os from pathlib import Path -import tarfile import re +import tarfile from typing import ( IO, Any, From 57eba0ad7b63c984fb69c8c2a7e1a40ae3721bd6 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Tue, 4 Jan 2022 08:27:49 +0000 Subject: [PATCH 14/33] pass through "mode" via compression args --- pandas/core/generic.py | 4 ---- pandas/io/common.py | 6 ++++-- pandas/tests/io/test_compression.py | 6 ++---- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 68d2b38264030..1e25b0f4eb176 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2345,7 +2345,6 @@ def to_json( default_handler: Callable[[Any], JSONSerializable] | None = None, lines: bool_t = False, compression: CompressionOptions = "infer", - mode: str = "w", index: bool_t = True, indent: int | None = None, storage_options: StorageOptions = None, @@ -2607,7 +2606,6 @@ def to_json( default_handler=default_handler, lines=lines, compression=compression, - mode=mode, index=index, indent=indent, storage_options=storage_options, @@ -2949,7 +2947,6 @@ def to_pickle( self, path, compression: CompressionOptions = "infer", - mode: str = "wb", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: @@ -3007,7 +3004,6 @@ def to_pickle( self, path, compression=compression, - mode=mode, protocol=protocol, storage_options=storage_options, ) diff --git a/pandas/io/common.py b/pandas/io/common.py index c14872d8d5c0b..dfe01fb3467a7 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -763,10 +763,12 @@ def get_handle( # TAR Encoding elif compression == "tar": + if "mode" not in compression_args: + compression_args["mode"] = ioargs.mode if is_path: - handle = _BytesTarFile.open(name=handle, mode=ioargs.mode) + handle = _BytesTarFile.open(name=handle, **compression_args) else: - handle = _BytesTarFile.open(fileobj=handle, mode=ioargs.mode) + handle = _BytesTarFile.open(fileobj=handle, **compression_args) if handle.mode == "r": handles.append(handle) files = handle.getnames() diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 29f7c35b90395..c14807e5cf96f 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -30,13 +30,11 @@ def flip(my_dict: dict): ) @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_compression_size(obj, method, compression_only): - kwargs = {} - if compression_only == "tar": - kwargs["mode"] = "w:gz" + compression_only = {"method": "tar", "mode": "w:gz"} with tm.ensure_clean() as path: - getattr(obj, method)(path, compression=compression_only, **kwargs) + getattr(obj, method)(path, compression=compression_only) compressed_size = os.path.getsize(path) getattr(obj, method)(path, compression=None) uncompressed_size = os.path.getsize(path) From 38f7d541ae7bde7ac08339f6adf9d88947706206 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Tue, 4 Jan 2022 08:36:41 +0000 Subject: [PATCH 15/33] fix pickle test --- pandas/tests/io/test_pickle.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 8d139a1a29bd6..3342aba90b76c 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -289,9 +289,7 @@ def get_random_path(): class TestCompression: - _extension_to_compression = { - ext: compression for compression, ext in icom._compression_to_extension.items() - } + _extension_to_compression = icom._extension_to_compression def compress_file(self, src_path, dest_path, compression): if compression is None: @@ -550,7 +548,7 @@ def test_pickle_binary_object_compression(compression): buffer.seek(0) # gzip and zip safe the filename: cannot compare the compressed content - assert buffer.getvalue() == reference or compression in ("gzip", "zip") + assert buffer.getvalue() == reference or compression in ("gzip", "zip", "tar") # read read_df = pd.read_pickle(buffer, compression=compression) From 887fd10f80e380d7bd907894927a8a368556dce1 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Tue, 4 Jan 2022 08:49:58 +0000 Subject: [PATCH 16/33] add class comment --- pandas/io/common.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index dfe01fb3467a7..f3e82d6ad1fb0 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -874,6 +874,13 @@ def get_handle( class _BytesTarFile(tarfile.TarFile, BytesIO): + """ + Wrapper for standard library class TarFile and allow the returned file-like + handle to accept byte strings via `write` method. + + BytesIO provides attributes of file-like object and TarFile.addfile writes + bytes strings into a member of the archive. + """ # GH 17778 def __init__( From 669d942d7eb4c54a1f83f874999197b5c2d09fe0 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 11:22:14 +0200 Subject: [PATCH 17/33] sort imports --- pandas/_testing/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 826efb18bafd1..1ef65f761c3f6 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -4,8 +4,8 @@ from functools import wraps import gzip import io -import tarfile import socket +import tarfile from typing import ( TYPE_CHECKING, Any, From 7d7d3c6d4d11c13ebc5dcc3212a32dead8f7f007 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 11:51:34 +0200 Subject: [PATCH 18/33] add _compression_to_extension back for backwards compatibility --- pandas/io/common.py | 1 + pandas/tests/io/test_compression.py | 8 ++------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index d44441b0586d7..f9cdc3f25c371 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -464,6 +464,7 @@ def file_path_to_url(path: str) -> str: ".zst": "zstd", } _supported_compressions = set(_extension_to_compression.values()) +_compression_to_extension = {value: key for key, value in _extension_to_compression.items()} def get_compression_method( diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index c14807e5cf96f..2d7daafb192a4 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -14,10 +14,6 @@ import pandas.io.common as icom -def flip(my_dict: dict): - return {value: key for key, value in my_dict.items()} - - @pytest.mark.parametrize( "obj", [ @@ -83,7 +79,7 @@ def test_dataframe_compression_defaults_to_infer( ): # GH22004 input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) - extension = flip(icom._extension_to_compression)[compression_only] + extension = icom._extension_to_compression[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) @@ -103,7 +99,7 @@ def test_series_compression_defaults_to_infer( ): # GH22004 input = pd.Series([0, 5, -2, 10], name="X") - extension = flip(icom._extension_to_compression)[compression_only] + extension = icom._extension_to_compression[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) if "squeeze" in read_kwargs: From 8b8b8ac1a39df3e9de1f7c389223548a7f1c27ba Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 11:51:57 +0200 Subject: [PATCH 19/33] fix some type warnings --- pandas/io/common.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index f9cdc3f25c371..38605b54d1e53 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -767,7 +767,7 @@ def get_handle( handle = _BytesTarFile.open(name=handle, **compression_args) else: handle = _BytesTarFile.open(fileobj=handle, **compression_args) - if handle.mode == "r": + if handle.mode == "r": # type: ignore[arg-type] handles.append(handle) files = handle.getnames() if len(files) == 1: @@ -871,6 +871,18 @@ def get_handle( ) +# error: Definition of "__exit__" in base class "TarFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "TarFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "TarFile" is incompatible with +# definition in base class "BinaryIO" [misc] +# error: Definition of "__enter__" in base class "TarFile" is incompatible with +# definition in base class "IO" [misc] +# error: Definition of "read" in base class "TarFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "read" in base class "TarFile" is incompatible with +# definition in base class "IO" [misc] class _BytesTarFile(tarfile.TarFile, BytesIO): """ Wrapper for standard library class TarFile and allow the returned file-like From dd356f674ea6a135c2c1889682ce895bbe075e78 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 12:20:01 +0200 Subject: [PATCH 20/33] fix: formatting --- pandas/io/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 38605b54d1e53..c462b4f7f0d86 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -464,7 +464,9 @@ def file_path_to_url(path: str) -> str: ".zst": "zstd", } _supported_compressions = set(_extension_to_compression.values()) -_compression_to_extension = {value: key for key, value in _extension_to_compression.items()} +_compression_to_extension = { + value: key for key, value in _extension_to_compression.items() +} def get_compression_method( @@ -767,7 +769,7 @@ def get_handle( handle = _BytesTarFile.open(name=handle, **compression_args) else: handle = _BytesTarFile.open(fileobj=handle, **compression_args) - if handle.mode == "r": # type: ignore[arg-type] + if handle.mode == "r": # type: ignore[arg-type] handles.append(handle) files = handle.getnames() if len(files) == 1: From 514014ae7129fbbe36b5e181c708b477a81f1731 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 12:48:24 +0200 Subject: [PATCH 21/33] fix: mypy complaints --- pandas/io/common.py | 24 ++++++++++++++---------- pandas/tests/io/test_gcs.py | 16 ++++++++++------ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c462b4f7f0d86..015196f898a38 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -769,11 +769,14 @@ def get_handle( handle = _BytesTarFile.open(name=handle, **compression_args) else: handle = _BytesTarFile.open(fileobj=handle, **compression_args) - if handle.mode == "r": # type: ignore[arg-type] + assert isinstance(handle, _BytesTarFile) + if handle.mode == "r": handles.append(handle) files = handle.getnames() if len(files) == 1: - handle = handle.extractfile(files[0]) + file = handle.extractfile(files[0]) + assert file is not None + handle = file elif len(files) == 0: raise ValueError(f"Zero files found in TAR archive {path_or_buf}") else: @@ -885,7 +888,7 @@ def get_handle( # definition in base class "BytesIO" [misc] # error: Definition of "read" in base class "TarFile" is incompatible with # definition in base class "IO" [misc] -class _BytesTarFile(tarfile.TarFile, BytesIO): +class _BytesTarFile(tarfile.TarFile, BytesIO): # type: ignore[misc] """ Wrapper for standard library class TarFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -897,14 +900,14 @@ class _BytesTarFile(tarfile.TarFile, BytesIO): # GH 17778 def __init__( self, - name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], - mode: str, + name: str | bytes | os.PathLike[str] | os.PathLike[bytes], + mode: Literal["r", "a", "w", "x"], fileobj: FileIO, archive_name: str | None = None, **kwargs, ): self.archive_name = archive_name - self.multiple_write_buffer: StringIO | BytesIO | None = None + self.multiple_write_buffer: BytesIO | None = None self._closing = False super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs) @@ -936,7 +939,10 @@ def infer_filename(self): file not to be named something.tar, because that causes confusion (GH39465). """ if isinstance(self.name, (os.PathLike, str)): - filename = Path(self.name) + # error: Argument 1 to "Path" has + # incompatible type "Union[str, PathLike[str], PathLike[bytes]]"; + # expected "Union[str, PathLike[str]]" [arg-type] + filename = Path(self.name) # type: ignore[arg-type] if filename.suffix == ".tar": return filename.with_suffix("").name if filename.suffix in [".tar.gz", ".tar.bz2", ".tar.xz"]: @@ -947,9 +953,7 @@ def infer_filename(self): def write(self, data): # buffer multiple write calls, write on flush if self.multiple_write_buffer is None: - self.multiple_write_buffer = ( - BytesIO() if isinstance(data, bytes) else StringIO() - ) + self.multiple_write_buffer = BytesIO() self.multiple_write_buffer.write(data) def flush(self) -> None: diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index b41fb4c5e64cc..ef590b2cf48d3 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -108,13 +108,17 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): for res_info, exp_info in zip(res.infolist(), exp.infolist()): assert res_info.CRC == exp_info.CRC elif compression == "tar": - with tarfile.open(fileobj=BytesIO(result)) as exp, tarfile.open( + with tarfile.open(fileobj=BytesIO(result)) as tar_exp, tarfile.open( fileobj=BytesIO(expected) - ) as res: - for res_info, exp_info in zip(res.getmembers(), exp.getmembers()): - assert ( - res.extractfile(res_info).read() == exp.extractfile(exp_info).read() - ) + ) as tar_res: + for tar_res_info, tar_exp_info in zip( + tar_res.getmembers(), tar_exp.getmembers() + ): + actual_file = tar_res.extractfile(tar_res_info) + expected_file = tar_exp.extractfile(tar_exp_info) + assert (actual_file is None) == (expected_file is None) + if actual_file is not None and expected_file is not None: + assert actual_file.read() == expected_file.read() else: assert result == expected From 38971c7163f1fe829900f1870aca754e28ed98a3 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 12:58:45 +0200 Subject: [PATCH 22/33] fix: more tests --- pandas/tests/io/test_compression.py | 4 ++-- pandas/tests/io/xml/test_xml.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 2d7daafb192a4..66e29a552f04a 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -79,7 +79,7 @@ def test_dataframe_compression_defaults_to_infer( ): # GH22004 input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) - extension = icom._extension_to_compression[compression_only] + extension = icom._compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) @@ -99,7 +99,7 @@ def test_series_compression_defaults_to_infer( ): # GH22004 input = pd.Series([0, 5, -2, 10], name="X") - extension = icom._extension_to_compression[compression_only] + extension = icom._compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) if "squeeze" in read_kwargs: diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index bfb6bb19452bd..6b47d81608621 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1370,6 +1370,7 @@ def test_wrong_compression(parser, compression, compression_only): "bz2": (OSError, "Invalid data stream"), "gzip": (OSError, "Not a gzipped file"), "zip": (BadZipFile, "File is not a zip file"), + "tar": (BadZipFile, "File is not a zip file"), } zstd = import_optional_dependency("zstandard", errors="ignore") if zstd is not None: From e35d361e1fa8ccaf2ef0667ac7c7a67a3ad8569d Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 13:13:30 +0200 Subject: [PATCH 23/33] fix: some error with xml --- pandas/tests/io/xml/test_xml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 6b47d81608621..277b6442a0a8c 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -6,6 +6,7 @@ ) from lzma import LZMAError import os +from tarfile import ReadError from urllib.error import HTTPError from zipfile import BadZipFile @@ -1370,7 +1371,7 @@ def test_wrong_compression(parser, compression, compression_only): "bz2": (OSError, "Invalid data stream"), "gzip": (OSError, "Not a gzipped file"), "zip": (BadZipFile, "File is not a zip file"), - "tar": (BadZipFile, "File is not a zip file"), + "tar": (ReadError, "file could not be opened successfully"), } zstd = import_optional_dependency("zstandard", errors="ignore") if zstd is not None: From c5088fc60a94bd74fe0becece986551a2a09188c Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 13:51:03 +0200 Subject: [PATCH 24/33] fix: interpreted text role --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index b102fa6696a65..9a8c0fc96f646 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -202,7 +202,7 @@ library to produce a tight representation of :class:`DataFrame` objects Reading directly from TAR archives ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -I/O methods like :function:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing +I/O methods like :func:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing directly on TAR archives (:issue:`44787`). .. code-block:: python From f6c51738704feb9cb5766d85531d1373aa95fa3f Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Sat, 9 Apr 2022 13:52:34 +0200 Subject: [PATCH 25/33] move to v1.5 whatsnw --- doc/source/whatsnew/v1.4.0.rst | 23 ----------------------- doc/source/whatsnew/v1.5.0.rst | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9a8c0fc96f646..52aa9312d4c14 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -199,29 +199,6 @@ library to produce a tight representation of :class:`DataFrame` objects .. _whatsnew_140.enhancements.other: -Reading directly from TAR archives -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -I/O methods like :func:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing -directly on TAR archives (:issue:`44787`). - -.. code-block:: python - - df = pd.read_csv("./movement.tar.gz") - # ... - df.to_csv("./out.tar.gz") - -This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives. -The used compression method is inferred from the filename. -If the compression method cannot be inferred, use the ``compression`` argument: - -.. code-block:: python - - df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821 - -(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open) - - Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`concat` will preserve the ``attrs`` when it is the same for all objects and discard the ``attrs`` when they are different (:issue:`41828`) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4920622a15f3f..9923a0eb3b770 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -75,6 +75,31 @@ as seen in the following example. 1 2021-01-02 08:00:00 4 2 2021-01-02 16:00:00 5 +.. _whatsnew_150.enhancements.tar: + +Reading directly from TAR archives +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +I/O methods like :func:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing +directly on TAR archives (:issue:`44787`). + +.. code-block:: python + + df = pd.read_csv("./movement.tar.gz") + # ... + df.to_csv("./out.tar.gz") + +This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives. +The used compression method is inferred from the filename. +If the compression method cannot be inferred, use the ``compression`` argument: + +.. code-block:: python + + df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821 + +(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open) + + .. _whatsnew_150.enhancements.other: Other enhancements From 9a4fa074f32916bbba4f2eb76be1ed6e2835feeb Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Mon, 11 Apr 2022 10:24:46 +0200 Subject: [PATCH 26/33] add versionadded note --- pandas/core/shared_docs.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index f4afc14dbddc6..69b4bb4c35585 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -432,7 +432,11 @@ ``tarfile.TarFile``, respectively. As an example, the following could be passed for faster compression and to create a reproducible gzip archive: - ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.""" + ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files. + """ _shared_docs[ "decompression_options" @@ -451,7 +455,11 @@ ``tarfile.TarFile``, respectively. As an example, the following could be passed for Zstandard decompression using a custom compression dictionary: - ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.""" + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files. + """ _shared_docs[ "replace" From 0c31aa8def7d50d418d2ef3750e2ee0017cfa5fc Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Mon, 11 Apr 2022 11:10:43 +0200 Subject: [PATCH 27/33] don't leave blank lines --- pandas/core/shared_docs.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 69b4bb4c35585..33b6d74e83fdc 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -435,8 +435,7 @@ ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. .. versionadded:: 1.5.0 - Added support for `.tar` files. - """ + Added support for `.tar` files.""" _shared_docs[ "decompression_options" @@ -458,8 +457,7 @@ ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. .. versionadded:: 1.5.0 - Added support for `.tar` files. - """ + Added support for `.tar` files.""" _shared_docs[ "replace" From 086c59883e169fe021873f25304e7bb09f2823fd Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Wed, 13 Apr 2022 12:03:01 +0200 Subject: [PATCH 28/33] add tests for zero files / multiple files --- pandas/tests/io/test_compression.py | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 66e29a552f04a..5ac0197d275dd 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -3,8 +3,10 @@ from pathlib import Path import subprocess import sys +import tarfile import textwrap import time +import zipfile import pytest @@ -262,3 +264,46 @@ def test_bzip_compression_level(obj, method): """ with tm.ensure_clean() as path: getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1}) + + +@pytest.mark.parametrize( + "suffix,archive", + [ + (".zip", zipfile.ZipFile), + (".tar", tarfile.TarFile), + ], +) +def test_empty_archive_zip(suffix, archive): + with tm.ensure_clean(filename=suffix) as path: + file = archive(path, "w") + file.close() + with pytest.raises(ValueError, match="Zero files found"): + pd.read_csv(path) + + +def test_ambiguous_archive_zip(): + with tm.ensure_clean(filename=".zip") as path: + file = zipfile.ZipFile(path, "w") + file.writestr("a.csv", "foo,bar") + file.writestr("b.csv", "foo,bar") + file.close() + with pytest.raises(ValueError, match="Multiple files found in ZIP file"): + pd.read_csv(path) + + +def test_ambiguous_archive_tar(): + with tm.ensure_clean_dir() as dir: + csvAPath = os.path.join(dir, "a.csv") + with open(csvAPath, "w") as a: + a.write("foo,bar\n") + csvBPath = os.path.join(dir, "b.csv") + with open(csvBPath, "w") as b: + b.write("foo,bar\n") + + tarpath = os.path.join(dir, "archive.tar") + with tarfile.TarFile(tarpath, "w") as tar: + tar.add(csvAPath, "a.csv") + tar.add(csvBPath, "b.csv") + + with pytest.raises(ValueError, match="Multiple files found in TAR archive"): + pd.read_csv(tarpath) From 861faf0e3cf86374946b56767f43067da0fde349 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Wed, 13 Apr 2022 14:00:55 +0200 Subject: [PATCH 29/33] move _compression_to_extension to tests --- pandas/io/common.py | 3 --- pandas/tests/io/formats/test_to_csv.py | 5 ++--- pandas/tests/io/json/test_compression.py | 5 ++--- pandas/tests/io/parser/test_compression.py | 5 ++--- pandas/tests/io/parser/test_network.py | 4 ++-- pandas/tests/io/parser/test_read_fwf.py | 4 ++-- pandas/tests/io/test_compression.py | 8 ++++++-- pandas/tests/io/test_gcs.py | 5 ++--- pandas/tests/io/test_stata.py | 6 +++--- pandas/tests/io/xml/test_to_xml.py | 4 ++-- 10 files changed, 23 insertions(+), 26 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 015196f898a38..15a8f2e114041 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -464,9 +464,6 @@ def file_path_to_url(path: str) -> str: ".zst": "zstd", } _supported_compressions = set(_extension_to_compression.values()) -_compression_to_extension = { - value: key for key, value in _extension_to_compression.items() -} def get_compression_method( diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index d3f8e27c47e98..b5096934af4cb 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -13,8 +13,7 @@ compat, ) import pandas._testing as tm - -import pandas.io.common as icom +from pandas.tests.io.test_compression import _compression_to_extension class TestToCSV: @@ -555,7 +554,7 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): # We'll complete file extension subsequently. filename = "test." - filename += icom._compression_to_extension[compression] + filename += _compression_to_extension[compression] df = DataFrame({"A": [1]}) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 42e7b6cd03f55..ab97fb1740496 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -6,8 +6,7 @@ import pandas as pd import pandas._testing as tm - -import pandas.io.common as icom +from pandas.tests.io.test_compression import _compression_to_extension def test_compression_roundtrip(compression): @@ -100,7 +99,7 @@ def test_to_json_compression(compression_only, read_infer, to_infer): # We'll complete file extension subsequently. filename = "test." - filename += icom._compression_to_extension[compression] + filename += _compression_to_extension[compression] df = pd.DataFrame({"A": [1]}) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index dd48663887d25..121784d5a45ed 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -12,8 +12,7 @@ from pandas import DataFrame import pandas._testing as tm - -import pandas.io.common as icom +from pandas.tests.io.test_compression import _compression_to_extension skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -96,7 +95,7 @@ def test_compression(request, parser_and_data, compression_only, buffer, filenam parser, data, expected = parser_and_data compress_type = compression_only - ext = icom._compression_to_extension[compress_type] + ext = _compression_to_extension[compress_type] filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 93924c9b670c2..0b16d1d9ec6b0 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -16,8 +16,8 @@ from pandas import DataFrame import pandas._testing as tm +from pandas.tests.io.test_compression import _compression_to_extension -import pandas.io.common as icom from pandas.io.feather_format import read_feather from pandas.io.parsers import read_csv @@ -35,7 +35,7 @@ def test_compressed_urls(salaries_table, mode, engine, compression_only): # test reading compressed urls with various engines and # extension inference - extension = icom._compression_to_extension[compression_only] + extension = _compression_to_extension[compression_only] base_url = ( "https://github.com/pandas-dev/pandas/raw/main/" "pandas/tests/io/parser/data/salaries.csv" diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index f3d41332502af..d6d787df39dfa 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -21,8 +21,8 @@ DatetimeIndex, ) import pandas._testing as tm +from pandas.tests.io.test_compression import _compression_to_extension -import pandas.io.common as icom from pandas.io.parsers import ( read_csv, read_fwf, @@ -656,7 +656,7 @@ def test_fwf_compression(compression_only, infer): 3333333333""".strip() compression = compression_only - extension = icom._compression_to_extension[compression] + extension = _compression_to_extension[compression] kwargs = {"widths": [5, 5], "names": ["one", "two"]} expected = read_fwf(StringIO(data), **kwargs) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 5ac0197d275dd..a9c48d07fc986 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -15,6 +15,10 @@ import pandas.io.common as icom +_compression_to_extension = { + value: key for key, value in icom._extension_to_compression.items() +} + @pytest.mark.parametrize( "obj", @@ -81,7 +85,7 @@ def test_dataframe_compression_defaults_to_infer( ): # GH22004 input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) - extension = icom._compression_to_extension[compression_only] + extension = _compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) @@ -101,7 +105,7 @@ def test_series_compression_defaults_to_infer( ): # GH22004 input = pd.Series([0, 5, -2, 10], name="X") - extension = icom._compression_to_extension[compression_only] + extension = _compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) if "squeeze" in read_kwargs: diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index ef590b2cf48d3..6907d8978e603 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -15,10 +15,9 @@ read_parquet, ) import pandas._testing as tm +from pandas.tests.io.test_compression import _compression_to_extension from pandas.util import _test_decorators as td -import pandas.io.common as icom - @pytest.fixture def gcs_buffer(monkeypatch): @@ -157,7 +156,7 @@ def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding) tm.assert_frame_equal(df, read_df) # write compressed file with implicit compression - file_ext = icom._compression_to_extension[compression_only] + file_ext = _compression_to_extension[compression_only] compression["method"] = "infer" path_gcs += f".{file_ext}" df.to_csv(path_gcs, compression=compression, encoding=encoding) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 2f5da142676c3..c21673af2d979 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -21,8 +21,8 @@ Series, ) from pandas.core.indexes.api import ensure_index +from pandas.tests.io.test_compression import _compression_to_extension -import pandas.io.common as icom from pandas.io.parsers import read_csv from pandas.io.stata import ( CategoricalConversionWarning, @@ -1850,7 +1850,7 @@ def test_compression(compression, version, use_dict, infer): if use_dict: file_ext = compression else: - file_ext = icom._compression_to_extension[compression] + file_ext = _compression_to_extension[compression] file_name += f".{file_ext}" compression_arg = compression if infer: @@ -2005,7 +2005,7 @@ def test_compression_roundtrip(compression): def test_stata_compression(compression_only, read_infer, to_infer): compression = compression_only - ext = icom._compression_to_extension[compression] + ext = _compression_to_extension[compression] filename = f"test.{ext}" df = DataFrame( diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 2b9735f64761c..d3247eb9dd47e 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -17,8 +17,8 @@ Index, ) import pandas._testing as tm +from pandas.tests.io.test_compression import _compression_to_extension -import pandas.io.common as icom from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -1292,7 +1292,7 @@ def test_compression_output(parser, compression_only): def test_filename_and_suffix_comp(parser, compression_only): - compfile = "xml." + icom._compression_to_extension[compression_only] + compfile = "xml." + _compression_to_extension[compression_only] with tm.ensure_clean(filename=compfile) as path: geom_df.to_xml(path, parser=parser, compression=compression_only) From 9458ecbe116c83e04ff2bd111467caa45a16f650 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Wed, 13 Apr 2022 14:14:31 +0200 Subject: [PATCH 30/33] revert added "mode" argument --- pandas/io/json/_json.py | 3 +-- pandas/io/pickle.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 1c9de2b53b275..2a9ed9f15cd11 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -81,7 +81,6 @@ def to_json( default_handler: Callable[[Any], JSONSerializable] | None = None, lines: bool = False, compression: CompressionOptions = "infer", - mode: str = "w", index: bool = True, indent: int = 0, storage_options: StorageOptions = None, @@ -126,7 +125,7 @@ def to_json( if path_or_buf is not None: # apply compression and byte/text conversion with get_handle( - path_or_buf, mode, compression=compression, storage_options=storage_options + path_or_buf, "w", compression=compression, storage_options=storage_options ) as handles: handles.handle.write(s) else: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 3636ce661fa2e..2928d8c6520b0 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -28,7 +28,6 @@ def to_pickle( obj: Any, filepath_or_buffer: FilePath | WriteBuffer[bytes], compression: CompressionOptions = "infer", - mode: str = "wb", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: @@ -97,7 +96,7 @@ def to_pickle( with get_handle( filepath_or_buffer, - mode, + "wb", compression=compression, is_text=False, storage_options=storage_options, From d20f31505960515bc43fe0074e52ecd141f1940d Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Wed, 13 Apr 2022 14:29:44 +0200 Subject: [PATCH 31/33] add test to ensure that `compression.mode` works --- pandas/tests/io/test_compression.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index a9c48d07fc986..9daa9dfd94641 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,3 +1,4 @@ +import gzip import io import os from pathlib import Path @@ -311,3 +312,17 @@ def test_ambiguous_archive_tar(): with pytest.raises(ValueError, match="Multiple files found in TAR archive"): pd.read_csv(tarpath) + + +def test_tar_gz_to_different_filename(): + with tm.ensure_clean(filename=".foo") as file: + pd.DataFrame( + [["1", "2"]], + columns=["foo", "bar"], + ).to_csv(file, compression={"method": "tar", "mode": "w:gz"}, index=False) + with gzip.open(file) as uncompressed: + with tarfile.TarFile(fileobj=uncompressed) as archive: + members = archive.getmembers() + assert len(members) == 1 + content = archive.extractfile(members[0]).read() + assert content == b"foo,bar\n1,2\n" From 0d9ed18d1b9703be8f5545f1462709da0ac71f07 Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Thu, 5 May 2022 15:24:16 +0200 Subject: [PATCH 32/33] compare strings, not bytes --- pandas/tests/io/test_compression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 9daa9dfd94641..a2da8d45171e2 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -324,5 +324,5 @@ def test_tar_gz_to_different_filename(): with tarfile.TarFile(fileobj=uncompressed) as archive: members = archive.getmembers() assert len(members) == 1 - content = archive.extractfile(members[0]).read() - assert content == b"foo,bar\n1,2\n" + content = archive.extractfile(members[0]).read().decode("utf8") + assert content == "foo,bar\n1,2\n" From 37370c2f7802fa52534abbda8a0a92e017d6bb2f Mon Sep 17 00:00:00 2001 From: Simon Knott Date: Fri, 6 May 2022 11:25:34 +0200 Subject: [PATCH 33/33] replace carriage returns --- pandas/tests/io/test_compression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index a2da8d45171e2..35749aabdc39f 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -325,4 +325,5 @@ def test_tar_gz_to_different_filename(): members = archive.getmembers() assert len(members) == 1 content = archive.extractfile(members[0]).read().decode("utf8") + content = content.replace("\r\n", "\n") # windows assert content == "foo,bar\n1,2\n"