diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index f0ef198d09375..2ad78fc56d8df 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -100,6 +100,31 @@ as seen in the following example. 1 2021-01-02 08:00:00 4 2 2021-01-02 16:00:00 5 +.. _whatsnew_150.enhancements.tar: + +Reading directly from TAR archives +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +I/O methods like :func:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing +directly on TAR archives (:issue:`44787`). + +.. code-block:: python + + df = pd.read_csv("./movement.tar.gz") + # ... + df.to_csv("./out.tar.gz") + +This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives. +The used compression method is inferred from the filename. +If the compression method cannot be inferred, use the ``compression`` argument: + +.. code-block:: python + + df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821 + +(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open) + + .. _whatsnew_150.enhancements.other: Other enhancements diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 4c7e669f94734..1ef65f761c3f6 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -3,7 +3,9 @@ import bz2 from functools import wraps import gzip +import io import socket +import tarfile from typing import ( TYPE_CHECKING, Any, @@ -398,6 +400,14 @@ def write_to_compressed(compression, path, data, dest="test"): mode = "w" args = (dest, data) method = "writestr" + elif compression == "tar": + compress_method = tarfile.TarFile + mode = "w" + file = tarfile.TarInfo(name=dest) + bytes = io.BytesIO(data) + file.size = len(data) + args = (file, bytes) + method = "addfile" elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": diff --git a/pandas/_typing.py b/pandas/_typing.py index 1debc4265508f..e71859e91785e 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -256,7 +256,7 @@ def closed(self) -> bool: # compression keywords and compression CompressionDict = Dict[str, Any] CompressionOptions = Optional[ - Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict] + Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], CompressionDict] ] # types in DataFrameFormatter diff --git a/pandas/conftest.py b/pandas/conftest.py index 9d98478010c97..b84d6fc9c2b99 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -294,6 +294,7 @@ def other_closed(request): "bz2", "zip", "xz", + "tar", pytest.param("zstd", marks=td.skip_if_no("zstandard")), ] ) @@ -310,6 +311,7 @@ def compression(request): "bz2", "zip", "xz", + "tar", pytest.param("zstd", marks=td.skip_if_no("zstandard")), ] ) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 890b988378870..79d79c7037992 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -421,29 +421,43 @@ ] = """compression : str or dict, default 'infer' For on-the-fly compression of the output data. If 'infer' and '%s' path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to - ``None`` for no compression. Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other - key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an - example, the following could be passed for faster compression and to create + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + Set to ``None`` for no compression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other + key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for faster compression and to create a reproducible gzip archive: - ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.""" + ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files.""" _shared_docs[ "decompression_options" ] = """compression : str or dict, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and '%s' is path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using - 'zip', the ZIP file must contain only one data file to be read in. Set to - ``None`` for no decompression. Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other - key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an - example, the following could be passed for Zstandard decompression using a + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other + key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a custom compression dictionary: - ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.""" + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files.""" _shared_docs[ "replace" diff --git a/pandas/io/common.py b/pandas/io/common.py index 57015924ce77f..15a8f2e114041 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,6 +10,7 @@ from io import ( BufferedIOBase, BytesIO, + FileIO, RawIOBase, StringIO, TextIOBase, @@ -19,6 +20,7 @@ import os from pathlib import Path import re +import tarfile from typing import ( IO, Any, @@ -450,13 +452,18 @@ def file_path_to_url(path: str) -> str: return urljoin("file:", pathname2url(path)) -_compression_to_extension = { - "gzip": ".gz", - "bz2": ".bz2", - "zip": ".zip", - "xz": ".xz", - "zstd": ".zst", +_extension_to_compression = { + ".tar": "tar", + ".tar.gz": "tar", + ".tar.bz2": "tar", + ".tar.xz": "tar", + ".gz": "gzip", + ".bz2": "bz2", + ".zip": "zip", + ".xz": "xz", + ".zst": "zstd", } +_supported_compressions = set(_extension_to_compression.values()) def get_compression_method( @@ -532,20 +539,18 @@ def infer_compression( return None # Infer compression from the filename/URL extension - for compression, extension in _compression_to_extension.items(): + for extension, compression in _extension_to_compression.items(): if filepath_or_buffer.lower().endswith(extension): return compression return None # Compression has been specified. Check that it's valid - if compression in _compression_to_extension: + if compression in _supported_compressions: return compression # https://github.com/python/mypy/issues/5492 # Unsupported operand types for + ("List[Optional[str]]" and "List[str]") - valid = ["infer", None] + sorted( - _compression_to_extension - ) # type: ignore[operator] + valid = ["infer", None] + sorted(_supported_compressions) # type: ignore[operator] msg = ( f"Unrecognized compression type: {compression}\n" f"Valid compression types are {valid}" @@ -682,7 +687,7 @@ def get_handle( ioargs.encoding, ioargs.mode, errors, - ioargs.compression["method"] not in _compression_to_extension, + ioargs.compression["method"] not in _supported_compressions, ) is_path = isinstance(handle, str) @@ -753,6 +758,30 @@ def get_handle( f"Only one file per ZIP: {zip_names}" ) + # TAR Encoding + elif compression == "tar": + if "mode" not in compression_args: + compression_args["mode"] = ioargs.mode + if is_path: + handle = _BytesTarFile.open(name=handle, **compression_args) + else: + handle = _BytesTarFile.open(fileobj=handle, **compression_args) + assert isinstance(handle, _BytesTarFile) + if handle.mode == "r": + handles.append(handle) + files = handle.getnames() + if len(files) == 1: + file = handle.extractfile(files[0]) + assert file is not None + handle = file + elif len(files) == 0: + raise ValueError(f"Zero files found in TAR archive {path_or_buf}") + else: + raise ValueError( + "Multiple files found in TAR archive. " + f"Only one file per TAR archive: {files}" + ) + # XZ Compression elif compression == "xz": handle = get_lzma_file()(handle, ioargs.mode) @@ -844,6 +873,116 @@ def get_handle( ) +# error: Definition of "__exit__" in base class "TarFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "TarFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "TarFile" is incompatible with +# definition in base class "BinaryIO" [misc] +# error: Definition of "__enter__" in base class "TarFile" is incompatible with +# definition in base class "IO" [misc] +# error: Definition of "read" in base class "TarFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "read" in base class "TarFile" is incompatible with +# definition in base class "IO" [misc] +class _BytesTarFile(tarfile.TarFile, BytesIO): # type: ignore[misc] + """ + Wrapper for standard library class TarFile and allow the returned file-like + handle to accept byte strings via `write` method. + + BytesIO provides attributes of file-like object and TarFile.addfile writes + bytes strings into a member of the archive. + """ + + # GH 17778 + def __init__( + self, + name: str | bytes | os.PathLike[str] | os.PathLike[bytes], + mode: Literal["r", "a", "w", "x"], + fileobj: FileIO, + archive_name: str | None = None, + **kwargs, + ): + self.archive_name = archive_name + self.multiple_write_buffer: BytesIO | None = None + self._closing = False + + super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs) + + @classmethod + def open(cls, name=None, mode="r", **kwargs): + mode = mode.replace("b", "") + return super().open(name=name, mode=cls.extend_mode(name, mode), **kwargs) + + @classmethod + def extend_mode( + cls, name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], mode: str + ) -> str: + if mode != "w": + return mode + if isinstance(name, (os.PathLike, str)): + filename = Path(name) + if filename.suffix == ".gz": + return mode + ":gz" + elif filename.suffix == ".xz": + return mode + ":xz" + elif filename.suffix == ".bz2": + return mode + ":bz2" + return mode + + def infer_filename(self): + """ + If an explicit archive_name is not given, we still want the file inside the zip + file not to be named something.tar, because that causes confusion (GH39465). + """ + if isinstance(self.name, (os.PathLike, str)): + # error: Argument 1 to "Path" has + # incompatible type "Union[str, PathLike[str], PathLike[bytes]]"; + # expected "Union[str, PathLike[str]]" [arg-type] + filename = Path(self.name) # type: ignore[arg-type] + if filename.suffix == ".tar": + return filename.with_suffix("").name + if filename.suffix in [".tar.gz", ".tar.bz2", ".tar.xz"]: + return filename.with_suffix("").with_suffix("").name + return filename.name + return None + + def write(self, data): + # buffer multiple write calls, write on flush + if self.multiple_write_buffer is None: + self.multiple_write_buffer = BytesIO() + self.multiple_write_buffer.write(data) + + def flush(self) -> None: + # write to actual handle and close write buffer + if self.multiple_write_buffer is None or self.multiple_write_buffer.closed: + return + + # TarFile needs a non-empty string + archive_name = self.archive_name or self.infer_filename() or "tar" + with self.multiple_write_buffer: + value = self.multiple_write_buffer.getvalue() + tarinfo = tarfile.TarInfo(name=archive_name) + tarinfo.size = len(value) + self.addfile(tarinfo, BytesIO(value)) + + def close(self): + self.flush() + super().close() + + @property + def closed(self): + if self.multiple_write_buffer is None: + return False + return self.multiple_write_buffer.closed and super().closed + + @closed.setter + def closed(self, value): + if not self._closing and value: + self._closing = True + self.close() + + # error: Definition of "__exit__" in base class "ZipFile" is incompatible with # definition in base class "BytesIO" [misc] # error: Definition of "__enter__" in base class "ZipFile" is incompatible with diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index d3f8e27c47e98..b5096934af4cb 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -13,8 +13,7 @@ compat, ) import pandas._testing as tm - -import pandas.io.common as icom +from pandas.tests.io.test_compression import _compression_to_extension class TestToCSV: @@ -555,7 +554,7 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): # We'll complete file extension subsequently. filename = "test." - filename += icom._compression_to_extension[compression] + filename += _compression_to_extension[compression] df = DataFrame({"A": [1]}) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 42e7b6cd03f55..ab97fb1740496 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -6,8 +6,7 @@ import pandas as pd import pandas._testing as tm - -import pandas.io.common as icom +from pandas.tests.io.test_compression import _compression_to_extension def test_compression_roundtrip(compression): @@ -100,7 +99,7 @@ def test_to_json_compression(compression_only, read_infer, to_infer): # We'll complete file extension subsequently. filename = "test." - filename += icom._compression_to_extension[compression] + filename += _compression_to_extension[compression] df = pd.DataFrame({"A": [1]}) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 576542306c164..121784d5a45ed 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -5,14 +5,14 @@ import os from pathlib import Path +import tarfile import zipfile import pytest from pandas import DataFrame import pandas._testing as tm - -import pandas.io.common as icom +from pandas.tests.io.test_compression import _compression_to_extension skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -95,7 +95,7 @@ def test_compression(request, parser_and_data, compression_only, buffer, filenam parser, data, expected = parser_and_data compress_type = compression_only - ext = icom._compression_to_extension[compress_type] + ext = _compression_to_extension[compress_type] filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: @@ -168,6 +168,14 @@ def test_invalid_compression(all_parsers, invalid_compression): parser.read_csv("test_file.zip", **compress_kwargs) +@skip_pyarrow +def test_compression_tar_archive(all_parsers, csv_dir_path): + parser = all_parsers + path = os.path.join(csv_dir_path, "tar_csv.tar.gz") + df = parser.read_csv(path) + assert list(df.columns) == ["a"] + + def test_ignore_compression_extension(all_parsers): parser = all_parsers df = DataFrame({"a": [0, 1]}) @@ -178,3 +186,26 @@ def test_ignore_compression_extension(all_parsers): Path(path_zip).write_text(Path(path_csv).read_text()) tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) + + +@skip_pyarrow +def test_writes_tar_gz(all_parsers): + parser = all_parsers + data = DataFrame( + { + "Country": ["Venezuela", "Venezuela"], + "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], + } + ) + with tm.ensure_clean("test.tar.gz") as tar_path: + data.to_csv(tar_path, index=False) + + # test that read_csv infers .tar.gz to gzip: + tm.assert_frame_equal(parser.read_csv(tar_path), data) + + # test that file is indeed gzipped: + with tarfile.open(tar_path, "r:gz") as tar: + result = parser.read_csv( + tar.extractfile(tar.getnames()[0]), compression="infer" + ) + tm.assert_frame_equal(result, data) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 93924c9b670c2..0b16d1d9ec6b0 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -16,8 +16,8 @@ from pandas import DataFrame import pandas._testing as tm +from pandas.tests.io.test_compression import _compression_to_extension -import pandas.io.common as icom from pandas.io.feather_format import read_feather from pandas.io.parsers import read_csv @@ -35,7 +35,7 @@ def test_compressed_urls(salaries_table, mode, engine, compression_only): # test reading compressed urls with various engines and # extension inference - extension = icom._compression_to_extension[compression_only] + extension = _compression_to_extension[compression_only] base_url = ( "https://github.com/pandas-dev/pandas/raw/main/" "pandas/tests/io/parser/data/salaries.csv" diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index f3d41332502af..d6d787df39dfa 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -21,8 +21,8 @@ DatetimeIndex, ) import pandas._testing as tm +from pandas.tests.io.test_compression import _compression_to_extension -import pandas.io.common as icom from pandas.io.parsers import ( read_csv, read_fwf, @@ -656,7 +656,7 @@ def test_fwf_compression(compression_only, infer): 3333333333""".strip() compression = compression_only - extension = icom._compression_to_extension[compression] + extension = _compression_to_extension[compression] kwargs = {"widths": [5, 5], "names": ["one", "two"]} expected = read_fwf(StringIO(data), **kwargs) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 3c278cb48e20f..35749aabdc39f 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,10 +1,13 @@ +import gzip import io import os from pathlib import Path import subprocess import sys +import tarfile import textwrap import time +import zipfile import pytest @@ -13,6 +16,10 @@ import pandas.io.common as icom +_compression_to_extension = { + value: key for key, value in icom._extension_to_compression.items() +} + @pytest.mark.parametrize( "obj", @@ -26,6 +33,9 @@ ) @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_compression_size(obj, method, compression_only): + if compression_only == "tar": + compression_only = {"method": "tar", "mode": "w:gz"} + with tm.ensure_clean() as path: getattr(obj, method)(path, compression=compression_only) compressed_size = os.path.getsize(path) @@ -47,7 +57,11 @@ def test_compression_size(obj, method, compression_only): @pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - with icom.get_handle(path, "w", compression=compression_only) as handles: + with icom.get_handle( + path, + "w:gz" if compression_only == "tar" else "w", + compression=compression_only, + ) as handles: getattr(obj, method)(handles.handle) assert not handles.handle.closed compressed_size = os.path.getsize(path) @@ -72,7 +86,7 @@ def test_dataframe_compression_defaults_to_infer( ): # GH22004 input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) - extension = icom._compression_to_extension[compression_only] + extension = _compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) @@ -92,7 +106,7 @@ def test_series_compression_defaults_to_infer( ): # GH22004 input = pd.Series([0, 5, -2, 10], name="X") - extension = icom._compression_to_extension[compression_only] + extension = _compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) if "squeeze" in read_kwargs: @@ -255,3 +269,61 @@ def test_bzip_compression_level(obj, method): """ with tm.ensure_clean() as path: getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1}) + + +@pytest.mark.parametrize( + "suffix,archive", + [ + (".zip", zipfile.ZipFile), + (".tar", tarfile.TarFile), + ], +) +def test_empty_archive_zip(suffix, archive): + with tm.ensure_clean(filename=suffix) as path: + file = archive(path, "w") + file.close() + with pytest.raises(ValueError, match="Zero files found"): + pd.read_csv(path) + + +def test_ambiguous_archive_zip(): + with tm.ensure_clean(filename=".zip") as path: + file = zipfile.ZipFile(path, "w") + file.writestr("a.csv", "foo,bar") + file.writestr("b.csv", "foo,bar") + file.close() + with pytest.raises(ValueError, match="Multiple files found in ZIP file"): + pd.read_csv(path) + + +def test_ambiguous_archive_tar(): + with tm.ensure_clean_dir() as dir: + csvAPath = os.path.join(dir, "a.csv") + with open(csvAPath, "w") as a: + a.write("foo,bar\n") + csvBPath = os.path.join(dir, "b.csv") + with open(csvBPath, "w") as b: + b.write("foo,bar\n") + + tarpath = os.path.join(dir, "archive.tar") + with tarfile.TarFile(tarpath, "w") as tar: + tar.add(csvAPath, "a.csv") + tar.add(csvBPath, "b.csv") + + with pytest.raises(ValueError, match="Multiple files found in TAR archive"): + pd.read_csv(tarpath) + + +def test_tar_gz_to_different_filename(): + with tm.ensure_clean(filename=".foo") as file: + pd.DataFrame( + [["1", "2"]], + columns=["foo", "bar"], + ).to_csv(file, compression={"method": "tar", "mode": "w:gz"}, index=False) + with gzip.open(file) as uncompressed: + with tarfile.TarFile(fileobj=uncompressed) as archive: + members = archive.getmembers() + assert len(members) == 1 + content = archive.extractfile(members[0]).read().decode("utf8") + content = content.replace("\r\n", "\n") # windows + assert content == "foo,bar\n1,2\n" diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 5e0da6f3ab3bb..6907d8978e603 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -1,5 +1,6 @@ from io import BytesIO import os +import tarfile import zipfile import numpy as np @@ -14,10 +15,9 @@ read_parquet, ) import pandas._testing as tm +from pandas.tests.io.test_compression import _compression_to_extension from pandas.util import _test_decorators as td -import pandas.io.common as icom - @pytest.fixture def gcs_buffer(monkeypatch): @@ -106,6 +106,18 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): ) as res: for res_info, exp_info in zip(res.infolist(), exp.infolist()): assert res_info.CRC == exp_info.CRC + elif compression == "tar": + with tarfile.open(fileobj=BytesIO(result)) as tar_exp, tarfile.open( + fileobj=BytesIO(expected) + ) as tar_res: + for tar_res_info, tar_exp_info in zip( + tar_res.getmembers(), tar_exp.getmembers() + ): + actual_file = tar_res.extractfile(tar_res_info) + expected_file = tar_exp.extractfile(tar_exp_info) + assert (actual_file is None) == (expected_file is None) + if actual_file is not None and expected_file is not None: + assert actual_file.read() == expected_file.read() else: assert result == expected @@ -144,7 +156,7 @@ def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding) tm.assert_frame_equal(df, read_df) # write compressed file with implicit compression - file_ext = icom._compression_to_extension[compression_only] + file_ext = _compression_to_extension[compression_only] compression["method"] = "infer" path_gcs += f".{file_ext}" df.to_csv(path_gcs, compression=compression, encoding=encoding) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 8f19a54a5eedf..98f02e14f4f13 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -21,6 +21,7 @@ from pathlib import Path import pickle import shutil +import tarfile import uuid from warnings import ( catch_warnings, @@ -254,9 +255,7 @@ def get_random_path(): class TestCompression: - _extension_to_compression = { - ext: compression for compression, ext in icom._compression_to_extension.items() - } + _extension_to_compression = icom._extension_to_compression def compress_file(self, src_path, dest_path, compression): if compression is None: @@ -270,6 +269,11 @@ def compress_file(self, src_path, dest_path, compression): elif compression == "zip": with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) + elif compression == "tar": + with open(src_path, "rb") as fh: + with tarfile.open(dest_path, mode="w") as tar: + tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path)) + tar.addfile(tarinfo, fh) elif compression == "xz": f = get_lzma_file()(dest_path, "w") elif compression == "zstd": @@ -278,7 +282,7 @@ def compress_file(self, src_path, dest_path, compression): msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) - if compression != "zip": + if compression not in ["zip", "tar"]: with open(src_path, "rb") as fh, f: f.write(fh.read()) @@ -510,7 +514,7 @@ def test_pickle_binary_object_compression(compression): buffer.seek(0) # gzip and zip safe the filename: cannot compare the compressed content - assert buffer.getvalue() == reference or compression in ("gzip", "zip") + assert buffer.getvalue() == reference or compression in ("gzip", "zip", "tar") # read read_df = pd.read_pickle(buffer, compression=compression) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 42f2f64a57ff0..c21673af2d979 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -5,6 +5,7 @@ import io import os import struct +import tarfile import warnings import zipfile @@ -20,8 +21,8 @@ Series, ) from pandas.core.indexes.api import ensure_index +from pandas.tests.io.test_compression import _compression_to_extension -import pandas.io.common as icom from pandas.io.parsers import read_csv from pandas.io.stata import ( CategoricalConversionWarning, @@ -1849,7 +1850,7 @@ def test_compression(compression, version, use_dict, infer): if use_dict: file_ext = compression else: - file_ext = icom._compression_to_extension[compression] + file_ext = _compression_to_extension[compression] file_name += f".{file_ext}" compression_arg = compression if infer: @@ -1867,6 +1868,9 @@ def test_compression(compression, version, use_dict, infer): elif compression == "zip": with zipfile.ZipFile(path, "r") as comp: fp = io.BytesIO(comp.read(comp.filelist[0])) + elif compression == "tar": + with tarfile.open(path) as tar: + fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read()) elif compression == "bz2": with bz2.open(path, "rb") as comp: fp = io.BytesIO(comp.read()) @@ -2001,7 +2005,7 @@ def test_compression_roundtrip(compression): def test_stata_compression(compression_only, read_infer, to_infer): compression = compression_only - ext = icom._compression_to_extension[compression] + ext = _compression_to_extension[compression] filename = f"test.{ext}" df = DataFrame( diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 2b9735f64761c..d3247eb9dd47e 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -17,8 +17,8 @@ Index, ) import pandas._testing as tm +from pandas.tests.io.test_compression import _compression_to_extension -import pandas.io.common as icom from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -1292,7 +1292,7 @@ def test_compression_output(parser, compression_only): def test_filename_and_suffix_comp(parser, compression_only): - compfile = "xml." + icom._compression_to_extension[compression_only] + compfile = "xml." + _compression_to_extension[compression_only] with tm.ensure_clean(filename=compfile) as path: geom_df.to_xml(path, parser=parser, compression=compression_only) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index bfb6bb19452bd..277b6442a0a8c 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -6,6 +6,7 @@ ) from lzma import LZMAError import os +from tarfile import ReadError from urllib.error import HTTPError from zipfile import BadZipFile @@ -1370,6 +1371,7 @@ def test_wrong_compression(parser, compression, compression_only): "bz2": (OSError, "Invalid data stream"), "gzip": (OSError, "Not a gzipped file"), "zip": (BadZipFile, "File is not a zip file"), + "tar": (ReadError, "file could not be opened successfully"), } zstd = import_optional_dependency("zstandard", errors="ignore") if zstd is not None: