From c1823ef5eef28b32e860e94dcd7b1b7373113697 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Mon, 6 Dec 2021 09:21:18 +0000
Subject: [PATCH 01/33] Add reproduction test for .tar.gz archives

co-authored-by: Margarete Dippel <margarete01@users.noreply.github.com>
---
 pandas/tests/io/parser/test_compression.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index 5aa0edfd8b46a..e0857eb8f6ce8 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -162,6 +162,14 @@ def test_invalid_compression(all_parsers, invalid_compression):
         parser.read_csv("test_file.zip", **compress_kwargs)
 
 
+@skip_pyarrow
+def test_compression_tar_archive(all_parsers, csv_dir_path):
+    parser = all_parsers
+    path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
+    df = parser.read_csv(path)
+    assert list(df.columns) == ["a"]
+
+
 def test_ignore_compression_extension(all_parsers):
     parser = all_parsers
     df = DataFrame({"a": [0, 1]})

From 9a85cbad219bfd39100c1ea24f63703ba797ca0f Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Mon, 6 Dec 2021 12:37:54 +0000
Subject: [PATCH 02/33] add support for .tar archives

python's `tarfile` supports gzip, xz and bz2 encoding,
so we don't need to make any special cases for that.

co-authored-by: Margarete Dippel <margarete01@users.noreply.github.com>
---
 pandas/io/common.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 844304396a23f..bca4bb5478f89 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -18,6 +18,7 @@
 import mmap
 import os
 from pathlib import Path
+import tarfile
 import tempfile
 from typing import (
     IO,
@@ -520,6 +521,9 @@ def infer_compression(
             # Cannot infer compression of a buffer, assume no compression
             return None
 
+        if ".tar" in filepath_or_buffer:
+            return "tar"
+
         # Infer compression from the filename/URL extension
         for compression, extension in _compression_to_extension.items():
             if filepath_or_buffer.lower().endswith(extension):
@@ -747,6 +751,21 @@ def get_handle(
                         f"Only one file per ZIP: {zip_names}"
                     )
 
+        # TAR Encoding
+        elif compression == "tar":
+            tar = tarfile.open(handle, "r:*")
+            handles.append(tar)
+            files = tar.getnames()
+            if len(files) == 1:
+                handle = tar.extractfile(files[0])
+            elif len(files) == 0:
+                raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
+            else:
+                raise ValueError(
+                    "Multiple files found in TAR archive. "
+                    f"Only one file per TAR archive: {files}"
+                )
+
         # XZ Compression
         elif compression == "xz":
             handle = get_lzma_file()(handle, ioargs.mode)

From e6730613e279b86d4a673668421e6a947548c280 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Mon, 6 Dec 2021 12:49:50 +0000
Subject: [PATCH 03/33] update doc comments

---
 pandas/core/frame.py         |  8 ++++----
 pandas/io/common.py          |  6 +++---
 pandas/io/json/_json.py      |  8 ++++----
 pandas/io/parsers/readers.py |  6 +++---
 pandas/io/pickle.py          |  4 ++--
 pandas/io/xml.py             | 12 ++++++------
 6 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 01f817300a01a..1f6336015dc43 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3020,11 +3020,11 @@ def to_xml(
             layout of elements and attributes from original output. This
             argument requires ``lxml`` to be installed. Only XSLT 1.0
             scripts and not later versions is currently supported.
-        compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+        compression : {{'infer', 'gzip', 'bz2', 'zip', 'tar', 'xz', None}}, default 'infer'
             For on-the-fly decompression of on-disk data. If 'infer', then use
-            gzip, bz2, zip or xz if path_or_buffer is a string ending in
-            '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-            otherwise. If using 'zip', the ZIP file must contain only one data
+            gzip, bz2, zip, xz or tar if path_or_buffer is a string ending in
+            '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, and no decompression
+            otherwise. If using 'zip' or 'tar', the archive must contain only one data
             file to be read in. Set to None for no decompression.
         {storage_options}
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
index bca4bb5478f89..0cc344bb7be07 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -263,7 +263,7 @@ def _get_filepath_or_buffer(
     ----------
     filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                          or buffer
-    compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
+    compression : {{'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, optional
     encoding : the encoding to use to decode bytes, default is 'utf-8'
     mode : str, optional
 
@@ -497,9 +497,9 @@ def infer_compression(
     ----------
     filepath_or_buffer : str or file handle
         File path or object.
-    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}
         If 'infer' and `filepath_or_buffer` is path-like, then detect
-        compression from the following extensions: '.gz', '.bz2', '.zip',
+        compression from the following extensions: '.gz', '.bz2', '.zip', '.tar',
         or '.xz' (otherwise no compression).
 
     Returns
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 62f542de3437f..25002e31f28e0 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -475,11 +475,11 @@ def read_json(
 
            ``JsonReader`` is a context manager.
 
-    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
-        gzip, bz2, zip or xz if path_or_buf is a string ending in
-        '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-        otherwise. If using 'zip', the ZIP file must contain only one data
+        gzip, bz2, zip, xz or tar if path_or_buf is a string ending in
+        '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, and no decompression
+        otherwise. If using 'zip' or 'tar', the archive must contain only one data
         file to be read in. Set to None for no decompression.
 
     nrows : int, optional
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 82f8ee553df8e..127ea6378ab82 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -279,11 +279,11 @@
     .. versionchanged:: 1.2
 
        ``TextFileReader`` is a context manager.
-compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
     For on-the-fly decompression of on-disk data. If 'infer' and
     `filepath_or_buffer` is path-like, then detect compression from the
-    following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
-    decompression). If using 'zip', the ZIP file must contain only one data
+    following extensions: '.gz', '.bz2', '.zip', '.tar', '.xz' (otherwise no
+    decompression). If using 'zip' or 'tar', the archive must contain only one data
     file to be read in. Set to None for no decompression.
 thousands : str, optional
     Thousands separator.
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index 5e0a3e1646883..29cacc12b1da0 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -134,9 +134,9 @@ def read_pickle(
         .. versionchanged:: 1.0.0
            Accept URL. URL is not limited to S3 and GCS.
 
-    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         If 'infer' and 'path_or_url' is path-like, then detect compression from
-        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
+        the following extensions: '.gz', '.bz2', '.zip', '.tar', or '.xz' (otherwise no
         compression) If 'infer' and 'path_or_url' is not path-like, then use
         None (= no decompression).
 
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 3c3b4afa2c57d..5aae9cb3a380a 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -68,9 +68,9 @@ class _XMLFrameParser:
         URL, file, file-like object, or a raw string containing XSLT,
         `etree` does not support XSLT but retained for consistency.
 
-    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'tar', 'xz', None}, default 'infer'
         Compression type for on-the-fly decompression of on-disk data.
-        If 'infer', then use extension for gzip, bz2, zip or xz.
+        If 'infer', then use extension for gzip, bz2, zip, tar or xz.
 
     storage_options : dict, optional
         Extra options that make sense for a particular storage connection,
@@ -801,11 +801,11 @@ def read_xml(
         transformation and not the original XML document. Only XSLT 1.0
         scripts and not later versions is currently supported.
 
-    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
-        gzip, bz2, zip or xz if path_or_buffer is a string ending in
-        '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-        otherwise. If using 'zip', the ZIP file must contain only one data
+        gzip, bz2, zip, xz, or tar if path_or_buffer is a string ending in
+        '.gz', '.bz2', '.zip', '.xz', or containing '.tar' respectively, and no decompression
+        otherwise. If using 'zip' or 'tar', the archive must contain only one data
         file to be read in. Set to None for no decompression.
 
     {storage_options}

From a0d63865ef2c903ec233fcddba71003d4ecc94c0 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Mon, 6 Dec 2021 13:11:54 +0000
Subject: [PATCH 04/33] fix: pep8 errors

---
 pandas/core/frame.py    | 8 +++++---
 pandas/io/json/_json.py | 5 +++--
 pandas/io/xml.py        | 5 +++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 1f6336015dc43..f13ec10b12391 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3020,11 +3020,13 @@ def to_xml(
             layout of elements and attributes from original output. This
             argument requires ``lxml`` to be installed. Only XSLT 1.0
             scripts and not later versions is currently supported.
-        compression : {{'infer', 'gzip', 'bz2', 'zip', 'tar', 'xz', None}}, default 'infer'
+        compression : {{'infer', 'gzip', 'bz2',
+            'zip', 'tar', 'xz', None}}, default 'infer'
             For on-the-fly decompression of on-disk data. If 'infer', then use
             gzip, bz2, zip, xz or tar if path_or_buffer is a string ending in
-            '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, and no decompression
-            otherwise. If using 'zip' or 'tar', the archive must contain only one data
+            '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively,
+            and no decompression otherwise.
+            If using 'zip' or 'tar', the archive must contain only one data
             file to be read in. Set to None for no decompression.
         {storage_options}
 
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 25002e31f28e0..83cbac48e6a2b 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -478,8 +478,9 @@ def read_json(
     compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
         gzip, bz2, zip, xz or tar if path_or_buf is a string ending in
-        '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively, and no decompression
-        otherwise. If using 'zip' or 'tar', the archive must contain only one data
+        '.gz', '.bz2', '.zip', '.xz' or containing '.tar' respectively,
+        and no decompression otherwise.
+        If using 'zip' or 'tar', the archive must contain only one data
         file to be read in. Set to None for no decompression.
 
     nrows : int, optional
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 5aae9cb3a380a..9bf78c84377ed 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -804,8 +804,9 @@ def read_xml(
     compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', 'tar', None}}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
         gzip, bz2, zip, xz, or tar if path_or_buffer is a string ending in
-        '.gz', '.bz2', '.zip', '.xz', or containing '.tar' respectively, and no decompression
-        otherwise. If using 'zip' or 'tar', the archive must contain only one data
+        '.gz', '.bz2', '.zip', '.xz', or containing '.tar' respectively,
+        and no decompression otherwise.
+        If using 'zip' or 'tar', the archive must contain only one data
         file to be read in. Set to None for no decompression.
 
     {storage_options}

From 6a8edef38437769491c645e284ec2696e13b8182 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Tue, 7 Dec 2021 10:14:44 +0000
Subject: [PATCH 05/33] refactor: flip _compression_to_extension around to
 support multiple extensions on same compression

co-authored-by: Margarete Dippel <margarete01@users.norepl
y.github.com>
---
 pandas/io/common.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 0cc344bb7be07..837f0dd4903b9 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -446,7 +446,8 @@ def file_path_to_url(path: str) -> str:
     return urljoin("file:", pathname2url(path))
 
 
-_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}
+_extension_to_compression = {".gz": "gzip", ".bz2": "bz2", ".zip": "zip", ".xz": "xz"}
+_supported_compressions = set(_extension_to_compression.values())
 
 
 def get_compression_method(
@@ -525,20 +526,18 @@ def infer_compression(
             return "tar"
 
         # Infer compression from the filename/URL extension
-        for compression, extension in _compression_to_extension.items():
+        for extension, compression in _extension_to_compression.items():
             if filepath_or_buffer.lower().endswith(extension):
                 return compression
         return None
 
     # Compression has been specified. Check that it's valid
-    if compression in _compression_to_extension:
+    if compression in _supported_compressions:
         return compression
 
     # https://github.com/python/mypy/issues/5492
     # Unsupported operand types for + ("List[Optional[str]]" and "List[str]")
-    valid = ["infer", None] + sorted(
-        _compression_to_extension
-    )  # type: ignore[operator]
+    valid = ["infer", None] + sorted(_supported_compressions)  # type: ignore[operator]
     msg = (
         f"Unrecognized compression type: {compression}\n"
         f"Valid compression types are {valid}"
@@ -683,7 +682,7 @@ def get_handle(
         ioargs.encoding,
         ioargs.mode,
         errors,
-        ioargs.compression["method"] not in _compression_to_extension,
+        ioargs.compression["method"] not in _supported_compressions,
     )
 
     is_path = isinstance(handle, str)

From d4e40c97b67ab0ad98e5435b0ef0710310fb032e Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Tue, 7 Dec 2021 10:18:03 +0000
Subject: [PATCH 06/33] refactor: detect tar files using existing extension
 mapping

co-authored-by: Margarete Dippel <margarete01@users.noreply.github.com>
---
 pandas/io/common.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 837f0dd4903b9..fbbeda9cc5d64 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -446,7 +446,16 @@ def file_path_to_url(path: str) -> str:
     return urljoin("file:", pathname2url(path))
 
 
-_extension_to_compression = {".gz": "gzip", ".bz2": "bz2", ".zip": "zip", ".xz": "xz"}
+_extension_to_compression = {
+    ".tar": "tar",
+    ".tar.gz": "tar",
+    ".tar.bz2": "tar",
+    ".tar.xz": "tar",
+    ".gz": "gzip",
+    ".bz2": "bz2",
+    ".zip": "zip",
+    ".xz": "xz",
+}
 _supported_compressions = set(_extension_to_compression.values())
 
 
@@ -522,9 +531,6 @@ def infer_compression(
             # Cannot infer compression of a buffer, assume no compression
             return None
 
-        if ".tar" in filepath_or_buffer:
-            return "tar"
-
         # Infer compression from the filename/URL extension
         for extension, compression in _extension_to_compression.items():
             if filepath_or_buffer.lower().endswith(extension):

From 5f22df77accc57cfcd9864052db5b58e5dcae2e3 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Tue, 7 Dec 2021 13:51:59 +0000
Subject: [PATCH 07/33] feat: add support for writing tar files

co-authored-by: Margarete Dippel <margarete01@users.noreply.github.com>
---
 pandas/_testing/_io.py              |  10 +++
 pandas/conftest.py                  |   4 +-
 pandas/core/generic.py              |   4 ++
 pandas/io/common.py                 | 101 +++++++++++++++++++++++++---
 pandas/io/json/_json.py             |   3 +-
 pandas/io/pickle.py                 |   3 +-
 pandas/tests/io/test_compression.py |  15 ++++-
 pandas/tests/io/test_gcs.py         |   9 +++
 pandas/tests/io/test_pickle.py      |   8 ++-
 pandas/tests/io/test_stata.py       |   4 ++
 10 files changed, 142 insertions(+), 19 deletions(-)

diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
index 2c8e1b0daaeaa..5775129736339 100644
--- a/pandas/_testing/_io.py
+++ b/pandas/_testing/_io.py
@@ -3,6 +3,8 @@
 import bz2
 from functools import wraps
 import gzip
+import io
+import tarfile
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -387,6 +389,14 @@ def write_to_compressed(compression, path, data, dest="test"):
         mode = "w"
         args = (dest, data)
         method = "writestr"
+    elif compression == "tar":
+        compress_method = tarfile.TarFile
+        mode = "w"
+        file = tarfile.TarInfo(name=dest)
+        bytes = io.BytesIO(data)
+        file.size = len(data)
+        args = (file, bytes)
+        method = "addfile"
     elif compression == "gzip":
         compress_method = gzip.GzipFile
     elif compression == "bz2":
diff --git a/pandas/conftest.py b/pandas/conftest.py
index eb9a952250f36..c2e41bb5693d0 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -267,7 +267,7 @@ def other_closed(request):
     return request.param
 
 
-@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"])
+@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz", "tar"])
 def compression(request):
     """
     Fixture for trying common compression types in compression tests.
@@ -275,7 +275,7 @@ def compression(request):
     return request.param
 
 
-@pytest.fixture(params=["gzip", "bz2", "zip", "xz"])
+@pytest.fixture(params=["gzip", "bz2", "zip", "xz", "tar"])
 def compression_only(request):
     """
     Fixture for trying common compression types in compression tests excluding
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 57f151feeae80..a56bc43d14455 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2341,6 +2341,7 @@ def to_json(
         default_handler: Callable[[Any], JSONSerializable] | None = None,
         lines: bool_t = False,
         compression: CompressionOptions = "infer",
+        mode: str = "w",
         index: bool_t = True,
         indent: int | None = None,
         storage_options: StorageOptions = None,
@@ -2604,6 +2605,7 @@ def to_json(
             default_handler=default_handler,
             lines=lines,
             compression=compression,
+            mode=mode,
             index=index,
             indent=indent,
             storage_options=storage_options,
@@ -2923,6 +2925,7 @@ def to_pickle(
         self,
         path,
         compression: CompressionOptions = "infer",
+        mode: str = "wb",
         protocol: int = pickle.HIGHEST_PROTOCOL,
         storage_options: StorageOptions = None,
     ) -> None:
@@ -2990,6 +2993,7 @@ def to_pickle(
             self,
             path,
             compression=compression,
+            mode=mode,
             protocol=protocol,
             storage_options=storage_options,
         )
diff --git a/pandas/io/common.py b/pandas/io/common.py
index fbbeda9cc5d64..27605233c5f49 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -10,6 +10,7 @@
 from io import (
     BufferedIOBase,
     BytesIO,
+    FileIO,
     RawIOBase,
     StringIO,
     TextIOBase,
@@ -758,18 +759,22 @@ def get_handle(
 
         # TAR Encoding
         elif compression == "tar":
-            tar = tarfile.open(handle, "r:*")
-            handles.append(tar)
-            files = tar.getnames()
-            if len(files) == 1:
-                handle = tar.extractfile(files[0])
-            elif len(files) == 0:
-                raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
+            if is_path:
+                handle = _BytesTarFile.open(name=handle, mode=ioargs.mode)
             else:
-                raise ValueError(
-                    "Multiple files found in TAR archive. "
-                    f"Only one file per TAR archive: {files}"
-                )
+                handle = _BytesTarFile.open(fileobj=handle, mode=ioargs.mode)
+            if handle.mode == "r":
+                handles.append(handle)
+                files = handle.getnames()
+                if len(files) == 1:
+                    handle = handle.extractfile(files[0])
+                elif len(files) == 0:
+                    raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
+                else:
+                    raise ValueError(
+                        "Multiple files found in TAR archive. "
+                        f"Only one file per TAR archive: {files}"
+                    )
 
         # XZ Compression
         elif compression == "xz":
@@ -852,6 +857,80 @@ def get_handle(
     )
 
 
+class _BytesTarFile(tarfile.TarFile, BytesIO):
+
+    # GH 17778
+    def __init__(
+        self,
+        name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
+        mode: str,
+        fileobj: FileIO,
+        archive_name: str | None = None,
+        **kwargs,
+    ):
+        self.archive_name = archive_name
+        self.multiple_write_buffer: StringIO | BytesIO | None = None
+        self._closing = False
+
+        super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs)
+
+    @classmethod
+    def open(cls, mode="r", **kwargs):
+        mode = mode.replace("b", "")
+        return super().open(mode=mode, **kwargs)
+
+    def infer_filename(self):
+        """
+        If an explicit archive_name is not given, we still want the file inside the zip
+        file not to be named something.tar, because that causes confusion (GH39465).
+        """
+        if isinstance(self.name, (os.PathLike, str)):
+            filename = Path(self.name)
+            if filename.suffix == ".tar":
+                return filename.with_suffix("").name
+            if filename.suffix in [".tar.gz", ".tar.bz2", ".tar.xz"]:
+                return filename.with_suffix("").with_suffix("").name
+            return filename.name
+        return None
+
+    def write(self, data):
+        # buffer multiple write calls, write on flush
+        if self.multiple_write_buffer is None:
+            self.multiple_write_buffer = (
+                BytesIO() if isinstance(data, bytes) else StringIO()
+            )
+        self.multiple_write_buffer.write(data)
+
+    def flush(self) -> None:
+        # write to actual handle and close write buffer
+        if self.multiple_write_buffer is None or self.multiple_write_buffer.closed:
+            return
+
+        # TarFile needs a non-empty string
+        archive_name = self.archive_name or self.infer_filename() or "tar"
+        with self.multiple_write_buffer:
+            value = self.multiple_write_buffer.getvalue()
+            tarinfo = tarfile.TarInfo(name=archive_name)
+            tarinfo.size = len(value)
+            self.addfile(tarinfo, io.BytesIO(value))
+
+    def close(self):
+        self.flush()
+        super().close()
+
+    @property
+    def closed(self):
+        if self.multiple_write_buffer is None:
+            return False
+        return self.multiple_write_buffer.closed and super().closed
+
+    @closed.setter
+    def closed(self, value):
+        if not self._closing and value:
+            self._closing = True
+            self.close()
+
+
 # error: Definition of "__exit__" in base class "ZipFile" is incompatible with
 # definition in base class "BytesIO"  [misc]
 # error: Definition of "__enter__" in base class "ZipFile" is incompatible with
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 83cbac48e6a2b..6f6ee951f392a 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -83,6 +83,7 @@ def to_json(
     default_handler: Callable[[Any], JSONSerializable] | None = None,
     lines: bool = False,
     compression: CompressionOptions = "infer",
+    mode: str = "w",
     index: bool = True,
     indent: int = 0,
     storage_options: StorageOptions = None,
@@ -127,7 +128,7 @@ def to_json(
     if path_or_buf is not None:
         # apply compression and byte/text conversion
         with get_handle(
-            path_or_buf, "w", compression=compression, storage_options=storage_options
+            path_or_buf, mode, compression=compression, storage_options=storage_options
         ) as handles:
             handles.handle.write(s)
     else:
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index 29cacc12b1da0..d2837c49b3c63 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -25,6 +25,7 @@ def to_pickle(
     obj: Any,
     filepath_or_buffer: FilePath | WriteBuffer[bytes],
     compression: CompressionOptions = "infer",
+    mode: str = "wb",
     protocol: int = pickle.HIGHEST_PROTOCOL,
     storage_options: StorageOptions = None,
 ):
@@ -95,7 +96,7 @@ def to_pickle(
 
     with get_handle(
         filepath_or_buffer,
-        "wb",
+        mode,
         compression=compression,
         is_text=False,
         storage_options=storage_options,
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 3c278cb48e20f..056bc85241215 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -14,6 +14,10 @@
 import pandas.io.common as icom
 
 
+def flip(my_dict: dict):
+    return {value: key for key, value in my_dict.items()}
+
+
 @pytest.mark.parametrize(
     "obj",
     [
@@ -26,8 +30,13 @@
 )
 @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
 def test_compression_size(obj, method, compression_only):
+    kwargs = {}
+
+    if compression_only == "tar":
+        kwargs["mode"] = "w:gz"
+
     with tm.ensure_clean() as path:
-        getattr(obj, method)(path, compression=compression_only)
+        getattr(obj, method)(path, compression=compression_only, **kwargs)
         compressed_size = os.path.getsize(path)
         getattr(obj, method)(path, compression=None)
         uncompressed_size = os.path.getsize(path)
@@ -72,7 +81,7 @@ def test_dataframe_compression_defaults_to_infer(
 ):
     # GH22004
     input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"])
-    extension = icom._compression_to_extension[compression_only]
+    extension = flip(icom._extension_to_compression)[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
         output = read_method(path, compression=compression_only)
@@ -92,7 +101,7 @@ def test_series_compression_defaults_to_infer(
 ):
     # GH22004
     input = pd.Series([0, 5, -2, 10], name="X")
-    extension = icom._compression_to_extension[compression_only]
+    extension = flip(icom._extension_to_compression)[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
         if "squeeze" in read_kwargs:
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
index 2e8e4a9017dbc..30ddf0e9e9b75 100644
--- a/pandas/tests/io/test_gcs.py
+++ b/pandas/tests/io/test_gcs.py
@@ -1,5 +1,6 @@
 from io import BytesIO
 import os
+import tarfile
 import zipfile
 
 import numpy as np
@@ -104,6 +105,14 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str):
         ) as res:
             for res_info, exp_info in zip(res.infolist(), exp.infolist()):
                 assert res_info.CRC == exp_info.CRC
+    elif compression == "tar":
+        with tarfile.open(fileobj=BytesIO(result)) as exp, tarfile.open(
+            fileobj=BytesIO(expected)
+        ) as res:
+            for res_info, exp_info in zip(res.getmembers(), exp.getmembers()):
+                assert (
+                    res.extractfile(res_info).read() == exp.extractfile(exp_info).read()
+                )
     else:
         assert result == expected
 
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
index aa80df1bcbd38..41c2734fdf149 100644
--- a/pandas/tests/io/test_pickle.py
+++ b/pandas/tests/io/test_pickle.py
@@ -21,6 +21,7 @@
 from pathlib import Path
 import pickle
 import shutil
+import tarfile
 from warnings import (
     catch_warnings,
     filterwarnings,
@@ -306,13 +307,18 @@ def compress_file(self, src_path, dest_path, compression):
         elif compression == "zip":
             with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
                 f.write(src_path, os.path.basename(src_path))
+        elif compression == "tar":
+            with open(src_path, "rb") as fh:
+                with tarfile.open(dest_path, mode="w") as tar:
+                    tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path))
+                    tar.addfile(tarinfo, fh)
         elif compression == "xz":
             f = get_lzma_file()(dest_path, "w")
         else:
             msg = f"Unrecognized compression type: {compression}"
             raise ValueError(msg)
 
-        if compression != "zip":
+        if compression not in ["zip", "tar"]:
             with open(src_path, "rb") as fh, f:
                 f.write(fh.read())
 
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index eb457d74c6a01..8f396ff78c047 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -5,6 +5,7 @@
 import io
 import os
 import struct
+import tarfile
 import warnings
 import zipfile
 
@@ -1899,6 +1900,9 @@ def test_compression(compression, version, use_dict, infer):
         elif compression == "zip":
             with zipfile.ZipFile(path, "r") as comp:
                 fp = io.BytesIO(comp.read(comp.filelist[0]))
+        elif compression == "tar":
+            with tarfile.open(path) as tar:
+                fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read())
         elif compression == "bz2":
             with bz2.open(path, "rb") as comp:
                 fp = io.BytesIO(comp.read())

From c6573efdabb07ab5074a6b3b06ab63d9ac29a04e Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Wed, 15 Dec 2021 10:21:16 +0000
Subject: [PATCH 08/33] feat: assure it respects .gz endings

---
 pandas/io/common.py                        | 20 ++++++++++++++++--
 pandas/tests/io/parser/test_compression.py | 24 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 27605233c5f49..58735f4b5d0dd 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -875,9 +875,25 @@ def __init__(
         super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs)
 
     @classmethod
-    def open(cls, mode="r", **kwargs):
+    def open(cls, name=None, mode="r", **kwargs):
         mode = mode.replace("b", "")
-        return super().open(mode=mode, **kwargs)
+        return super().open(name=name, mode=cls.extend_mode(name, mode), **kwargs)
+
+    @classmethod
+    def extend_mode(
+        cls, name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], mode: str
+    ) -> str:
+        if mode != "w":
+            return mode
+        if isinstance(name, (os.PathLike, str)):
+            filename = Path(name)
+            if filename.suffix == ".gz":
+                return mode + ":gz"
+            elif filename.suffix == ".xz":
+                return mode + ":xz"
+            elif filename.suffix == ".bz2":
+                return mode + ":bz2"
+        return mode
 
     def infer_filename(self):
         """
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index e0857eb8f6ce8..a411fca91ecf0 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -5,6 +5,7 @@
 
 import os
 from pathlib import Path
+import tarfile
 import zipfile
 
 import pytest
@@ -180,3 +181,26 @@ def test_ignore_compression_extension(all_parsers):
             Path(path_zip).write_text(Path(path_csv).read_text())
 
             tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
+
+
+@skip_pyarrow
+def test_writes_tar_gz(all_parsers):
+    parser = all_parsers
+    data = DataFrame(
+        {
+            "Country": ["Venezuela", "Venezuela"],
+            "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
+        }
+    )
+    with tm.ensure_clean("test.tar.gz") as tar_path:
+        data.to_csv(tar_path, index=False)
+
+        # test that read_csv infers .tar.gz to gzip:
+        tm.assert_frame_equal(parser.read_csv(tar_path), data)
+
+        # test that file is indeed gzipped:
+        with tarfile.open(tar_path, "r:gz") as tar:
+            result = parser.read_csv(
+                tar.extractfile(tar.getnames()[0]), compression="infer"
+            )
+            tm.assert_frame_equal(result, data)

From a4ac382898e07e406b9ab4e2b0c147a9d32f09b7 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Wed, 15 Dec 2021 11:12:25 +0000
Subject: [PATCH 09/33] feat: add "tar" entry to compressionoptions

---
 pandas/_typing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_typing.py b/pandas/_typing.py
index 95277e97eae98..d35cfe50a3ae8 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -243,7 +243,7 @@ def closed(self) -> bool:
 # compression keywords and compression
 CompressionDict = Dict[str, Any]
 CompressionOptions = Optional[
-    Union[Literal["infer", "gzip", "bz2", "zip", "xz"], CompressionDict]
+    Union[Literal["infer", "gzip", "bz2", "zip", "xz", "tar"], CompressionDict]
 ]
 
 

From e66826b9e1bb430f3df704634e6006af3133516a Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Wed, 15 Dec 2021 11:29:30 +0000
Subject: [PATCH 10/33] chore: add whatsnew entry

---
 doc/source/whatsnew/v1.4.0.rst | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 372f991d96a22..79703d1ea5c44 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -196,6 +196,29 @@ representation of :class:`DataFrame` objects (:issue:`4889`).
 
 .. _whatsnew_140.enhancements.other:
 
+Reading directly from TAR archives
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+I/O methods like :function:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing
+directly on TAR archives (:issue:`44787`).
+
+.. code-block:: python
+
+   df = pd.read_csv("./movement.tar.gz")
+   # ...
+   df.to_csv("./out.tar.gz")
+
+This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives.
+The used compression method is inferred from the filename.
+If the compression method cannot be inferred, use the ``compression`` argument:
+
+.. code-block:: python
+
+   df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821
+
+(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open)
+
+
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - :meth:`concat` will preserve the ``attrs`` when it is the same for all objects and discard the ``attrs`` when they are different. (:issue:`41828`)

From 941be377d28f61dcfdc7afdd87bb84c04034194f Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Wed, 15 Dec 2021 11:35:33 +0000
Subject: [PATCH 11/33] fix: test_compression_size_fh

---
 pandas/tests/io/test_compression.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 056bc85241215..29f7c35b90395 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -56,7 +56,11 @@ def test_compression_size(obj, method, compression_only):
 @pytest.mark.parametrize("method", ["to_csv", "to_json"])
 def test_compression_size_fh(obj, method, compression_only):
     with tm.ensure_clean() as path:
-        with icom.get_handle(path, "w", compression=compression_only) as handles:
+        with icom.get_handle(
+            path,
+            "w:gz" if compression_only == "tar" else "w",
+            compression=compression_only,
+        ) as handles:
             getattr(obj, method)(handles.handle)
             assert not handles.handle.closed
         compressed_size = os.path.getsize(path)

From 0468e5f38286c8644e4d91a1b6e91ea1e42a8d07 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Tue, 4 Jan 2022 08:38:20 +0100
Subject: [PATCH 12/33] add tarfile to shared compression docs

---
 pandas/core/shared_docs.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
index f79fd3ed09f8d..d54e62fa34f19 100644
--- a/pandas/core/shared_docs.py
+++ b/pandas/core/shared_docs.py
@@ -407,11 +407,11 @@
 ] = """compression : str or dict, default 'infer'
     For on-the-fly compression of the output data. If 'infer' and '%s'
     path-like, then detect compression from the following extensions: '.gz',
-    '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to
+    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). Set to
     ``None`` for no compression. Can also be a dict with key ``'method'`` set
-    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
+    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
-    ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
+    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or ``tarfile.TarFile``, respectively. As an
     example, the following could be passed for faster compression and to create
     a reproducible gzip archive:
     ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``."""
@@ -421,12 +421,12 @@
 ] = """compression : str or dict, default 'infer'
     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
     path-like, then detect compression from the following extensions: '.gz',
-    '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
-    'zip', the ZIP file must contain only one data file to be read in. Set to
+    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). If using
+    'zip' or 'tar', the ZIP file must contain only one data file to be read in. Set to
     ``None`` for no decompression. Can also be a dict with key ``'method'`` set
-    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
+    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
     key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
-    ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
+    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or ``tarfile.TarFile``, respectively. As an
     example, the following could be passed for Zstandard decompression using a
     custom compression dictionary:
     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``."""

From 2531ee0b704c514f448eb934c38cb02da30dc86b Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Tue, 4 Jan 2022 09:00:41 +0100
Subject: [PATCH 13/33] fix formatting

---
 pandas/core/shared_docs.py | 30 +++++++++++++++++++-----------
 pandas/io/common.py        |  2 +-
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
index d54e62fa34f19..af881f4d4bbaf 100644
--- a/pandas/core/shared_docs.py
+++ b/pandas/core/shared_docs.py
@@ -407,12 +407,16 @@
 ] = """compression : str or dict, default 'infer'
     For on-the-fly compression of the output data. If 'infer' and '%s'
     path-like, then detect compression from the following extensions: '.gz',
-    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). Set to
-    ``None`` for no compression. Can also be a dict with key ``'method'`` set
+    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+    (otherwise no compression).
+    Set to ``None`` for no compression.
+    Can also be a dict with key ``'method'`` set
     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
-    key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
-    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or ``tarfile.TarFile``, respectively. As an
-    example, the following could be passed for faster compression and to create
+    key-value pairs are forwarded to
+    ``zipfile.ZipFile``, ``gzip.GzipFile``,
+    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or
+    ``tarfile.TarFile``, respectively.
+    As an example, the following could be passed for faster compression and to create
     a reproducible gzip archive:
     ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``."""
 
@@ -421,13 +425,17 @@
 ] = """compression : str or dict, default 'infer'
     For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
     path-like, then detect compression from the following extensions: '.gz',
-    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). If using
-    'zip' or 'tar', the ZIP file must contain only one data file to be read in. Set to
-    ``None`` for no decompression. Can also be a dict with key ``'method'`` set
+    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+    (otherwise no compression).
+    If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in.
+    Set to ``None`` for no decompression.
+    Can also be a dict with key ``'method'`` set
     to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
-    key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
-    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or ``tarfile.TarFile``, respectively. As an
-    example, the following could be passed for Zstandard decompression using a
+    key-value pairs are forwarded to
+    ``zipfile.ZipFile``, ``gzip.GzipFile``,
+    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or
+    ``tarfile.TarFile``, respectively.
+    As an example, the following could be passed for Zstandard decompression using a
     custom compression dictionary:
     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``."""
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
index 04779a84cf613..c14872d8d5c0b 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -19,8 +19,8 @@
 import mmap
 import os
 from pathlib import Path
-import tarfile
 import re
+import tarfile
 from typing import (
     IO,
     Any,

From 57eba0ad7b63c984fb69c8c2a7e1a40ae3721bd6 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Tue, 4 Jan 2022 08:27:49 +0000
Subject: [PATCH 14/33] pass through "mode" via compression args

---
 pandas/core/generic.py              | 4 ----
 pandas/io/common.py                 | 6 ++++--
 pandas/tests/io/test_compression.py | 6 ++----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 68d2b38264030..1e25b0f4eb176 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2345,7 +2345,6 @@ def to_json(
         default_handler: Callable[[Any], JSONSerializable] | None = None,
         lines: bool_t = False,
         compression: CompressionOptions = "infer",
-        mode: str = "w",
         index: bool_t = True,
         indent: int | None = None,
         storage_options: StorageOptions = None,
@@ -2607,7 +2606,6 @@ def to_json(
             default_handler=default_handler,
             lines=lines,
             compression=compression,
-            mode=mode,
             index=index,
             indent=indent,
             storage_options=storage_options,
@@ -2949,7 +2947,6 @@ def to_pickle(
         self,
         path,
         compression: CompressionOptions = "infer",
-        mode: str = "wb",
         protocol: int = pickle.HIGHEST_PROTOCOL,
         storage_options: StorageOptions = None,
     ) -> None:
@@ -3007,7 +3004,6 @@ def to_pickle(
             self,
             path,
             compression=compression,
-            mode=mode,
             protocol=protocol,
             storage_options=storage_options,
         )
diff --git a/pandas/io/common.py b/pandas/io/common.py
index c14872d8d5c0b..dfe01fb3467a7 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -763,10 +763,12 @@ def get_handle(
 
         # TAR Encoding
         elif compression == "tar":
+            if "mode" not in compression_args:
+                compression_args["mode"] = ioargs.mode
             if is_path:
-                handle = _BytesTarFile.open(name=handle, mode=ioargs.mode)
+                handle = _BytesTarFile.open(name=handle, **compression_args)
             else:
-                handle = _BytesTarFile.open(fileobj=handle, mode=ioargs.mode)
+                handle = _BytesTarFile.open(fileobj=handle, **compression_args)
             if handle.mode == "r":
                 handles.append(handle)
                 files = handle.getnames()
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 29f7c35b90395..c14807e5cf96f 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -30,13 +30,11 @@ def flip(my_dict: dict):
 )
 @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
 def test_compression_size(obj, method, compression_only):
-    kwargs = {}
-
     if compression_only == "tar":
-        kwargs["mode"] = "w:gz"
+        compression_only = {"method": "tar", "mode": "w:gz"}
 
     with tm.ensure_clean() as path:
-        getattr(obj, method)(path, compression=compression_only, **kwargs)
+        getattr(obj, method)(path, compression=compression_only)
         compressed_size = os.path.getsize(path)
         getattr(obj, method)(path, compression=None)
         uncompressed_size = os.path.getsize(path)

From 38f7d541ae7bde7ac08339f6adf9d88947706206 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Tue, 4 Jan 2022 08:36:41 +0000
Subject: [PATCH 15/33] fix pickle test

---
 pandas/tests/io/test_pickle.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
index 8d139a1a29bd6..3342aba90b76c 100644
--- a/pandas/tests/io/test_pickle.py
+++ b/pandas/tests/io/test_pickle.py
@@ -289,9 +289,7 @@ def get_random_path():
 
 class TestCompression:
 
-    _extension_to_compression = {
-        ext: compression for compression, ext in icom._compression_to_extension.items()
-    }
+    _extension_to_compression = icom._extension_to_compression
 
     def compress_file(self, src_path, dest_path, compression):
         if compression is None:
@@ -550,7 +548,7 @@ def test_pickle_binary_object_compression(compression):
     buffer.seek(0)
 
     # gzip  and zip safe the filename: cannot compare the compressed content
-    assert buffer.getvalue() == reference or compression in ("gzip", "zip")
+    assert buffer.getvalue() == reference or compression in ("gzip", "zip", "tar")
 
     # read
     read_df = pd.read_pickle(buffer, compression=compression)

From 887fd10f80e380d7bd907894927a8a368556dce1 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Tue, 4 Jan 2022 08:49:58 +0000
Subject: [PATCH 16/33] add class comment

---
 pandas/io/common.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index dfe01fb3467a7..f3e82d6ad1fb0 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -874,6 +874,13 @@ def get_handle(
 
 
 class _BytesTarFile(tarfile.TarFile, BytesIO):
+    """
+    Wrapper for standard library class TarFile and allow the returned file-like
+    handle to accept byte strings via `write` method.
+
+    BytesIO provides attributes of file-like object and TarFile.addfile writes
+    bytes strings into a member of the archive.
+    """
 
     # GH 17778
     def __init__(

From 669d942d7eb4c54a1f83f874999197b5c2d09fe0 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 11:22:14 +0200
Subject: [PATCH 17/33] sort imports

---
 pandas/_testing/_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
index 826efb18bafd1..1ef65f761c3f6 100644
--- a/pandas/_testing/_io.py
+++ b/pandas/_testing/_io.py
@@ -4,8 +4,8 @@
 from functools import wraps
 import gzip
 import io
-import tarfile
 import socket
+import tarfile
 from typing import (
     TYPE_CHECKING,
     Any,

From 7d7d3c6d4d11c13ebc5dcc3212a32dead8f7f007 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 11:51:34 +0200
Subject: [PATCH 18/33] add _compression_to_extension back for backwards
 compatibility

---
 pandas/io/common.py                 | 1 +
 pandas/tests/io/test_compression.py | 8 ++------
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index d44441b0586d7..f9cdc3f25c371 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -464,6 +464,7 @@ def file_path_to_url(path: str) -> str:
     ".zst": "zstd",
 }
 _supported_compressions = set(_extension_to_compression.values())
+_compression_to_extension = {value: key for key, value in _extension_to_compression.items()}
 
 
 def get_compression_method(
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index c14807e5cf96f..2d7daafb192a4 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -14,10 +14,6 @@
 import pandas.io.common as icom
 
 
-def flip(my_dict: dict):
-    return {value: key for key, value in my_dict.items()}
-
-
 @pytest.mark.parametrize(
     "obj",
     [
@@ -83,7 +79,7 @@ def test_dataframe_compression_defaults_to_infer(
 ):
     # GH22004
     input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"])
-    extension = flip(icom._extension_to_compression)[compression_only]
+    extension = icom._extension_to_compression[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
         output = read_method(path, compression=compression_only)
@@ -103,7 +99,7 @@ def test_series_compression_defaults_to_infer(
 ):
     # GH22004
     input = pd.Series([0, 5, -2, 10], name="X")
-    extension = flip(icom._extension_to_compression)[compression_only]
+    extension = icom._extension_to_compression[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
         if "squeeze" in read_kwargs:

From 8b8b8ac1a39df3e9de1f7c389223548a7f1c27ba Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 11:51:57 +0200
Subject: [PATCH 19/33] fix some type warnings

---
 pandas/io/common.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index f9cdc3f25c371..38605b54d1e53 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -767,7 +767,7 @@ def get_handle(
                 handle = _BytesTarFile.open(name=handle, **compression_args)
             else:
                 handle = _BytesTarFile.open(fileobj=handle, **compression_args)
-            if handle.mode == "r":
+            if handle.mode == "r": # type: ignore[arg-type]
                 handles.append(handle)
                 files = handle.getnames()
                 if len(files) == 1:
@@ -871,6 +871,18 @@ def get_handle(
     )
 
 
+# error: Definition of "__exit__" in base class "TarFile" is incompatible with
+# definition in base class "BytesIO"  [misc]
+# error: Definition of "__enter__" in base class "TarFile" is incompatible with
+# definition in base class "BytesIO"  [misc]
+# error: Definition of "__enter__" in base class "TarFile" is incompatible with
+# definition in base class "BinaryIO"  [misc]
+# error: Definition of "__enter__" in base class "TarFile" is incompatible with
+# definition in base class "IO"  [misc]
+# error: Definition of "read" in base class "TarFile" is incompatible with
+# definition in base class "BytesIO"  [misc]
+# error: Definition of "read" in base class "TarFile" is incompatible with
+# definition in base class "IO"  [misc]
 class _BytesTarFile(tarfile.TarFile, BytesIO):
     """
     Wrapper for standard library class TarFile and allow the returned file-like

From dd356f674ea6a135c2c1889682ce895bbe075e78 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 12:20:01 +0200
Subject: [PATCH 20/33] fix: formatting

---
 pandas/io/common.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 38605b54d1e53..c462b4f7f0d86 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -464,7 +464,9 @@ def file_path_to_url(path: str) -> str:
     ".zst": "zstd",
 }
 _supported_compressions = set(_extension_to_compression.values())
-_compression_to_extension = {value: key for key, value in _extension_to_compression.items()}
+_compression_to_extension = {
+    value: key for key, value in _extension_to_compression.items()
+}
 
 
 def get_compression_method(
@@ -767,7 +769,7 @@ def get_handle(
                 handle = _BytesTarFile.open(name=handle, **compression_args)
             else:
                 handle = _BytesTarFile.open(fileobj=handle, **compression_args)
-            if handle.mode == "r": # type: ignore[arg-type]
+            if handle.mode == "r":  # type: ignore[arg-type]
                 handles.append(handle)
                 files = handle.getnames()
                 if len(files) == 1:

From 514014ae7129fbbe36b5e181c708b477a81f1731 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 12:48:24 +0200
Subject: [PATCH 21/33] fix: mypy complaints

---
 pandas/io/common.py         | 24 ++++++++++++++----------
 pandas/tests/io/test_gcs.py | 16 ++++++++++------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index c462b4f7f0d86..015196f898a38 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -769,11 +769,14 @@ def get_handle(
                 handle = _BytesTarFile.open(name=handle, **compression_args)
             else:
                 handle = _BytesTarFile.open(fileobj=handle, **compression_args)
-            if handle.mode == "r":  # type: ignore[arg-type]
+            assert isinstance(handle, _BytesTarFile)
+            if handle.mode == "r":
                 handles.append(handle)
                 files = handle.getnames()
                 if len(files) == 1:
-                    handle = handle.extractfile(files[0])
+                    file = handle.extractfile(files[0])
+                    assert file is not None
+                    handle = file
                 elif len(files) == 0:
                     raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
                 else:
@@ -885,7 +888,7 @@ def get_handle(
 # definition in base class "BytesIO"  [misc]
 # error: Definition of "read" in base class "TarFile" is incompatible with
 # definition in base class "IO"  [misc]
-class _BytesTarFile(tarfile.TarFile, BytesIO):
+class _BytesTarFile(tarfile.TarFile, BytesIO):  # type: ignore[misc]
     """
     Wrapper for standard library class TarFile and allow the returned file-like
     handle to accept byte strings via `write` method.
@@ -897,14 +900,14 @@ class _BytesTarFile(tarfile.TarFile, BytesIO):
     # GH 17778
     def __init__(
         self,
-        name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
-        mode: str,
+        name: str | bytes | os.PathLike[str] | os.PathLike[bytes],
+        mode: Literal["r", "a", "w", "x"],
         fileobj: FileIO,
         archive_name: str | None = None,
         **kwargs,
     ):
         self.archive_name = archive_name
-        self.multiple_write_buffer: StringIO | BytesIO | None = None
+        self.multiple_write_buffer: BytesIO | None = None
         self._closing = False
 
         super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs)
@@ -936,7 +939,10 @@ def infer_filename(self):
         file not to be named something.tar, because that causes confusion (GH39465).
         """
         if isinstance(self.name, (os.PathLike, str)):
-            filename = Path(self.name)
+            # error: Argument 1 to "Path" has
+            # incompatible type "Union[str, PathLike[str], PathLike[bytes]]";
+            # expected "Union[str, PathLike[str]]"  [arg-type]
+            filename = Path(self.name)  # type: ignore[arg-type]
             if filename.suffix == ".tar":
                 return filename.with_suffix("").name
             if filename.suffix in [".tar.gz", ".tar.bz2", ".tar.xz"]:
@@ -947,9 +953,7 @@ def infer_filename(self):
     def write(self, data):
         # buffer multiple write calls, write on flush
         if self.multiple_write_buffer is None:
-            self.multiple_write_buffer = (
-                BytesIO() if isinstance(data, bytes) else StringIO()
-            )
+            self.multiple_write_buffer = BytesIO()
         self.multiple_write_buffer.write(data)
 
     def flush(self) -> None:
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
index b41fb4c5e64cc..ef590b2cf48d3 100644
--- a/pandas/tests/io/test_gcs.py
+++ b/pandas/tests/io/test_gcs.py
@@ -108,13 +108,17 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str):
             for res_info, exp_info in zip(res.infolist(), exp.infolist()):
                 assert res_info.CRC == exp_info.CRC
     elif compression == "tar":
-        with tarfile.open(fileobj=BytesIO(result)) as exp, tarfile.open(
+        with tarfile.open(fileobj=BytesIO(result)) as tar_exp, tarfile.open(
             fileobj=BytesIO(expected)
-        ) as res:
-            for res_info, exp_info in zip(res.getmembers(), exp.getmembers()):
-                assert (
-                    res.extractfile(res_info).read() == exp.extractfile(exp_info).read()
-                )
+        ) as tar_res:
+            for tar_res_info, tar_exp_info in zip(
+                tar_res.getmembers(), tar_exp.getmembers()
+            ):
+                actual_file = tar_res.extractfile(tar_res_info)
+                expected_file = tar_exp.extractfile(tar_exp_info)
+                assert (actual_file is None) == (expected_file is None)
+                if actual_file is not None and expected_file is not None:
+                    assert actual_file.read() == expected_file.read()
     else:
         assert result == expected
 

From 38971c7163f1fe829900f1870aca754e28ed98a3 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 12:58:45 +0200
Subject: [PATCH 22/33] fix: more tests

---
 pandas/tests/io/test_compression.py | 4 ++--
 pandas/tests/io/xml/test_xml.py     | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 2d7daafb192a4..66e29a552f04a 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -79,7 +79,7 @@ def test_dataframe_compression_defaults_to_infer(
 ):
     # GH22004
     input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"])
-    extension = icom._extension_to_compression[compression_only]
+    extension = icom._compression_to_extension[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
         output = read_method(path, compression=compression_only)
@@ -99,7 +99,7 @@ def test_series_compression_defaults_to_infer(
 ):
     # GH22004
     input = pd.Series([0, 5, -2, 10], name="X")
-    extension = icom._extension_to_compression[compression_only]
+    extension = icom._compression_to_extension[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
         if "squeeze" in read_kwargs:
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index bfb6bb19452bd..6b47d81608621 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -1370,6 +1370,7 @@ def test_wrong_compression(parser, compression, compression_only):
         "bz2": (OSError, "Invalid data stream"),
         "gzip": (OSError, "Not a gzipped file"),
         "zip": (BadZipFile, "File is not a zip file"),
+        "tar": (BadZipFile, "File is not a zip file"),
     }
     zstd = import_optional_dependency("zstandard", errors="ignore")
     if zstd is not None:

From e35d361e1fa8ccaf2ef0667ac7c7a67a3ad8569d Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 13:13:30 +0200
Subject: [PATCH 23/33] fix: some error with xml

---
 pandas/tests/io/xml/test_xml.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 6b47d81608621..277b6442a0a8c 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -6,6 +6,7 @@
 )
 from lzma import LZMAError
 import os
+from tarfile import ReadError
 from urllib.error import HTTPError
 from zipfile import BadZipFile
 
@@ -1370,7 +1371,7 @@ def test_wrong_compression(parser, compression, compression_only):
         "bz2": (OSError, "Invalid data stream"),
         "gzip": (OSError, "Not a gzipped file"),
         "zip": (BadZipFile, "File is not a zip file"),
-        "tar": (BadZipFile, "File is not a zip file"),
+        "tar": (ReadError, "file could not be opened successfully"),
     }
     zstd = import_optional_dependency("zstandard", errors="ignore")
     if zstd is not None:

From c5088fc60a94bd74fe0becece986551a2a09188c Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 13:51:03 +0200
Subject: [PATCH 24/33] fix: interpreted text role

---
 doc/source/whatsnew/v1.4.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index b102fa6696a65..9a8c0fc96f646 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -202,7 +202,7 @@ library to produce a tight representation of :class:`DataFrame` objects
 Reading directly from TAR archives
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-I/O methods like :function:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing
+I/O methods like :func:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing
 directly on TAR archives (:issue:`44787`).
 
 .. code-block:: python

From f6c51738704feb9cb5766d85531d1373aa95fa3f Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Sat, 9 Apr 2022 13:52:34 +0200
Subject: [PATCH 25/33] move to v1.5 whatsnw

---
 doc/source/whatsnew/v1.4.0.rst | 23 -----------------------
 doc/source/whatsnew/v1.5.0.rst | 25 +++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 9a8c0fc96f646..52aa9312d4c14 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -199,29 +199,6 @@ library to produce a tight representation of :class:`DataFrame` objects
 
 .. _whatsnew_140.enhancements.other:
 
-Reading directly from TAR archives
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-I/O methods like :func:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing
-directly on TAR archives (:issue:`44787`).
-
-.. code-block:: python
-
-   df = pd.read_csv("./movement.tar.gz")
-   # ...
-   df.to_csv("./out.tar.gz")
-
-This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives.
-The used compression method is inferred from the filename.
-If the compression method cannot be inferred, use the ``compression`` argument:
-
-.. code-block:: python
-
-   df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821
-
-(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open)
-
-
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - :meth:`concat` will preserve the ``attrs`` when it is the same for all objects and discard the ``attrs`` when they are different (:issue:`41828`)
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 4920622a15f3f..9923a0eb3b770 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -75,6 +75,31 @@ as seen in the following example.
                1 2021-01-02 08:00:00  4
                2 2021-01-02 16:00:00  5
 
+.. _whatsnew_150.enhancements.tar:
+
+Reading directly from TAR archives
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+I/O methods like :func:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing
+directly on TAR archives (:issue:`44787`).
+
+.. code-block:: python
+
+   df = pd.read_csv("./movement.tar.gz")
+   # ...
+   df.to_csv("./out.tar.gz")
+
+This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives.
+The used compression method is inferred from the filename.
+If the compression method cannot be inferred, use the ``compression`` argument:
+
+.. code-block:: python
+
+   df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821
+
+(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open)
+
+
 .. _whatsnew_150.enhancements.other:
 
 Other enhancements

From 9a4fa074f32916bbba4f2eb76be1ed6e2835feeb Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Mon, 11 Apr 2022 10:24:46 +0200
Subject: [PATCH 26/33] add versionadded note

---
 pandas/core/shared_docs.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
index f4afc14dbddc6..69b4bb4c35585 100644
--- a/pandas/core/shared_docs.py
+++ b/pandas/core/shared_docs.py
@@ -432,7 +432,11 @@
     ``tarfile.TarFile``, respectively.
     As an example, the following could be passed for faster compression and to create
     a reproducible gzip archive:
-    ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``."""
+    ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+        .. versionadded:: 1.5.0
+            Added support for `.tar` files.
+    """
 
 _shared_docs[
     "decompression_options"
@@ -451,7 +455,11 @@
     ``tarfile.TarFile``, respectively.
     As an example, the following could be passed for Zstandard decompression using a
     custom compression dictionary:
-    ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``."""
+    ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
+
+        .. versionadded:: 1.5.0
+            Added support for `.tar` files.
+    """
 
 _shared_docs[
     "replace"

From 0c31aa8def7d50d418d2ef3750e2ee0017cfa5fc Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Mon, 11 Apr 2022 11:10:43 +0200
Subject: [PATCH 27/33] don't leave blank lines

---
 pandas/core/shared_docs.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
index 69b4bb4c35585..33b6d74e83fdc 100644
--- a/pandas/core/shared_docs.py
+++ b/pandas/core/shared_docs.py
@@ -435,8 +435,7 @@
     ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
 
         .. versionadded:: 1.5.0
-            Added support for `.tar` files.
-    """
+            Added support for `.tar` files."""
 
 _shared_docs[
     "decompression_options"
@@ -458,8 +457,7 @@
     ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
 
         .. versionadded:: 1.5.0
-            Added support for `.tar` files.
-    """
+            Added support for `.tar` files."""
 
 _shared_docs[
     "replace"

From 086c59883e169fe021873f25304e7bb09f2823fd Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Wed, 13 Apr 2022 12:03:01 +0200
Subject: [PATCH 28/33] add tests for zero files / multiple files

---
 pandas/tests/io/test_compression.py | 45 +++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 66e29a552f04a..5ac0197d275dd 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -3,8 +3,10 @@
 from pathlib import Path
 import subprocess
 import sys
+import tarfile
 import textwrap
 import time
+import zipfile
 
 import pytest
 
@@ -262,3 +264,46 @@ def test_bzip_compression_level(obj, method):
     """
     with tm.ensure_clean() as path:
         getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})
+
+
+@pytest.mark.parametrize(
+    "suffix,archive",
+    [
+        (".zip", zipfile.ZipFile),
+        (".tar", tarfile.TarFile),
+    ],
+)
+def test_empty_archive_zip(suffix, archive):
+    with tm.ensure_clean(filename=suffix) as path:
+        file = archive(path, "w")
+        file.close()
+        with pytest.raises(ValueError, match="Zero files found"):
+            pd.read_csv(path)
+
+
+def test_ambiguous_archive_zip():
+    with tm.ensure_clean(filename=".zip") as path:
+        file = zipfile.ZipFile(path, "w")
+        file.writestr("a.csv", "foo,bar")
+        file.writestr("b.csv", "foo,bar")
+        file.close()
+        with pytest.raises(ValueError, match="Multiple files found in ZIP file"):
+            pd.read_csv(path)
+
+
+def test_ambiguous_archive_tar():
+    with tm.ensure_clean_dir() as dir:
+        csvAPath = os.path.join(dir, "a.csv")
+        with open(csvAPath, "w") as a:
+            a.write("foo,bar\n")
+        csvBPath = os.path.join(dir, "b.csv")
+        with open(csvBPath, "w") as b:
+            b.write("foo,bar\n")
+
+        tarpath = os.path.join(dir, "archive.tar")
+        with tarfile.TarFile(tarpath, "w") as tar:
+            tar.add(csvAPath, "a.csv")
+            tar.add(csvBPath, "b.csv")
+
+        with pytest.raises(ValueError, match="Multiple files found in TAR archive"):
+            pd.read_csv(tarpath)

From 861faf0e3cf86374946b56767f43067da0fde349 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Wed, 13 Apr 2022 14:00:55 +0200
Subject: [PATCH 29/33] move _compression_to_extension to tests

---
 pandas/io/common.py                        | 3 ---
 pandas/tests/io/formats/test_to_csv.py     | 5 ++---
 pandas/tests/io/json/test_compression.py   | 5 ++---
 pandas/tests/io/parser/test_compression.py | 5 ++---
 pandas/tests/io/parser/test_network.py     | 4 ++--
 pandas/tests/io/parser/test_read_fwf.py    | 4 ++--
 pandas/tests/io/test_compression.py        | 8 ++++++--
 pandas/tests/io/test_gcs.py                | 5 ++---
 pandas/tests/io/test_stata.py              | 6 +++---
 pandas/tests/io/xml/test_to_xml.py         | 4 ++--
 10 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 015196f898a38..15a8f2e114041 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -464,9 +464,6 @@ def file_path_to_url(path: str) -> str:
     ".zst": "zstd",
 }
 _supported_compressions = set(_extension_to_compression.values())
-_compression_to_extension = {
-    value: key for key, value in _extension_to_compression.items()
-}
 
 
 def get_compression_method(
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
index d3f8e27c47e98..b5096934af4cb 100644
--- a/pandas/tests/io/formats/test_to_csv.py
+++ b/pandas/tests/io/formats/test_to_csv.py
@@ -13,8 +13,7 @@
     compat,
 )
 import pandas._testing as tm
-
-import pandas.io.common as icom
+from pandas.tests.io.test_compression import _compression_to_extension
 
 
 class TestToCSV:
@@ -555,7 +554,7 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer):
 
         # We'll complete file extension subsequently.
         filename = "test."
-        filename += icom._compression_to_extension[compression]
+        filename += _compression_to_extension[compression]
 
         df = DataFrame({"A": [1]})
 
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
index 42e7b6cd03f55..ab97fb1740496 100644
--- a/pandas/tests/io/json/test_compression.py
+++ b/pandas/tests/io/json/test_compression.py
@@ -6,8 +6,7 @@
 
 import pandas as pd
 import pandas._testing as tm
-
-import pandas.io.common as icom
+from pandas.tests.io.test_compression import _compression_to_extension
 
 
 def test_compression_roundtrip(compression):
@@ -100,7 +99,7 @@ def test_to_json_compression(compression_only, read_infer, to_infer):
 
     # We'll complete file extension subsequently.
     filename = "test."
-    filename += icom._compression_to_extension[compression]
+    filename += _compression_to_extension[compression]
 
     df = pd.DataFrame({"A": [1]})
 
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index dd48663887d25..121784d5a45ed 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -12,8 +12,7 @@
 
 from pandas import DataFrame
 import pandas._testing as tm
-
-import pandas.io.common as icom
+from pandas.tests.io.test_compression import _compression_to_extension
 
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
@@ -96,7 +95,7 @@ def test_compression(request, parser_and_data, compression_only, buffer, filenam
     parser, data, expected = parser_and_data
     compress_type = compression_only
 
-    ext = icom._compression_to_extension[compress_type]
+    ext = _compression_to_extension[compress_type]
     filename = filename if filename is None else filename.format(ext=ext)
 
     if filename and buffer:
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
index 93924c9b670c2..0b16d1d9ec6b0 100644
--- a/pandas/tests/io/parser/test_network.py
+++ b/pandas/tests/io/parser/test_network.py
@@ -16,8 +16,8 @@
 
 from pandas import DataFrame
 import pandas._testing as tm
+from pandas.tests.io.test_compression import _compression_to_extension
 
-import pandas.io.common as icom
 from pandas.io.feather_format import read_feather
 from pandas.io.parsers import read_csv
 
@@ -35,7 +35,7 @@
 def test_compressed_urls(salaries_table, mode, engine, compression_only):
     # test reading compressed urls with various engines and
     # extension inference
-    extension = icom._compression_to_extension[compression_only]
+    extension = _compression_to_extension[compression_only]
     base_url = (
         "https://github.com/pandas-dev/pandas/raw/main/"
         "pandas/tests/io/parser/data/salaries.csv"
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
index f3d41332502af..d6d787df39dfa 100644
--- a/pandas/tests/io/parser/test_read_fwf.py
+++ b/pandas/tests/io/parser/test_read_fwf.py
@@ -21,8 +21,8 @@
     DatetimeIndex,
 )
 import pandas._testing as tm
+from pandas.tests.io.test_compression import _compression_to_extension
 
-import pandas.io.common as icom
 from pandas.io.parsers import (
     read_csv,
     read_fwf,
@@ -656,7 +656,7 @@ def test_fwf_compression(compression_only, infer):
     3333333333""".strip()
 
     compression = compression_only
-    extension = icom._compression_to_extension[compression]
+    extension = _compression_to_extension[compression]
 
     kwargs = {"widths": [5, 5], "names": ["one", "two"]}
     expected = read_fwf(StringIO(data), **kwargs)
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 5ac0197d275dd..a9c48d07fc986 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -15,6 +15,10 @@
 
 import pandas.io.common as icom
 
+_compression_to_extension = {
+    value: key for key, value in icom._extension_to_compression.items()
+}
+
 
 @pytest.mark.parametrize(
     "obj",
@@ -81,7 +85,7 @@ def test_dataframe_compression_defaults_to_infer(
 ):
     # GH22004
     input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"])
-    extension = icom._compression_to_extension[compression_only]
+    extension = _compression_to_extension[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
         output = read_method(path, compression=compression_only)
@@ -101,7 +105,7 @@ def test_series_compression_defaults_to_infer(
 ):
     # GH22004
     input = pd.Series([0, 5, -2, 10], name="X")
-    extension = icom._compression_to_extension[compression_only]
+    extension = _compression_to_extension[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
         if "squeeze" in read_kwargs:
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
index ef590b2cf48d3..6907d8978e603 100644
--- a/pandas/tests/io/test_gcs.py
+++ b/pandas/tests/io/test_gcs.py
@@ -15,10 +15,9 @@
     read_parquet,
 )
 import pandas._testing as tm
+from pandas.tests.io.test_compression import _compression_to_extension
 from pandas.util import _test_decorators as td
 
-import pandas.io.common as icom
-
 
 @pytest.fixture
 def gcs_buffer(monkeypatch):
@@ -157,7 +156,7 @@ def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding)
     tm.assert_frame_equal(df, read_df)
 
     # write compressed file with implicit compression
-    file_ext = icom._compression_to_extension[compression_only]
+    file_ext = _compression_to_extension[compression_only]
     compression["method"] = "infer"
     path_gcs += f".{file_ext}"
     df.to_csv(path_gcs, compression=compression, encoding=encoding)
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 2f5da142676c3..c21673af2d979 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -21,8 +21,8 @@
     Series,
 )
 from pandas.core.indexes.api import ensure_index
+from pandas.tests.io.test_compression import _compression_to_extension
 
-import pandas.io.common as icom
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (
     CategoricalConversionWarning,
@@ -1850,7 +1850,7 @@ def test_compression(compression, version, use_dict, infer):
         if use_dict:
             file_ext = compression
         else:
-            file_ext = icom._compression_to_extension[compression]
+            file_ext = _compression_to_extension[compression]
         file_name += f".{file_ext}"
     compression_arg = compression
     if infer:
@@ -2005,7 +2005,7 @@ def test_compression_roundtrip(compression):
 def test_stata_compression(compression_only, read_infer, to_infer):
     compression = compression_only
 
-    ext = icom._compression_to_extension[compression]
+    ext = _compression_to_extension[compression]
     filename = f"test.{ext}"
 
     df = DataFrame(
diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py
index 2b9735f64761c..d3247eb9dd47e 100644
--- a/pandas/tests/io/xml/test_to_xml.py
+++ b/pandas/tests/io/xml/test_to_xml.py
@@ -17,8 +17,8 @@
     Index,
 )
 import pandas._testing as tm
+from pandas.tests.io.test_compression import _compression_to_extension
 
-import pandas.io.common as icom
 from pandas.io.common import get_handle
 from pandas.io.xml import read_xml
 
@@ -1292,7 +1292,7 @@ def test_compression_output(parser, compression_only):
 
 
 def test_filename_and_suffix_comp(parser, compression_only):
-    compfile = "xml." + icom._compression_to_extension[compression_only]
+    compfile = "xml." + _compression_to_extension[compression_only]
     with tm.ensure_clean(filename=compfile) as path:
         geom_df.to_xml(path, parser=parser, compression=compression_only)
 

From 9458ecbe116c83e04ff2bd111467caa45a16f650 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Wed, 13 Apr 2022 14:14:31 +0200
Subject: [PATCH 30/33] revert added "mode" argument

---
 pandas/io/json/_json.py | 3 +--
 pandas/io/pickle.py     | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 1c9de2b53b275..2a9ed9f15cd11 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -81,7 +81,6 @@ def to_json(
     default_handler: Callable[[Any], JSONSerializable] | None = None,
     lines: bool = False,
     compression: CompressionOptions = "infer",
-    mode: str = "w",
     index: bool = True,
     indent: int = 0,
     storage_options: StorageOptions = None,
@@ -126,7 +125,7 @@ def to_json(
     if path_or_buf is not None:
         # apply compression and byte/text conversion
         with get_handle(
-            path_or_buf, mode, compression=compression, storage_options=storage_options
+            path_or_buf, "w", compression=compression, storage_options=storage_options
         ) as handles:
             handles.handle.write(s)
     else:
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index 3636ce661fa2e..2928d8c6520b0 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -28,7 +28,6 @@ def to_pickle(
     obj: Any,
     filepath_or_buffer: FilePath | WriteBuffer[bytes],
     compression: CompressionOptions = "infer",
-    mode: str = "wb",
     protocol: int = pickle.HIGHEST_PROTOCOL,
     storage_options: StorageOptions = None,
 ) -> None:
@@ -97,7 +96,7 @@ def to_pickle(
 
     with get_handle(
         filepath_or_buffer,
-        mode,
+        "wb",
         compression=compression,
         is_text=False,
         storage_options=storage_options,

From d20f31505960515bc43fe0074e52ecd141f1940d Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Wed, 13 Apr 2022 14:29:44 +0200
Subject: [PATCH 31/33] add test to ensure that `compression.mode` works

---
 pandas/tests/io/test_compression.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index a9c48d07fc986..9daa9dfd94641 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -1,3 +1,4 @@
+import gzip
 import io
 import os
 from pathlib import Path
@@ -311,3 +312,17 @@ def test_ambiguous_archive_tar():
 
         with pytest.raises(ValueError, match="Multiple files found in TAR archive"):
             pd.read_csv(tarpath)
+
+
+def test_tar_gz_to_different_filename():
+    with tm.ensure_clean(filename=".foo") as file:
+        pd.DataFrame(
+            [["1", "2"]],
+            columns=["foo", "bar"],
+        ).to_csv(file, compression={"method": "tar", "mode": "w:gz"}, index=False)
+        with gzip.open(file) as uncompressed:
+            with tarfile.TarFile(fileobj=uncompressed) as archive:
+                members = archive.getmembers()
+                assert len(members) == 1
+                content = archive.extractfile(members[0]).read()
+                assert content == b"foo,bar\n1,2\n"

From 0d9ed18d1b9703be8f5545f1462709da0ac71f07 Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Thu, 5 May 2022 15:24:16 +0200
Subject: [PATCH 32/33] compare strings, not bytes

---
 pandas/tests/io/test_compression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 9daa9dfd94641..a2da8d45171e2 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -324,5 +324,5 @@ def test_tar_gz_to_different_filename():
             with tarfile.TarFile(fileobj=uncompressed) as archive:
                 members = archive.getmembers()
                 assert len(members) == 1
-                content = archive.extractfile(members[0]).read()
-                assert content == b"foo,bar\n1,2\n"
+                content = archive.extractfile(members[0]).read().decode("utf8")
+                assert content == "foo,bar\n1,2\n"

From 37370c2f7802fa52534abbda8a0a92e017d6bb2f Mon Sep 17 00:00:00 2001
From: Simon Knott <info@simonknott.de>
Date: Fri, 6 May 2022 11:25:34 +0200
Subject: [PATCH 33/33] replace carriage returns

---
 pandas/tests/io/test_compression.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index a2da8d45171e2..35749aabdc39f 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -325,4 +325,5 @@ def test_tar_gz_to_different_filename():
                 members = archive.getmembers()
                 assert len(members) == 1
                 content = archive.extractfile(members[0]).read().decode("utf8")
+                content = content.replace("\r\n", "\n")  # windows
                 assert content == "foo,bar\n1,2\n"