Skip to content

Commit d3fe3fb

Browse files
committed
ENH: Support passing compression args to gzip and bz2
This commit closes GH33196 but takes a more generic approach than the suggested solution. Instead of providing a 'fast' kwarg or global compression level setting, this commit extends the ability to pass compression settings as a dict to the gzip and bz2 compression methods. In this way, if the user wants faster compression, they can pass compression={'method': 'gzip', 'compresslevel'=1} rather than just compression='gzip'. Note: For the API to be consistent when passing paths vs. filelikes, GZipFile and gzip2.open() must accept the same kwargs.
1 parent 06f4c90 commit d3fe3fb

File tree

4 files changed

+44
-8
lines changed

4 files changed

+44
-8
lines changed

doc/source/whatsnew/v1.1.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ Other enhancements
8888
- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
8989
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
9090
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
91-
-
91+
- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_json` now support passing a dict of compression arguments when using the ``gzip`` and ``bz2`` protocols. This can be used to set a custom compression level, e.g., ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}`` (:issue:`33196`)
9292

9393
.. ---------------------------------------------------------------------------
9494

pandas/core/generic.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3096,7 +3096,8 @@ def to_csv(
30963096
compression mode is 'infer' and `path_or_buf` is path-like, then
30973097
detect compression mode from the following extensions: '.gz',
30983098
'.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
3099-
and mode is 'zip' or inferred as 'zip', other entries passed as
3099+
and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as
3100+
one of the above, other entries passed as
31003101
additional compression options.
31013102
31023103
.. versionchanged:: 1.0.0
@@ -3105,6 +3106,12 @@ def to_csv(
31053106
and other entries as additional compression options if
31063107
compression mode is 'zip'.
31073108
3109+
.. versionchanged:: 1.1.0
3110+
3111+
Passing compression options as keys in dict is
3112+
supported for compression modes 'gzip' and 'bz2'
3113+
as well as 'zip'.
3114+
31083115
quoting : optional constant from csv module
31093116
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
31103117
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC

pandas/io/common.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -351,15 +351,21 @@ def get_handle(
351351
'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
352352
and `filepath_or_buffer` is path-like, then detect compression from
353353
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
354-
no compression). If dict and compression mode is 'zip' or inferred as
355-
'zip', other entries passed as additional compression options.
354+
no compression). If dict and compression mode is one of
355+
{'zip', 'gzip', 'bz2'}, or inferred as one of the above,
356+
other entries passed as additional compression options.
356357
357358
.. versionchanged:: 1.0.0
358359
359360
May now be a dict with key 'method' as compression mode
360361
and other keys as compression options if compression
361362
mode is 'zip'.
362363
364+
.. versionchanged:: 1.1.0
365+
366+
Passing compression options as keys in dict is now
367+
supported for compression modes 'gzip' and 'bz2' as well as 'zip'.
368+
363369
memory_map : boolean, default False
364370
See parsers._parser_params for more information.
365371
is_text : boolean, default True
@@ -397,16 +403,16 @@ def get_handle(
397403
# GZ Compression
398404
if compression == "gzip":
399405
if is_path:
400-
f = gzip.open(path_or_buf, mode)
406+
f = gzip.open(path_or_buf, mode, **compression_args)
401407
else:
402-
f = gzip.GzipFile(fileobj=path_or_buf)
408+
f = gzip.GzipFile(fileobj=path_or_buf, **compression_args)
403409

404410
# BZ Compression
405411
elif compression == "bz2":
406412
if is_path:
407-
f = bz2.BZ2File(path_or_buf, mode)
413+
f = bz2.BZ2File(path_or_buf, mode, **compression_args)
408414
else:
409-
f = bz2.BZ2File(path_or_buf)
415+
f = bz2.BZ2File(path_or_buf, **compression_args)
410416

411417
# ZIP Compression
412418
elif compression == "zip":

pandas/tests/io/test_compression.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,26 @@ def test_with_missing_lzma_runtime():
143143
"""
144144
)
145145
subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
146+
147+
148+
@pytest.mark.parametrize(
149+
"obj",
150+
[
151+
pd.DataFrame(
152+
100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
153+
columns=["X", "Y", "Z"],
154+
),
155+
pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
156+
],
157+
)
158+
@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
159+
def test_gzip_compression_level_path(obj, method):
160+
"""GH#33398 Ideally this test should be repeated for bz2 as well,
161+
but that is not practical because a file size of >100k is needed to see any
162+
size difference between bz2 compression settings."""
163+
with tm.ensure_clean() as path:
164+
getattr(obj, method)(path, compression="gzip")
165+
compressed_size_default = os.path.getsize(path)
166+
getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
167+
compressed_size_fast = os.path.getsize(path)
168+
assert compressed_size_default < compressed_size_fast

0 commit comments

Comments
 (0)