Skip to content

Commit b41be54

Browse files
committed
to_csv compression may now be dict with possible keys 'method' and 'arcname'
1 parent d238878 commit b41be54

File tree

5 files changed

+95
-52
lines changed

5 files changed

+95
-52
lines changed

doc/source/whatsnew/v0.25.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ Other Enhancements
3636
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3737
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
3838
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
39-
- :meth:`NDFrame.to_csv` now supports the ``arcname`` argument to specify the written CSV file name when inside a ZIP archive. Default ``arcname=None`` maintains previous behavior where the CSV name matches given ZIP path ``path_or_buf`` (:issue:`26023`)
39+
- :meth:`NDFrame.to_csv` now supports dicts as ``compression`` argument with key ``'method'`` being the compression method and optional key ``'arcname'`` specifying the archived CSV file name when the compression method is ``'zip'``. If key ``'arcname'`` unspecified or ``compression='zip'``, maintains previous behavior. (:issue:`26023`)
4040

4141
.. _whatsnew_0250.api_breaking:
4242

pandas/core/generic.py

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2921,7 +2921,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
29212921
mode='w', encoding=None, compression='infer', quoting=None,
29222922
quotechar='"', line_terminator=None, chunksize=None,
29232923
tupleize_cols=None, date_format=None, doublequote=True,
2924-
escapechar=None, decimal='.', arcname=None):
2924+
escapechar=None, decimal='.'):
29252925
r"""
29262926
Write object to a comma-separated values (csv) file.
29272927
@@ -2968,16 +2968,21 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
29682968
encoding : str, optional
29692969
A string representing the encoding to use in the output file,
29702970
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
2971-
compression : str, default 'infer'
2972-
Compression mode among the following possible values: {'infer',
2973-
'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf`
2974-
is path-like, then detect compression from the following
2975-
extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no
2976-
compression).
2977-
2978-
.. versionchanged:: 0.24.0
2979-
2980-
'infer' option added and set to default.
2971+
compression : str or dict, default 'infer'
2972+
If str, represents compression mode. If dict, value at 'method' is
2973+
the compression mode. Compression mode may be any of the following
2974+
possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If
2975+
compression mode is 'infer' and `path_or_buf` is path-like, then
2976+
detect compression mode from the following extensions: '.gz',
2977+
'.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
2978+
and mode is 'zip' or inferred as 'zip', optional value at 'arcname'
2979+
specifies name of file within ZIP archive, assuming equal to
2980+
`path_or_buf` if not specified or None.
2981+
2982+
.. versionchanged:: 0.25.0
2983+
2984+
May now be a dict with key 'method' as compression mode
2985+
and 'arcname' as CSV file name if mode is 'zip'
29812986
29822987
quoting : optional constant from csv module
29832988
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
@@ -3011,12 +3016,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
30113016
decimal : str, default '.'
30123017
Character recognized as decimal separator. E.g. use ',' for
30133018
European data.
3014-
arcname : str, default None
3015-
Name of CSV-formatted file within a ZIP archive. Only used when
3016-
`path_or_buf` is a path and `compression` is set to or inferred
3017-
as 'zip'. Uses `path_or_buf` if None.
3018-
3019-
.. versionadded:: 0.25.0
30203019
30213020
Returns
30223021
-------
@@ -3059,8 +3058,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
30593058
tupleize_cols=tupleize_cols,
30603059
date_format=date_format,
30613060
doublequote=doublequote,
3062-
escapechar=escapechar, decimal=decimal,
3063-
arcname=arcname)
3061+
escapechar=escapechar, decimal=decimal)
30643062
formatter.save()
30653063

30663064
if path_or_buf is None:

pandas/io/common.py

Lines changed: 48 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -235,19 +235,26 @@ def file_path_to_url(path):
235235

236236
def _infer_compression(filepath_or_buffer, compression):
237237
"""
238-
Get the compression method for filepath_or_buffer. If compression='infer',
239-
the inferred compression method is returned. Otherwise, the input
238+
Get the compression method for filepath_or_buffer. If compression mode is
239+
'infer', the inferred compression method is returned. Otherwise, the input
240240
compression method is returned unchanged, unless it's invalid, in which
241241
case an error is raised.
242242
243243
Parameters
244244
----------
245245
filepath_or_buffer :
246246
a path (str) or buffer
247-
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
248-
If 'infer' and `filepath_or_buffer` is path-like, then detect
249-
compression from the following extensions: '.gz', '.bz2', '.zip',
250-
or '.xz' (otherwise no compression).
247+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} or dict
248+
If string, specifies compression mode. If dict, value at key 'method'
249+
specifies compression mode. If compression mode is 'infer' and
250+
`filepath_or_buffer` is path-like, then detect compression from the
251+
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
252+
compression).
253+
254+
.. versionchanged 0.25.0
255+
256+
May now be a dict with required key 'method' specifying compression
257+
mode
251258
252259
Returns
253260
-------
@@ -259,6 +266,14 @@ def _infer_compression(filepath_or_buffer, compression):
259266
ValueError on invalid compression specified
260267
"""
261268

269+
# Handle compression method as dict
270+
if isinstance(compression, dict):
271+
try:
272+
compression = compression['method']
273+
except KeyError:
274+
raise ValueError("Compression dict must have key "
275+
"'method'")
276+
262277
# No compression has been explicitly specified
263278
if compression is None:
264279
return None
@@ -288,7 +303,7 @@ def _infer_compression(filepath_or_buffer, compression):
288303

289304

290305
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
291-
memory_map=False, is_text=True, arcname=None):
306+
memory_map=False, is_text=True):
292307
"""
293308
Get file handle for given path/buffer and mode.
294309
@@ -299,10 +314,21 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
299314
mode : str
300315
mode to open path_or_buf with
301316
encoding : str or None
302-
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
303-
If 'infer' and `filepath_or_buffer` is path-like, then detect
304-
compression from the following extensions: '.gz', '.bz2', '.zip',
305-
or '.xz' (otherwise no compression).
317+
compression : str or dict, default None
318+
If string, specifies compression mode. If dict, value at key 'method'
319+
specifies compression mode. Compression mode must be one of {'infer',
320+
'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
321+
and `filepath_or_buffer` is path-like, then detect compression from
322+
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
323+
no compression). If dict and compression mode is 'zip' or inferred as
324+
'zip', optional value at key 'arcname' specifies the name of the file
325+
within ZIP archive at `path_or_buf`.
326+
327+
.. versionchanged:: 0.25.0
328+
329+
May now be a dict with key 'method' as compression mode
330+
and 'arcname' as CSV file name if mode is 'zip'
331+
306332
memory_map : boolean, default False
307333
See parsers._parser_params for more information.
308334
is_text : boolean, default True
@@ -329,27 +355,31 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
329355
path_or_buf = _stringify_path(path_or_buf)
330356
is_path = isinstance(path_or_buf, str)
331357

358+
compression_method = None
332359
if is_path:
333-
compression = _infer_compression(path_or_buf, compression)
360+
compression_method = _infer_compression(path_or_buf, compression)
334361

335-
if compression:
362+
if compression_method:
336363

337364
# GZ Compression
338-
if compression == 'gzip':
365+
if compression_method == 'gzip':
339366
if is_path:
340367
f = gzip.open(path_or_buf, mode)
341368
else:
342369
f = gzip.GzipFile(fileobj=path_or_buf)
343370

344371
# BZ Compression
345-
elif compression == 'bz2':
372+
elif compression_method == 'bz2':
346373
if is_path:
347374
f = bz2.BZ2File(path_or_buf, mode)
348375
else:
349376
f = bz2.BZ2File(path_or_buf)
350377

351378
# ZIP Compression
352-
elif compression == 'zip':
379+
elif compression_method == 'zip':
380+
arcname = None
381+
if isinstance(compression, dict) and 'arcname' in compression:
382+
arcname = compression['arcname']
353383
zf = BytesZipFile(path_or_buf, mode, arcname=arcname)
354384
# Ensure the container is closed as well.
355385
handles.append(zf)
@@ -368,14 +398,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
368398
.format(zip_names))
369399

370400
# XZ Compression
371-
elif compression == 'xz':
401+
elif compression_method == 'xz':
372402
f = lzma.LZMAFile(path_or_buf, mode)
373403

374-
# Unrecognized Compression
375-
else:
376-
msg = 'Unrecognized compression type: {}'.format(compression)
377-
raise ValueError(msg)
378-
379404
handles.append(f)
380405

381406
elif is_path:
@@ -391,7 +416,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
391416
handles.append(f)
392417

393418
# Convert BytesIO or file objects passed with an encoding
394-
if is_text and (compression or isinstance(f, need_text_wrapping)):
419+
if is_text and (compression_method or isinstance(f, need_text_wrapping)):
395420
from io import TextIOWrapper
396421
f = TextIOWrapper(f, encoding=encoding, newline='')
397422
handles.append(f)

pandas/io/formats/csvs.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,27 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
2929
compression='infer', quoting=None, line_terminator='\n',
3030
chunksize=None, tupleize_cols=False, quotechar='"',
3131
date_format=None, doublequote=True, escapechar=None,
32-
decimal='.', arcname=None):
32+
decimal='.'):
3333

3434
self.obj = obj
3535

3636
if path_or_buf is None:
3737
path_or_buf = StringIO()
3838

39+
self._compression_arg = compression
40+
compression_mode = compression
41+
42+
# Extract compression mode as given, if dict
43+
if isinstance(compression, dict):
44+
try:
45+
compression_mode = compression['method']
46+
except KeyError:
47+
raise ValueError("If dict, compression must have key "
48+
"'method'")
49+
3950
self.path_or_buf, _, _, _ = get_filepath_or_buffer(
40-
path_or_buf, encoding=encoding, compression=compression, mode=mode
51+
path_or_buf, encoding=encoding,
52+
compression=compression_mode, mode=mode
4153
)
4254
self.sep = sep
4355
self.na_rep = na_rep
@@ -123,8 +135,6 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
123135
if not index:
124136
self.nlevels = 0
125137

126-
self.arcname = arcname
127-
128138
def save(self):
129139
"""
130140
Create the writer & save
@@ -152,7 +162,7 @@ def save(self):
152162
else:
153163
f, handles = _get_handle(self.path_or_buf, self.mode,
154164
encoding=self.encoding,
155-
compression=self.compression)
165+
compression=self._compression_arg)
156166
close = True
157167

158168
try:
@@ -178,8 +188,7 @@ def save(self):
178188
else:
179189
f, handles = _get_handle(self.path_or_buf, self.mode,
180190
encoding=self.encoding,
181-
compression=self.compression,
182-
arcname=self.arcname)
191+
compression=self._compression_arg)
183192
f.write(buf)
184193
close = True
185194
if close:

pandas/tests/io/formats/test_to_csv.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,17 @@ def test_to_csv_compression(self, compression_only,
538538
compression=read_compression)
539539
tm.assert_frame_equal(result, df)
540540

541+
@pytest.mark.parametrize("method", ["gzip", "bz2", "zip", "xz"])
542+
def test_to_csv_compression_dict(self, method):
543+
# GH 26023
544+
df = DataFrame({"ABC": [1]})
545+
filename = "to_csv_compress_as_dict."
546+
filename += "gz" if method == "gzip" else method
547+
with tm.ensure_clean(filename) as path:
548+
df.to_csv(path, compression={"method": method})
549+
read_df = pd.read_csv(path, index_col=0)
550+
tm.assert_frame_equal(read_df, df)
551+
541552
@pytest.mark.parametrize("compression", ["zip", "infer"])
542553
@pytest.mark.parametrize("arcname", [None, "test_to_csv.csv",
543554
"test_to_csv.zip"])
@@ -547,8 +558,8 @@ def test_to_csv_zip_arcname(self, compression, arcname):
547558

548559
df = DataFrame({"ABC": [1]})
549560
with tm.ensure_clean("to_csv_arcname.zip") as path:
550-
df.to_csv(path, compression=compression,
551-
arcname=arcname)
561+
df.to_csv(path, compression={"method": compression,
562+
"arcname": arcname})
552563
zp = ZipFile(path)
553564
expected_arcname = path if arcname is None else arcname
554565
expected_arcname = os.path.basename(expected_arcname)

0 commit comments

Comments
 (0)