to_csv compression may now be dict with possible keys 'method' and 'arcname'

drew-heenan · drew-heenan · commit b41be549e093 · 2019-04-09T03:07:03.000-04:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -36,7 +36,7 @@ Other Enhancements
 - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
 - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
 - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
-- :meth:`NDFrame.to_csv` now supports the ``arcname`` argument to specify the written CSV file name when inside a ZIP archive. Default ``arcname=None`` maintains previous behavior where the CSV name matches given ZIP path ``path_or_buf`` (:issue:`26023`)
+- :meth:`NDFrame.to_csv` now supports dicts as ``compression`` argument with key ``'method'`` being the compression method and optional key ``'arcname'`` specifying the archived CSV file name when the compression method is ``'zip'``. If key ``'arcname'`` unspecified or ``compression='zip'``, maintains previous behavior. (:issue:`26023`)
 
 .. _whatsnew_0250.api_breaking:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2921,7 +2921,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
                mode='w', encoding=None, compression='infer', quoting=None,
                quotechar='"', line_terminator=None, chunksize=None,
                tupleize_cols=None, date_format=None, doublequote=True,
-               escapechar=None, decimal='.', arcname=None):
+               escapechar=None, decimal='.'):
         r"""
         Write object to a comma-separated values (csv) file.
 
@@ -2968,16 +2968,21 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
         encoding : str, optional
             A string representing the encoding to use in the output file,
             defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
-        compression : str, default 'infer'
-            Compression mode among the following possible values: {'infer',
-            'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf`
-            is path-like, then detect compression from the following
-            extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no
-            compression).
-
-            .. versionchanged:: 0.24.0
-
-               'infer' option added and set to default.
+        compression : str or dict, default 'infer'
+            If str, represents compression mode. If dict, value at 'method' is
+            the compression mode. Compression mode may be any of the following
+            possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If
+            compression mode is 'infer' and `path_or_buf` is path-like, then
+            detect compression mode from the following extensions: '.gz',
+            '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
+            and mode is 'zip' or inferred as 'zip', optional value at 'arcname'
+            specifies name of file within ZIP archive, assuming equal to
+            `path_or_buf` if not specified or None.
+
+            .. versionchanged:: 0.25.0
+
+               May now be a dict with key 'method' as compression mode
+               and 'arcname' as CSV file name if mode is 'zip'
 
         quoting : optional constant from csv module
             Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
@@ -3011,12 +3016,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
         decimal : str, default '.'
             Character recognized as decimal separator. E.g. use ',' for
             European data.
-        arcname : str, default None
-            Name of CSV-formatted file within a ZIP archive. Only used when
-            `path_or_buf` is a path and `compression` is set to or inferred
-            as 'zip'. Uses `path_or_buf` if None.
-
-            .. versionadded:: 0.25.0
 
         Returns
         -------
@@ -3059,8 +3058,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
                                  tupleize_cols=tupleize_cols,
                                  date_format=date_format,
                                  doublequote=doublequote,
-                                 escapechar=escapechar, decimal=decimal,
-                                 arcname=arcname)
+                                 escapechar=escapechar, decimal=decimal)
         formatter.save()
 
         if path_or_buf is None:
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -235,19 +235,26 @@ def file_path_to_url(path):
 
 def _infer_compression(filepath_or_buffer, compression):
     """
-    Get the compression method for filepath_or_buffer. If compression='infer',
-    the inferred compression method is returned. Otherwise, the input
+    Get the compression method for filepath_or_buffer. If compression mode is
+    'infer', the inferred compression method is returned. Otherwise, the input
     compression method is returned unchanged, unless it's invalid, in which
     case an error is raised.
 
     Parameters
     ----------
     filepath_or_buffer :
         a path (str) or buffer
-    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
-        If 'infer' and `filepath_or_buffer` is path-like, then detect
-        compression from the following extensions: '.gz', '.bz2', '.zip',
-        or '.xz' (otherwise no compression).
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} or dict
+        If string, specifies compression mode. If dict, value at key 'method'
+        specifies compression mode. If compression mode is 'infer' and
+        `filepath_or_buffer` is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
+        compression).
+
+        .. versionchanged 0.25.0
+
+        May now be a dict with required key 'method' specifying compression
+        mode
 
     Returns
     -------
@@ -259,6 +266,14 @@ def _infer_compression(filepath_or_buffer, compression):
     ValueError on invalid compression specified
     """
 
+    # Handle compression method as dict
+    if isinstance(compression, dict):
+        try:
+            compression = compression['method']
+        except KeyError:
+            raise ValueError("Compression dict must have key "
+                             "'method'")
+
     # No compression has been explicitly specified
     if compression is None:
         return None
@@ -288,7 +303,7 @@ def _infer_compression(filepath_or_buffer, compression):
 
 
 def _get_handle(path_or_buf, mode, encoding=None, compression=None,
-                memory_map=False, is_text=True, arcname=None):
+                memory_map=False, is_text=True):
     """
     Get file handle for given path/buffer and mode.
 
@@ -299,10 +314,21 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     mode : str
         mode to open path_or_buf with
     encoding : str or None
-    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
-        If 'infer' and `filepath_or_buffer` is path-like, then detect
-        compression from the following extensions: '.gz', '.bz2', '.zip',
-        or '.xz' (otherwise no compression).
+    compression : str or dict, default None
+        If string, specifies compression mode. If dict, value at key 'method'
+        specifies compression mode. Compression mode must be one of {'infer',
+        'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
+        and `filepath_or_buffer` is path-like, then detect compression from
+        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
+        no compression). If dict and compression mode is 'zip' or inferred as
+        'zip', optional value at key 'arcname' specifies the name of the file
+        within ZIP archive at `path_or_buf`.
+
+        .. versionchanged:: 0.25.0
+
+           May now be a dict with key 'method' as compression mode
+           and 'arcname' as CSV file name if mode is 'zip'
+
     memory_map : boolean, default False
         See parsers._parser_params for more information.
     is_text : boolean, default True
@@ -329,27 +355,31 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     path_or_buf = _stringify_path(path_or_buf)
     is_path = isinstance(path_or_buf, str)
 
+    compression_method = None
     if is_path:
-        compression = _infer_compression(path_or_buf, compression)
+        compression_method = _infer_compression(path_or_buf, compression)
 
-    if compression:
+    if compression_method:
 
         # GZ Compression
-        if compression == 'gzip':
+        if compression_method == 'gzip':
             if is_path:
                 f = gzip.open(path_or_buf, mode)
             else:
                 f = gzip.GzipFile(fileobj=path_or_buf)
 
         # BZ Compression
-        elif compression == 'bz2':
+        elif compression_method == 'bz2':
             if is_path:
                 f = bz2.BZ2File(path_or_buf, mode)
             else:
                 f = bz2.BZ2File(path_or_buf)
 
         # ZIP Compression
-        elif compression == 'zip':
+        elif compression_method == 'zip':
+            arcname = None
+            if isinstance(compression, dict) and 'arcname' in compression:
+                arcname = compression['arcname']
             zf = BytesZipFile(path_or_buf, mode, arcname=arcname)
             # Ensure the container is closed as well.
             handles.append(zf)
@@ -368,14 +398,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
                                      .format(zip_names))
 
         # XZ Compression
-        elif compression == 'xz':
+        elif compression_method == 'xz':
             f = lzma.LZMAFile(path_or_buf, mode)
 
-        # Unrecognized Compression
-        else:
-            msg = 'Unrecognized compression type: {}'.format(compression)
-            raise ValueError(msg)
-
         handles.append(f)
 
     elif is_path:
@@ -391,7 +416,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
         handles.append(f)
 
     # Convert BytesIO or file objects passed with an encoding
-    if is_text and (compression or isinstance(f, need_text_wrapping)):
+    if is_text and (compression_method or isinstance(f, need_text_wrapping)):
         from io import TextIOWrapper
         f = TextIOWrapper(f, encoding=encoding, newline='')
         handles.append(f)
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -29,15 +29,27 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
                  compression='infer', quoting=None, line_terminator='\n',
                  chunksize=None, tupleize_cols=False, quotechar='"',
                  date_format=None, doublequote=True, escapechar=None,
-                 decimal='.', arcname=None):
+                 decimal='.'):
 
         self.obj = obj
 
         if path_or_buf is None:
             path_or_buf = StringIO()
 
+        self._compression_arg = compression
+        compression_mode = compression
+
+        # Extract compression mode as given, if dict
+        if isinstance(compression, dict):
+            try:
+                compression_mode = compression['method']
+            except KeyError:
+                raise ValueError("If dict, compression must have key "
+                                 "'method'")
+
         self.path_or_buf, _, _, _ = get_filepath_or_buffer(
-            path_or_buf, encoding=encoding, compression=compression, mode=mode
+            path_or_buf, encoding=encoding,
+            compression=compression_mode, mode=mode
         )
         self.sep = sep
         self.na_rep = na_rep
@@ -123,8 +135,6 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
         if not index:
             self.nlevels = 0
 
-        self.arcname = arcname
-
     def save(self):
         """
         Create the writer & save
@@ -152,7 +162,7 @@ def save(self):
         else:
             f, handles = _get_handle(self.path_or_buf, self.mode,
                                      encoding=self.encoding,
-                                     compression=self.compression)
+                                     compression=self._compression_arg)
             close = True
 
         try:
@@ -178,8 +188,7 @@ def save(self):
                 else:
                     f, handles = _get_handle(self.path_or_buf, self.mode,
                                              encoding=self.encoding,
-                                             compression=self.compression,
-                                             arcname=self.arcname)
+                                             compression=self._compression_arg)
                     f.write(buf)
                     close = True
             if close:
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -538,6 +538,17 @@ def test_to_csv_compression(self, compression_only,
                                  compression=read_compression)
             tm.assert_frame_equal(result, df)
 
+    @pytest.mark.parametrize("method", ["gzip", "bz2", "zip", "xz"])
+    def test_to_csv_compression_dict(self, method):
+        # GH 26023
+        df = DataFrame({"ABC": [1]})
+        filename = "to_csv_compress_as_dict."
+        filename += "gz" if method == "gzip" else method
+        with tm.ensure_clean(filename) as path:
+            df.to_csv(path, compression={"method": method})
+            read_df = pd.read_csv(path, index_col=0)
+            tm.assert_frame_equal(read_df, df)
+
     @pytest.mark.parametrize("compression", ["zip", "infer"])
     @pytest.mark.parametrize("arcname", [None, "test_to_csv.csv",
                                          "test_to_csv.zip"])
@@ -547,8 +558,8 @@ def test_to_csv_zip_arcname(self, compression, arcname):
 
         df = DataFrame({"ABC": [1]})
         with tm.ensure_clean("to_csv_arcname.zip") as path:
-            df.to_csv(path, compression=compression,
-                      arcname=arcname)
+            df.to_csv(path, compression={"method": compression,
+                                         "arcname": arcname})
             zp = ZipFile(path)
             expected_arcname = path if arcname is None else arcname
             expected_arcname = os.path.basename(expected_arcname)