diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 8e25857e5ad69..2bfc09e52c68b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -206,6 +206,7 @@ ExtensionArray Other ^^^^^ - Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) .. _whatsnew_1000.contributors: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba1c516b9b444..f785caa392936 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7,7 +7,17 @@ import pickle import re from textwrap import dedent -from typing import Callable, Dict, FrozenSet, List, Optional, Set +from typing import ( + Callable, + Dict, + FrozenSet, + Hashable, + List, + Optional, + Sequence, + Set, + Union, +) import warnings import weakref @@ -50,7 +60,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas._typing import Dtype +from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin @@ -122,6 +132,9 @@ def _single_replace(self, to_replace, method, inplace, limit): return result +bool_t = bool # Need alias because NDFrame has def bool: + + class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -3051,26 +3064,26 @@ def to_latex( def to_csv( self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, - quotechar='"', - line_terminator=None, - chunksize=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal=".", - ): + path_or_buf: Optional[FilePathOrBuffer] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Hashable]] = None, + header: Union[bool_t, List[str]] = True, + index: bool_t = True, + index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Optional[Union[str, Dict[str, str]]] = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool_t = True, + escapechar: Optional[str] = None, + decimal: Optional[str] = ".", + ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3117,16 +3130,21 @@ def to_csv( encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. - compression : str, default 'infer' - Compression mode among the following possible values: {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` - is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no - compression). - - .. versionchanged:: 0.24.0 - - 'infer' option added and set to default. + compression : str or dict, default 'infer' + If str, represents compression mode. If dict, value at 'method' is + the compression mode. Compression mode may be any of the following + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + compression mode is 'infer' and `path_or_buf` is path-like, then + detect compression mode from the following extensions: '.gz', + '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given + and mode is 'zip' or inferred as 'zip', other entries passed as + additional compression options. + + .. versionchanged:: 0.25.0 + + May now be a dict with key 'method' as compression mode + and other entries as additional compression options if + compression mode is 'zip'. quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` @@ -3171,6 +3189,13 @@ def to_csv( ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + + # create 'out.zip' containing 'out.csv' + >>> compression_opts = dict(method='zip', + ... archive_name='out.csv') # doctest: +SKIP + + >>> df.to_csv('out.zip', index=False, + ... compression=compression_opts) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() @@ -3204,6 +3229,8 @@ def to_csv( if path_or_buf is None: return formatter.path_or_buf.getvalue() + return None + # ---------------------------------------------------------------------- # Fancy Indexing diff --git a/pandas/io/common.py b/pandas/io/common.py index 26b68dda7b464..290022167e520 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,19 @@ import mmap import os import pathlib -from typing import IO, AnyStr, BinaryIO, Optional, TextIO, Type +from typing import ( + IO, + Any, + AnyStr, + BinaryIO, + Dict, + List, + Optional, + TextIO, + Tuple, + Type, + Union, +) from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, @@ -255,6 +267,40 @@ def file_path_to_url(path: str) -> str: _compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} +def _get_compression_method( + compression: Optional[Union[str, Dict[str, str]]] +) -> Tuple[Optional[str], Dict[str, str]]: + """ + Simplifies a compression argument to a compression method string and + a dict containing additional arguments. + + Parameters + ---------- + compression : str or dict + If string, specifies the compression method. If dict, value at key + 'method' specifies compression method. + + Returns + ------- + tuple of ({compression method}, Optional[str] + {compression arguments}, Dict[str, str]) + + Raises + ------ + ValueError on dict missing 'method' key + """ + # Handle dict + if isinstance(compression, dict): + compression_args = compression.copy() + try: + compression = compression_args.pop("method") + except KeyError: + raise ValueError("If dict, compression must have key 'method'") + else: + compression_args = {} + return compression, compression_args + + def _infer_compression( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] ) -> Optional[str]: @@ -266,8 +312,8 @@ def _infer_compression( Parameters ---------- - filepath_or_buffer : - a path (str) or buffer + filepath_or_buffer : str or file handle + File path or object. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', @@ -275,12 +321,11 @@ def _infer_compression( Returns ------- - string or None : - compression method + string or None Raises ------ - ValueError on invalid compression specified + ValueError on invalid compression specified. """ # No compression has been explicitly specified @@ -312,32 +357,49 @@ def _infer_compression( def _get_handle( - path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True + path_or_buf, + mode: str, + encoding=None, + compression: Optional[Union[str, Dict[str, Any]]] = None, + memory_map: bool = False, + is_text: bool = True, ): """ Get file handle for given path/buffer and mode. Parameters ---------- - path_or_buf : - a path (str) or buffer + path_or_buf : str or file handle + File path or object. mode : str - mode to open path_or_buf with + Mode to open path_or_buf with. encoding : str or None - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None - If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', - or '.xz' (otherwise no compression). + Encoding to use. + compression : str or dict, default None + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. Compression mode must be one of {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' + and `filepath_or_buffer` is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise + no compression). If dict and compression mode is 'zip' or inferred as + 'zip', other entries passed as additional compression options. + + .. versionchanged:: 1.0.0 + + May now be a dict with key 'method' as compression mode + and other keys as compression options if compression + mode is 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.) + mode (pickle, etc.). Returns ------- f : file-like - A file-like object + A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ @@ -346,15 +408,16 @@ def _get_handle( need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = BufferedIOBase + need_text_wrapping = BufferedIOBase # type: ignore - handles = list() + handles = list() # type: List[IO] f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) + compression, compression_args = _get_compression_method(compression) if is_path: compression = _infer_compression(path_or_buf, compression) @@ -376,7 +439,7 @@ def _get_handle( # ZIP Compression elif compression == "zip": - zf = BytesZipFile(path_or_buf, mode) + zf = BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": @@ -429,9 +492,9 @@ def _get_handle( if memory_map and hasattr(f, "fileno"): try: - g = MMapWrapper(f) + wrapped = MMapWrapper(f) f.close() - f = g + f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level @@ -456,15 +519,19 @@ def __init__( self, file: FilePathOrBuffer, mode: str, - compression: int = zipfile.ZIP_DEFLATED, + archive_name: Optional[str] = None, **kwargs ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") - super().__init__(file, mode, compression, **kwargs) + self.archive_name = archive_name + super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) def write(self, data): - super().writestr(self.filename, data) + archive_name = self.filename + if self.archive_name is not None: + archive_name = self.archive_name + super().writestr(archive_name, data) @property def closed(self): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 60daf311397e8..e25862537cbfc 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -22,6 +22,7 @@ from pandas.io.common import ( UnicodeWriter, + _get_compression_method, _get_handle, _infer_compression, get_filepath_or_buffer, @@ -58,6 +59,9 @@ def __init__( if path_or_buf is None: path_or_buf = StringIO() + # Extract compression mode as given, if dict + compression, self.compression_args = _get_compression_method(compression) + self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode ) @@ -178,7 +182,7 @@ def save(self): self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=dict(self.compression_args, method=self.compression), ) close = True @@ -206,11 +210,13 @@ def save(self): if hasattr(self.path_or_buf, "write"): self.path_or_buf.write(buf) else: + compression = dict(self.compression_args, method=self.compression) + f, handles = _get_handle( self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=compression, ) f.write(buf) close = True diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index ee236a8253b01..ab44b8b8059eb 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -514,3 +514,44 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) + + def test_to_csv_compression_dict(self, compression_only): + # GH 26023 + method = compression_only + df = DataFrame({"ABC": [1]}) + filename = "to_csv_compress_as_dict." + filename += "gz" if method == "gzip" else method + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + + def test_to_csv_compression_dict_no_method_raises(self): + # GH 26023 + df = DataFrame({"ABC": [1]}) + compression = {"some_option": True} + msg = "must have key 'method'" + + with tm.ensure_clean("out.zip") as path: + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + @pytest.mark.parametrize( + "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] + ) + def test_to_csv_zip_arguments(self, compression, archive_name): + # GH 26023 + from zipfile import ZipFile + + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("to_csv_archive_name.zip") as path: + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) + zp = ZipFile(path) + expected_arcname = path if archive_name is None else archive_name + expected_arcname = os.path.basename(expected_arcname) + assert len(zp.filelist) == 1 + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname