From 6f627c3ba265b00494ecc98cb33abb5fc46ccde8 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Fri, 12 Jan 2018 23:58:57 +0000 Subject: [PATCH 1/6] API: Add compression argument to Series.to_csv --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/series.py | 13 ++++--- pandas/tests/series/test_io.py | 66 ++++++++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a0205a8d64cb7..61d79c943c0e4 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -271,6 +271,7 @@ Other API Changes - :class:`IntervalIndex` and ``IntervalDtype`` no longer support categorical, object, and string subtypes (:issue:`19016`) - The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`) - ``IntervalDtype`` now returns ``True`` when compared against ``'interval'`` regardless of subtype, and ``IntervalDtype.name`` now returns ``'interval'`` regardless of subtype (:issue:`18980`) +- :func:`Series.to_csv` now accepts a ``compression`` argument that works in the same way as the ``compression`` argument in :func:`DataFrame.to_csv` (:issue:`18958`) .. _whatsnew_0230.deprecations: diff --git a/pandas/core/series.py b/pandas/core/series.py index 71cded4f9c888..71f32703d48fe 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2881,7 +2881,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, def to_csv(self, path=None, index=True, sep=",", na_rep='', float_format=None, header=False, index_label=None, - mode='w', encoding=None, date_format=None, decimal='.'): + mode='w', encoding=None, compression=None, date_format=None, + decimal='.'): """ Write Series to a comma-separated values (csv) file @@ -2908,6 +2909,10 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', encoding : string, optional a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 + compression : string, optional + a string representing the compression to use in the output file, + allowed values are 'gzip', 'bz2', 'xz', only used when the first + argument is a filename date_format: string, default None Format string for datetime objects. decimal: string, default '.' @@ -2920,10 +2925,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', result = df.to_csv(path, index=index, sep=sep, na_rep=na_rep, float_format=float_format, header=header, index_label=index_label, mode=mode, - encoding=encoding, date_format=date_format, - decimal=decimal) - if path is None: - return result + encoding=encoding, compression=compression, + date_format=date_format, decimal=decimal) @Appender(generic._shared_docs['to_excel'] % _shared_doc_kwargs) def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index ad51261a47c5c..12f3741881c51 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -8,12 +8,13 @@ import numpy as np import pandas as pd -from pandas import Series, DataFrame +from pandas import Series, DataFrame, compat from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm +import pandas.util._test_decorators as td from .common import TestData @@ -138,6 +139,69 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) + def test_to_csv_compression_gzip(self): + + s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], + name='X') + + with ensure_clean() as filename: + + s.to_csv(filename, compression="gzip", header=True) + + # test the round trip - to_csv -> read_csv + rs = pd.read_csv(filename, compression="gzip", index_col=0, + squeeze=True) + assert_series_equal(s, rs) + + # explicitly make sure file is gziped + import gzip + f = gzip.open(filename, 'rb') + text = f.read().decode('utf8') + f.close() + assert s.name in text + + def test_to_csv_compression_bz2(self): + + s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], + name='X') + + with ensure_clean() as filename: + + s.to_csv(filename, compression="bz2", header=True) + + # test the round trip - to_csv -> read_csv + rs = pd.read_csv(filename, compression="bz2", index_col=0, + squeeze=True) + assert_series_equal(s, rs) + + # explicitly make sure file is bz2ed + import bz2 + f = bz2.BZ2File(filename, 'rb') + text = f.read().decode('utf8') + f.close() + assert s.name in text + + @td.skip_if_no_lzma + def test_to_csv_compression_xz(self): + + s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], + name='X') + + with ensure_clean() as filename: + + s.to_csv(filename, compression="xz", header=True) + + # test the round trip - to_csv -> read_csv + rs = pd.read_csv(filename, compression="xz", index_col=0, + squeeze=True) + assert_series_equal(s, rs) + + # explicitly make sure file is xzipped + lzma = compat.import_lzma() + f = lzma.open(filename, 'rb') + assert_series_equal(s, pd.read_csv(f, index_col=0, squeeze=True)) + f.close() + class TestSeriesIO(TestData): From 5f181fa64c5649644f900a5ab10495ca3fb40639 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sat, 13 Jan 2018 00:10:14 +0000 Subject: [PATCH 2/6] pep8 --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 71f32703d48fe..a12c36ff0e0fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2911,7 +2911,7 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', non-ascii, for python versions prior to 3 compression : string, optional a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'xz', only used when the first + allowed values are 'gzip', 'bz2', 'xz', only used when the first argument is a filename date_format: string, default None Format string for datetime objects. From 62841731aa9b4600a0ead1f4b5ddc42c9c0a11e0 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sat, 13 Jan 2018 12:27:58 +0000 Subject: [PATCH 3/6] add back in check for path=None --- pandas/core/series.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index a12c36ff0e0fa..4b6e6690eac0a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2927,6 +2927,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', index_label=index_label, mode=mode, encoding=encoding, compression=compression, date_format=date_format, decimal=decimal) + if path is None: + return result @Appender(generic._shared_docs['to_excel'] % _shared_doc_kwargs) def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', From daaeec23a94661b05160dbb0b9b5818cd25fc898 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sat, 13 Jan 2018 23:18:24 +0000 Subject: [PATCH 4/6] try to parametrize tests --- pandas/tests/series/test_io.py | 64 +++++++++------------------------- 1 file changed, 17 insertions(+), 47 deletions(-) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 12f3741881c51..1aba7684b0405 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -4,11 +4,14 @@ from datetime import datetime import collections import pytest +import gzip +import bz2 +import lzma import numpy as np import pandas as pd -from pandas import Series, DataFrame, compat +from pandas import Series, DataFrame from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, @@ -139,67 +142,34 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def test_to_csv_compression_gzip(self): + @pytest.mark.parametrize('compression, open_func', [ + ('gzip', gzip.open), + ('bz2', bz2.BZ2File), + (pytest.param('xz', lzma.open, marks=td.skip_if_no_lzma)), + ]) + def test_to_csv_compression(self, compression, open_func): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') with ensure_clean() as filename: - s.to_csv(filename, compression="gzip", header=True) + s.to_csv(filename, compression=compression, header=True) # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression="gzip", index_col=0, + rs = pd.read_csv(filename, compression=compression, index_col=0, squeeze=True) assert_series_equal(s, rs) - # explicitly make sure file is gziped - import gzip - f = gzip.open(filename, 'rb') + # explicitly make sure file is compressed + f = open_func(filename, 'rb') text = f.read().decode('utf8') - f.close() - assert s.name in text - - def test_to_csv_compression_bz2(self): - - s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X') - - with ensure_clean() as filename: - - s.to_csv(filename, compression="bz2", header=True) - - # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression="bz2", index_col=0, - squeeze=True) - assert_series_equal(s, rs) - - # explicitly make sure file is bz2ed - import bz2 - f = bz2.BZ2File(filename, 'rb') - text = f.read().decode('utf8') - f.close() assert s.name in text - @td.skip_if_no_lzma - def test_to_csv_compression_xz(self): - - s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X') - - with ensure_clean() as filename: - - s.to_csv(filename, compression="xz", header=True) - - # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression="xz", index_col=0, - squeeze=True) - assert_series_equal(s, rs) + f = open_func(filename, 'rb') + assert_series_equal(s, pd.read_csv(f, index_col=0, + squeeze=True)) - # explicitly make sure file is xzipped - lzma = compat.import_lzma() - f = lzma.open(filename, 'rb') - assert_series_equal(s, pd.read_csv(f, index_col=0, squeeze=True)) f.close() From 42ceff8fd9db2e0be1f9bb94b1eccd2f6770306c Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sun, 14 Jan 2018 14:19:17 +0000 Subject: [PATCH 5/6] improve parametrized tests --- pandas/tests/series/test_io.py | 45 ++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 1aba7684b0405..b331cd595fd07 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -4,14 +4,11 @@ from datetime import datetime import collections import pytest -import gzip -import bz2 -import lzma import numpy as np import pandas as pd -from pandas import Series, DataFrame +from pandas import Series, DataFrame, compat from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, @@ -142,12 +139,31 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - @pytest.mark.parametrize('compression, open_func', [ - ('gzip', gzip.open), - ('bz2', bz2.BZ2File), - (pytest.param('xz', lzma.open, marks=td.skip_if_no_lzma)), + def decompress_file(self, src_path, compression): + if compression is None: + f = open(src_path, 'rb') + elif compression == 'gzip': + import gzip + f = gzip.open(src_path, 'rb') + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(src_path, 'rb') + elif compression == 'xz': + lzma = compat.import_lzma() + f = lzma.LZMAFile(src_path, 'rb') + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + return f + + @pytest.mark.parametrize('compression', [ + None, + 'gzip', + 'bz2', + pytest.param('xz', marks=td.skip_if_no_lzma), ]) - def test_to_csv_compression(self, compression, open_func): + def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') @@ -161,15 +177,14 @@ def test_to_csv_compression(self, compression, open_func): squeeze=True) assert_series_equal(s, rs) - # explicitly make sure file is compressed - f = open_func(filename, 'rb') + # explicitly ensure file was compressed + f = self.decompress_file(filename, compression=compression) text = f.read().decode('utf8') assert s.name in text + f.close() - f = open_func(filename, 'rb') - assert_series_equal(s, pd.read_csv(f, index_col=0, - squeeze=True)) - + f = self.decompress_file(filename, compression=compression) + assert_series_equal(s, pd.read_csv(f, index_col=0, squeeze=True)) f.close() From 33a90dcc4c693e5e92e2fb2445196c86d6241e83 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sun, 14 Jan 2018 17:44:51 +0000 Subject: [PATCH 6/6] move decompress_file to util/testing.py --- pandas/tests/series/test_io.py | 24 +++-------------------- pandas/util/testing.py | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index b331cd595fd07..99dcc9272bf11 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from pandas import Series, DataFrame, compat +from pandas import Series, DataFrame from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, @@ -139,24 +139,6 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def decompress_file(self, src_path, compression): - if compression is None: - f = open(src_path, 'rb') - elif compression == 'gzip': - import gzip - f = gzip.open(src_path, 'rb') - elif compression == 'bz2': - import bz2 - f = bz2.BZ2File(src_path, 'rb') - elif compression == 'xz': - lzma = compat.import_lzma() - f = lzma.LZMAFile(src_path, 'rb') - else: - msg = 'Unrecognized compression type: {}'.format(compression) - raise ValueError(msg) - - return f - @pytest.mark.parametrize('compression', [ None, 'gzip', @@ -178,12 +160,12 @@ def test_to_csv_compression(self, compression): assert_series_equal(s, rs) # explicitly ensure file was compressed - f = self.decompress_file(filename, compression=compression) + f = tm.decompress_file(filename, compression=compression) text = f.read().decode('utf8') assert s.name in text f.close() - f = self.decompress_file(filename, compression=compression) + f = tm.decompress_file(filename, compression=compression) assert_series_equal(s, pd.read_csv(f, index_col=0, squeeze=True)) f.close() diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cd9ebd3017256..1bea25a16ca1e 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -162,6 +162,41 @@ def round_trip_localpath(writer, reader, path=None): return obj +def decompress_file(path, compression): + """ + Open a compressed file and return a file object + + Parameters + ---------- + path : str + The path where the file is read from + + compression : {'gzip', 'bz2', 'xz', None} + Name of the decompression to use + + Returns + ------- + f : file object + """ + + if compression is None: + f = open(path, 'rb') + elif compression == 'gzip': + import gzip + f = gzip.open(path, 'rb') + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(path, 'rb') + elif compression == 'xz': + lzma = compat.import_lzma() + f = lzma.LZMAFile(path, 'rb') + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + return f + + def assert_almost_equal(left, right, check_exact=False, check_dtype='equiv', check_less_precise=False, **kwargs):