From 86d2f72b43e67db2fadc0391fb9b064e79446116 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 20 May 2018 17:26:55 +0100 Subject: [PATCH 01/12] set keyword argument so zipfile actually compresses --- pandas/io/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 0827216975f15..a492b7c0b8e8e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,7 @@ import codecs import mmap from contextlib import contextmanager, closing -from zipfile import ZipFile +import zipfile from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat @@ -428,7 +428,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, return f, handles -class BytesZipFile(ZipFile, BytesIO): +class BytesZipFile(zipfile.ZipFile, BytesIO): """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -437,10 +437,10 @@ class BytesZipFile(ZipFile, BytesIO): bytes strings into a member of the archive. """ # GH 17778 - def __init__(self, file, mode='r', **kwargs): + def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') - super(BytesZipFile, self).__init__(file, mode, **kwargs) + super(BytesZipFile, self).__init__(file, mode, compression, **kwargs) def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) From 498451d5c479c066a50b9ff45e0685cf24b254f7 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 25 May 2018 23:58:39 +0100 Subject: [PATCH 02/12] add compression size test case --- pandas/tests/frame/test_to_csv.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e4829ebf48561..cfcb9d1257a86 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -943,6 +943,22 @@ def test_to_csv_compression(self, compression): with tm.decompress_file(filename, compression) as fh: assert_frame_equal(df, read_csv(fh, index_col=0)) + def test_to_csv_compression_size(self, compression): + + df = pd.concat(100 * [DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z'])]) + + with ensure_clean() as filename: + import os + df.to_csv(filename, compression=compression) + file_size = os.path.getsize(filename) + + if compression: + df.to_csv(filename, compression=None) + uncompressed_file_size = os.path.getsize(filename) + assert uncompressed_file_size > file_size + def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: dt_index = self.tsframe.index From 012383f7c443ee710e7bccf9d5a109f7b86743c4 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 25 May 2018 23:58:47 +0100 Subject: [PATCH 03/12] add compression size test case --- pandas/tests/io/json/test_compression.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c9074ca49e5be..c482d7272d29d 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,6 +21,24 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) +def test_to_csv_compression_size(compression): + + df = pd.concat(100 * [pd.DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z'])], + ignore_index=True) + + with tm.ensure_clean() as filename: + import os + df.to_json(filename, compression=compression) + file_size = os.path.getsize(filename) + + if compression: + df.to_json(filename, compression=None) + uncompressed_file_size = os.path.getsize(filename) + assert uncompressed_file_size > file_size + + def test_read_zipped_json(): uncompressed_path = tm.get_data_path("tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) From a79014845e41751407db94cbf47676342b5b0829 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 26 May 2018 00:07:08 +0100 Subject: [PATCH 04/12] add compression size test case --- pandas/tests/io/test_pickle.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index fbe2174e603e2..dcb3494136a36 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -457,6 +457,21 @@ def test_read_infer(self, ext, get_random_path): tm.assert_frame_equal(df, df2) + def test_compression_size(self, compression): + + df = pd.concat(100 * [pd.DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z'])]) + + with tm.ensure_clean() as filename: + import os + df.to_pickle(filename, compression=compression) + file_size = os.path.getsize(filename) + + if compression: + df.to_pickle(filename, compression=None) + uncompressed_file_size = os.path.getsize(filename) + assert uncompressed_file_size > file_size # --------------------- # test pickle compression From 42f5c3200fac4e1d0420eb04be25b2a870606b72 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 26 May 2018 00:13:56 +0100 Subject: [PATCH 05/12] add compression size test case --- pandas/tests/series/test_io.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0b0d4334c86a3..1bf2477537c7a 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -162,6 +162,20 @@ def test_to_csv_compression(self, compression): index_col=0, squeeze=True)) + def test_to_csv_compression_size(self, compression): + + s = Series(100 * [0.123456, 0.234567, 0.567567], name='X') + + with ensure_clean() as filename: + import os + s.to_csv(filename, compression=compression) + file_size = os.path.getsize(filename) + + if compression: + s.to_csv(filename, compression=None) + uncompressed_file_size = os.path.getsize(filename) + assert uncompressed_file_size > file_size + class TestSeriesIO(TestData): From 74b8c3454d8cecd4903a10d220b5566b396ccd12 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 26 May 2018 00:36:07 +0100 Subject: [PATCH 06/12] update whatsnew --- doc/source/whatsnew/v0.23.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 5a553264e828b..af1f34aa6bca9 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -75,7 +75,7 @@ Indexing I/O ^^^ -- +- Bug in :class:`pandas.io.common.BytesZipFile` where zip compression produces uncompressed zip archive (:issue:`17778`) - Plotting From f31fc3d8ec1b5c8c3f7acc5dafc9d72f0fd852f4 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 26 May 2018 13:49:32 +0100 Subject: [PATCH 07/12] E302 expect 2 blank lines --- pandas/tests/io/test_pickle.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index dcb3494136a36..c17a02c7fd7c2 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -473,6 +473,7 @@ def test_compression_size(self, compression): uncompressed_file_size = os.path.getsize(filename) assert uncompressed_file_size > file_size + # --------------------- # test pickle compression # --------------------- From 3a29ab3a01084ff0fa72d53a156179c62e4a84de Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 26 May 2018 15:59:12 +0100 Subject: [PATCH 08/12] minor refactor of tests --- pandas/tests/io/json/test_compression.py | 2 +- pandas/tests/io/test_pickle.py | 1 - pandas/tests/series/test_io.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c482d7272d29d..c425425c21ecc 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,7 +21,7 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) -def test_to_csv_compression_size(compression): +def test_to_json_compression_size(compression): df = pd.concat(100 * [pd.DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index c17a02c7fd7c2..05bdb3f5d2e7d 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -464,7 +464,6 @@ def test_compression_size(self, compression): columns=['X', 'Y', 'Z'])]) with tm.ensure_clean() as filename: - import os df.to_pickle(filename, compression=compression) file_size = os.path.getsize(filename) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 471876be82538..f3c9803ffb79e 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -167,11 +167,11 @@ def test_to_csv_compression_size(self, compression): with ensure_clean() as filename: import os - s.to_csv(filename, compression=compression) + s.to_csv(filename, compression=compression, header=True) file_size = os.path.getsize(filename) if compression: - s.to_csv(filename, compression=None) + s.to_csv(filename, compression=None, header=True) uncompressed_file_size = os.path.getsize(filename) assert uncompressed_file_size > file_size From 4775cac0181101c1890d56438436efc1430d1a43 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 27 May 2018 14:23:26 +0100 Subject: [PATCH 09/12] refactor tests --- pandas/tests/frame/test_to_csv.py | 16 ---------------- pandas/tests/io/json/test_compression.py | 18 ------------------ pandas/tests/io/test_pickle.py | 15 --------------- pandas/tests/series/test_io.py | 14 -------------- pandas/tests/test_common.py | 23 ++++++++++++++++++++++- 5 files changed, 22 insertions(+), 64 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index cfcb9d1257a86..e4829ebf48561 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -943,22 +943,6 @@ def test_to_csv_compression(self, compression): with tm.decompress_file(filename, compression) as fh: assert_frame_equal(df, read_csv(fh, index_col=0)) - def test_to_csv_compression_size(self, compression): - - df = pd.concat(100 * [DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z'])]) - - with ensure_clean() as filename: - import os - df.to_csv(filename, compression=compression) - file_size = os.path.getsize(filename) - - if compression: - df.to_csv(filename, compression=None) - uncompressed_file_size = os.path.getsize(filename) - assert uncompressed_file_size > file_size - def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: dt_index = self.tsframe.index diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c425425c21ecc..c9074ca49e5be 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,24 +21,6 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) -def test_to_json_compression_size(compression): - - df = pd.concat(100 * [pd.DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z'])], - ignore_index=True) - - with tm.ensure_clean() as filename: - import os - df.to_json(filename, compression=compression) - file_size = os.path.getsize(filename) - - if compression: - df.to_json(filename, compression=None) - uncompressed_file_size = os.path.getsize(filename) - assert uncompressed_file_size > file_size - - def test_read_zipped_json(): uncompressed_path = tm.get_data_path("tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 05bdb3f5d2e7d..fbe2174e603e2 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -457,21 +457,6 @@ def test_read_infer(self, ext, get_random_path): tm.assert_frame_equal(df, df2) - def test_compression_size(self, compression): - - df = pd.concat(100 * [pd.DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z'])]) - - with tm.ensure_clean() as filename: - df.to_pickle(filename, compression=compression) - file_size = os.path.getsize(filename) - - if compression: - df.to_pickle(filename, compression=None) - uncompressed_file_size = os.path.getsize(filename) - assert uncompressed_file_size > file_size - # --------------------- # test pickle compression diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f3c9803ffb79e..e369dfda6deac 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -161,20 +161,6 @@ def test_to_csv_compression(self, compression): index_col=0, squeeze=True)) - def test_to_csv_compression_size(self, compression): - - s = Series(100 * [0.123456, 0.234567, 0.567567], name='X') - - with ensure_clean() as filename: - import os - s.to_csv(filename, compression=compression, header=True) - file_size = os.path.getsize(filename) - - if compression: - s.to_csv(filename, compression=None, header=True) - uncompressed_file_size = os.path.getsize(filename) - assert uncompressed_file_size > file_size - class TestSeriesIO(TestData): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 0b329f64dafa3..28ddf9e0204c3 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- import pytest +import os import collections from functools import partial import numpy as np -from pandas import Series, Timestamp +import pandas as pd +from pandas import Series, DataFrame, Timestamp from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops @@ -222,3 +224,22 @@ def test_standardize_mapping(): dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) + + +@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) +def test_compression_size(method, compression): + + df = pd.concat(100 * [DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z'])], + ignore_index=True) + s = df.iloc[:, 0] + + with tm.ensure_clean() as filename: + for obj in [df, s]: + getattr(obj, method)(filename, compression=compression) + file_size = os.path.getsize(filename) + getattr(obj, method)(filename, compression=None) + uncompressed_file_size = os.path.getsize(filename) + if compression: + assert uncompressed_file_size > file_size From fa6c433597d18c64de8d8f740c7ad86e709f590a Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 27 May 2018 20:35:14 +0100 Subject: [PATCH 10/12] parameterize objects --- pandas/tests/test_common.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 28ddf9e0204c3..b101b43b79098 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -226,20 +226,20 @@ def test_standardize_mapping(): assert isinstance(com.standardize_mapping(dd), partial) +@pytest.mark.parametrize('frame', [ + pd.concat(100 * [DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z'])], ignore_index=True), + pd.concat(100 * [Series([0.123456, 0.234567, 0.567567], name='X')], + ignore_index=True)]) @pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) -def test_compression_size(method, compression): - - df = pd.concat(100 * [DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z'])], - ignore_index=True) - s = df.iloc[:, 0] +def test_compression_size(frame, method, compression): + if not compression: + pytest.skip("only test compression case.") with tm.ensure_clean() as filename: - for obj in [df, s]: - getattr(obj, method)(filename, compression=compression) - file_size = os.path.getsize(filename) - getattr(obj, method)(filename, compression=None) - uncompressed_file_size = os.path.getsize(filename) - if compression: - assert uncompressed_file_size > file_size + getattr(frame, method)(filename, compression=compression) + compressed = os.path.getsize(filename) + getattr(frame, method)(filename, compression=None) + uncompressed = os.path.getsize(filename) + assert uncompressed > compressed From adb6fd665addbde004a483c99128aa42ad9c23a8 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 27 May 2018 20:58:47 +0100 Subject: [PATCH 11/12] simplify construction --- pandas/tests/test_common.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index b101b43b79098..bb7ee1b911fee 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -7,7 +7,6 @@ import numpy as np -import pandas as pd from pandas import Series, DataFrame, Timestamp from pandas.compat import range, lmap import pandas.core.common as com @@ -226,20 +225,19 @@ def test_standardize_mapping(): assert isinstance(com.standardize_mapping(dd), partial) -@pytest.mark.parametrize('frame', [ - pd.concat(100 * [DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z'])], ignore_index=True), - pd.concat(100 * [Series([0.123456, 0.234567, 0.567567], name='X')], - ignore_index=True)]) +@pytest.mark.parametrize('obj', [ + DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) @pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) -def test_compression_size(frame, method, compression): +def test_compression_size(obj, method, compression): if not compression: pytest.skip("only test compression case.") with tm.ensure_clean() as filename: - getattr(frame, method)(filename, compression=compression) + getattr(obj, method)(filename, compression=compression) compressed = os.path.getsize(filename) - getattr(frame, method)(filename, compression=None) + getattr(obj, method)(filename, compression=None) uncompressed = os.path.getsize(filename) assert uncompressed > compressed From 974b063568b8cdebd20290dfb5ddbc0348ac3e68 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 29 May 2018 06:40:09 -0400 Subject: [PATCH 12/12] update whatsnew --- doc/source/whatsnew/v0.23.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index aa94497feb692..c02d988a7bc63 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -83,7 +83,7 @@ Indexing I/O ^^^ -- Bug in :class:`pandas.io.common.BytesZipFile` where zip compression produces uncompressed zip archive (:issue:`17778`) +- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) -