From 8e419d3ef5a29e44309f8297038587cf66c04824 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan Date: Sat, 1 Dec 2018 15:27:03 +0530 Subject: [PATCH 1/5] ENH: Add columns argument to read_feather() (#24025) --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/io/feather_format.py | 13 ++++++++++--- pandas/tests/io/test_feather.py | 19 +++++++++++++++++-- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 7617ad5b428a2..3fb7b925ceb6b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -25,6 +25,7 @@ New features dataframe's indexes from the resulting Parquet file. (:issue:`20768`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) +- :func:`DataFrame.read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) .. _whatsnew_0240.values_api: diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 96ebca16d1892..52eca6397b820 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -84,7 +84,7 @@ def to_feather(df, path): @deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads') -def read_feather(path, use_threads=True): +def read_feather(path, columns=None, use_threads=True): """ Load a feather-format object from the file path @@ -93,6 +93,11 @@ def read_feather(path, use_threads=True): Parameters ---------- path : string file path, or file-like object + columns : sequence, default None + Only read a specific set of columns. If not provided, all columns are + read + + .. versionadded 0.24.0 nthreads : int, default 1 Number of CPU threads to use when reading to pandas.DataFrame @@ -116,6 +121,8 @@ def read_feather(path, use_threads=True): int_use_threads = int(use_threads) if int_use_threads < 1: int_use_threads = 1 - return feather.read_feather(path, nthreads=int_use_threads) + return feather.read_feather(path, columns=columns, + nthreads=int_use_threads) - return feather.read_feather(path, use_threads=bool(use_threads)) + return feather.read_feather(path, columns=columns, + use_threads=bool(use_threads)) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 16b59526c8233..3b3a4777a4271 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -26,13 +26,16 @@ def check_error_on_write(self, df, exc): with ensure_clean() as path: to_feather(df, path) - def check_round_trip(self, df, **kwargs): + def check_round_trip(self, df, expected=None, **kwargs): + + if expected is None: + expected = df with ensure_clean() as path: to_feather(df, path) result = read_feather(path, **kwargs) - assert_frame_equal(result, df) + assert_frame_equal(result, expected) def test_error(self): @@ -74,6 +77,18 @@ def test_stringify_columns(self): df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy() self.check_error_on_write(df, ValueError) + def test_read_columns(self): + + df = pd.DataFrame({'col1': list('abc'), + 'col2': list(range(1, 4)), + 'col3': list('xyz'), + 'col4': list(range(4, 7))}) + self.check_round_trip(df, columns=None) + self.check_round_trip(df, columns=df.columns) + random_cols = np.random.choice(df.columns, 2) + self.check_round_trip(df, expected=df[random_cols], + columns=random_cols) + def test_unsupported_other(self): # period From 12a42ea9f845b8b3296071c5f6f9bceb231e3300 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan Date: Sat, 1 Dec 2018 23:11:23 +0530 Subject: [PATCH 2/5] Fix test case --- pandas/tests/io/test_feather.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 3b3a4777a4271..cc42a1a5a83db 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -85,9 +85,9 @@ def test_read_columns(self): 'col4': list(range(4, 7))}) self.check_round_trip(df, columns=None) self.check_round_trip(df, columns=df.columns) - random_cols = np.random.choice(df.columns, 2) - self.check_round_trip(df, expected=df[random_cols], - columns=random_cols) + cols = ['col3', 'col1'] + self.check_round_trip(df, expected=df[cols], + columns=cols) def test_unsupported_other(self): From 3f5382a8109557fab50e536a8c038458da93dd64 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan Date: Sun, 2 Dec 2018 19:33:28 +0530 Subject: [PATCH 3/5] Add Github issue number --- pandas/tests/io/test_feather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index cc42a1a5a83db..d328337cb614e 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -78,7 +78,7 @@ def test_stringify_columns(self): self.check_error_on_write(df, ValueError) def test_read_columns(self): - + # GH 24025 df = pd.DataFrame({'col1': list('abc'), 'col2': list(range(1, 4)), 'col3': list('xyz'), From 99d4aee87aeab5cea7c4d4cc31864b555c35ae50 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan Date: Mon, 3 Dec 2018 00:54:55 +0530 Subject: [PATCH 4/5] Parameterize test case and shorten doc string --- pandas/io/feather_format.py | 3 +-- pandas/tests/io/test_feather.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 52eca6397b820..5c8ab37c7c917 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -94,8 +94,7 @@ def read_feather(path, columns=None, use_threads=True): ---------- path : string file path, or file-like object columns : sequence, default None - Only read a specific set of columns. If not provided, all columns are - read + If not provided, all columns are read .. versionadded 0.24.0 nthreads : int, default 1 diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index d328337cb614e..4935514d10899 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -77,17 +77,20 @@ def test_stringify_columns(self): df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy() self.check_error_on_write(df, ValueError) - def test_read_columns(self): + @pytest.mark.parametrize("columns", [ + None, + ['col1', 'col3'], + ['col1', 'col2', 'col3', 'col4'] + ]) + def test_read_columns(self, columns): # GH 24025 df = pd.DataFrame({'col1': list('abc'), 'col2': list(range(1, 4)), 'col3': list('xyz'), 'col4': list(range(4, 7))}) - self.check_round_trip(df, columns=None) - self.check_round_trip(df, columns=df.columns) - cols = ['col3', 'col1'] - self.check_round_trip(df, expected=df[cols], - columns=cols) + expected = df[columns] if columns else columns + self.check_round_trip(df, expected=expected, + columns=columns) def test_unsupported_other(self): From 8a997fc408fed2633f2fc1f5e1e9639de611e323 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan Date: Mon, 3 Dec 2018 23:58:04 +0530 Subject: [PATCH 5/5] Remove unnecessary test cases --- pandas/tests/io/test_feather.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 4935514d10899..19ecb378b6378 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -77,19 +77,14 @@ def test_stringify_columns(self): df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy() self.check_error_on_write(df, ValueError) - @pytest.mark.parametrize("columns", [ - None, - ['col1', 'col3'], - ['col1', 'col2', 'col3', 'col4'] - ]) - def test_read_columns(self, columns): + def test_read_columns(self): # GH 24025 df = pd.DataFrame({'col1': list('abc'), 'col2': list(range(1, 4)), 'col3': list('xyz'), 'col4': list(range(4, 7))}) - expected = df[columns] if columns else columns - self.check_round_trip(df, expected=expected, + columns = ['col1', 'col3'] + self.check_round_trip(df, expected=df[columns], columns=columns) def test_unsupported_other(self):