From 8c247c2bb5544ab53bc884391eaff7b2b1b20520 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Tue, 7 Nov 2017 21:00:47 +0100
Subject: [PATCH 01/13] implement to read only columns from parquet file

---
 pandas/io/parquet.py            | 14 ++++++++------
 pandas/tests/io/test_parquet.py | 11 +++++++++++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 4b507b7f5df6f..caeebad3def7a 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy',
                 table, path, compression=compression,
                 coerce_timestamps=coerce_timestamps, **kwargs)
 
-    def read(self, path):
+    def read(self, path, columns=None):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.parquet.read_table(path).to_pandas()
+        return self.api.parquet.read_table(path, columns).to_pandas()
 
 
 class FastParquetImpl(object):
@@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs):
             self.api.write(path, df,
                            compression=compression, **kwargs)
 
-    def read(self, path):
+    def read(self, path, columns=None):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.ParquetFile(path).to_pandas()
+        return self.api.ParquetFile(path).to_pandas(columns)
 
 
 def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
@@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
     return impl.write(df, path, compression=compression)
 
 
-def read_parquet(path, engine='auto', **kwargs):
+def read_parquet(path, engine='auto', columns=None,  **kwargs):
     """
     Load a parquet object from the file path, returning a DataFrame.
 
@@ -188,6 +188,8 @@ def read_parquet(path, engine='auto', **kwargs):
     ----------
     path : string
         File path
+    columns: list
+        If not None, only these columns will be read from the file.
     engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
         Parquet reader library to use. If 'auto', then the option
         'io.parquet.engine' is used. If 'auto', then the first
@@ -201,4 +203,4 @@ def read_parquet(path, engine='auto', **kwargs):
     """
 
     impl = get_engine(engine)
-    return impl.read(path)
+    return impl.read(path, columns)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index ecd4e8f719014..27511379d9046 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -282,6 +282,17 @@ def test_compression(self, engine, compression):
         df = pd.DataFrame({'A': [1, 2, 3]})
         self.check_round_trip(df, engine, compression=compression)
 
+    def test_read_columns(self, engine, fp):
+        df = pd.DataFrame({'string': list('abc'),
+                           'int': list(range(1, 4))})
+
+        with tm.ensure_clean() as path:
+            df.to_parquet(path, engine, compression=None)
+            result = read_parquet(path, engine, columns=["string"])
+
+            expected = pd.DataFrame({'string': list('abc')})
+            tm.assert_frame_equal(result, expected)
+
 
 class TestParquetPyArrow(Base):
 

From d00d2222250f18cfe86441c06dac0a8f90d2d237 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Tue, 7 Nov 2017 21:03:03 +0100
Subject: [PATCH 02/13] fix flake8

---
 pandas/io/parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index caeebad3def7a..df784e9aa4fe3 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
     return impl.write(df, path, compression=compression)
 
 
-def read_parquet(path, engine='auto', columns=None,  **kwargs):
+def read_parquet(path, engine='auto', columns=None, **kwargs):
     """
     Load a parquet object from the file path, returning a DataFrame.
 

From c1449f5c768c464826a5c946e5d3d84857c09b81 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Tue, 7 Nov 2017 21:21:44 +0100
Subject: [PATCH 03/13] reference issue in tests, clarify default in docstring

---
 pandas/io/parquet.py            | 2 +-
 pandas/tests/io/test_parquet.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index df784e9aa4fe3..a420f4b36beb1 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -188,7 +188,7 @@ def read_parquet(path, engine='auto', columns=None, **kwargs):
     ----------
     path : string
         File path
-    columns: list
+    columns: list, default=None
         If not None, only these columns will be read from the file.
     engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
         Parquet reader library to use. If 'auto', then the option
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 27511379d9046..fbd1c5ed9d51a 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -283,6 +283,7 @@ def test_compression(self, engine, compression):
         self.check_round_trip(df, engine, compression=compression)
 
     def test_read_columns(self, engine, fp):
+        #GH18154
         df = pd.DataFrame({'string': list('abc'),
                            'int': list(range(1, 4))})
 

From 22663e8c2de92625b1c2c2f00fed5143ec3596c4 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Tue, 7 Nov 2017 21:23:16 +0100
Subject: [PATCH 04/13] fix pep8

---
 pandas/tests/io/test_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index fbd1c5ed9d51a..7868979252970 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -283,7 +283,7 @@ def test_compression(self, engine, compression):
         self.check_round_trip(df, engine, compression=compression)
 
     def test_read_columns(self, engine, fp):
-        #GH18154
+        # GH18154
         df = pd.DataFrame({'string': list('abc'),
                            'int': list(range(1, 4))})
 

From f31e6a288b821737bdfb58d7c4aa62a7947341a2 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Tue, 7 Nov 2017 21:27:11 +0100
Subject: [PATCH 05/13] add whatsnew entry

---
 doc/source/whatsnew/v0.22.0.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
index 943b6bb84fb47..a111e49f7d6d7 100644
--- a/doc/source/whatsnew/v0.22.0.txt
+++ b/doc/source/whatsnew/v0.22.0.txt
@@ -109,7 +109,7 @@ I/O
 ^^^
 
 - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
--
+- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
 -
 
 Plotting

From f91f5f8250bba2bdec97a38cde27cd742a0ffacd Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Tue, 7 Nov 2017 22:08:19 +0100
Subject: [PATCH 06/13] add feature to version v0.21.1

---
 doc/source/whatsnew/v0.21.1.txt | 1 +
 doc/source/whatsnew/v0.22.0.txt | 2 +-
 pandas/io/parquet.py            | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
index 6044f25ca5147..be40894aea656 100644
--- a/doc/source/whatsnew/v0.21.1.txt
+++ b/doc/source/whatsnew/v0.21.1.txt
@@ -81,6 +81,7 @@ I/O
 - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
 - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`)
 - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
+- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
 
 Plotting
 ^^^^^^^^
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
index a111e49f7d6d7..943b6bb84fb47 100644
--- a/doc/source/whatsnew/v0.22.0.txt
+++ b/doc/source/whatsnew/v0.22.0.txt
@@ -109,7 +109,7 @@ I/O
 ^^^
 
 - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
-- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
+-
 -
 
 Plotting
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index a420f4b36beb1..0ef6aca3668bf 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -190,6 +190,7 @@ def read_parquet(path, engine='auto', columns=None, **kwargs):
         File path
     columns: list, default=None
         If not None, only these columns will be read from the file.
+        .. versionadded 0.21.1
     engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
         Parquet reader library to use. If 'auto', then the option
         'io.parquet.engine' is used. If 'auto', then the first

From 21c5f5ea3528e451ec2653e19de094d25cdc0405 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Tue, 7 Nov 2017 22:16:48 +0100
Subject: [PATCH 07/13] add documentation how to read columns from parquet file

---
 doc/source/io.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 8656e617b8173..5d6b00a4db72e 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -4538,6 +4538,16 @@ Read from a parquet file.
 
    result.dtypes
 
+Read only certain columns of a parquet file. 
+
+.. ipython:: python
+
+   result = pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b'])
+   result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b'])
+
+   result.dtypes
+
+
 .. ipython:: python
    :suppress:
 

From ef30f39f58fa77e8aa5c7592899627cfc0b04b88 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Wed, 8 Nov 2017 08:49:28 +0100
Subject: [PATCH 08/13] use keyword argument to pass columns

---
 pandas/io/parquet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 0ef6aca3668bf..c54167fe52fe3 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -78,7 +78,7 @@ def write(self, df, path, compression='snappy',
 
     def read(self, path, columns=None):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.parquet.read_table(path, columns).to_pandas()
+        return self.api.parquet.read_table(path, columns=columns).to_pandas()
 
 
 class FastParquetImpl(object):
@@ -117,7 +117,7 @@ def write(self, df, path, compression='snappy', **kwargs):
 
     def read(self, path, columns=None):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.ParquetFile(path).to_pandas(columns)
+        return self.api.ParquetFile(path).to_pandas(columns=columns)
 
 
 def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
@@ -204,4 +204,4 @@ def read_parquet(path, engine='auto', columns=None, **kwargs):
     """
 
     impl = get_engine(engine)
-    return impl.read(path, columns)
+    return impl.read(path, columns=columns)

From 54fc1c9b5110bdefb6c545107205cc8d4a08b652 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Wed, 8 Nov 2017 08:52:27 +0100
Subject: [PATCH 09/13] use check_round_tip in tests

---
 pandas/tests/io/test_parquet.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 7868979252970..9bfc8214e21d2 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -192,7 +192,7 @@ def check_round_trip(self, df, engine, expected=None, **kwargs):
 
         with tm.ensure_clean() as path:
             df.to_parquet(path, engine, **kwargs)
-            result = read_parquet(path, engine)
+            result = read_parquet(path, engine, **kwargs)
 
             if expected is None:
                 expected = df
@@ -200,7 +200,7 @@ def check_round_trip(self, df, engine, expected=None, **kwargs):
 
             # repeat
             to_parquet(df, path, engine, **kwargs)
-            result = pd.read_parquet(path, engine)
+            result = pd.read_parquet(path, engine, **kwargs)
 
             if expected is None:
                 expected = df
@@ -282,17 +282,14 @@ def test_compression(self, engine, compression):
         df = pd.DataFrame({'A': [1, 2, 3]})
         self.check_round_trip(df, engine, compression=compression)
 
-    def test_read_columns(self, engine, fp):
+    def test_read_columns(self, engine):
         # GH18154
         df = pd.DataFrame({'string': list('abc'),
                            'int': list(range(1, 4))})
 
-        with tm.ensure_clean() as path:
-            df.to_parquet(path, engine, compression=None)
-            result = read_parquet(path, engine, columns=["string"])
-
-            expected = pd.DataFrame({'string': list('abc')})
-            tm.assert_frame_equal(result, expected)
+        expected = pd.DataFrame({'string': list('abc')})
+        self.check_round_trip(df, engine, expected=expected, compression=None, columns=["string"])
+        
 
 
 class TestParquetPyArrow(Base):

From e5336b65bc437fcd3c18c9da2687578db35a476a Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Wed, 8 Nov 2017 08:53:33 +0100
Subject: [PATCH 10/13] pep8

---
 pandas/tests/io/test_parquet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 9bfc8214e21d2..4e9f4699e14f4 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -288,8 +288,8 @@ def test_read_columns(self, engine):
                            'int': list(range(1, 4))})
 
         expected = pd.DataFrame({'string': list('abc')})
-        self.check_round_trip(df, engine, expected=expected, compression=None, columns=["string"])
-        
+        self.check_round_trip(
+            df, engine, expected=expected, compression=None, columns=["string"])
 
 
 class TestParquetPyArrow(Base):

From d6baa9d916e03bbb1f15b6269b1220b2be10801c Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Wed, 8 Nov 2017 08:54:56 +0100
Subject: [PATCH 11/13] add newline before versionadded

---
 pandas/io/parquet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index c54167fe52fe3..ef95e32cc241e 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -190,6 +190,7 @@ def read_parquet(path, engine='auto', columns=None, **kwargs):
         File path
     columns: list, default=None
         If not None, only these columns will be read from the file.
+
         .. versionadded 0.21.1
     engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
         Parquet reader library to use. If 'auto', then the option

From 7f6e7f67b83d07ec1e6e81fcd7ed75e7c36ff783 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Wed, 8 Nov 2017 12:51:30 +0100
Subject: [PATCH 12/13] fix lint problem

---
 pandas/tests/io/test_parquet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 4e9f4699e14f4..068b28258d3d8 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -288,8 +288,8 @@ def test_read_columns(self, engine):
                            'int': list(range(1, 4))})
 
         expected = pd.DataFrame({'string': list('abc')})
-        self.check_round_trip(
-            df, engine, expected=expected, compression=None, columns=["string"])
+        self.check_round_trip(df, engine,
+                expected=expected, compression=None, columns=["string"])
 
 
 class TestParquetPyArrow(Base):

From 4b22c889ca4463d7c5ebcd3acbcf3e6067d0a1c2 Mon Sep 17 00:00:00 2001
From: Peter Hoffmann <ph@peter-hoffmann.com>
Date: Wed, 8 Nov 2017 14:04:18 +0100
Subject: [PATCH 13/13] fix lint error

---
 pandas/tests/io/test_parquet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 068b28258d3d8..9a4edf38e2ef4 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -288,8 +288,8 @@ def test_read_columns(self, engine):
                            'int': list(range(1, 4))})
 
         expected = pd.DataFrame({'string': list('abc')})
-        self.check_round_trip(df, engine,
-                expected=expected, compression=None, columns=["string"])
+        self.check_round_trip(df, engine, expected=expected,
+                              compression=None, columns=["string"])
 
 
 class TestParquetPyArrow(Base):