pandas-dev · lithomas1 · Feb 9, 2020 · Feb 9, 2020 · Feb 9, 2020 · Feb 9, 2020
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -1,4 +1,4 @@
-from io import StringIO
+from io import BytesIO, StringIO
 import random
 import string
 
@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
 class ReadCSVSkipRows(BaseIO):
 
     fname = "__test__.csv"
-    params = [None, 10000]
-    param_names = ["skiprows"]
+    params = ([None, 10000], ["c", "pyarrow"])
+    param_names = ["skiprows", "engine"]
 
-    def setup(self, skiprows):
+    def setup(self, skiprows, engine):
         N = 20000
         index = tm.makeStringIndex(N)
         df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
         )
         df.to_csv(self.fname)
 
-    def time_skipprows(self, skiprows):
-        read_csv(self.fname, skiprows=skiprows)
+    def time_skipprows(self, skiprows, engine):
+        read_csv(self.fname, skiprows=skiprows, engine=engine)
 
 
 class ReadUint64Integers(StringIORewind):
@@ -254,9 +254,33 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
             names=list("abc"),
         )
 
+    def time_read_csv_arrow(self, sep, decimal, float_precision):
+        read_csv(
+            self.data(self.StringIO_input),
+            sep=sep,
+            header=None,
+            names=list("abc"),
+        )
 
-class ReadCSVCategorical(BaseIO):
 
+class ReadCSVEngine(StringIORewind):
+    params = ["c", "python", "pyarrow"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
+        self.StringIO_input = StringIO("\n".join(data))
+        # simulate reading from file
+        self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
+
+    def time_read_stringcsv(self, engine):
+        read_csv(self.data(self.StringIO_input), engine=engine)
+
+    def time_read_bytescsv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
+
+
+class ReadCSVCategorical(BaseIO):
     fname = "__test__.csv"
 
     def setup(self):
@@ -273,7 +297,10 @@ def time_convert_direct(self):
 
 
 class ReadCSVParseDates(StringIORewind):
-    def setup(self):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
                   {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
                   {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -284,18 +311,20 @@ def setup(self):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self):
+    def time_multiple_date(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=list(string.digits[:9]),
             parse_dates=[[1, 2], [1, 3]],
         )
 
-    def time_baseline(self):
+    def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             parse_dates=[1],
@@ -304,17 +333,18 @@ def time_baseline(self):
 
 
 class ReadCSVCachedParseDates(StringIORewind):
-    params = ([True, False],)
-    param_names = ["do_cache"]
+    params = ([True, False], ["c", "pyarrow", "python"])
+    param_names = ["do_cache", "engine"]
 
-    def setup(self, do_cache):
+    def setup(self, do_cache, engine):
         data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
         self.StringIO_input = StringIO(data)
 
-    def time_read_csv_cached(self, do_cache):
+    def time_read_csv_cached(self, do_cache, engine):
         try:
             read_csv(
                 self.data(self.StringIO_input),
+                engine=engine,
                 header=None,
                 parse_dates=[0],
                 cache_dates=do_cache,
@@ -344,22 +374,23 @@ def mem_parser_chunks(self):
 
 
 class ReadCSVParseSpecialDate(StringIORewind):
-    params = (["mY", "mdY", "hm"],)
-    param_names = ["value"]
+    params = (["mY", "mdY", "hm"], ["c", "pyarrow", "python"])
+    param_names = ["value", "engine"]
     objects = {
         "mY": "01-2019\n10-2019\n02/2000\n",
         "mdY": "12/02/2010\n",
         "hm": "21:34\n",
     }
 
-    def setup(self, value):
+    def setup(self, value, engine):
         count_elem = 10000
         data = self.objects[value] * count_elem
         self.StringIO_input = StringIO(data)
 
-    def time_read_special_date(self, value):
+    def time_read_special_date(self, value, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=["Date"],

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -158,9 +158,11 @@ dtype : Type name or dict of column -> type, default ``None``
   (unsupported with ``engine='python'``). Use ``str`` or ``object`` together
   with suitable ``na_values`` settings to preserve and
   not interpret dtype.
-engine : {``'c'``, ``'python'``}
-  Parser engine to use. The C engine is faster while the Python engine is
-  currently more feature-complete.
+engine : {``'c'``, ``'pyarrow'``, ``'python'``}
+  Parser engine to use. In terms of performance, the pyarrow engine,
+  which requires ``pyarrow`` >= 0.15.0, is faster than the C engine, which
+  is faster than the python engine. However, the pyarrow and C engines
+  are currently less feature complete than their Python counterpart.
 converters : dict, default ``None``
   Dict of functions for converting values in certain columns. Keys can either be
   integers or column labels.
@@ -1600,11 +1602,18 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
 Specifying the parser engine
 ''''''''''''''''''''''''''''
 
-Under the hood pandas uses a fast and efficient parser implemented in C as well
-as a Python implementation which is currently more feature-complete. Where
-possible pandas uses the C parser (specified as ``engine='c'``), but may fall
-back to Python if C-unsupported options are specified. Currently, C-unsupported
-options include:
+Currently, pandas supports using three engines, the C engine, the python engine,
+and an optional pyarrow engine(requires ``pyarrow`` >= 0.15). In terms of performance
+the pyarrow engine is fastest, followed by the C and Python engines. However,
+the pyarrow engine is much less robust than the C engine, which in turn lacks a
+couple of features present in the Python parser.
+
+Where possible pandas uses the C parser (specified as ``engine='c'``), but may fall
+back to Python if C-unsupported options are specified. If pyarrow unsupported options are
+specified while using ``engine='pyarrow'``, the parser will error out
+(a full list of unsupported options is available at ``pandas.io.parsers._pyarrow_unsupported``).
+
+Currently, C-unsupported options include:
 
 * ``sep`` other than a single character (e.g. regex separators)
 * ``skipfooter``

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -252,6 +252,7 @@ If needed you can adjust the bins with the argument ``offset`` (a :class:`Timede
 
 For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`.
 
+
 fsspec now used for filesystem handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -204,6 +204,12 @@ example where the index name is preserved:
 The same is true for :class:`MultiIndex`, but the logic is applied separately on a
 level-by-level basis.
 
+read_csv() now accepts pyarrow as an engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_csv` now accepts engine="pyarrow" as an argument, allowing for faster csv parsing on multicore machines
+with pyarrow>=0.15 installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
+
 .. _whatsnew_120.enhancements.other:
 
 Other enhancements

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -1,6 +1,8 @@
 import distutils.version
 import importlib
+import sys
 import types
+from typing import Optional
 import warnings
 
 # Update install.rst when updating versions!
@@ -43,6 +45,7 @@
     "pandas_gbq": "pandas-gbq",
     "sqlalchemy": "SQLAlchemy",
     "jinja2": "Jinja2",
+    "pyarrow.csv": "pyarrow",
 }
 
 
@@ -58,7 +61,11 @@ def _get_version(module: types.ModuleType) -> str:
 
 
 def import_optional_dependency(
-    name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise"
+    name: str,
+    extra: str = "",
+    raise_on_missing: bool = True,
+    on_version: str = "raise",
+    min_version: Optional[str] = None,
 ):
     """
     Import an optional dependency.
@@ -70,8 +77,7 @@ def import_optional_dependency(
     Parameters
     ----------
     name : str
-        The module name. This should be top-level only, so that the
-        version may be checked.
+        The module name.
     extra : str
         Additional text to include in the ImportError message.
     raise_on_missing : bool, default True
@@ -85,6 +91,8 @@ def import_optional_dependency(
         * ignore: Return the module, even if the version is too old.
           It's expected that users validate the version locally when
           using ``on_version="ignore"`` (see. ``io/html.py``)
+    min_version: Optional[str]
+        Specify the minimum version
 
     Returns
     -------
@@ -109,10 +117,16 @@ def import_optional_dependency(
             raise ImportError(msg) from None
         else:
             return None
-
-    minimum_version = VERSIONS.get(name)
+    # Handle submodules: if we have submodule, grab parent module from sys.modules
+    parent = name.split(".")[0]
+    if parent != name:
+        install_name = parent
+        module_to_get = sys.modules[install_name]
+    else:
+        module_to_get = module
+    minimum_version = min_version if min_version is not None else VERSIONS.get(name)
     if minimum_version:
-        version = _get_version(module)
+        version = _get_version(module_to_get)
         if distutils.version.LooseVersion(version) < minimum_version:
             assert on_version in {"warn", "raise", "ignore"}
             msg = (
Original file line number	Diff line number	Diff line change
Expand Up		@@ -252,6 +252,7 @@ If needed you can adjust the bins with the argument ``offset`` (a :class:`Timede

		For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`.


		fsspec now used for filesystem handling
		^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down