pandas-dev · lithomas1 · Feb 9, 2020 · Feb 9, 2020 · Feb 9, 2020 · Feb 9, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -42,7 +42,9 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
 - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
--
+- :func:`pandas.read_csv` now accepts engine="arrow" as an argument, allowing for faster csv parsing
+  if pyarrow>0.11 is installed. However, the pyarrow engine is less feature-complete than its "c" or
+  "python" counterparts.
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -20,6 +20,7 @@
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._libs.tslibs import parsing
 from pandas._typing import FilePathOrBuffer
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     AbstractMethodError,
     EmptyDataError,
@@ -165,9 +166,10 @@
     to preserve and not interpret dtype.
     If converters are specified, they will be applied INSTEAD
     of dtype conversion.
-engine : {{'c', 'python'}}, optional
-    Parser engine to use. The C engine is faster while the python engine is
-    currently more feature-complete.
+engine : {{'c', 'python', 'arrow'}}, optional
+    Parser engine to use. The C and arrow engines are faster, while the python engine is
+    currently more feature-complete. The arrow engine requires ``pyarrow``
+    as a dependency however.
 converters : dict, optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels.
@@ -506,7 +508,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     "skip_blank_lines": True,
 }
 
-
 _c_parser_defaults = {
     "delim_whitespace": False,
     "na_filter": True,
@@ -520,6 +521,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
 _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
 
 _c_unsupported = {"skipfooter"}
+_arrow_unsupported = {"skipfooter", "low_memory", "float_precision", "chunksize"}
 _python_unsupported = {"low_memory", "float_precision"}
 
 _deprecated_defaults: Dict[str, Any] = {}
@@ -705,7 +707,6 @@ def read_fwf(
     infer_nrows=100,
     **kwds,
 ):
-
     r"""
     Read a table of fixed-width formatted lines into DataFrame.
 
@@ -944,17 +945,22 @@ def _clean_options(self, options, engine):
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
 
-        # C engine not supported yet
-        if engine == "c":
+        # arrow engine not supported yet
+        if engine == "arrow":
+            if options["chunksize"] is not None:
+                fallback_reason = f"the arrow engine does not support chunksize"
+                engine = "python"
+        # C and arrow engine not supported yet
+        if engine == "c" or engine == "arrow":
             if options["skipfooter"] > 0:
-                fallback_reason = "the 'c' engine does not support skipfooter"
+                fallback_reason = f"the {engine} engine does not support skipfooter"
                 engine = "python"
 
         encoding = sys.getfilesystemencoding() or "utf-8"
         if sep is None and not delim_whitespace:
-            if engine == "c":
+            if engine == "c" or engine == "arrow":
                 fallback_reason = (
-                    "the 'c' engine does not support "
+                    f"the {engine} engine does not support "
                     "sep=None with delim_whitespace=False"
                 )
                 engine = "python"
@@ -1081,14 +1087,20 @@ def _clean_options(self, options, engine):
         na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
 
         # handle skiprows; this is internally handled by the
-        # c-engine, so only need for python parsers
+        # c-engine, so only need for python parser
         if engine != "c":
-            if is_integer(skiprows):
-                skiprows = list(range(skiprows))
-            if skiprows is None:
-                skiprows = set()
-            elif not callable(skiprows):
-                skiprows = set(skiprows)
+            if engine == "arrow":
+                if not is_integer(skiprows) and skiprows is not None:
+                    raise ValueError(
+                        "skiprows argument must be an integer when using engine='arrow'"
+                    )
+            else:
+                if is_integer(skiprows):
+                    skiprows = list(range(skiprows))
+                if skiprows is None:
+                    skiprows = set()
+                elif not callable(skiprows):
+                    skiprows = set(skiprows)
 
         # put stuff back
         result["names"] = names
@@ -1109,6 +1121,8 @@ def __next__(self):
     def _make_engine(self, engine="c"):
         if engine == "c":
             self._engine = CParserWrapper(self.f, **self.options)
+        elif engine == "arrow":
+            self._engine = ArrowParserWrapper(self.f, **self.options)
         else:
             if engine == "python":
                 klass = PythonParser
@@ -1125,29 +1139,32 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
-        nrows = _validate_integer("nrows", nrows)
-        ret = self._engine.read(nrows)
+        if isinstance(self._engine, ArrowParserWrapper):
+            return self._engine.read(nrows)
+        else:
+            nrows = _validate_integer("nrows", nrows)
+            ret = self._engine.read(nrows)
 
-        # May alter columns / col_dict
-        index, columns, col_dict = self._create_index(ret)
+            # May alter columns / col_dict
+            index, columns, col_dict = self._create_index(ret)
 
-        if index is None:
-            if col_dict:
-                # Any column is actually fine:
-                new_rows = len(next(iter(col_dict.values())))
-                index = RangeIndex(self._currow, self._currow + new_rows)
+            if index is None:
+                if col_dict:
+                    # Any column is actually fine:
+                    new_rows = len(next(iter(col_dict.values())))
+                    index = RangeIndex(self._currow, self._currow + new_rows)
+                else:
+                    new_rows = 0
             else:
-                new_rows = 0
-        else:
-            new_rows = len(index)
+                new_rows = len(index)
 
-        df = DataFrame(col_dict, columns=columns, index=index)
+            df = DataFrame(col_dict, columns=columns, index=index)
 
-        self._currow += new_rows
+            self._currow += new_rows
 
-        if self.squeeze and len(df.columns) == 1:
-            return df[df.columns[0]].copy()
-        return df
+            if self.squeeze and len(df.columns) == 1:
+                return df[df.columns[0]].copy()
+            return df
 
     def _create_index(self, ret):
         index, columns, col_dict = ret
@@ -2139,6 +2156,53 @@ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
         return values
 
 
+class ArrowParserWrapper(ParserBase):
+    """
+
+    """
+
+    def __init__(self, src, **kwds):
+        self.kwds = kwds
+        self.src = src
+        kwds = kwds.copy()
+
+        ParserBase.__init__(self, kwds)
+
+        # #2442
+        kwds["allow_leading_cols"] = self.index_col is not False
+
+        # GH20529, validate usecol arg before TextReader
+        self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
+
+    def read(self, nrows=None):
+        pyarrow = import_optional_dependency(
+            "pyarrow.csv", extra="pyarrow is required to use arrow engine"
+        )
+        nrows = _validate_integer("nrows", nrows)
+        table = pyarrow.read_csv(
+            self.src,
+            read_options=pyarrow.ReadOptions(
+                skip_rows=self.kwds.get("skiprows"), column_names=self.names
+            ),
+            parse_options=pyarrow.ParseOptions(
+                delimiter=self.kwds.get("delimiter"),
+                quote_char=self.kwds.get("quotechar"),
+            ),
+            convert_options=pyarrow.ConvertOptions(
+                include_columns=self.usecols, column_types=self.kwds.get("dtype")
+            ),
+        )
+        if nrows:
+            table = table[:nrows]
+        table_width = len(table.column_names)
+        if self.names is None:
+            if self.prefix:
+                self.names = [f"{self.prefix}{i}" for i in range(table_width)]
+        if self.names:
+            table = table.rename_columns(self.names)
+        return table.to_pandas()
+
+
 def TextParser(*args, **kwds):
     """
     Converts lists of lists/tuples into DataFrames with proper type inference
@@ -3340,7 +3404,6 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
 
 
 def _clean_na_values(na_values, keep_default_na=True):
-
     if na_values is None:
         if keep_default_na:
             na_values = STR_NA_VALUES

diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
@@ -44,6 +44,11 @@ class PythonParser(BaseParser):
     float_precision_choices = [None]
 
 
+class ArrowParser(BaseParser):
+    engine = "arrow"
+    float_precision_choices = [None]
+
+
 @pytest.fixture
 def csv_dir_path(datapath):
     """
@@ -63,14 +68,17 @@ def csv1(csv_dir_path):
 _cParserHighMemory = CParserHighMemory()
 _cParserLowMemory = CParserLowMemory()
 _pythonParser = PythonParser()
+_arrowParser = ArrowParser()
 
 _py_parsers_only = [_pythonParser]
 _c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
-_all_parsers = [*_c_parsers_only, *_py_parsers_only]
+_arrow_parsers_only = [_arrowParser]
+_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_arrow_parsers_only]
 
 _py_parser_ids = ["python"]
 _c_parser_ids = ["c_high", "c_low"]
-_all_parser_ids = [*_c_parser_ids, *_py_parser_ids]
+_arrow_parser_ids = ["arrow"]
+_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_arrow_parser_ids]
 
 
 @pytest.fixture(params=_all_parsers, ids=_all_parser_ids)