WIP:Make python engine support EA types when reading CSVs

kprestel · kprestel · commit 9d5a6c782b83 · 2018-12-08T09:56:29.000-05:00
The C engine is the real WIP.
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1231,7 +1231,11 @@ cdef class TextReader:
 
             if result is not None and dtype != 'int64':
                 if is_extension_array_dtype(dtype):
-                    result = result.astype(dtype.numpy_dtype)
+                    try:
+                        result = dtype.construct_array_type()._from_sequence(
+                                result, dtype=dtype)
+                    except Exception as e:
+                        raise
                 else:
                     result = result.astype(dtype)
 
@@ -1243,7 +1247,11 @@ cdef class TextReader:
 
             if result is not None and dtype != 'float64':
                 if is_extension_array_dtype(dtype):
-                    result = result.astype(dtype.numpy_dtype)
+                    try:
+                        result = dtype.construct_array_type()._from_sequence(
+                                result)
+                    except Exception as e:
+                        raise
                 else:
                     result = result.astype(dtype)
             return result, na_count
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -123,6 +123,27 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         """
         raise AbstractMethodError(cls)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        """Construct a new ExtensionArray from a sequence of scalars.
+
+        Parameters
+        ----------
+        strings : Sequence
+            Each element will be an instance of the scalar type for this
+            array, ``cls.dtype.type``.
+        dtype : dtype, optional
+            Construct for this particular dtype. This should be a Dtype
+            compatible with the ExtensionArray.
+        copy : boolean, default False
+            If True, copy the underlying data.
+
+        Returns
+        -------
+        ExtensionArray
+        """
+        raise AbstractMethodError(cls)
+
     @classmethod
     def _from_factorized(cls, values, original):
         """
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -154,7 +154,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
             dtype = dtype.lower()
         if not issubclass(type(dtype), _IntegerDtype):
             try:
-                dtype = _dtypes[str(np.dtype(dtype))]
+                dtype = _dtypes[str(np.dtype(dtype.name.lower()))]
             except KeyError:
                 raise ValueError("invalid dtype specified {}".format(dtype))
 
@@ -261,6 +261,10 @@ def __init__(self, values, mask, copy=False):
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         return integer_array(scalars, dtype=dtype, copy=copy)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence([int(x) for x in strings], dtype, copy)
+
     @classmethod
     def _from_factorized(cls, values, original):
         return integer_array(values, dtype=original.dtype)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -615,8 +615,22 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
 
     # dispatch on extension dtype if needed
     if is_extension_array_dtype(dtype):
-        return dtype.construct_array_type()._from_sequence(
-            arr, dtype=dtype, copy=copy)
+        if is_object_dtype(arr):
+            try:
+                return dtype.construct_array_type()._from_sequence_of_strings(
+                    arr, dtype=dtype, copy=copy)
+            except AttributeError:
+                dtype = pandas_dtype(dtype)
+                return dtype.construct_array_type()._from_sequence_of_strings(
+                    arr, dtype=dtype, copy=copy)
+        else:
+            try:
+                return dtype.construct_array_type()._from_sequence(
+                    arr, dtype=dtype, copy=copy)
+            except AttributeError:
+                dtype = pandas_dtype(dtype)
+                return dtype.construct_array_type()._from_sequence(
+                    arr, dtype=dtype, copy=copy)
 
     if not isinstance(dtype, np.dtype):
         dtype = pandas_dtype(dtype)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1795,7 +1795,10 @@ def _get_dtype(arr_or_dtype):
     if isinstance(arr_or_dtype, np.dtype):
         return arr_or_dtype
     elif isinstance(arr_or_dtype, type):
-        return np.dtype(arr_or_dtype)
+        try:
+            return pandas_dtype(arr_or_dtype)
+        except TypeError:
+            return np.dtype(arr_or_dtype)
     elif isinstance(arr_or_dtype, ExtensionDtype):
         return arr_or_dtype
     elif isinstance(arr_or_dtype, DatetimeTZDtype):
@@ -1813,6 +1816,11 @@ def _get_dtype(arr_or_dtype):
             return PeriodDtype.construct_from_string(arr_or_dtype)
         elif is_interval_dtype(arr_or_dtype):
             return IntervalDtype.construct_from_string(arr_or_dtype)
+        else:
+            try:
+                return pandas_dtype(arr_or_dtype)
+            except TypeError:
+                pass
     elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex,
                                    ABCSparseArray, ABCSparseSeries)):
         return arr_or_dtype.dtype
@@ -1843,7 +1851,15 @@ def _get_dtype_type(arr_or_dtype):
     if isinstance(arr_or_dtype, np.dtype):
         return arr_or_dtype.type
     elif isinstance(arr_or_dtype, type):
-        return np.dtype(arr_or_dtype).type
+        try:
+            dtype = pandas_dtype(arr_or_dtype)
+            try:
+                return dtype.type
+            except AttributeError:
+                raise TypeError
+        except TypeError:
+            return np.dtype(arr_or_dtype).type
+
     elif isinstance(arr_or_dtype, CategoricalDtype):
         return CategoricalDtypeType
     elif isinstance(arr_or_dtype, DatetimeTZDtype):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -29,7 +29,7 @@
 from pandas.core.dtypes.common import (
     ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
     is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype,
-    is_scalar, is_string_dtype)
+    is_scalar, is_string_dtype, is_extension_array_dtype)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 
@@ -1660,15 +1660,17 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
                     try_num_bool=False)
             else:
                 # skip inference if specified dtype is object
-                try_num_bool = not (cast_type and is_string_dtype(cast_type))
+                try_num_bool = not (cast_type and (is_string_dtype(cast_type)
+                                                   or is_extension_array_dtype(cast_type)))
 
                 # general type inference and conversion
                 cvals, na_count = self._infer_types(
                     values, set(col_na_values) | col_na_fvalues,
                     try_num_bool)
 
                 # type specified in dtype param
-                if cast_type and not is_dtype_equal(cvals, cast_type):
+                if cast_type and not (is_dtype_equal(cvals, cast_type)
+                        or is_extension_array_dtype(cast_type)):
                     try:
                         if (is_bool_dtype(cast_type) and
                                 not is_categorical_dtype(cast_type)
diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py
@@ -0,0 +1,19 @@
+import pandas as pd
+from pandas.compat import StringIO
+from pandas.core.arrays.integer import Int64Dtype
+from .base import BaseExtensionTests
+
+
+class ExtensionParsingTests(BaseExtensionTests):
+    def test_EA_types(self):
+        df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'),
+                           'A': [1, 2, 1]})
+        data = df.to_csv(index=False)
+        result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype})
+        assert result is not None
+
+        df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'),
+                           'A': [1, 2, 1]})
+        data = df.to_csv(index=False)
+        result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'})
+        assert result is not None
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -73,6 +73,11 @@ def dtype(self):
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         return cls(scalars)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence([decimal.Decimal(x) for x in strings],
+                                  dtype, copy)
+
     @classmethod
     def _from_factorized(cls, values, original):
         return cls(values)