pandas-dev · TomAugspurger · Oct 13, 2018 · Jul 12, 2018 · Jul 13, 2018 · Jul 13, 2018
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -320,6 +320,22 @@ is the case with :attr:`Period.end_time`, for example
 
    p.end_time
 
+.. _whatsnew_0240.api_breaking.sparse_values:
+
+``SparseArray`` is now an ``ExtensionArray``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This has some notable changes
+
+- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`
+- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of ``SparseDtype``, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subdtype``.
+- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`todo`)
+- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used.
+- passing ``fill_value`` to ``SparseArray.take`` no longer implies ``allow_fill=True``.
+- ``SparseArray.astype(np.dtype)`` will create a dense NumPy array. To keep astype to a SparseArray with a different subdtype, use ``.astype(sparse_dtype)`` or a string like ``.astype('Sparse[float32]')``.
+- Setting ``SparseArray.fill_value`` to a fill value with a different dtype is now allowed.
+- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index.
+
 .. _whatsnew_0240.api.datetimelike.normalize:
 
 Tick DateOffset Normalize Restrictions
@@ -418,7 +434,7 @@ ExtensionType Changes
 - Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
 - :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
 - :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
--
+- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric.
 
 .. _whatsnew_0240.api.incompatibilities:
 

diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
@@ -71,6 +71,10 @@ cdef class IntIndex(SparseIndex):
         output += 'Indices: %s\n' % repr(self.indices)
         return output
 
+    @property
+    def nbytes(self):
+        return self.indices.nbytes
+
     def check_integrity(self):
         """
         Checks the following:
@@ -362,6 +366,10 @@ cdef class BlockIndex(SparseIndex):
 
         return output
 
+    @property
+    def nbytes(self):
+        return self.blocs.nbytes + self.blengths.nbytes
+
     @property
     def ngaps(self):
         return self.length - self.npoints

diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py
@@ -5,4 +5,4 @@
 from pandas.core.algorithms import take  # noqa
 from pandas.core.arrays.base import (ExtensionArray,    # noqa
                                      ExtensionScalarOpsMixin)
-from pandas.core.dtypes.dtypes import ExtensionDtype  # noqa
+from pandas.core.dtypes.dtypes import registry, ExtensionDtype  # noqa
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -15,7 +15,7 @@
 from pandas import compat
 from pandas.compat import iteritems, PY36, OrderedDict
 from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
-from pandas.core.dtypes.common import is_integer
+from pandas.core.dtypes.common import is_integer, is_bool_dtype
 from pandas.core.dtypes.inference import _iterable_not_string
 from pandas.core.dtypes.missing import isna, isnull, notnull  # noqa
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
@@ -100,7 +100,12 @@ def maybe_box_datetimelike(value):
 
 
 def is_bool_indexer(key):
-    if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)):
+    # TODO: This is currently broken for ExtensionArrays.
+    # We currently special case SparseArray, but that should *maybe* be
+    # just ExtensionArray.
+    from pandas.core.sparse.api import SparseArray
+
+    if isinstance(key, (ABCSeries, np.ndarray, ABCIndex, SparseArray)):
         if key.dtype == np.object_:
             key = np.asarray(values_from_object(key))
 
@@ -110,7 +115,7 @@ def is_bool_indexer(key):
                                      'NA / NaN values')
                 return False
             return True
-        elif key.dtype == np.bool_:
+        elif is_bool_dtype(key.dtype):
             return True
     elif isinstance(key, list):
         try:

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -94,6 +94,17 @@ def is_dtype(cls, dtype):
         except TypeError:
             return False
 
+    @property
+    def _is_numeric(self):
+        """
+        Whether columns with this dtype should be considered numeric.
+
+        By default ExtensionDtypes are assumed to be non-numeric.
+        They'll be excluded from operations that exclude non-numeric
+        columns, like groupby reductions.
+        """
+        return False
+
 
 class ExtensionDtype(_DtypeOpsMixin):
     """A custom data type, to be paired with an ExtensionArray.
@@ -109,6 +120,11 @@ class ExtensionDtype(_DtypeOpsMixin):
     * name
     * construct_from_string
 
+    The following properties affect the behavior of extension arrays
+    in operations:
+
+    * _is_numeric
+
     Optionally one can override construct_array_type for construction
     with the name of this dtype via the Registry
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -11,6 +11,7 @@
     DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, IntervalDtype,
     IntervalDtypeType, PandasExtensionDtype, ExtensionDtype,
     _pandas_registry)
+from pandas.core.sparse.dtype import SparseDtype
 from pandas.core.dtypes.generic import (
     ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries,
     ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass,
@@ -152,8 +153,22 @@ def is_sparse(arr):
     >>> is_sparse(bsr_matrix([1, 2, 3]))
     False
     """
+    from pandas.core.sparse.array import SparseArray
+    from pandas.core.sparse.dtype import SparseDtype
+    from pandas.core.generic import ABCSeries
+    from pandas.core.internals import BlockManager, Block
 
-    return isinstance(arr, (ABCSparseArray, ABCSparseSeries))
+    if isinstance(arr, BlockManager):
+        # SparseArrays are only 1d
+        if arr.ndim == 1:
+            arr = arr.blocks[0]
+        else:
+            return False
+
+    if isinstance(arr, (ABCSeries, Block)):
+        arr = arr.values
+
+    return isinstance(arr, (SparseArray, ABCSparseSeries, SparseDtype))
 
 
 def is_scipy_sparse(arr):
@@ -1608,8 +1623,9 @@ def is_bool_dtype(arr_or_dtype):
     False
     >>> is_bool_dtype(np.array([True, False]))
     True
+    >>> is_bool_dtype(pd.SparseArray([True, False]))
+    True
     """
-
     if arr_or_dtype is None:
         return False
     try:
@@ -1626,7 +1642,8 @@ def is_bool_dtype(arr_or_dtype):
         # guess this
         return (arr_or_dtype.is_object and
                 arr_or_dtype.inferred_type == 'boolean')
-
+    elif isinstance(arr_or_dtype, SparseDtype):
+        return issubclass(arr_or_dtype.subdtype.type, np.bool_)
     return issubclass(tipo, np.bool_)
 
 
@@ -1706,6 +1723,8 @@ def is_extension_array_dtype(arr_or_dtype):
     array interface. In pandas, this includes:
 
     * Categorical
+    * Sparse
+    * Interval
 
     Third-party libraries may implement arrays or types satisfying
     this interface as well.
@@ -1828,7 +1847,8 @@ def _get_dtype(arr_or_dtype):
             return PeriodDtype.construct_from_string(arr_or_dtype)
         elif is_interval_dtype(arr_or_dtype):
             return IntervalDtype.construct_from_string(arr_or_dtype)
-    elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)):
+    elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex,
+                                   ABCSparseArray, ABCSparseSeries)):
         return arr_or_dtype.dtype
 
     if hasattr(arr_or_dtype, 'dtype'):
@@ -1876,6 +1896,10 @@ def _get_dtype_type(arr_or_dtype):
         elif is_interval_dtype(arr_or_dtype):
             return IntervalDtypeType
         return _get_dtype_type(np.dtype(arr_or_dtype))
+    elif isinstance(arr_or_dtype, (ABCSparseSeries, ABCSparseArray,
+                                   SparseDtype)):
+        dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
+        return dtype.type
     try:
         return arr_or_dtype.dtype.type
     except AttributeError:

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -97,7 +97,9 @@ def _get_frame_result_type(result, objs):
     otherwise, return 1st obj
     """
 
-    if result.blocks and all(b.is_sparse for b in result.blocks):
+    if (result.blocks and (
+            all(is_sparse(b) for b in result.blocks) or
+            all(isinstance(obj, ABCSparseDataFrame) for obj in objs))):
         from pandas.core.sparse.api import SparseDataFrame
         return SparseDataFrame
     else:
@@ -554,61 +556,23 @@ def _concat_sparse(to_concat, axis=0, typs=None):
     a single array, preserving the combined dtypes
     """
 
-    from pandas.core.sparse.array import SparseArray, _make_index
+    from pandas.core.sparse.array import SparseArray
 
-    def convert_sparse(x, axis):
-        # coerce to native type
-        if isinstance(x, SparseArray):
-            x = x.get_values()
-        else:
-            x = np.asarray(x)
-        x = x.ravel()
-        if axis > 0:
-            x = np.atleast_2d(x)
-        return x
+    fill_values = [x.fill_value for x in to_concat
+                   if isinstance(x, SparseArray)]
 
-    if typs is None:
-        typs = get_dtype_kinds(to_concat)
+    if len(set(fill_values)) > 1:
+        raise ValueError("Cannot concatenate SparseArrays with different "
+                         "fill values")
 
-    if len(typs) == 1:
-        # concat input as it is if all inputs are sparse
-        # and have the same fill_value
-        fill_values = {c.fill_value for c in to_concat}
-        if len(fill_values) == 1:
-            sp_values = [c.sp_values for c in to_concat]
-            indexes = [c.sp_index.to_int_index() for c in to_concat]
-
-            indices = []
-            loc = 0
-            for idx in indexes:
-                indices.append(idx.indices + loc)
-                loc += idx.length
-            sp_values = np.concatenate(sp_values)
-            indices = np.concatenate(indices)
-            sp_index = _make_index(loc, indices, kind=to_concat[0].sp_index)
-
-            return SparseArray(sp_values, sparse_index=sp_index,
-                               fill_value=to_concat[0].fill_value)
-
-    # input may be sparse / dense mixed and may have different fill_value
-    # input must contain sparse at least 1
-    sparses = [c for c in to_concat if is_sparse(c)]
-    fill_values = [c.fill_value for c in sparses]
-    sp_indexes = [c.sp_index for c in sparses]
-
-    # densify and regular concat
-    to_concat = [convert_sparse(x, axis) for x in to_concat]
-    result = np.concatenate(to_concat, axis=axis)
-
-    if not len(typs - {'sparse', 'f', 'i'}):
-        # sparsify if inputs are sparse and dense numerics
-        # first sparse input's fill_value and SparseIndex is used
-        result = SparseArray(result.ravel(), fill_value=fill_values[0],
-                             kind=sp_indexes[0])
-    else:
-        # coerce to object if needed
-        result = result.astype('object')
-    return result
+    fill_value = list(fill_values)[0]
+
+    # TODO: Fix join unit generation so we aren't passed this.
+    to_concat = [x if isinstance(x, SparseArray)
+                 else SparseArray(x.squeeze(), fill_value=fill_value)
+                 for x in to_concat]
+
+    return SparseArray._concat_same_type(to_concat)
 
 
 def _concat_rangeindex_same_dtype(indexes):

diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py
@@ -5,7 +5,7 @@
     make_block,     # io.pytables, io.packers
     FloatBlock, IntBlock, ComplexBlock, BoolBlock, ObjectBlock,
     TimeDeltaBlock, DatetimeBlock, DatetimeTZBlock,
-    CategoricalBlock, ExtensionBlock, SparseBlock, ScalarBlock,
+    CategoricalBlock, ExtensionBlock, ScalarBlock,
     Block)
 from .managers import (  # noqa:F401
     BlockManager, SingleBlockManager,