ENH: Allow SparseDataFrame/SparseSeries values assignment

kernc · kernc · commit 7b36ac756c03 · 2017-10-05T19:52:54.000+02:00
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -530,7 +530,10 @@ def setter(item, v):
                     # set the item, possibly having a dtype change
                     s._consolidate_inplace()
                     s = s.copy()
-                    s._data = s._data.setitem(indexer=pi, value=v)
+                    if is_sparse(s):
+                        s.set_value(pi, v, takeable=is_list_like(pi))
+                    else:
+                        s._data = s._data.setitem(indexer=pi, value=v)
                     s._maybe_update_cacher(clear=True)
 
                 # reset the sliced object if unique
@@ -635,8 +638,13 @@ def can_do_equal_len():
 
             # actually do the set
             self.obj._consolidate_inplace()
-            self.obj._data = self.obj._data.setitem(indexer=indexer,
-                                                    value=value)
+            if is_sparse(self.obj):
+                # SparseSeries has underlying SparseArray, which doesn't
+                # support resizing
+                self.obj[indexer] = value
+            else:
+                self.obj._data = self.obj._data.setitem(indexer=indexer,
+                                                        value=value)
             self.obj._maybe_update_cacher(clear=True)
 
     def _align_series(self, indexer, ser, multiindex_indexer=False):
@@ -1933,6 +1941,11 @@ def _has_valid_setitem_indexer(self, indexer):
 
     def _convert_key(self, key, is_setter=False):
         """ require  integer args (and convert to label arguments) """
+
+        # allow arbitrary setting
+        if is_setter:
+            return list(key)
+
         for a, i in zip(self.obj.axes, key):
             if not is_integer(i):
                 raise ValueError("iAt based indexing can only have integer "
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1733,12 +1733,13 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
         # use block's copy logic.
         # .values may be an Index which does shallow copy by default
         new_values = self.values if inplace else self.copy().values
+        new_values = new_values.to_dense()
         new_values, _, new, _ = self._try_coerce_args(new_values, new)
 
         if isinstance(new, np.ndarray) and len(new) == len(mask):
             new = new[mask]
 
-        mask = _safe_reshape(mask, new_values.shape)
+        mask = _safe_reshape(np.asarray(mask), new_values.shape)
 
         new_values[mask] = new
         new_values = self._try_coerce_result(new_values)
@@ -2753,6 +2754,12 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
         return self.make_block_same_class(values=values,
                                           placement=self.mgr_locs)
 
+    def _try_coerce_result(self, result):
+        if not is_sparse(result):
+            result = SparseArray(result, kind=self.kind,
+                                 fill_value=self.fill_value, dtype=self.dtype)
+        return result
+
     def __len__(self):
         try:
             return self.sp_index.length
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -948,8 +948,7 @@ def set_value(self, label, value, takeable=False):
         Returns
         -------
         series : Series
-            If label is contained, will be reference to calling Series,
-            otherwise a new object
+            self
         """
         warnings.warn("set_value is deprecated and will be removed "
                       "in a future release. Please use "
diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -322,8 +322,8 @@ def _apply_columns(self, func):
             default_fill_value=self.default_fill_value,
             default_kind=self.default_kind).__finalize__(self)
 
-    def astype(self, dtype):
-        return self._apply_columns(lambda x: x.astype(dtype))
+    def astype(self, dtype, **kwargs):
+        return self._apply_columns(lambda x: x.astype(dtype, **kwargs))
 
     def copy(self, deep=True):
         """
@@ -465,44 +465,6 @@ def _get_value(self, index, col, takeable=False):
         return series._get_value(index, takeable=takeable)
     _get_value.__doc__ = get_value.__doc__
 
-    def set_value(self, index, col, value, takeable=False):
-        """
-        Put single value at passed column and index
-
-        .. deprecated:: 0.21.0
-
-        Please use .at[] or .iat[] accessors.
-
-        Parameters
-        ----------
-        index : row label
-        col : column label
-        value : scalar value
-        takeable : interpret the index/col as indexers, default False
-
-        Notes
-        -----
-        This method *always* returns a new object. It is currently not
-        particularly efficient (and potentially very expensive) but is provided
-        for API compatibility with DataFrame
-
-        Returns
-        -------
-        frame : DataFrame
-        """
-        warnings.warn("set_value is deprecated and will be removed "
-                      "in a future release. Please use "
-                      ".at[] or .iat[] accessors instead", FutureWarning,
-                      stacklevel=2)
-        return self._set_value(index, col, value, takeable=takeable)
-
-    def _set_value(self, index, col, value, takeable=False):
-        dense = self.to_dense()._set_value(
-            index, col, value, takeable=takeable)
-        return dense.to_sparse(kind=self._default_kind,
-                               fill_value=self._default_fill_value)
-    _set_value.__doc__ = set_value.__doc__
-
     def _slice(self, slobj, axis=0, kind=None):
         if axis == 0:
             new_index = self.index[slobj]
diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
@@ -9,18 +9,20 @@
 import warnings
 
 from pandas.core.dtypes.missing import isna, notna
-from pandas.core.dtypes.common import is_scalar
+from pandas.core.dtypes.common import is_scalar, is_sparse
 from pandas.core.common import _values_from_object, _maybe_match_name
 
 from pandas.compat.numpy import function as nv
 from pandas.core.index import Index, _ensure_index, InvalidIndexError
+from pandas.core.indexing import check_bool_indexer
 from pandas.core.series import Series
 from pandas.core.frame import DataFrame
 from pandas.core.internals import SingleBlockManager
 from pandas.core import generic
 import pandas.core.common as com
 import pandas.core.ops as ops
 import pandas._libs.index as _index
+from pandas.errors import PerformanceWarning
 from pandas.util._decorators import Appender
 
 from pandas.core.sparse.array import (
@@ -314,8 +316,13 @@ def __array_wrap__(self, result, context=None):
         else:
             fill_value = self.fill_value
 
+        # Results size unchanged, old sparse index is valid ???
+        if np.size(result) == self.sp_index.npoints:
+            sp_index = self.sp_index
+        else:
+            sp_index = None
         return self._constructor(result, index=self.index,
-                                 sparse_index=self.sp_index,
+                                 sparse_index=sp_index,
                                  fill_value=fill_value,
                                  copy=False).__finalize__(self)
 
@@ -426,7 +433,22 @@ def _get_values(self, indexer):
             return self[indexer]
 
     def _set_with_engine(self, key, value):
+<<<<<<< HEAD
         return self._set_value(key, value)
+||||||| parent of e95270f1e... ENH: Allow SparseDataFrame/SparseSeries values assignment
+        return self.set_value(key, value)
+=======
+        takeable = False
+
+        # Sparse doesn't support reshaping so the standard .where() does
+        # not apply. We short-circuit bool indexers here by treating them as
+        # regular list of indexes and setting each array/value separately
+        if com.is_bool_indexer(key):
+            key = check_bool_indexer(self.index, key).nonzero()[0]
+            takeable = True
+
+        return self.set_value(key, value, takeable=takeable)
+>>>>>>> e95270f1e... ENH: Allow SparseDataFrame/SparseSeries values assignment
 
     def abs(self):
         """
@@ -525,18 +547,62 @@ def set_value(self, label, value, takeable=False):
         return self._set_value(label, value, takeable=takeable)
 
     def _set_value(self, label, value, takeable=False):
-        values = self.to_dense()
-
-        # if the label doesn't exist, we will create a new object here
-        # and possibily change the index
-        new_values = values._set_value(label, value, takeable=takeable)
-        if new_values is not None:
-            values = new_values
-        new_index = values.index
-        values = SparseArray(values, fill_value=self.fill_value,
-                             kind=self.kind)
-        self._data = SingleBlockManager(values, new_index)
-        self._index = new_index
+        try:
+            loc = self.index.get_loc(label)
+        except (KeyError, TypeError):
+            loc = None
+
+        warnings.warn(
+            'Setting SparseSeries values is inefficient '
+            '(a copy of data is made)', PerformanceWarning, stacklevel=2)
+
+        # If label is not unique in index, or it is takeable,
+        # amend the series by amending its dense copy
+        if not isinstance(loc, int) or takeable:
+            warnings.warn(
+                'Setting SparseSeries values is particularly inefficient when '
+                'indexing with a non-unique label because the whole series '
+                'is made dense interim.', PerformanceWarning, stacklevel=2)
+            values = self.to_dense()
+            values.set_value(label, value, takeable=takeable)
+
+            index = values.index
+            sp_index = None
+            values = values.to_sparse(kind=self.kind,
+                                      fill_value=self.fill_value)
+
+        # If label is unique key and not takeable, then it is more space-
+        # efficient to not make the whole series dense, rather just its
+        # defined part
+        else:
+            values = self._to_dense(sparse_only=True)
+            old_index = values.index
+            values.set_value(label, value, takeable=takeable)
+            index = self.index
+
+            # label was already in sparse index, we can just reuse old index
+            if label in old_index:
+                sp_index = self.sp_index
+
+            # label might have been at least in .index
+            else:
+                # and if not, just add it, then construct both indexes anew
+                if loc is None:
+                    index = self.index.append(Index((label,)))
+                    loc = len(index) - 1
+
+                indices = np.append(
+                    self.sp_index.to_int_index().indices,
+                    np.array(loc, dtype=np.int32))
+                order = indices.argsort()
+                values = values.values.take(order)
+                indices = indices.take(order)
+                sp_index = _make_index(len(index), indices, self.kind)
+
+        values = SparseArray(values, sparse_index=sp_index, kind=self.kind,
+                             fill_value=self.fill_value)
+        self._data = SingleBlockManager(values, index)
+        self._index = index
     _set_value.__doc__ = set_value.__doc__
 
     def _set_values(self, key, value):
@@ -561,7 +627,8 @@ def to_dense(self, sparse_only=False):
         Parameters
         ----------
         sparse_only: bool, default False
-            DEPRECATED: this argument will be removed in a future version.
+            .. deprecated:: 0.19.2
+                This argument will be removed in a future version.
 
             If True, return just the non-sparse values, or the dense version
             of `self.values` if False.
@@ -574,12 +641,31 @@ def to_dense(self, sparse_only=False):
             warnings.warn(("The 'sparse_only' parameter has been deprecated "
                            "and will be removed in a future version."),
                           FutureWarning, stacklevel=2)
+        return self._to_dense(sparse_only=sparse_only)
+
+    def _to_dense(self, sparse_only=False):
+        """
+        Convert SparseSeries to a Series.
+
+        Parameters
+        ----------
+        sparse_only: bool, default False
+            If True, return just the non-sparse values, or the dense version
+            of `self.values` if False.
+
+        Returns
+        -------
+        s : Series
+        """
+        if sparse_only:
             int_index = self.sp_index.to_int_index()
             index = self.index.take(int_index.indices)
             return Series(self.sp_values, index=index, name=self.name)
         else:
-            return Series(self.values.to_dense(), index=self.index,
-                          name=self.name)
+            values = self.values
+            if is_sparse(values):
+                values = values.to_dense()
+            return Series(values, index=self.index, name=self.name)
 
     @property
     def density(self):
diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py
@@ -1398,3 +1398,33 @@ def test_numpy_func_call(self):
                  'std', 'min', 'max']
         for func in funcs:
             getattr(np, func)(self.frame)
+
+
+@pytest.mark.parametrize('kind', ['integer', 'block'])
+@pytest.mark.parametrize('indexer', [None, 'loc', 'iloc', 'at', 'iat'])
+@pytest.mark.parametrize('key', [0, [0, 1], [True, False], None])
+def test_frame_assignment(kind, indexer, key):
+    try_multiple = 'at' not in (indexer or '')
+    is_multi_key = np.asarray(key).ndim > 0
+    if is_multi_key and not try_multiple:
+        return
+    if not indexer and not is_multi_key and key is not None:  # skip non-multikey with setitem
+        return
+    if indexer and key is None:  # skip df indexer with non-setitem
+        return
+
+    arr = np.array([[1, nan],
+                    [nan, 1]])
+    sdf = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
+
+    if key is None:
+        key = pd.isnull(sdf).to_sparse()
+
+    arr = arr.copy()
+    arr[np.asarray(key)] = 2
+    res = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
+
+    sdf_setitem = getattr(sdf, indexer) if indexer else sdf
+    sdf_setitem[key] = 2
+
+    tm.assert_sp_frame_equal(sdf, res)
diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py
@@ -1428,3 +1428,28 @@ def test_constructor_dict_datetime64_index(datetime_type):
     expected = SparseSeries(values, map(pd.Timestamp, dates))
 
     tm.assert_sp_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('kind', ['integer', 'block'])
+@pytest.mark.parametrize('indexer', [None, 'loc', 'iloc', 'at', 'iat'])
+@pytest.mark.parametrize('key', [0, [0, 1], 2, [2, 3],
+                                 [True, False, False, False],
+                                 [False, False, False, True],])
+def test_series_assignment(kind, indexer, key):
+    try_multiple = 'at' not in (indexer or '')
+    is_multi_key = np.asarray(key).ndim > 0
+    if is_multi_key and not try_multiple:
+        return
+
+    arr = np.array([0., 0., nan, nan])
+    ss = SparseSeries(arr, kind=kind)
+    assert len(ss.sp_index.to_int_index().indices) == 2
+
+    res = arr.copy()
+    res[key] = 1
+    res = SparseSeries(res, kind=kind)
+
+    ss_setitem = getattr(ss, indexer) if indexer else ss
+    ss_setitem[key] = 1
+
+    tm.assert_sp_series_equal(ss, res)