Skip to content

Commit 7b36ac7

Browse files
committed
ENH: Allow SparseDataFrame/SparseSeries values assignment
1 parent e833725 commit 7b36ac7

File tree

7 files changed

+185
-63
lines changed

7 files changed

+185
-63
lines changed

pandas/core/indexing.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,10 @@ def setter(item, v):
530530
# set the item, possibly having a dtype change
531531
s._consolidate_inplace()
532532
s = s.copy()
533-
s._data = s._data.setitem(indexer=pi, value=v)
533+
if is_sparse(s):
534+
s.set_value(pi, v, takeable=is_list_like(pi))
535+
else:
536+
s._data = s._data.setitem(indexer=pi, value=v)
534537
s._maybe_update_cacher(clear=True)
535538

536539
# reset the sliced object if unique
@@ -635,8 +638,13 @@ def can_do_equal_len():
635638

636639
# actually do the set
637640
self.obj._consolidate_inplace()
638-
self.obj._data = self.obj._data.setitem(indexer=indexer,
639-
value=value)
641+
if is_sparse(self.obj):
642+
# SparseSeries has underlying SparseArray, which doesn't
643+
# support resizing
644+
self.obj[indexer] = value
645+
else:
646+
self.obj._data = self.obj._data.setitem(indexer=indexer,
647+
value=value)
640648
self.obj._maybe_update_cacher(clear=True)
641649

642650
def _align_series(self, indexer, ser, multiindex_indexer=False):
@@ -1933,6 +1941,11 @@ def _has_valid_setitem_indexer(self, indexer):
19331941

19341942
def _convert_key(self, key, is_setter=False):
19351943
""" require integer args (and convert to label arguments) """
1944+
1945+
# allow arbitrary setting
1946+
if is_setter:
1947+
return list(key)
1948+
19361949
for a, i in zip(self.obj.axes, key):
19371950
if not is_integer(i):
19381951
raise ValueError("iAt based indexing can only have integer "

pandas/core/internals.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1733,12 +1733,13 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
17331733
# use block's copy logic.
17341734
# .values may be an Index which does shallow copy by default
17351735
new_values = self.values if inplace else self.copy().values
1736+
new_values = new_values.to_dense()
17361737
new_values, _, new, _ = self._try_coerce_args(new_values, new)
17371738

17381739
if isinstance(new, np.ndarray) and len(new) == len(mask):
17391740
new = new[mask]
17401741

1741-
mask = _safe_reshape(mask, new_values.shape)
1742+
mask = _safe_reshape(np.asarray(mask), new_values.shape)
17421743

17431744
new_values[mask] = new
17441745
new_values = self._try_coerce_result(new_values)
@@ -2753,6 +2754,12 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
27532754
return self.make_block_same_class(values=values,
27542755
placement=self.mgr_locs)
27552756

2757+
def _try_coerce_result(self, result):
2758+
if not is_sparse(result):
2759+
result = SparseArray(result, kind=self.kind,
2760+
fill_value=self.fill_value, dtype=self.dtype)
2761+
return result
2762+
27562763
def __len__(self):
27572764
try:
27582765
return self.sp_index.length

pandas/core/series.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -948,8 +948,7 @@ def set_value(self, label, value, takeable=False):
948948
Returns
949949
-------
950950
series : Series
951-
If label is contained, will be reference to calling Series,
952-
otherwise a new object
951+
self
953952
"""
954953
warnings.warn("set_value is deprecated and will be removed "
955954
"in a future release. Please use "

pandas/core/sparse/frame.py

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,8 @@ def _apply_columns(self, func):
322322
default_fill_value=self.default_fill_value,
323323
default_kind=self.default_kind).__finalize__(self)
324324

325-
def astype(self, dtype):
326-
return self._apply_columns(lambda x: x.astype(dtype))
325+
def astype(self, dtype, **kwargs):
326+
return self._apply_columns(lambda x: x.astype(dtype, **kwargs))
327327

328328
def copy(self, deep=True):
329329
"""
@@ -465,44 +465,6 @@ def _get_value(self, index, col, takeable=False):
465465
return series._get_value(index, takeable=takeable)
466466
_get_value.__doc__ = get_value.__doc__
467467

468-
def set_value(self, index, col, value, takeable=False):
469-
"""
470-
Put single value at passed column and index
471-
472-
.. deprecated:: 0.21.0
473-
474-
Please use .at[] or .iat[] accessors.
475-
476-
Parameters
477-
----------
478-
index : row label
479-
col : column label
480-
value : scalar value
481-
takeable : interpret the index/col as indexers, default False
482-
483-
Notes
484-
-----
485-
This method *always* returns a new object. It is currently not
486-
particularly efficient (and potentially very expensive) but is provided
487-
for API compatibility with DataFrame
488-
489-
Returns
490-
-------
491-
frame : DataFrame
492-
"""
493-
warnings.warn("set_value is deprecated and will be removed "
494-
"in a future release. Please use "
495-
".at[] or .iat[] accessors instead", FutureWarning,
496-
stacklevel=2)
497-
return self._set_value(index, col, value, takeable=takeable)
498-
499-
def _set_value(self, index, col, value, takeable=False):
500-
dense = self.to_dense()._set_value(
501-
index, col, value, takeable=takeable)
502-
return dense.to_sparse(kind=self._default_kind,
503-
fill_value=self._default_fill_value)
504-
_set_value.__doc__ = set_value.__doc__
505-
506468
def _slice(self, slobj, axis=0, kind=None):
507469
if axis == 0:
508470
new_index = self.index[slobj]

pandas/core/sparse/series.py

Lines changed: 103 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,20 @@
99
import warnings
1010

1111
from pandas.core.dtypes.missing import isna, notna
12-
from pandas.core.dtypes.common import is_scalar
12+
from pandas.core.dtypes.common import is_scalar, is_sparse
1313
from pandas.core.common import _values_from_object, _maybe_match_name
1414

1515
from pandas.compat.numpy import function as nv
1616
from pandas.core.index import Index, _ensure_index, InvalidIndexError
17+
from pandas.core.indexing import check_bool_indexer
1718
from pandas.core.series import Series
1819
from pandas.core.frame import DataFrame
1920
from pandas.core.internals import SingleBlockManager
2021
from pandas.core import generic
2122
import pandas.core.common as com
2223
import pandas.core.ops as ops
2324
import pandas._libs.index as _index
25+
from pandas.errors import PerformanceWarning
2426
from pandas.util._decorators import Appender
2527

2628
from pandas.core.sparse.array import (
@@ -314,8 +316,13 @@ def __array_wrap__(self, result, context=None):
314316
else:
315317
fill_value = self.fill_value
316318

319+
# Results size unchanged, old sparse index is valid ???
320+
if np.size(result) == self.sp_index.npoints:
321+
sp_index = self.sp_index
322+
else:
323+
sp_index = None
317324
return self._constructor(result, index=self.index,
318-
sparse_index=self.sp_index,
325+
sparse_index=sp_index,
319326
fill_value=fill_value,
320327
copy=False).__finalize__(self)
321328

@@ -426,7 +433,22 @@ def _get_values(self, indexer):
426433
return self[indexer]
427434

428435
def _set_with_engine(self, key, value):
436+
<<<<<<< HEAD
429437
return self._set_value(key, value)
438+
||||||| parent of e95270f1e... ENH: Allow SparseDataFrame/SparseSeries values assignment
439+
return self.set_value(key, value)
440+
=======
441+
takeable = False
442+
443+
# Sparse doesn't support reshaping so the standard .where() does
444+
# not apply. We short-circuit bool indexers here by treating them as
445+
# regular list of indexes and setting each array/value separately
446+
if com.is_bool_indexer(key):
447+
key = check_bool_indexer(self.index, key).nonzero()[0]
448+
takeable = True
449+
450+
return self.set_value(key, value, takeable=takeable)
451+
>>>>>>> e95270f1e... ENH: Allow SparseDataFrame/SparseSeries values assignment
430452

431453
def abs(self):
432454
"""
@@ -525,18 +547,62 @@ def set_value(self, label, value, takeable=False):
525547
return self._set_value(label, value, takeable=takeable)
526548

527549
def _set_value(self, label, value, takeable=False):
528-
values = self.to_dense()
529-
530-
# if the label doesn't exist, we will create a new object here
531-
# and possibily change the index
532-
new_values = values._set_value(label, value, takeable=takeable)
533-
if new_values is not None:
534-
values = new_values
535-
new_index = values.index
536-
values = SparseArray(values, fill_value=self.fill_value,
537-
kind=self.kind)
538-
self._data = SingleBlockManager(values, new_index)
539-
self._index = new_index
550+
try:
551+
loc = self.index.get_loc(label)
552+
except (KeyError, TypeError):
553+
loc = None
554+
555+
warnings.warn(
556+
'Setting SparseSeries values is inefficient '
557+
'(a copy of data is made)', PerformanceWarning, stacklevel=2)
558+
559+
# If label is not unique in index, or it is takeable,
560+
# amend the series by amending its dense copy
561+
if not isinstance(loc, int) or takeable:
562+
warnings.warn(
563+
'Setting SparseSeries values is particularly inefficient when '
564+
'indexing with a non-unique label because the whole series '
565+
'is made dense interim.', PerformanceWarning, stacklevel=2)
566+
values = self.to_dense()
567+
values.set_value(label, value, takeable=takeable)
568+
569+
index = values.index
570+
sp_index = None
571+
values = values.to_sparse(kind=self.kind,
572+
fill_value=self.fill_value)
573+
574+
# If label is unique key and not takeable, then it is more space-
575+
# efficient to not make the whole series dense, rather just its
576+
# defined part
577+
else:
578+
values = self._to_dense(sparse_only=True)
579+
old_index = values.index
580+
values.set_value(label, value, takeable=takeable)
581+
index = self.index
582+
583+
# label was already in sparse index, we can just reuse old index
584+
if label in old_index:
585+
sp_index = self.sp_index
586+
587+
# label might have been at least in .index
588+
else:
589+
# and if not, just add it, then construct both indexes anew
590+
if loc is None:
591+
index = self.index.append(Index((label,)))
592+
loc = len(index) - 1
593+
594+
indices = np.append(
595+
self.sp_index.to_int_index().indices,
596+
np.array(loc, dtype=np.int32))
597+
order = indices.argsort()
598+
values = values.values.take(order)
599+
indices = indices.take(order)
600+
sp_index = _make_index(len(index), indices, self.kind)
601+
602+
values = SparseArray(values, sparse_index=sp_index, kind=self.kind,
603+
fill_value=self.fill_value)
604+
self._data = SingleBlockManager(values, index)
605+
self._index = index
540606
_set_value.__doc__ = set_value.__doc__
541607

542608
def _set_values(self, key, value):
@@ -561,7 +627,8 @@ def to_dense(self, sparse_only=False):
561627
Parameters
562628
----------
563629
sparse_only: bool, default False
564-
DEPRECATED: this argument will be removed in a future version.
630+
.. deprecated:: 0.19.2
631+
This argument will be removed in a future version.
565632
566633
If True, return just the non-sparse values, or the dense version
567634
of `self.values` if False.
@@ -574,12 +641,31 @@ def to_dense(self, sparse_only=False):
574641
warnings.warn(("The 'sparse_only' parameter has been deprecated "
575642
"and will be removed in a future version."),
576643
FutureWarning, stacklevel=2)
644+
return self._to_dense(sparse_only=sparse_only)
645+
646+
def _to_dense(self, sparse_only=False):
647+
"""
648+
Convert SparseSeries to a Series.
649+
650+
Parameters
651+
----------
652+
sparse_only: bool, default False
653+
If True, return just the non-sparse values, or the dense version
654+
of `self.values` if False.
655+
656+
Returns
657+
-------
658+
s : Series
659+
"""
660+
if sparse_only:
577661
int_index = self.sp_index.to_int_index()
578662
index = self.index.take(int_index.indices)
579663
return Series(self.sp_values, index=index, name=self.name)
580664
else:
581-
return Series(self.values.to_dense(), index=self.index,
582-
name=self.name)
665+
values = self.values
666+
if is_sparse(values):
667+
values = values.to_dense()
668+
return Series(values, index=self.index, name=self.name)
583669

584670
@property
585671
def density(self):

pandas/tests/sparse/test_frame.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,3 +1398,33 @@ def test_numpy_func_call(self):
13981398
'std', 'min', 'max']
13991399
for func in funcs:
14001400
getattr(np, func)(self.frame)
1401+
1402+
1403+
@pytest.mark.parametrize('kind', ['integer', 'block'])
1404+
@pytest.mark.parametrize('indexer', [None, 'loc', 'iloc', 'at', 'iat'])
1405+
@pytest.mark.parametrize('key', [0, [0, 1], [True, False], None])
1406+
def test_frame_assignment(kind, indexer, key):
1407+
try_multiple = 'at' not in (indexer or '')
1408+
is_multi_key = np.asarray(key).ndim > 0
1409+
if is_multi_key and not try_multiple:
1410+
return
1411+
if not indexer and not is_multi_key and key is not None: # skip non-multikey with setitem
1412+
return
1413+
if indexer and key is None: # skip df indexer with non-setitem
1414+
return
1415+
1416+
arr = np.array([[1, nan],
1417+
[nan, 1]])
1418+
sdf = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
1419+
1420+
if key is None:
1421+
key = pd.isnull(sdf).to_sparse()
1422+
1423+
arr = arr.copy()
1424+
arr[np.asarray(key)] = 2
1425+
res = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
1426+
1427+
sdf_setitem = getattr(sdf, indexer) if indexer else sdf
1428+
sdf_setitem[key] = 2
1429+
1430+
tm.assert_sp_frame_equal(sdf, res)

pandas/tests/sparse/test_series.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1428,3 +1428,28 @@ def test_constructor_dict_datetime64_index(datetime_type):
14281428
expected = SparseSeries(values, map(pd.Timestamp, dates))
14291429

14301430
tm.assert_sp_series_equal(result, expected)
1431+
1432+
1433+
@pytest.mark.parametrize('kind', ['integer', 'block'])
1434+
@pytest.mark.parametrize('indexer', [None, 'loc', 'iloc', 'at', 'iat'])
1435+
@pytest.mark.parametrize('key', [0, [0, 1], 2, [2, 3],
1436+
[True, False, False, False],
1437+
[False, False, False, True],])
1438+
def test_series_assignment(kind, indexer, key):
1439+
try_multiple = 'at' not in (indexer or '')
1440+
is_multi_key = np.asarray(key).ndim > 0
1441+
if is_multi_key and not try_multiple:
1442+
return
1443+
1444+
arr = np.array([0., 0., nan, nan])
1445+
ss = SparseSeries(arr, kind=kind)
1446+
assert len(ss.sp_index.to_int_index().indices) == 2
1447+
1448+
res = arr.copy()
1449+
res[key] = 1
1450+
res = SparseSeries(res, kind=kind)
1451+
1452+
ss_setitem = getattr(ss, indexer) if indexer else ss
1453+
ss_setitem[key] = 1
1454+
1455+
tm.assert_sp_series_equal(ss, res)

0 commit comments

Comments
 (0)