Skip to content

Commit b5b57c9

Browse files
committed
ENH: Allow SparseDataFrame/SparseSeries values assignment
1 parent 8453c73 commit b5b57c9

File tree

8 files changed

+173
-83
lines changed

8 files changed

+173
-83
lines changed

pandas/core/frame.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -770,9 +770,9 @@ def iterrows(self):
770770
iteritems : Iterate over (column name, Series) pairs.
771771
772772
"""
773-
iloc = self.iloc
773+
row_at = self.iloc.__getitem__
774774
for i, k in enumerate(self.index):
775-
yield k, iloc[i]
775+
yield k, row_at(i)
776776

777777
def itertuples(self, index=True, name="Pandas"):
778778
"""
@@ -2567,9 +2567,7 @@ def set_value(self, index, col, value, takeable=False):
25672567
25682568
Returns
25692569
-------
2570-
frame : DataFrame
2571-
If label pair is contained, will be reference to calling DataFrame,
2572-
otherwise a new object
2570+
self : DataFrame
25732571
"""
25742572
warnings.warn("set_value is deprecated and will be removed "
25752573
"in a future release. Please use "

pandas/core/internals.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,9 @@ def _is_empty_indexer(indexer):
925925
if _is_empty_indexer(indexer):
926926
pass
927927

928+
elif is_sparse(values):
929+
values = values.set_values(indexer, value)
930+
928931
# setting a single element for each dim and with a rhs that could
929932
# be say a list
930933
# GH 6043
@@ -1809,6 +1812,11 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
18091812
new_values = self.values if inplace else self.copy().values
18101813
new_values, _, new, _ = self._try_coerce_args(new_values, new)
18111814

1815+
if is_sparse(new_values):
1816+
indexer = mask.to_dense().values.ravel().nonzero()[0]
1817+
block = self.setitem(indexer, new)
1818+
return [block]
1819+
18121820
if isinstance(new, np.ndarray) and len(new) == len(mask):
18131821
new = new[mask]
18141822

@@ -3061,6 +3069,17 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
30613069
return self.make_block_same_class(values=values,
30623070
placement=self.mgr_locs)
30633071

3072+
def _can_hold_element(self, element):
3073+
return np.can_cast(np.asarray(element).dtype, self.sp_values.dtype)
3074+
3075+
def _try_coerce_result(self, result):
3076+
if (isinstance(result, np.ndarray) and
3077+
np.ndim(result) > 0
3078+
and not is_sparse(result)):
3079+
result = SparseArray(result, kind=self.kind,
3080+
fill_value=self.fill_value, dtype=self.dtype)
3081+
return result
3082+
30643083
def __len__(self):
30653084
try:
30663085
return self.sp_index.length

pandas/core/series.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,9 +1063,7 @@ def set_value(self, label, value, takeable=False):
10631063
10641064
Returns
10651065
-------
1066-
series : Series
1067-
If label is contained, will be reference to calling Series,
1068-
otherwise a new object
1066+
self : Series
10691067
"""
10701068
warnings.warn("set_value is deprecated and will be removed "
10711069
"in a future release. Please use "

pandas/core/sparse/array.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import pandas.core.algorithms as algos
3838
import pandas.core.ops as ops
3939
import pandas.io.formats.printing as printing
40+
from pandas.errors import PerformanceWarning
4041
from pandas.util._decorators import Appender
4142
from pandas.core.indexes.base import _index_shared_docs
4243

@@ -369,6 +370,53 @@ def get_values(self, fill=None):
369370
""" return a dense representation """
370371
return self.to_dense(fill=fill)
371372

373+
def set_values(self, indexer, value):
374+
"""
375+
Return new SparseArray with indexed values set to `value`.
376+
377+
Returns
378+
-------
379+
SparseArray
380+
A new sparse array with indexer positions filled with value.
381+
"""
382+
# If indexer is not a single int position, easiest to handle via dense
383+
if not is_scalar(indexer):
384+
warnings.warn(
385+
'Setting SparseSeries/Array values is particularly '
386+
'inefficient when indexing with multiple keys because the '
387+
'whole series is made dense interim.',
388+
PerformanceWarning, stacklevel=2)
389+
390+
values = self.to_dense()
391+
values[indexer] = value
392+
return SparseArray(values, kind=self.kind,
393+
fill_value=self.fill_value)
394+
395+
warnings.warn(
396+
'Setting SparseSeries/Array values is inefficient '
397+
'(a copy of data is made).', PerformanceWarning, stacklevel=2)
398+
399+
# If label already in sparse index, just switch the value on a copy
400+
idx = self.sp_index.lookup(indexer)
401+
if idx != -1:
402+
obj = self.copy()
403+
obj.sp_values[idx] = value
404+
return obj
405+
406+
# Otherwise, construct a new array, and insert the new value in the
407+
# correct position
408+
indices = self.sp_index.to_int_index().indices
409+
pos = np.searchsorted(indices, indexer)
410+
411+
indices = np.insert(indices, pos, indexer)
412+
sp_values = np.insert(self.sp_values, pos, value)
413+
# Length can be increased when adding a new value into index
414+
length = max(self.sp_index.length, indexer + 1)
415+
sp_index = _make_index(length, indices, self.kind)
416+
417+
return SparseArray(sp_values, sparse_index=sp_index,
418+
fill_value=self.fill_value)
419+
372420
def to_dense(self, fill=None):
373421
"""
374422
Convert SparseArray to a NumPy array.

pandas/core/sparse/frame.py

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -332,8 +332,8 @@ def _apply_columns(self, func):
332332
default_fill_value=self.default_fill_value,
333333
default_kind=self.default_kind).__finalize__(self)
334334

335-
def astype(self, dtype):
336-
return self._apply_columns(lambda x: x.astype(dtype))
335+
def astype(self, dtype, **kwargs):
336+
return self._apply_columns(lambda x: x.astype(dtype, **kwargs))
337337

338338
def copy(self, deep=True):
339339
"""
@@ -464,44 +464,6 @@ def _get_value(self, index, col, takeable=False):
464464
return series._get_value(index, takeable=takeable)
465465
_get_value.__doc__ = get_value.__doc__
466466

467-
def set_value(self, index, col, value, takeable=False):
468-
"""
469-
Put single value at passed column and index
470-
471-
.. deprecated:: 0.21.0
472-
473-
Please use .at[] or .iat[] accessors.
474-
475-
Parameters
476-
----------
477-
index : row label
478-
col : column label
479-
value : scalar value
480-
takeable : interpret the index/col as indexers, default False
481-
482-
Notes
483-
-----
484-
This method *always* returns a new object. It is currently not
485-
particularly efficient (and potentially very expensive) but is provided
486-
for API compatibility with DataFrame
487-
488-
Returns
489-
-------
490-
frame : DataFrame
491-
"""
492-
warnings.warn("set_value is deprecated and will be removed "
493-
"in a future release. Please use "
494-
".at[] or .iat[] accessors instead", FutureWarning,
495-
stacklevel=2)
496-
return self._set_value(index, col, value, takeable=takeable)
497-
498-
def _set_value(self, index, col, value, takeable=False):
499-
dense = self.to_dense()._set_value(
500-
index, col, value, takeable=takeable)
501-
return dense.to_sparse(kind=self._default_kind,
502-
fill_value=self._default_fill_value)
503-
_set_value.__doc__ = set_value.__doc__
504-
505467
def _slice(self, slobj, axis=0, kind=None):
506468
if axis == 0:
507469
new_index = self.index[slobj]

pandas/core/sparse/series.py

Lines changed: 20 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import warnings
1010

1111
from pandas.core.dtypes.missing import isna, notna
12+
from pandas.core.dtypes.common import is_sparse
1213

1314
from pandas.compat.numpy import function as nv
1415
from pandas.core.index import Index, _ensure_index, InvalidIndexError
@@ -17,7 +18,6 @@
1718
from pandas.core import generic
1819
import pandas.core.common as com
1920
import pandas.core.ops as ops
20-
import pandas._libs.index as libindex
2121
from pandas.util._decorators import Appender
2222

2323
from pandas.core.sparse.array import (
@@ -277,8 +277,13 @@ def __array_wrap__(self, result, context=None):
277277
else:
278278
fill_value = self.fill_value
279279

280+
# Assume: If result size matches, old sparse index is valid (ok???)
281+
if np.size(result) == self.sp_index.npoints:
282+
sp_index = self.sp_index
283+
else:
284+
sp_index = None
280285
return self._constructor(result, index=self.index,
281-
sparse_index=self.sp_index,
286+
sparse_index=sp_index,
282287
fill_value=fill_value,
283288
copy=False).__finalize__(self)
284289

@@ -479,7 +484,7 @@ def set_value(self, label, value, takeable=False):
479484
480485
Returns
481486
-------
482-
series : SparseSeries
487+
self : SparseSeries
483488
"""
484489
warnings.warn("set_value is deprecated and will be removed "
485490
"in a future release. Please use "
@@ -488,35 +493,16 @@ def set_value(self, label, value, takeable=False):
488493
return self._set_value(label, value, takeable=takeable)
489494

490495
def _set_value(self, label, value, takeable=False):
491-
values = self.to_dense()
492-
493-
# if the label doesn't exist, we will create a new object here
494-
# and possibly change the index
495-
new_values = values._set_value(label, value, takeable=takeable)
496-
if new_values is not None:
497-
values = new_values
498-
new_index = values.index
499-
values = SparseArray(values, fill_value=self.fill_value,
500-
kind=self.kind)
501-
self._data = SingleBlockManager(values, new_index)
502-
self._index = new_index
496+
self._data = self._data.copy()
497+
try:
498+
idx = self.index.get_loc(label)
499+
except KeyError:
500+
idx = len(self)
501+
self._data.axes[0] = self._data.index.append(Index([label]))
502+
self._data = self._data.setitem(indexer=idx, value=value)
503+
return self
503504
_set_value.__doc__ = set_value.__doc__
504505

505-
def _set_values(self, key, value):
506-
507-
# this might be inefficient as we have to recreate the sparse array
508-
# rather than setting individual elements, but have to convert
509-
# the passed slice/boolean that's in dense space into a sparse indexer
510-
# not sure how to do that!
511-
if isinstance(key, Series):
512-
key = key.values
513-
514-
values = self.values.to_dense()
515-
values[key] = libindex.convert_scalar(values, value)
516-
values = SparseArray(values, fill_value=self.fill_value,
517-
kind=self.kind)
518-
self._data = SingleBlockManager(values, self.index)
519-
520506
def to_dense(self, sparse_only=False):
521507
"""
522508
Convert SparseSeries to a Series.
@@ -542,8 +528,10 @@ def to_dense(self, sparse_only=False):
542528
index = self.index.take(int_index.indices)
543529
return Series(self.sp_values, index=index, name=self.name)
544530
else:
545-
return Series(self.values.to_dense(), index=self.index,
546-
name=self.name)
531+
values = self.values
532+
if is_sparse(values):
533+
values = values.to_dense()
534+
return Series(values, index=self.index, name=self.name)
547535

548536
@property
549537
def density(self):

pandas/tests/sparse/frame/test_frame.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from pandas import Series, DataFrame, bdate_range, Panel
1212
from pandas.core.indexes.datetimes import DatetimeIndex
13+
from pandas.errors import PerformanceWarning
1314
from pandas.tseries.offsets import BDay
1415
from pandas.util import testing as tm
1516
from pandas.compat import lrange
@@ -471,7 +472,6 @@ def test_set_value(self):
471472
with tm.assert_produces_warning(FutureWarning,
472473
check_stacklevel=False):
473474
res = self.frame.set_value('foobar', 'B', 1.5)
474-
assert res is not self.frame
475475
assert res.index[-1] == 'foobar'
476476
with tm.assert_produces_warning(FutureWarning,
477477
check_stacklevel=False):
@@ -480,9 +480,8 @@ def test_set_value(self):
480480
with tm.assert_produces_warning(FutureWarning,
481481
check_stacklevel=False):
482482
res2 = res.set_value('foobar', 'qux', 1.5)
483-
assert res2 is not res
484483
tm.assert_index_equal(res2.columns,
485-
pd.Index(list(self.frame.columns) + ['qux']))
484+
pd.Index(list(self.frame.columns)))
486485
with tm.assert_produces_warning(FutureWarning,
487486
check_stacklevel=False):
488487
assert res2.get_value('foobar', 'qux') == 1.5
@@ -1301,3 +1300,54 @@ def test_assign_with_sparse_frame(self):
13011300

13021301
for column in res.columns:
13031302
assert type(res[column]) is SparseSeries
1303+
1304+
1305+
def _test_assignment(kind, indexer, key=None):
1306+
arr = np.array([[1, nan],
1307+
[nan, 1]])
1308+
df = DataFrame(arr, copy=True)
1309+
sdf = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
1310+
1311+
def get_indexer(df):
1312+
return getattr(df, indexer) if indexer else df
1313+
1314+
if key is None:
1315+
key = pd.isnull(sdf).to_sparse()
1316+
1317+
get_indexer(sdf)[key] = 2
1318+
1319+
get_indexer(df)[key] = 2
1320+
res = df.to_sparse(kind=kind)
1321+
1322+
tm.assert_sp_frame_equal(sdf, res)
1323+
1324+
1325+
@pytest.fixture(params=['integer', 'block'])
1326+
def spindex_kind(request):
1327+
return request.param
1328+
1329+
1330+
@pytest.mark.parametrize('indexer', ['iat'])
1331+
@pytest.mark.parametrize('key', [(0, 0)])
1332+
def test_frame_assignment_at(spindex_kind, indexer, key):
1333+
_test_assignment(spindex_kind, indexer, key)
1334+
1335+
1336+
@pytest.mark.parametrize('indexer', ['at', 'loc', 'iloc'])
1337+
@pytest.mark.parametrize('key', [0,
1338+
[0, 1],
1339+
[True, False]])
1340+
def test_frame_assignment_loc(spindex_kind, indexer, key):
1341+
_test_assignment(spindex_kind, indexer, key)
1342+
1343+
1344+
@pytest.mark.parametrize('key', [None,
1345+
[True, False]])
1346+
def test_frame_assignment_setitem(spindex_kind, key):
1347+
_test_assignment(spindex_kind, None, key)
1348+
1349+
1350+
@pytest.mark.parametrize('indexer', ['loc', 'at'])
1351+
@pytest.mark.parametrize('key', [3])
1352+
def test_frame_assignment_extend_index(spindex_kind, indexer, key):
1353+
_test_assignment(spindex_kind, indexer, key)

0 commit comments

Comments
 (0)