Skip to content

API: DataFrame.sparse accessor #25682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
May 14, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,29 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
DataFrame.boxplot
DataFrame.hist


.. _api.frame.sparse:

Sparse Accessor
~~~~~~~~~~~~~~~

Sparse-dtype specific methods and attributes are provided under the
``DataFrame.sparse`` accessor.

.. autosummary::
:toctree: api/
:template: autosummary/accessor_attribute.rst

DataFrame.sparse.density

.. autosummary::
:toctree: api/

DataFrame.sparse.from_spmatrix
DataFrame.sparse.to_coo
DataFrame.sparse.to_dense


Serialization / IO / Conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Other Enhancements
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
- Added :ref:`api.frame.sparse` for working with a ``DataFrame`` whose values are sparse (:issue:`25681`)

.. _whatsnew_0250.api_breaking:

Expand Down
164 changes: 159 additions & 5 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,36 @@ def _simple_new(cls, sparse_array, sparse_index, dtype):
new._dtype = dtype
return new

@classmethod
def from_spmatrix(cls, data):
"""
Create a SparseArray from a scipy.sparse matrix.

Parameters
----------
data : scipy.sparse.sp_matrix
This should be a 2-D SciPy sparse where the size
of the second dimension is 1. In other words, a
sparse matrix with a single column.

Returns
-------
SparseArray.
"""
assert data.ndim == 2

length, ncol = data.shape

assert ncol == 1

arr = data.data
idx, _ = data.nonzero()
zero = np.array(0, dtype=arr.dtype).item()
dtype = SparseDtype(arr.dtype, zero)
index = IntIndex(length, idx)

return cls._simple_new(arr, index, dtype)

def __array__(self, dtype=None, copy=True):
fill_value = self.fill_value

Expand Down Expand Up @@ -1891,6 +1921,9 @@ def _make_index(length, indices, kind):
# ----------------------------------------------------------------------------
# Accessor

_validation_msg = "Can only use the '.sparse' accessor with Sparse data."


@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
'sp_values'],
typ='property')
Expand All @@ -1900,15 +1933,13 @@ class SparseAccessor(PandasDelegate):
"""

def __init__(self, data=None):
self._validate(data)
# Store the Series since we need that for to_coo
self._parent = data
self._validate(data)

@staticmethod
def _validate(data):
def _validate(self, data):
if not isinstance(data.dtype, SparseDtype):
msg = "Can only use the '.sparse' accessor with Sparse data."
raise AttributeError(msg)
raise AttributeError(_validation_msg)

def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.values, name)
Expand Down Expand Up @@ -2025,3 +2056,126 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
column_levels,
sort_labels=sort_labels)
return A, rows, columns

def to_dense(self):
from pandas import Series
return Series(self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name)


class SparseFrameAccessor(PandasDelegate):

def __init__(self, data=None):
# Store the Series since we need that for to_coo
self._parent = data
self._validate(data)

def _validate(self, data):
dtypes = data.dtypes
if not all(isinstance(t, SparseDtype) for t in dtypes):
raise AttributeError(_validation_msg)

@classmethod
def from_spmatrix(cls, data, index=None, columns=None):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am assuming you are defining this here because then we can simply deprecate SparseDataFrame as this is much simpler / direct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, this is the replacement for SparseDataFrame(sp_matrix).

Create a new DataFrame from a scipy sparse matrix.

Parameters
----------
data : scipy.sparse.spmatrix
Must be convertible to csc format.
index, columns : Index, optional
Row and column labels to use for the resulting DataFrame.
Defaults to a RangeIndex.

Returns
-------
DataFrame

Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0.0 0.0
1 0.0 1.0 0.0
2 0.0 0.0 1.0
"""
from pandas import DataFrame

data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
sparrays = [
SparseArray.from_spmatrix(data[:, i])
for i in range(data.shape[1])
]
data = dict(zip(columns, sparrays))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how often we use this construction but I assume this preclude a user from specifying a MI or anything with duplicated index entries due to hashability / uniqueness constraints of dict keys

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair point.

I'd like to avoid the perf issue with passing columns= to the DataFrame constructor... I suppose our alternative is to just set .columns after creating the DataFrame?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wasn't aware of the perf issue - is there an open issue for that?

Yea think assigning directly would be a better approach

return DataFrame(data, index=index)

def to_dense(self):
"""
Convert to dense DataFrame

Returns
-------
df : DataFrame
"""
from pandas import DataFrame

data = {k: v.array.to_dense()
for k, v in compat.iteritems(self._parent)}
return DataFrame(data,
index=self._parent.index,
columns=self._parent.columns)

def to_coo(self):
try:
from scipy.sparse import coo_matrix
except ImportError:
raise ImportError('Scipy is not installed')

dtype = find_common_type(self._parent.dtypes)
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype

cols, rows, datas = [], [], []
for col, name in enumerate(self._parent):
s = self._parent[name]
row = s.array.sp_index.to_int_index().indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
datas.append(s.array.sp_values.astype(dtype, copy=False))

cols = np.concatenate(cols)
rows = np.concatenate(rows)
datas = np.concatenate(datas)
return coo_matrix((datas, (rows, cols)), shape=self._parent.shape)

@property
def density(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you can add type annotations anywhere it is easy would be nice.

"""
Ratio of non-sparse points to total (dense) data points
represented in the DataFrame.
"""
return np.mean([column.array.density
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would not taking the mean, and returning a Series instead, be more useful?

for _, column in self._parent.iteritems()])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use items()


@staticmethod
def _prep_index(data, index, columns):
import pandas.core.indexes.base as ibase

N, K = data.shape
if index is None:
index = ibase.default_index(N)
if columns is None:
columns = ibase.default_index(K)

if len(columns) != K:
raise ValueError('Column length mismatch: {columns} vs. {K}'
.format(columns=len(columns), K=K))
if len(index) != N:
raise ValueError('Index length mismatch: {index} vs. {N}'
.format(index=len(index), N=N))
return index, columns
2 changes: 2 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
PY36, raise_with_traceback, Iterator,
string_and_binary_types)
from pandas.compat.numpy import function as nv
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.dtypes.cast import (
maybe_upcast,
cast_scalar_to_array,
Expand Down Expand Up @@ -8009,6 +8010,7 @@ def isin(self, values):
plot = CachedAccessor("plot", gfx.FramePlotMethods)
hist = gfx.hist_frame
boxplot = gfx.boxplot_frame
sparse = CachedAccessor("sparse", SparseFrameAccessor)


DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
Expand Down
27 changes: 4 additions & 23 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender

from pandas.core.dtypes.cast import find_common_type, maybe_upcast
from pandas.core.dtypes.cast import maybe_upcast
from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
from pandas.core.dtypes.missing import isna, notna

import pandas.core.algorithms as algos
from pandas.core.arrays.sparse import SparseArray, SparseDtype
from pandas.core.arrays.sparse import SparseArray
import pandas.core.common as com
from pandas.core.frame import DataFrame
import pandas.core.generic as generic
Expand Down Expand Up @@ -271,27 +271,8 @@ def to_coo(self):
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
"""
try:
from scipy.sparse import coo_matrix
except ImportError:
raise ImportError('Scipy is not installed')

dtype = find_common_type(self.dtypes)
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype

cols, rows, datas = [], [], []
for col, name in enumerate(self):
s = self[name]
row = s.sp_index.to_int_index().indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
datas.append(s.sp_values.astype(dtype, copy=False))

cols = np.concatenate(cols)
rows = np.concatenate(rows)
datas = np.concatenate(datas)
return coo_matrix((datas, (rows, cols)), shape=self.shape)
from pandas.core.arrays.sparse import SparseFrameAccessor
return SparseFrameAccessor(self).to_coo()

def __array_wrap__(self, result):
return self._constructor(
Expand Down
76 changes: 76 additions & 0 deletions pandas/tests/arrays/sparse/test_accessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import string

import numpy as np
import pytest

import pandas as pd
import pandas.util.testing as tm


class TestSeriesAccessor(object):
# TODO: collect other accessor tests
def test_to_dense(self):
s = pd.Series([0, 1, 0, 10], dtype='Sparse[int]')
result = s.sparse.to_dense()
expected = pd.Series([0, 1, 0, 10])
tm.assert_series_equal(result, expected)


class TestFrameAccessor(object):
@pytest.mark.parametrize('format', ['csc', 'csr', 'coo'])
@pytest.mark.parametrize("labels", [
None,
list(string.ascii_letters[:10]),
])
@pytest.mark.parametrize('dtype', ['float64', 'int64'])
def test_from_spmatrix(self, format, labels, dtype):
pytest.importorskip("scipy")
import scipy.sparse
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm shouldn't we move the scipy specific test to a new file then just pyimportorskip at the top?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My preference to keep all the accessor tests in a single file / class.

sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item())

mat = scipy.sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(
mat, index=labels, columns=labels
)
expected = pd.DataFrame(
np.eye(10, dtype=dtype),
index=labels,
columns=labels,
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)

def test_to_coo(self):
pytest.importorskip("scipy")
import scipy.sparse

df = pd.DataFrame({
"A": [0, 1, 0],
"B": [1, 0, 0],
}, dtype='Sparse[int64, 0]')
result = df.sparse.to_coo()
expected = scipy.sparse.coo_matrix(np.asarray(df))
assert (result != expected).nnz == 0

def test_to_dense(self):
df = pd.DataFrame({
"A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)),
"B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)),
"C": pd.SparseArray([1., 0.],
dtype=pd.SparseDtype('float64', 0.0)),
}, index=['b', 'a'])
result = df.sparse.to_dense()
expected = pd.DataFrame({
'A': [1, 0],
'B': [1, 0],
'C': [1.0, 0.0],
}, index=['b', 'a'])
tm.assert_frame_equal(result, expected)

def test_density(self):
df = pd.DataFrame({
'A': pd.SparseArray([1, 0, 2, 1], fill_value=0),
'B': pd.SparseArray([0, 1, 1, 1], fill_value=0),
})
res = df.sparse.density
expected = 0.75
assert res == expected