From cd922b6b81f1377f7a7ba6019694e78da5aa9bc1 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 20 Jun 2024 19:13:30 +0100 Subject: [PATCH 01/15] BUG: :bug: :sparkles: Add fill_value param to from_spmatrix method. --- pandas/core/arrays/sparse/accessor.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6a1c25711acb0..6213f2a5392f2 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -265,7 +265,9 @@ def _validate(self, data) -> None: raise AttributeError(self._validation_msg) @classmethod - def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: + def from_spmatrix( + cls, data, index=None, columns=None, fill_value=None + ) -> DataFrame: """ Create a new DataFrame from a scipy sparse matrix. @@ -276,6 +278,21 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: index, columns : Index, optional Row and column labels to use for the resulting DataFrame. Defaults to a RangeIndex. + fill_value : scalar, optional + The scalar value not stored in the columns. By default, this + depends on the dtype of `data`. + + =========== ========== + dtype na_value + =========== ========== + float ``np.nan`` + int ``0`` + bool ``False`` + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + =========== ========== + + The default value may be overridden by specifying a `fill_value`. Returns ------- @@ -313,7 +330,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: indices = data.indices indptr = data.indptr array_data = data.data - dtype = SparseDtype(array_data.dtype, 0) + dtype = SparseDtype(array_data.dtype, fill_value) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) From 9323e4378ed04d73820830ad17f76b75d06bc2c7 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 20 Jun 2024 19:45:44 +0100 Subject: [PATCH 02/15] ENH: :sparkles: Set explicit fill_value of NaN for complex floats. --- pandas/core/arrays/sparse/accessor.py | 1 + pandas/core/dtypes/dtypes.py | 1 + pandas/core/dtypes/missing.py | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6213f2a5392f2..8127b757ca63b 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -286,6 +286,7 @@ def from_spmatrix( dtype na_value =========== ========== float ``np.nan`` + complex ``np.nan`` int ``0`` bool ``False`` datetime64 ``pd.NaT`` diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5213be8b69016..d38254d98553e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1683,6 +1683,7 @@ class SparseDtype(ExtensionDtype): dtype na_value =========== ========== float ``np.nan`` + complex ``np.nan`` int ``0`` bool ``False`` datetime64 ``pd.NaT`` diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f0e21136f8a97..b9cd6ae2f13e8 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): nan >>> na_value_for_dtype(np.dtype("float64")) nan + >>> na_value_for_dtype(np.dtype("complex128")) + nan >>> na_value_for_dtype(np.dtype("bool")) False >>> na_value_for_dtype(np.dtype("datetime64[ns]")) @@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): elif dtype.kind in "mM": unit = np.datetime_data(dtype)[0] return dtype.type("NaT", unit) - elif dtype.kind == "f": + elif dtype.kind in "fc": return np.nan elif dtype.kind in "iu": if compat: From 212a66447cc0e56065b3686dffa0250892430477 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 20 Jun 2024 20:06:28 +0100 Subject: [PATCH 03/15] TST: :white_check_mark: Fix failing tests. --- pandas/tests/arrays/sparse/test_accessor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 87eb7bcfa9cee..b8b35044f18fb 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -112,7 +112,9 @@ def test_from_spmatrix(self, format, labels, dtype): sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) mat = sp_sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) + result = pd.DataFrame.sparse.from_spmatrix( + mat, index=labels, columns=labels, fill_value=0 + ) expected = pd.DataFrame( np.eye(10, dtype=dtype), index=labels, columns=labels ).astype(sp_dtype) @@ -124,7 +126,7 @@ def test_from_spmatrix_including_explicit_zero(self, format): mat = sp_sparse.random(10, 2, density=0.5, format=format) mat.data[0] = 0 - result = pd.DataFrame.sparse.from_spmatrix(mat) + result = pd.DataFrame.sparse.from_spmatrix(mat, fill_value=0) dtype = SparseDtype("float64", 0.0) expected = pd.DataFrame(mat.todense()).astype(dtype) tm.assert_frame_equal(result, expected) @@ -139,7 +141,7 @@ def test_from_spmatrix_columns(self, columns): dtype = SparseDtype("float64", 0.0) mat = sp_sparse.random(10, 2, density=0.5) - result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) + result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns, fill_value=0) expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) tm.assert_frame_equal(result, expected) From 81b33f5f502da89057c9e178d7a2e5a089fc6ad0 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 20 Jun 2024 22:24:57 +0100 Subject: [PATCH 04/15] TST: :white_check_mark: Add tests for from_spmatrix method. --- pandas/tests/arrays/sparse/test_accessor.py | 20 +++++++++++++++++++- pandas/tests/dtypes/test_missing.py | 4 ++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index b8b35044f18fb..163161f15a952 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -105,7 +105,7 @@ def test_accessor_raises(self): @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) - @pytest.mark.parametrize("dtype", ["float64", "int64"]) + @pytest.mark.parametrize("dtype", ["complex128", "float64", "int64"]) def test_from_spmatrix(self, format, labels, dtype): sp_sparse = pytest.importorskip("scipy.sparse") @@ -145,6 +145,24 @@ def test_from_spmatrix_columns(self, columns): expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "dtype, fill_value", + [("bool", False), ("float64", np.nan), ("complex128", np.nan)], + ) + @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) + def test_from_spmatrix_fill_value(self, format, dtype, fill_value): + sp_sparse = pytest.importorskip("scipy.sparse") + + sp_dtype = SparseDtype(dtype, fill_value) + + sp_mat = sp_sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, fill_value=fill_value) + mat = np.eye(10, dtype=dtype) + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(fill_value) + ).astype(sp_dtype) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] ) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 2109c794ad44f..54d618f7b0766 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -697,6 +697,10 @@ def test_array_equivalent_index_with_tuples(): ("f2", np.nan), ("f4", np.nan), ("f8", np.nan), + # Complex + ("c8", np.nan), + ("c16", np.nan), + ("c32", np.nan), # Object ("O", np.nan), # Interval From f5f1479aa2aaaaec0a2979cc8f6480bdb8e6a42d Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Fri, 21 Jun 2024 11:33:25 +0100 Subject: [PATCH 05/15] DOC: :memo: Add what's new entry. --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b952ffd7661a7..18ad7e3ccedaf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -584,7 +584,7 @@ Reshaping Sparse ^^^^^^ - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) -- +- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`) ExtensionArray ^^^^^^^^^^^^^^ From 57367aaaae290875e793274aac14a1cff8979d87 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:15:46 +0100 Subject: [PATCH 06/15] TST: :white_check_mark: Fix failing tests for sparse getitem. --- pandas/tests/indexing/test_loc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 16f3e0fd0c229..a2a34314e2801 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1292,7 +1292,7 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): # diagonal cells are ones, meaning the last two columns are purely sparse. rows, cols = 5, 7 spmatrix = spmatrix_t(np.eye(rows, cols, dtype=dtype), dtype=dtype) - df = DataFrame.sparse.from_spmatrix(spmatrix) + df = DataFrame.sparse.from_spmatrix(spmatrix, fill_value=0) # regression test for GH#34526 itr_idx = range(2, rows) @@ -1314,7 +1314,7 @@ def test_loc_getitem_sparse_frame(self): # GH34687 sp_sparse = pytest.importorskip("scipy.sparse") - df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5)) + df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5), fill_value=0) result = df.loc[range(2)] expected = DataFrame( [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]], From e90963af397ef1d7c32c4c6d03f29079ec1b471a Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:25:01 +0100 Subject: [PATCH 07/15] TST: :white_check_mark: Remove test for 256-bit complex float. --- pandas/tests/dtypes/test_missing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 54d618f7b0766..f86ed6f49759f 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -700,7 +700,6 @@ def test_array_equivalent_index_with_tuples(): # Complex ("c8", np.nan), ("c16", np.nan), - ("c32", np.nan), # Object ("O", np.nan), # Interval From eb222a6bb151b8d262bc3df9a5235f958c9274e3 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Fri, 21 Jun 2024 13:30:15 +0100 Subject: [PATCH 08/15] DOC: :memo: Update example in docstring for from_spmatrix method. --- pandas/core/arrays/sparse/accessor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8127b757ca63b..d13c2f471f1af 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -310,11 +310,11 @@ def from_spmatrix( -------- >>> import scipy.sparse >>> mat = scipy.sparse.eye(3, dtype=float) - >>> pd.DataFrame.sparse.from_spmatrix(mat) + >>> pd.DataFrame.sparse.from_spmatrix(mat, fill_value=0.0) 0 1 2 - 0 1.0 0 0 - 1 0 1.0 0 - 2 0 0 1.0 + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 """ from pandas._libs.sparse import IntIndex From 221c4aabce136e27ebe85d59f9008daf1edd6153 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:26:27 +0100 Subject: [PATCH 09/15] DOC: :memo: Update some docstrings and sparse user guide. --- doc/source/user_guide/sparse.rst | 2 +- pandas/core/arrays/sparse/accessor.py | 4 ++-- pandas/core/dtypes/dtypes.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 25bcb8bcc0c93..03105d62821a0 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -188,7 +188,7 @@ Use :meth:`DataFrame.sparse.from_spmatrix` to create a :class:`DataFrame` with s sp_arr = csr_matrix(arr) sp_arr - sdf = pd.DataFrame.sparse.from_spmatrix(sp_arr) + sdf = pd.DataFrame.sparse.from_spmatrix(sp_arr, fill_value=0) sdf.head() sdf.dtypes diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index d13c2f471f1af..3af394872f387 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -280,7 +280,7 @@ def from_spmatrix( Defaults to a RangeIndex. fill_value : scalar, optional The scalar value not stored in the columns. By default, this - depends on the dtype of `data`. + depends on the dtype of ``data``. =========== ========== dtype na_value @@ -293,7 +293,7 @@ def from_spmatrix( timedelta64 ``pd.NaT`` =========== ========== - The default value may be overridden by specifying a `fill_value`. + The default value may be overridden by specifying a ``fill_value``. Returns ------- diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d38254d98553e..a483260c2c3bd 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. - `SparseDtype` is used as the data type for :class:`SparseArray`, enabling + SparseDtype is used as the data type for :class:`SparseArray`, enabling more efficient storage of data that contains a significant number of repetitive values typically represented by a fill value. It supports any scalar dtype as the underlying data type of the non-fill values. @@ -1677,7 +1677,7 @@ class SparseDtype(ExtensionDtype): The dtype of the underlying array storing the non-fill value values. fill_value : scalar, optional The scalar value not stored in the SparseArray. By default, this - depends on `dtype`. + depends on ``dtype``. =========== ========== dtype na_value @@ -1690,7 +1690,7 @@ class SparseDtype(ExtensionDtype): timedelta64 ``pd.NaT`` =========== ========== - The default value may be overridden by specifying a `fill_value`. + The default value may be overridden by specifying a ``fill_value``. Attributes ---------- From 0d07c30277276f10bdb187738822b4566bcc8be6 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Sat, 22 Jun 2024 13:43:44 +0100 Subject: [PATCH 10/15] DOC: :pencil2: Update dtype docstring. Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/dtypes/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a483260c2c3bd..3aeab96e03163 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. - SparseDtype is used as the data type for :class:`SparseArray`, enabling + ``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling more efficient storage of data that contains a significant number of repetitive values typically represented by a fill value. It supports any scalar dtype as the underlying data type of the non-fill values. From ccba29e4618fa25d810a574f40d146eb47d40e85 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:35:22 +0100 Subject: [PATCH 11/15] BUG: :rewind: :bug: Revert fill_value change and fix to_coo method. --- pandas/core/arrays/sparse/accessor.py | 34 ++++++--------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 3af394872f387..d104e0a126bc2 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -265,9 +265,7 @@ def _validate(self, data) -> None: raise AttributeError(self._validation_msg) @classmethod - def from_spmatrix( - cls, data, index=None, columns=None, fill_value=None - ) -> DataFrame: + def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: """ Create a new DataFrame from a scipy sparse matrix. @@ -278,22 +276,6 @@ def from_spmatrix( index, columns : Index, optional Row and column labels to use for the resulting DataFrame. Defaults to a RangeIndex. - fill_value : scalar, optional - The scalar value not stored in the columns. By default, this - depends on the dtype of ``data``. - - =========== ========== - dtype na_value - =========== ========== - float ``np.nan`` - complex ``np.nan`` - int ``0`` - bool ``False`` - datetime64 ``pd.NaT`` - timedelta64 ``pd.NaT`` - =========== ========== - - The default value may be overridden by specifying a ``fill_value``. Returns ------- @@ -309,12 +291,12 @@ def from_spmatrix( Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3, dtype=float) - >>> pd.DataFrame.sparse.from_spmatrix(mat, fill_value=0.0) + >>> mat = scipy.sparse.eye(3, dtype=int) + >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 - 0 1.0 0.0 0.0 - 1 0.0 1.0 0.0 - 2 0.0 0.0 1.0 + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 """ from pandas._libs.sparse import IntIndex @@ -331,7 +313,7 @@ def from_spmatrix( indices = data.indices indptr = data.indptr array_data = data.data - dtype = SparseDtype(array_data.dtype, fill_value) + dtype = SparseDtype(array_data.dtype) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) @@ -411,8 +393,6 @@ def to_coo(self) -> spmatrix: cols, rows, data = [], [], [] for col, (_, ser) in enumerate(self._parent.items()): sp_arr = ser.array - if sp_arr.fill_value != 0: - raise ValueError("fill value must be 0 when converting to COO matrix") row = sp_arr.sp_index.indices cols.append(np.repeat(col, len(row))) From d09171e609e341823b61f484b96bf6d8298dc330 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:38:17 +0100 Subject: [PATCH 12/15] TST: :rewind: :white_check_mark: Fix and add sparse accessor tests. --- pandas/tests/arrays/sparse/test_accessor.py | 93 ++++++++------------- 1 file changed, 37 insertions(+), 56 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 163161f15a952..6579fadf7b395 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -105,30 +105,36 @@ def test_accessor_raises(self): @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) - @pytest.mark.parametrize("dtype", ["complex128", "float64", "int64"]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_from_spmatrix(self, format, labels, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype) - mat = sp_sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix( - mat, index=labels, columns=labels, fill_value=0 - ) + sp_mat = sp_sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels) + mat = np.eye(10, dtype=dtype) expected = pd.DataFrame( - np.eye(10, dtype=dtype), index=labels, columns=labels + np.ma.masked_array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + index=labels, + columns=labels, ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) - def test_from_spmatrix_including_explicit_zero(self, format): + @pytest.mark.parametrize("dtype", [np.int64, bool]) + def test_from_spmatrix_including_explicit_zero(self, format, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - mat = sp_sparse.random(10, 2, density=0.5, format=format) - mat.data[0] = 0 - result = pd.DataFrame.sparse.from_spmatrix(mat, fill_value=0) - dtype = SparseDtype("float64", 0.0) - expected = pd.DataFrame(mat.todense()).astype(dtype) + sp_dtype = SparseDtype(dtype) + + sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype) + sp_mat.data[0] = 0 + result = pd.DataFrame.sparse.from_spmatrix(sp_mat) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value) + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -138,59 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format): def test_from_spmatrix_columns(self, columns): sp_sparse = pytest.importorskip("scipy.sparse") - dtype = SparseDtype("float64", 0.0) - - mat = sp_sparse.random(10, 2, density=0.5) - result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns, fill_value=0) - expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "dtype, fill_value", - [("bool", False), ("float64", np.nan), ("complex128", np.nan)], - ) - @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) - def test_from_spmatrix_fill_value(self, format, dtype, fill_value): - sp_sparse = pytest.importorskip("scipy.sparse") - - sp_dtype = SparseDtype(dtype, fill_value) + sp_dtype = SparseDtype(np.float64) - sp_mat = sp_sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix(sp_mat, fill_value=fill_value) - mat = np.eye(10, dtype=dtype) + sp_mat = sp_sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns) + mat = sp_mat.toarray() expected = pd.DataFrame( - np.ma.array(mat, mask=(mat == 0)).filled(fill_value) + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + "columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] ) - def test_to_coo(self, colnames): + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) + def test_to_coo(self, columns, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - df = pd.DataFrame( - {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]" - ) - result = df.sparse.to_coo() - expected = sp_sparse.coo_matrix(np.asarray(df)) - assert (result != expected).nnz == 0 + sp_dtype = SparseDtype(dtype) - @pytest.mark.parametrize("fill_value", [1, np.nan]) - def test_to_coo_nonzero_fill_val_raises(self, fill_value): - pytest.importorskip("scipy") - df = pd.DataFrame( - { - "A": SparseArray( - [fill_value, fill_value, fill_value, 2], fill_value=fill_value - ), - "B": SparseArray( - [fill_value, 2, fill_value, fill_value], fill_value=fill_value - ), - } - ) - with pytest.raises(ValueError, match="fill value must be 0"): - df.sparse.to_coo() + expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype) + mat = expected.toarray() + result = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + dtype=sp_dtype, + ).sparse.to_coo() + assert (result != expected).nnz == 0 def test_to_coo_midx_categorical(self): # GH#50996 From b8134536fec0bbc5f530e58a7b3015fa4cfea66d Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:39:31 +0100 Subject: [PATCH 13/15] TST: :rewind: :white_check_mark: Fix and add sparse getitem tests. --- pandas/tests/indexing/test_loc.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index a2a34314e2801..903ad24ce53b3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1281,7 +1281,7 @@ def test_loc_getitem_time_object(self, frame_or_series): tm.assert_equal(result, expected) @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) - @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): sp_sparse = pytest.importorskip("scipy.sparse") @@ -1292,17 +1292,17 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): # diagonal cells are ones, meaning the last two columns are purely sparse. rows, cols = 5, 7 spmatrix = spmatrix_t(np.eye(rows, cols, dtype=dtype), dtype=dtype) - df = DataFrame.sparse.from_spmatrix(spmatrix, fill_value=0) + df = DataFrame.sparse.from_spmatrix(spmatrix) # regression test for GH#34526 itr_idx = range(2, rows) - result = df.loc[itr_idx].values + result = np.nan_to_num(df.loc[itr_idx].values) expected = spmatrix.toarray()[itr_idx] tm.assert_numpy_array_equal(result, expected) # regression test for GH#34540 result = df.loc[itr_idx].dtypes.values - expected = np.full(cols, SparseDtype(dtype, fill_value=0)) + expected = np.full(cols, SparseDtype(dtype)) tm.assert_numpy_array_equal(result, expected) def test_loc_getitem_listlike_all_retains_sparse(self): @@ -1314,18 +1314,16 @@ def test_loc_getitem_sparse_frame(self): # GH34687 sp_sparse = pytest.importorskip("scipy.sparse") - df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5), fill_value=0) + df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64)) result = df.loc[range(2)] expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]], - dtype=SparseDtype("float64", 0.0), + [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]], + dtype=SparseDtype(np.int64), ) tm.assert_frame_equal(result, expected) result = df.loc[range(2)].loc[range(1)] - expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0) - ) + expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64)) tm.assert_frame_equal(result, expected) def test_loc_getitem_sparse_series(self): From 499db2f9f4a8eb54cffcece574346b901421e5cc Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:40:29 +0100 Subject: [PATCH 14/15] DOC: :rewind: :memo: Revert fill_value change to sparse user guide. --- doc/source/user_guide/sparse.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 03105d62821a0..25bcb8bcc0c93 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -188,7 +188,7 @@ Use :meth:`DataFrame.sparse.from_spmatrix` to create a :class:`DataFrame` with s sp_arr = csr_matrix(arr) sp_arr - sdf = pd.DataFrame.sparse.from_spmatrix(sp_arr, fill_value=0) + sdf = pd.DataFrame.sparse.from_spmatrix(sp_arr) sdf.head() sdf.dtypes From 9eb3dac4ed9d3dcbfb8e2bd130159e095d5f9eb5 Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 27 Jun 2024 14:58:35 +0100 Subject: [PATCH 15/15] CLN: :pencil2: Fix instantiation of np.ma.array in test. --- pandas/tests/arrays/sparse/test_accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 6579fadf7b395..bd3298940ae3a 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -115,7 +115,7 @@ def test_from_spmatrix(self, format, labels, dtype): result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels) mat = np.eye(10, dtype=dtype) expected = pd.DataFrame( - np.ma.masked_array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), index=labels, columns=labels, ).astype(sp_dtype)