From 71f92a97c9327b8163087f1fc252544f01fe4f36 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Wed, 12 Oct 2022 14:09:14 -0700 Subject: [PATCH 1/5] PERF: sparse to_csv Improves to_csv performance for sparse matric by casting to dense before initializing DataFrameFormatter. Results in many fewer calls to `to_native_types` which saves time. --- asv_bench/benchmarks/io/csv.py | 20 ++++++++++++++++++++ doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/generic.py | 2 ++ 3 files changed, 23 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 10aef954a3475..69534c1a7acce 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -172,6 +172,26 @@ def time_head_of_multiindex(self): self.df_custom_index_then_head.to_csv(self.fname) +class ToCSVSparse(BaseIO): + + fname = "__test__.csv" + + def setup(self): + from scipy import sparse as sc + + vals = np.random.randint(0, 10, size=(10000, 1000)) + keep = vals > 3 + vals[keep] = 0 + sparse_mtx = sc.coo_matrix(vals) + self.data = DataFrame.sparse.from_spmatrix(sparse_mtx) + + def time_sparse_to_csv(self): + self.data.to_csv("sparse_pd.csv") + + def time_sparse_to_dense_to_csv(self): + self.data.sparse.to_dense().to_csv("sparse_pd.csv") + + class StringIORewind: def data(self, stringio_object): stringio_object.seek(0) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9d1e0c7485092..260081e7adc07 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -158,6 +158,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) +- Performance improvement for :meth:`NDFrame.to_csv` when data frame is sparse (:issue:`41023`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ce828ba4a0af4..50cdc23f0fadc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3717,6 +3717,8 @@ def to_csv( >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() + if hasattr(df, "sparse"): + df = df.sparse.to_dense() # fixes 41023 formatter = DataFrameFormatter( frame=df, From 4e3e1fa7306d555b829d8c86c8f1ed8afa2f68d9 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Thu, 13 Oct 2022 18:20:53 -0700 Subject: [PATCH 2/5] Move optimization deeper in the call stack --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/generic.py | 2 -- pandas/io/formats/csvs.py | 5 ++++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 260081e7adc07..04c1b2bd447cb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -158,7 +158,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) -- Performance improvement for :meth:`NDFrame.to_csv` when data frame is sparse (:issue:`41023`) +- Performance improvement for saving to CSV with :class:`CSVFormatter` when data frame is sparse (:issue:`41023`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 50cdc23f0fadc..ce828ba4a0af4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3717,8 +3717,6 @@ def to_csv( >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() - if hasattr(df, "sparse"): - df = df.sparse.to_dense() # fixes 41023 formatter = DataFrameFormatter( frame=df, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6ab57b0cce2a4..9d10a4f3092c6 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -69,7 +69,10 @@ def __init__( ) -> None: self.fmt = formatter - self.obj = self.fmt.frame + if hasattr(self.fmt.frame, "sparse"): + self.obj = self.fmt.frame.sparse.to_dense() + else: + self.obj = self.fmt.frame self.filepath_or_buffer = path_or_buf self.encoding = encoding From c0b794fdf035c441224c3134b8ab3cad14716af3 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 22 Oct 2022 20:49:45 -0700 Subject: [PATCH 3/5] Update whatsnew to reference user-facing `to_csv` instead of `CSVFormatter` --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 2ddbbfe8c7fae..bcec46c240b62 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -163,7 +163,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) -- Performance improvement for saving to CSV with :class:`CSVFormatter` when data frame is sparse (:issue:`41023`) +- Performance improvement for saving to CSV with :meth:`DataFrame.to_csv` when data frame is sparse (:issue:`41023`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: From 6fa18f642e3a242d3d716a322d6c51d11bb008c4 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 22 Oct 2022 21:11:01 -0700 Subject: [PATCH 4/5] Reduce test frame size and use self.fname for asv test case --- asv_bench/benchmarks/io/csv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 69534c1a7acce..8acd22050f89e 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -179,17 +179,17 @@ class ToCSVSparse(BaseIO): def setup(self): from scipy import sparse as sc - vals = np.random.randint(0, 10, size=(10000, 1000)) + vals = np.random.randint(0, 10, size=(500, 1000)) keep = vals > 3 vals[keep] = 0 sparse_mtx = sc.coo_matrix(vals) self.data = DataFrame.sparse.from_spmatrix(sparse_mtx) def time_sparse_to_csv(self): - self.data.to_csv("sparse_pd.csv") + self.data.to_csv(self.fname) def time_sparse_to_dense_to_csv(self): - self.data.sparse.to_dense().to_csv("sparse_pd.csv") + self.data.sparse.to_dense().to_csv(self.fname) class StringIORewind: From c6c50d1efcd4561215ecfc2e4ab5d310727f5cb5 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Wed, 26 Oct 2022 21:00:27 -0700 Subject: [PATCH 5/5] Move sparse conversion deeper in the call stack This should improve memory consumption by only materializing one chunk at a time --- pandas/io/formats/csvs.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 9d10a4f3092c6..979d0eaef29b4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -69,10 +69,7 @@ def __init__( ) -> None: self.fmt = formatter - if hasattr(self.fmt.frame, "sparse"): - self.obj = self.fmt.frame.sparse.to_dense() - else: - self.obj = self.fmt.frame + self.obj = self.fmt.frame self.filepath_or_buffer = path_or_buf self.encoding = encoding @@ -311,6 +308,11 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] + # cast sparse columns to dense to reduce calls + # to df._mgr.to_native_types #41023 + if hasattr(df, "sparse"): + df = df.sparse.to_dense() + res = df._mgr.to_native_types(**self._number_format) data = [res.iget_values(i) for i in range(len(res.items))]