From 71f92a97c9327b8163087f1fc252544f01fe4f36 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryantlee9@gmail.com>
Date: Wed, 12 Oct 2022 14:09:14 -0700
Subject: [PATCH 1/5] PERF: sparse to_csv

Improves to_csv performance for sparse matric by casting to dense
before initializing DataFrameFormatter. Results in many fewer calls to
`to_native_types` which saves time.
---
 asv_bench/benchmarks/io/csv.py | 20 ++++++++++++++++++++
 doc/source/whatsnew/v2.0.0.rst |  1 +
 pandas/core/generic.py         |  2 ++
 3 files changed, 23 insertions(+)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 10aef954a3475..69534c1a7acce 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -172,6 +172,26 @@ def time_head_of_multiindex(self):
         self.df_custom_index_then_head.to_csv(self.fname)
 
 
+class ToCSVSparse(BaseIO):
+
+    fname = "__test__.csv"
+
+    def setup(self):
+        from scipy import sparse as sc
+
+        vals = np.random.randint(0, 10, size=(10000, 1000))
+        keep = vals > 3
+        vals[keep] = 0
+        sparse_mtx = sc.coo_matrix(vals)
+        self.data = DataFrame.sparse.from_spmatrix(sparse_mtx)
+
+    def time_sparse_to_csv(self):
+        self.data.to_csv("sparse_pd.csv")
+
+    def time_sparse_to_dense_to_csv(self):
+        self.data.sparse.to_dense().to_csv("sparse_pd.csv")
+
+
 class StringIORewind:
     def data(self, stringio_object):
         stringio_object.seek(0)
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 9d1e0c7485092..260081e7adc07 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -158,6 +158,7 @@ Performance improvements
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
 - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
+- Performance improvement for :meth:`NDFrame.to_csv` when data frame is sparse (:issue:`41023`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index ce828ba4a0af4..50cdc23f0fadc 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -3717,6 +3717,8 @@ def to_csv(
         >>> df.to_csv('folder/subfolder/out.csv')  # doctest: +SKIP
         """
         df = self if isinstance(self, ABCDataFrame) else self.to_frame()
+        if hasattr(df, "sparse"):
+            df = df.sparse.to_dense()  # fixes 41023
 
         formatter = DataFrameFormatter(
             frame=df,

From 4e3e1fa7306d555b829d8c86c8f1ed8afa2f68d9 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryantlee9@gmail.com>
Date: Thu, 13 Oct 2022 18:20:53 -0700
Subject: [PATCH 2/5] Move optimization deeper in the call stack

---
 doc/source/whatsnew/v2.0.0.rst | 2 +-
 pandas/core/generic.py         | 2 --
 pandas/io/formats/csvs.py      | 5 ++++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 260081e7adc07..04c1b2bd447cb 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -158,7 +158,7 @@ Performance improvements
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
 - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
-- Performance improvement for :meth:`NDFrame.to_csv` when data frame is sparse (:issue:`41023`)
+- Performance improvement for saving to CSV with :class:`CSVFormatter` when data frame is sparse (:issue:`41023`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 50cdc23f0fadc..ce828ba4a0af4 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -3717,8 +3717,6 @@ def to_csv(
         >>> df.to_csv('folder/subfolder/out.csv')  # doctest: +SKIP
         """
         df = self if isinstance(self, ABCDataFrame) else self.to_frame()
-        if hasattr(df, "sparse"):
-            df = df.sparse.to_dense()  # fixes 41023
 
         formatter = DataFrameFormatter(
             frame=df,
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index 6ab57b0cce2a4..9d10a4f3092c6 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -69,7 +69,10 @@ def __init__(
     ) -> None:
         self.fmt = formatter
 
-        self.obj = self.fmt.frame
+        if hasattr(self.fmt.frame, "sparse"):
+            self.obj = self.fmt.frame.sparse.to_dense()
+        else:
+            self.obj = self.fmt.frame
 
         self.filepath_or_buffer = path_or_buf
         self.encoding = encoding

From c0b794fdf035c441224c3134b8ab3cad14716af3 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryantlee9@gmail.com>
Date: Sat, 22 Oct 2022 20:49:45 -0700
Subject: [PATCH 3/5] Update whatsnew to reference user-facing `to_csv` instead
 of `CSVFormatter`

---
 doc/source/whatsnew/v2.0.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 2ddbbfe8c7fae..bcec46c240b62 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -163,7 +163,7 @@ Performance improvements
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
 - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
-- Performance improvement for saving to CSV with :class:`CSVFormatter` when data frame is sparse (:issue:`41023`)
+- Performance improvement for saving to CSV with :meth:`DataFrame.to_csv` when data frame is sparse (:issue:`41023`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:

From 6fa18f642e3a242d3d716a322d6c51d11bb008c4 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryantlee9@gmail.com>
Date: Sat, 22 Oct 2022 21:11:01 -0700
Subject: [PATCH 4/5] Reduce test frame size and use self.fname for asv test
 case

---
 asv_bench/benchmarks/io/csv.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 69534c1a7acce..8acd22050f89e 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -179,17 +179,17 @@ class ToCSVSparse(BaseIO):
     def setup(self):
         from scipy import sparse as sc
 
-        vals = np.random.randint(0, 10, size=(10000, 1000))
+        vals = np.random.randint(0, 10, size=(500, 1000))
         keep = vals > 3
         vals[keep] = 0
         sparse_mtx = sc.coo_matrix(vals)
         self.data = DataFrame.sparse.from_spmatrix(sparse_mtx)
 
     def time_sparse_to_csv(self):
-        self.data.to_csv("sparse_pd.csv")
+        self.data.to_csv(self.fname)
 
     def time_sparse_to_dense_to_csv(self):
-        self.data.sparse.to_dense().to_csv("sparse_pd.csv")
+        self.data.sparse.to_dense().to_csv(self.fname)
 
 
 class StringIORewind:

From c6c50d1efcd4561215ecfc2e4ab5d310727f5cb5 Mon Sep 17 00:00:00 2001
From: Ryan Lee <ryantlee9@gmail.com>
Date: Wed, 26 Oct 2022 21:00:27 -0700
Subject: [PATCH 5/5] Move sparse conversion deeper in the call stack

This should improve memory consumption by only materializing one chunk
at a time
---
 pandas/io/formats/csvs.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index 9d10a4f3092c6..979d0eaef29b4 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -69,10 +69,7 @@ def __init__(
     ) -> None:
         self.fmt = formatter
 
-        if hasattr(self.fmt.frame, "sparse"):
-            self.obj = self.fmt.frame.sparse.to_dense()
-        else:
-            self.obj = self.fmt.frame
+        self.obj = self.fmt.frame
 
         self.filepath_or_buffer = path_or_buf
         self.encoding = encoding
@@ -311,6 +308,11 @@ def _save_chunk(self, start_i: int, end_i: int) -> None:
         slicer = slice(start_i, end_i)
         df = self.obj.iloc[slicer]
 
+        # cast sparse columns to dense to reduce calls
+        # to df._mgr.to_native_types #41023
+        if hasattr(df, "sparse"):
+            df = df.sparse.to_dense()
+
         res = df._mgr.to_native_types(**self._number_format)
         data = [res.iget_values(i) for i in range(len(res.items))]