From 8d93fecb0175c2fa4fa89f6ad2e65ea29ef3e1c6 Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Fri, 22 Apr 2022 10:32:42 +0100
Subject: [PATCH 01/13] PERF: Slow performance of to_dict (#46470)

---
 doc/source/whatsnew/v1.5.0.rst |   1 +
 pandas/core/frame.py           | 133 +++++++++++++++++++++++++++++++++
 pandas/core/series.py          |   8 +-
 3 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 922ef28b855b9..fdce9e2158078 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -453,6 +453,7 @@ Performance improvements
 - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
+- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.bug_fixes:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 74d061cbb9b7f..5141c5f20d0bb 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1771,6 +1771,139 @@ def to_numpy(
 
         return result
 
+    def _to_dict_helper(self, orient, into_c, into):
+        """Helper function to do main work to convert frame into dict based on
+        `orient` and `into`
+
+        As part of GH46470 also takes care in when to use maybe_box_native as this
+        function can perform badly and is not necessary for non object cols
+        """
+        object_dtype_cols = {
+            col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
+        }
+        are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes)
+        if orient == "dict":
+            return into_c((k, v.to_dict(into)) for k, v in self.items())
+        elif orient == "list":
+            return into_c(
+                (
+                    k,
+                    list(map(maybe_box_native, v.tolist()))
+                    if k in object_dtype_cols
+                    else v.tolist(),
+                )
+                for k, v in self.items()
+            )
+        elif orient == "split":
+            if are_all_object_dtype_cols:
+                data = [
+                    list(map(maybe_box_native, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            elif object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                data = [
+                    [
+                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
+                        for i, v in enumerate(t)
+                    ]
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+            return into_c(
+                (
+                    ("index", self.index.tolist()),
+                    ("columns", self.columns.tolist()),
+                    ("data", data),
+                )
+            )
+        elif orient == "series":
+            return into_c((k, v) for k, v in self.items())
+        elif orient == "records":
+            columns = self.columns.tolist()
+            if object_dtype_cols:
+                is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
+                return [
+                    into_c(
+                        zip(
+                            columns,
+                            [
+                                maybe_box_native(v)
+                                if is_object_dtype_by_index[i]
+                                else v
+                                for i, v in enumerate(t)
+                            ],
+                        )
+                    )
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                return [
+                    into_c(zip(columns, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+        elif orient == "index":
+            if not self.index.is_unique:
+                raise ValueError("DataFrame index must be unique for orient='index'.")
+            columns = self.columns.tolist()
+            if object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                return into_c(
+                    (
+                        t[0],
+                        {
+                            columns[i]: maybe_box_native(v)
+                            if is_object_dtype_by_index[i]
+                            else v
+                            for i, v in enumerate(t[1:])
+                        },
+                    )
+                    for t in self.itertuples(name=None)
+                )
+            else:
+                return into_c(
+                    (
+                        t[0],
+                        {columns[i]: v for i, v in enumerate(t[1:])},
+                    )
+                    for t in self.itertuples(name=None)
+                )
+        elif orient == "tight":
+            if are_all_object_dtype_cols:
+                data = [
+                    list(map(maybe_box_native, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            elif object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                data = [
+                    [
+                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
+                        for i, v in enumerate(t)
+                    ]
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+            return into_c(
+                (
+                    ("index", self.index.tolist()),
+                    ("columns", self.columns.tolist()),
+                    ("data", data),
+                    ("index_names", list(self.index.names)),
+                    ("column_names", list(self.columns.names)),
+                )
+            )
+        else:
+            raise ValueError(f"orient '{orient}' not understood")
+
     def to_dict(self, orient: str = "dict", into=dict):
         """
         Convert the DataFrame to a dictionary.
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 1d3509cac0edd..aba5282e9e04e 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1771,7 +1771,13 @@ def to_dict(self, into=dict):
         """
         # GH16122
         into_c = com.standardize_mapping(into)
-        return into_c((k, maybe_box_native(v)) for k, v in self.items())
+
+        if is_object_dtype(self):
+            return into_c((k, maybe_box_native(v)) for k, v in self.items())
+        else:
+            # Not an object dtype => all types will be the same so let the default
+            # indexer return native python type
+            return into_c((k, v) for k, v in self.items())
 
     def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
         """

From d9f9786ca6d7746c9745438a7230bf730d3e1044 Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Fri, 22 Apr 2022 10:34:53 +0100
Subject: [PATCH 02/13] Update

---
 pandas/core/frame.py | 273 ++++++++++++++++---------------------------
 1 file changed, 101 insertions(+), 172 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 5141c5f20d0bb..ad3b64e32e07e 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1771,139 +1771,6 @@ def to_numpy(
 
         return result
 
-    def _to_dict_helper(self, orient, into_c, into):
-        """Helper function to do main work to convert frame into dict based on
-        `orient` and `into`
-
-        As part of GH46470 also takes care in when to use maybe_box_native as this
-        function can perform badly and is not necessary for non object cols
-        """
-        object_dtype_cols = {
-            col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
-        }
-        are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes)
-        if orient == "dict":
-            return into_c((k, v.to_dict(into)) for k, v in self.items())
-        elif orient == "list":
-            return into_c(
-                (
-                    k,
-                    list(map(maybe_box_native, v.tolist()))
-                    if k in object_dtype_cols
-                    else v.tolist(),
-                )
-                for k, v in self.items()
-            )
-        elif orient == "split":
-            if are_all_object_dtype_cols:
-                data = [
-                    list(map(maybe_box_native, t))
-                    for t in self.itertuples(index=False, name=None)
-                ]
-            elif object_dtype_cols:
-                is_object_dtype_by_index = [
-                    col in object_dtype_cols for col in self.columns
-                ]
-                data = [
-                    [
-                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
-                        for i, v in enumerate(t)
-                    ]
-                    for t in self.itertuples(index=False, name=None)
-                ]
-            else:
-                data = [list(t) for t in self.itertuples(index=False, name=None)]
-            return into_c(
-                (
-                    ("index", self.index.tolist()),
-                    ("columns", self.columns.tolist()),
-                    ("data", data),
-                )
-            )
-        elif orient == "series":
-            return into_c((k, v) for k, v in self.items())
-        elif orient == "records":
-            columns = self.columns.tolist()
-            if object_dtype_cols:
-                is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
-                return [
-                    into_c(
-                        zip(
-                            columns,
-                            [
-                                maybe_box_native(v)
-                                if is_object_dtype_by_index[i]
-                                else v
-                                for i, v in enumerate(t)
-                            ],
-                        )
-                    )
-                    for t in self.itertuples(index=False, name=None)
-                ]
-            else:
-                return [
-                    into_c(zip(columns, t))
-                    for t in self.itertuples(index=False, name=None)
-                ]
-        elif orient == "index":
-            if not self.index.is_unique:
-                raise ValueError("DataFrame index must be unique for orient='index'.")
-            columns = self.columns.tolist()
-            if object_dtype_cols:
-                is_object_dtype_by_index = [
-                    col in object_dtype_cols for col in self.columns
-                ]
-                return into_c(
-                    (
-                        t[0],
-                        {
-                            columns[i]: maybe_box_native(v)
-                            if is_object_dtype_by_index[i]
-                            else v
-                            for i, v in enumerate(t[1:])
-                        },
-                    )
-                    for t in self.itertuples(name=None)
-                )
-            else:
-                return into_c(
-                    (
-                        t[0],
-                        {columns[i]: v for i, v in enumerate(t[1:])},
-                    )
-                    for t in self.itertuples(name=None)
-                )
-        elif orient == "tight":
-            if are_all_object_dtype_cols:
-                data = [
-                    list(map(maybe_box_native, t))
-                    for t in self.itertuples(index=False, name=None)
-                ]
-            elif object_dtype_cols:
-                is_object_dtype_by_index = [
-                    col in object_dtype_cols for col in self.columns
-                ]
-                data = [
-                    [
-                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
-                        for i, v in enumerate(t)
-                    ]
-                    for t in self.itertuples(index=False, name=None)
-                ]
-            else:
-                data = [list(t) for t in self.itertuples(index=False, name=None)]
-            return into_c(
-                (
-                    ("index", self.index.tolist()),
-                    ("columns", self.columns.tolist()),
-                    ("data", data),
-                    ("index_names", list(self.index.names)),
-                    ("column_names", list(self.columns.names)),
-                )
-            )
-        else:
-            raise ValueError(f"orient '{orient}' not understood")
-
     def to_dict(self, orient: str = "dict", into=dict):
         """
         Convert the DataFrame to a dictionary.
@@ -2042,67 +1909,129 @@ def to_dict(self, orient: str = "dict", into=dict):
             elif orient.startswith("i"):
                 orient = "index"
 
+        object_dtype_cols = {
+            col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
+        }
+        are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes)
         if orient == "dict":
             return into_c((k, v.to_dict(into)) for k, v in self.items())
-
         elif orient == "list":
-            return into_c(
-                (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items()
-            )
-
-        elif orient == "split":
             return into_c(
                 (
-                    ("index", self.index.tolist()),
-                    ("columns", self.columns.tolist()),
-                    (
-                        "data",
-                        [
-                            list(map(maybe_box_native, t))
-                            for t in self.itertuples(index=False, name=None)
-                        ],
-                    ),
+                    k,
+                    list(map(maybe_box_native, v.tolist()))
+                    if k in object_dtype_cols
+                    else v.tolist(),
                 )
+                for k, v in self.items()
             )
-
-        elif orient == "tight":
+        elif orient == "split":
+            if are_all_object_dtype_cols:
+                data = [
+                    list(map(maybe_box_native, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            elif object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                data = [
+                    [
+                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
+                        for i, v in enumerate(t)
+                    ]
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
             return into_c(
                 (
                     ("index", self.index.tolist()),
                     ("columns", self.columns.tolist()),
-                    (
-                        "data",
-                        [
-                            list(map(maybe_box_native, t))
-                            for t in self.itertuples(index=False, name=None)
-                        ],
-                    ),
-                    ("index_names", list(self.index.names)),
-                    ("column_names", list(self.columns.names)),
+                    ("data", data),
                 )
             )
-
         elif orient == "series":
             return into_c((k, v) for k, v in self.items())
-
         elif orient == "records":
             columns = self.columns.tolist()
-            rows = (
-                dict(zip(columns, row))
-                for row in self.itertuples(index=False, name=None)
-            )
-            return [
-                into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
-            ]
-
+            if object_dtype_cols:
+                is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
+                return [
+                    into_c(
+                        zip(
+                            columns,
+                            [
+                                maybe_box_native(v)
+                                if is_object_dtype_by_index[i]
+                                else v
+                                for i, v in enumerate(t)
+                            ],
+                        )
+                    )
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                return [
+                    into_c(zip(columns, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
         elif orient == "index":
             if not self.index.is_unique:
                 raise ValueError("DataFrame index must be unique for orient='index'.")
+            columns = self.columns.tolist()
+            if object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                return into_c(
+                    (
+                        t[0],
+                        {
+                            columns[i]: maybe_box_native(v)
+                            if is_object_dtype_by_index[i]
+                            else v
+                            for i, v in enumerate(t[1:])
+                        },
+                    )
+                    for t in self.itertuples(name=None)
+                )
+            else:
+                return into_c(
+                    (
+                        t[0],
+                        {columns[i]: v for i, v in enumerate(t[1:])},
+                    )
+                    for t in self.itertuples(name=None)
+                )
+        elif orient == "tight":
+            if are_all_object_dtype_cols:
+                data = [
+                    list(map(maybe_box_native, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            elif object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                data = [
+                    [
+                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
+                        for i, v in enumerate(t)
+                    ]
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
             return into_c(
-                (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
-                for t in self.itertuples(name=None)
+                (
+                    ("index", self.index.tolist()),
+                    ("columns", self.columns.tolist()),
+                    ("data", data),
+                    ("index_names", list(self.index.names)),
+                    ("column_names", list(self.columns.names)),
+                )
             )
-
         else:
             raise ValueError(f"orient '{orient}' not understood")
 

From 96ac6fa342c6cd5ff3c59d5197ec3602b21c7265 Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Fri, 22 Apr 2022 12:58:59 +0100
Subject: [PATCH 03/13] Update

---
 pandas/core/frame.py                       | 57 +++++++++++++---------
 pandas/tests/frame/methods/test_to_dict.py | 10 ++++
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index ad3b64e32e07e..b4781acb899fb 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1932,16 +1932,15 @@ def to_dict(self, orient: str = "dict", into=dict):
                     for t in self.itertuples(index=False, name=None)
                 ]
             elif object_dtype_cols:
-                is_object_dtype_by_index = [
-                    col in object_dtype_cols for col in self.columns
-                ]
-                data = [
-                    [
-                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
-                        for i, v in enumerate(t)
-                    ]
-                    for t in self.itertuples(index=False, name=None)
+                # A number of ways were tried here, this solution proved to be the
+                # most optimal in general
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+                object_type_indices = [
+                    i for i, col in enumerate(self.columns) if col in object_dtype_cols
                 ]
+                for row in data:
+                    for i in object_type_indices:
+                        row[i] = maybe_box_native(row[i])
             else:
                 data = [list(t) for t in self.itertuples(index=False, name=None)]
             return into_c(
@@ -1955,7 +1954,16 @@ def to_dict(self, orient: str = "dict", into=dict):
             return into_c((k, v) for k, v in self.items())
         elif orient == "records":
             columns = self.columns.tolist()
-            if object_dtype_cols:
+            if are_all_object_dtype_cols:
+                rows = (
+                    dict(zip(columns, row))
+                    for row in self.itertuples(index=False, name=None)
+                )
+                return [
+                    into_c((k, maybe_box_native(v)) for k, v in row.items())
+                    for row in rows
+                ]
+            elif object_dtype_cols:
                 is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
                 return [
                     into_c(
@@ -1980,7 +1988,12 @@ def to_dict(self, orient: str = "dict", into=dict):
             if not self.index.is_unique:
                 raise ValueError("DataFrame index must be unique for orient='index'.")
             columns = self.columns.tolist()
-            if object_dtype_cols:
+            if are_all_object_dtype_cols:
+                return into_c(
+                    (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
+                    for t in self.itertuples(name=None)
+                )
+            elif object_dtype_cols:
                 is_object_dtype_by_index = [
                     col in object_dtype_cols for col in self.columns
                 ]
@@ -1998,10 +2011,7 @@ def to_dict(self, orient: str = "dict", into=dict):
                 )
             else:
                 return into_c(
-                    (
-                        t[0],
-                        {columns[i]: v for i, v in enumerate(t[1:])},
-                    )
+                    (t[0], dict(zip(self.columns, t[1:])))
                     for t in self.itertuples(name=None)
                 )
         elif orient == "tight":
@@ -2011,16 +2021,15 @@ def to_dict(self, orient: str = "dict", into=dict):
                     for t in self.itertuples(index=False, name=None)
                 ]
             elif object_dtype_cols:
-                is_object_dtype_by_index = [
-                    col in object_dtype_cols for col in self.columns
-                ]
-                data = [
-                    [
-                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
-                        for i, v in enumerate(t)
-                    ]
-                    for t in self.itertuples(index=False, name=None)
+                # A number of ways were tried here, this solution proved to be the
+                # most optimal in general
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+                object_type_indices = [
+                    i for i, col in enumerate(self.columns) if col in object_dtype_cols
                 ]
+                for row in data:
+                    for i in object_type_indices:
+                        row[i] = maybe_box_native(row[i])
             else:
                 data = [list(t) for t in self.itertuples(index=False, name=None)]
             return into_c(
diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py
index 6d5c32cae7368..69e64088e4870 100644
--- a/pandas/tests/frame/methods/test_to_dict.py
+++ b/pandas/tests/frame/methods/test_to_dict.py
@@ -380,6 +380,16 @@ def test_to_dict_orient_tight(self, index, columns):
                     "b": [float, float, float],
                 },
             ),
+            (  # Make sure we have one df which is all object type cols
+                {
+                    "a": [1, "hello", 3],
+                    "b": [1.1, "world", 3.3],
+                },
+                {
+                    "a": [int, str, int],
+                    "b": [float, str, float],
+                },
+            ),
         ),
     )
     def test_to_dict_returns_native_types(self, orient, data, expected_types):

From 3c596f72eb183b17efd4dd4e676a51c502931a70 Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Fri, 22 Apr 2022 13:09:32 +0100
Subject: [PATCH 04/13] Clean up

---
 pandas/core/frame.py | 61 ++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b4781acb899fb..1aaa70008abaf 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1915,6 +1915,7 @@ def to_dict(self, orient: str = "dict", into=dict):
         are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes)
         if orient == "dict":
             return into_c((k, v.to_dict(into)) for k, v in self.items())
+
         elif orient == "list":
             return into_c(
                 (
@@ -1925,6 +1926,7 @@ def to_dict(self, orient: str = "dict", into=dict):
                 )
                 for k, v in self.items()
             )
+
         elif orient == "split":
             if are_all_object_dtype_cols:
                 data = [
@@ -1950,8 +1952,38 @@ def to_dict(self, orient: str = "dict", into=dict):
                     ("data", data),
                 )
             )
+
+        elif orient == "tight":
+            if are_all_object_dtype_cols:
+                data = [
+                    list(map(maybe_box_native, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            elif object_dtype_cols:
+                # A number of ways were tried here, this solution proved to be the
+                # most optimal in general
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+                object_type_indices = [
+                    i for i, col in enumerate(self.columns) if col in object_dtype_cols
+                ]
+                for row in data:
+                    for i in object_type_indices:
+                        row[i] = maybe_box_native(row[i])
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+            return into_c(
+                (
+                    ("index", self.index.tolist()),
+                    ("columns", self.columns.tolist()),
+                    ("data", data),
+                    ("index_names", list(self.index.names)),
+                    ("column_names", list(self.columns.names)),
+                )
+            )
+
         elif orient == "series":
             return into_c((k, v) for k, v in self.items())
+
         elif orient == "records":
             columns = self.columns.tolist()
             if are_all_object_dtype_cols:
@@ -1984,6 +2016,7 @@ def to_dict(self, orient: str = "dict", into=dict):
                     into_c(zip(columns, t))
                     for t in self.itertuples(index=False, name=None)
                 ]
+
         elif orient == "index":
             if not self.index.is_unique:
                 raise ValueError("DataFrame index must be unique for orient='index'.")
@@ -2014,33 +2047,7 @@ def to_dict(self, orient: str = "dict", into=dict):
                     (t[0], dict(zip(self.columns, t[1:])))
                     for t in self.itertuples(name=None)
                 )
-        elif orient == "tight":
-            if are_all_object_dtype_cols:
-                data = [
-                    list(map(maybe_box_native, t))
-                    for t in self.itertuples(index=False, name=None)
-                ]
-            elif object_dtype_cols:
-                # A number of ways were tried here, this solution proved to be the
-                # most optimal in general
-                data = [list(t) for t in self.itertuples(index=False, name=None)]
-                object_type_indices = [
-                    i for i, col in enumerate(self.columns) if col in object_dtype_cols
-                ]
-                for row in data:
-                    for i in object_type_indices:
-                        row[i] = maybe_box_native(row[i])
-            else:
-                data = [list(t) for t in self.itertuples(index=False, name=None)]
-            return into_c(
-                (
-                    ("index", self.index.tolist()),
-                    ("columns", self.columns.tolist()),
-                    ("data", data),
-                    ("index_names", list(self.index.names)),
-                    ("column_names", list(self.columns.names)),
-                )
-            )
+
         else:
             raise ValueError(f"orient '{orient}' not understood")
 

From 57c95ebf0b3f6de909c2e648f4ad520638a31171 Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Sun, 24 Apr 2022 12:02:37 +0100
Subject: [PATCH 05/13] Address PR comments

---
 doc/source/whatsnew/v1.5.0.rst |  2 +-
 pandas/core/frame.py           | 61 +++++++++++++---------------------
 2 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index fdce9e2158078..de4f109e555d1 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -453,7 +453,7 @@ Performance improvements
 - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
-- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`)
+- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.bug_fixes:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 1aaa70008abaf..2fc138f4d59ff 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1933,18 +1933,17 @@ def to_dict(self, orient: str = "dict", into=dict):
                     list(map(maybe_box_native, t))
                     for t in self.itertuples(index=False, name=None)
                 ]
-            elif object_dtype_cols:
-                # A number of ways were tried here, this solution proved to be the
-                # most optimal in general
-                data = [list(t) for t in self.itertuples(index=False, name=None)]
-                object_type_indices = [
-                    i for i, col in enumerate(self.columns) if col in object_dtype_cols
-                ]
-                for row in data:
-                    for i in object_type_indices:
-                        row[i] = maybe_box_native(row[i])
             else:
                 data = [list(t) for t in self.itertuples(index=False, name=None)]
+                if object_dtype_cols:
+                    object_dtype_indices = [
+                        i
+                        for i, col in enumerate(self.columns)
+                        if col in object_dtype_cols
+                    ]
+                    for row in data:
+                        for i in object_dtype_indices:
+                            row[i] = maybe_box_native(row[i])
             return into_c(
                 (
                     ("index", self.index.tolist()),
@@ -1959,18 +1958,17 @@ def to_dict(self, orient: str = "dict", into=dict):
                     list(map(maybe_box_native, t))
                     for t in self.itertuples(index=False, name=None)
                 ]
-            elif object_dtype_cols:
-                # A number of ways were tried here, this solution proved to be the
-                # most optimal in general
-                data = [list(t) for t in self.itertuples(index=False, name=None)]
-                object_type_indices = [
-                    i for i, col in enumerate(self.columns) if col in object_dtype_cols
-                ]
-                for row in data:
-                    for i in object_type_indices:
-                        row[i] = maybe_box_native(row[i])
             else:
                 data = [list(t) for t in self.itertuples(index=False, name=None)]
+                if object_dtype_cols:
+                    object_dtype_indices = [
+                        i
+                        for i, col in enumerate(self.columns)
+                        if col in object_dtype_cols
+                    ]
+                    for row in data:
+                        for i in object_dtype_indices:
+                            row[i] = maybe_box_native(row[i])
             return into_c(
                 (
                     ("index", self.index.tolist()),
@@ -1995,27 +1993,16 @@ def to_dict(self, orient: str = "dict", into=dict):
                     into_c((k, maybe_box_native(v)) for k, v in row.items())
                     for row in rows
                 ]
-            elif object_dtype_cols:
-                is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
-                return [
-                    into_c(
-                        zip(
-                            columns,
-                            [
-                                maybe_box_native(v)
-                                if is_object_dtype_by_index[i]
-                                else v
-                                for i, v in enumerate(t)
-                            ],
-                        )
-                    )
-                    for t in self.itertuples(index=False, name=None)
-                ]
             else:
-                return [
+                data = [
                     into_c(zip(columns, t))
                     for t in self.itertuples(index=False, name=None)
                 ]
+                if object_dtype_cols:
+                    for row in data:
+                        for col in object_dtype_cols:
+                            row[col] = maybe_box_native(row[col])
+                return data
 
         elif orient == "index":
             if not self.index.is_unique:

From e07f02c3a4ad0e9ceadcbd7ded9bf7c53eb5025c Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Fri, 29 Apr 2022 16:37:50 +0100
Subject: [PATCH 06/13] Address PR comments

---
 pandas/core/frame.py | 47 ++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 2fc138f4d59ff..25655acf78006 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1909,22 +1909,26 @@ def to_dict(self, orient: str = "dict", into=dict):
             elif orient.startswith("i"):
                 orient = "index"
 
-        object_dtype_cols = {
-            col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
-        }
-        are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes)
+        object_dtype_indices = [
+            i
+            for i, col_dtype in enumerate(self.dtypes.values)
+            if is_object_dtype(col_dtype)
+        ]
+        are_all_object_dtype_cols = len(object_dtype_indices) == len(self.dtypes)
+
         if orient == "dict":
             return into_c((k, v.to_dict(into)) for k, v in self.items())
 
         elif orient == "list":
+            object_dtype_indices = set(object_dtype_indices)
             return into_c(
                 (
                     k,
                     list(map(maybe_box_native, v.tolist()))
-                    if k in object_dtype_cols
+                    if i in object_dtype_indices
                     else v.tolist(),
                 )
-                for k, v in self.items()
+                for i, (k, v) in enumerate(self.items())
             )
 
         elif orient == "split":
@@ -1935,12 +1939,9 @@ def to_dict(self, orient: str = "dict", into=dict):
                 ]
             else:
                 data = [list(t) for t in self.itertuples(index=False, name=None)]
-                if object_dtype_cols:
-                    object_dtype_indices = [
-                        i
-                        for i, col in enumerate(self.columns)
-                        if col in object_dtype_cols
-                    ]
+                if object_dtype_indices:
+                    # If we have object_dtype_cols, apply maybe_box_naive after list
+                    # comprehension for perf
                     for row in data:
                         for i in object_dtype_indices:
                             row[i] = maybe_box_native(row[i])
@@ -1960,12 +1961,9 @@ def to_dict(self, orient: str = "dict", into=dict):
                 ]
             else:
                 data = [list(t) for t in self.itertuples(index=False, name=None)]
-                if object_dtype_cols:
-                    object_dtype_indices = [
-                        i
-                        for i, col in enumerate(self.columns)
-                        if col in object_dtype_cols
-                    ]
+                if object_dtype_indices:
+                    # If we have object_dtype_cols, apply maybe_box_naive after list
+                    # comprehension for perf
                     for row in data:
                         for i in object_dtype_indices:
                             row[i] = maybe_box_native(row[i])
@@ -1998,7 +1996,13 @@ def to_dict(self, orient: str = "dict", into=dict):
                     into_c(zip(columns, t))
                     for t in self.itertuples(index=False, name=None)
                 ]
-                if object_dtype_cols:
+                if object_dtype_indices:
+                    object_dtype_indices = set(object_dtype_indices)
+                    object_dtype_cols = {
+                        col
+                        for i, col in enumerate(self.columns)
+                        if i in object_dtype_indices
+                    }
                     for row in data:
                         for col in object_dtype_cols:
                             row[col] = maybe_box_native(row[col])
@@ -2013,9 +2017,10 @@ def to_dict(self, orient: str = "dict", into=dict):
                     (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
                     for t in self.itertuples(name=None)
                 )
-            elif object_dtype_cols:
+            elif object_dtype_indices:
+                object_dtype_indices = set(object_dtype_indices)
                 is_object_dtype_by_index = [
-                    col in object_dtype_cols for col in self.columns
+                    i in object_dtype_indices for i in range(len(self.columns))
                 ]
                 return into_c(
                     (

From d2da86b46e3b6be7dad9e0415f42ef11e475ad9b Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Fri, 29 Apr 2022 17:27:47 +0100
Subject: [PATCH 07/13] Use as set

---
 pandas/core/frame.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 25655acf78006..587819876ed54 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1920,12 +1920,12 @@ def to_dict(self, orient: str = "dict", into=dict):
             return into_c((k, v.to_dict(into)) for k, v in self.items())
 
         elif orient == "list":
-            object_dtype_indices = set(object_dtype_indices)
+            object_dtype_indices_as_set = set(object_dtype_indices)
             return into_c(
                 (
                     k,
                     list(map(maybe_box_native, v.tolist()))
-                    if i in object_dtype_indices
+                    if i in object_dtype_indices_as_set
                     else v.tolist(),
                 )
                 for i, (k, v) in enumerate(self.items())
@@ -1997,11 +1997,11 @@ def to_dict(self, orient: str = "dict", into=dict):
                     for t in self.itertuples(index=False, name=None)
                 ]
                 if object_dtype_indices:
-                    object_dtype_indices = set(object_dtype_indices)
+                    object_dtype_indices_as_set = set(object_dtype_indices)
                     object_dtype_cols = {
                         col
                         for i, col in enumerate(self.columns)
-                        if i in object_dtype_indices
+                        if i in object_dtype_indices_as_set
                     }
                     for row in data:
                         for col in object_dtype_cols:
@@ -2018,9 +2018,9 @@ def to_dict(self, orient: str = "dict", into=dict):
                     for t in self.itertuples(name=None)
                 )
             elif object_dtype_indices:
-                object_dtype_indices = set(object_dtype_indices)
+                object_dtype_indices_as_set = set(object_dtype_indices)
                 is_object_dtype_by_index = [
-                    i in object_dtype_indices for i in range(len(self.columns))
+                    i in object_dtype_indices_as_set for i in range(len(self.columns))
                 ]
                 return into_c(
                     (

From 0c0481d25dbd00882154954e95d145f7193df08a Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Tue, 10 May 2022 17:40:09 +0100
Subject: [PATCH 08/13] Add helper function

---
 pandas/core/frame.py | 52 ++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 587819876ed54..f8381a9485d16 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1771,6 +1771,24 @@ def to_numpy(
 
         return result
 
+    def _create_data_for_split_and_tight_to_dict(
+        self, are_all_object_dtype_cols, object_dtype_indices
+    ):
+        if are_all_object_dtype_cols:
+            data = [
+                list(map(maybe_box_native, t))
+                for t in self.itertuples(index=False, name=None)
+            ]
+        else:
+            data = [list(t) for t in self.itertuples(index=False, name=None)]
+            if object_dtype_indices:
+                # If we have object_dtype_cols, apply maybe_box_naive after list
+                # comprehension for perf
+                for row in data:
+                    for i in object_dtype_indices:
+                        row[i] = maybe_box_native(row[i])
+        return data
+
     def to_dict(self, orient: str = "dict", into=dict):
         """
         Convert the DataFrame to a dictionary.
@@ -1932,19 +1950,10 @@ def to_dict(self, orient: str = "dict", into=dict):
             )
 
         elif orient == "split":
-            if are_all_object_dtype_cols:
-                data = [
-                    list(map(maybe_box_native, t))
-                    for t in self.itertuples(index=False, name=None)
-                ]
-            else:
-                data = [list(t) for t in self.itertuples(index=False, name=None)]
-                if object_dtype_indices:
-                    # If we have object_dtype_cols, apply maybe_box_naive after list
-                    # comprehension for perf
-                    for row in data:
-                        for i in object_dtype_indices:
-                            row[i] = maybe_box_native(row[i])
+            data = self._create_data_for_split_and_tight_to_dict(
+                are_all_object_dtype_cols, object_dtype_indices
+            )
+
             return into_c(
                 (
                     ("index", self.index.tolist()),
@@ -1954,19 +1963,10 @@ def to_dict(self, orient: str = "dict", into=dict):
             )
 
         elif orient == "tight":
-            if are_all_object_dtype_cols:
-                data = [
-                    list(map(maybe_box_native, t))
-                    for t in self.itertuples(index=False, name=None)
-                ]
-            else:
-                data = [list(t) for t in self.itertuples(index=False, name=None)]
-                if object_dtype_indices:
-                    # If we have object_dtype_cols, apply maybe_box_naive after list
-                    # comprehension for perf
-                    for row in data:
-                        for i in object_dtype_indices:
-                            row[i] = maybe_box_native(row[i])
+            data = self._create_data_for_split_and_tight_to_dict(
+                are_all_object_dtype_cols, object_dtype_indices
+            )
+
             return into_c(
                 (
                     ("index", self.index.tolist()),

From 1d68f83e496ddd71787885d12255d4fc3bc01ef9 Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Wed, 11 May 2022 11:09:55 +0100
Subject: [PATCH 09/13] Add comment and types

---
 pandas/core/frame.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index f8381a9485d16..bff9be938f136 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1772,8 +1772,12 @@ def to_numpy(
         return result
 
     def _create_data_for_split_and_tight_to_dict(
-        self, are_all_object_dtype_cols, object_dtype_indices
+        self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
     ):
+        """
+        Simple helper method to create data for to ``to_dict(orient="split")`` and
+        ``to_dict(orient="tight")`` to create the main output data
+        """
         if are_all_object_dtype_cols:
             data = [
                 list(map(maybe_box_native, t))

From f66340d450c3ddda65e63f7b57de6ec998c8e85b Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Mon, 27 Jun 2022 11:40:03 +0100
Subject: [PATCH 10/13] Return quickly if series

---
 pandas/core/frame.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 44e082a1abe98..15cc20f453efd 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1971,6 +1971,10 @@ def to_dict(self, orient: str = "dict", into=dict):
             elif orient.startswith("i"):
                 orient = "index"
 
+        if orient == "series":
+            # GH46470 Return quickly if orient series to avoid creating dtype objects
+            return into_c((k, v) for k, v in self.items())
+
         object_dtype_indices = [
             i
             for i, col_dtype in enumerate(self.dtypes.values)
@@ -2021,9 +2025,6 @@ def to_dict(self, orient: str = "dict", into=dict):
                 )
             )
 
-        elif orient == "series":
-            return into_c((k, v) for k, v in self.items())
-
         elif orient == "records":
             columns = self.columns.tolist()
             if are_all_object_dtype_cols:

From 03978aa9fe07602e3b2621c0b0d8773cc2e90922 Mon Sep 17 00:00:00 2001
From: Roger Thomas <roger.thomas@oliverwyman.com>
Date: Tue, 28 Jun 2022 09:53:05 +0100
Subject: [PATCH 11/13] Add return value

---
 pandas/core/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 15cc20f453efd..9e22403c4ba27 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1813,7 +1813,7 @@ def to_numpy(
 
     def _create_data_for_split_and_tight_to_dict(
         self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
-    ):
+    ) -> list:
         """
         Simple helper method to create data for to ``to_dict(orient="split")`` and
         ``to_dict(orient="tight")`` to create the main output data

From ecfffa6fdf0c8b15925ea9a85ef85644dbc11d70 Mon Sep 17 00:00:00 2001
From: RogerThomas <roger.thomas@gmail.com>
Date: Tue, 22 Nov 2022 09:25:30 +0000
Subject: [PATCH 12/13] Move doc to 2.0.0

---
 doc/source/whatsnew/v1.5.0.rst | 1 -
 doc/source/whatsnew/v2.0.0.rst | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 3ecb4bdba0e42..08dbb357c8053 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -960,7 +960,6 @@ Performance improvements
 - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
-- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
 - Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)
 - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`)
 - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing`  (:issue:`47458`)
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 04e5154ca1a0b..238e19dd21307 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -326,6 +326,7 @@ Performance improvements
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
 - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
+- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:

From 36e22c05f7bbedd318aa5fb0888cad22687e6f8f Mon Sep 17 00:00:00 2001
From: RogerThomas <roger.thomas@gmail.com>
Date: Tue, 22 Nov 2022 09:32:11 +0000
Subject: [PATCH 13/13] Revert

---
 doc/source/whatsnew/v2.0.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 97e975e24a5a1..8188916a06008 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -603,6 +603,7 @@ Performance improvements
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
 - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
+- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`)
 - Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`)
 - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
 - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)