From 8d93fecb0175c2fa4fa89f6ad2e65ea29ef3e1c6 Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Fri, 22 Apr 2022 10:32:42 +0100 Subject: [PATCH 01/13] PERF: Slow performance of to_dict (#46470) --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/frame.py | 133 +++++++++++++++++++++++++++++++++ pandas/core/series.py | 8 +- 3 files changed, 141 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 922ef28b855b9..fdce9e2158078 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -453,6 +453,7 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) +- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 74d061cbb9b7f..5141c5f20d0bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1771,6 +1771,139 @@ def to_numpy( return result + def _to_dict_helper(self, orient, into_c, into): + """Helper function to do main work to convert frame into dict based on + `orient` and `into` + + As part of GH46470 also takes care in when to use maybe_box_native as this + function can perform badly and is not necessary for non object cols + """ + object_dtype_cols = { + col for col, dtype in self.dtypes.items() if is_object_dtype(dtype) + } + are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes) + if orient == "dict": + return into_c((k, v.to_dict(into)) for k, v in self.items()) + elif orient == "list": + return into_c( + ( + k, + list(map(maybe_box_native, v.tolist())) + if k in object_dtype_cols + else v.tolist(), + ) + for k, v in self.items() + ) + elif orient == "split": + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + elif object_dtype_cols: + is_object_dtype_by_index = [ + col in object_dtype_cols for col in self.columns + ] + data = [ + [ + maybe_box_native(v) if is_object_dtype_by_index[i] else v + for i, v in enumerate(t) + ] + for t in self.itertuples(index=False, name=None) + ] + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] + return into_c( + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ("data", data), + ) + ) + elif orient == "series": + return into_c((k, v) for k, v in self.items()) + elif orient == "records": + columns = self.columns.tolist() + if object_dtype_cols: + is_object_dtype_by_index = [col in object_dtype_cols for col in columns] + return [ + into_c( + zip( + columns, + [ + maybe_box_native(v) + if is_object_dtype_by_index[i] + else v + for i, v in enumerate(t) + ], + ) + ) + for t in self.itertuples(index=False, name=None) + ] + else: + return [ + into_c(zip(columns, t)) + for t in self.itertuples(index=False, name=None) + ] + elif orient == "index": + if not self.index.is_unique: + raise ValueError("DataFrame index must be unique for orient='index'.") + columns = self.columns.tolist() + if object_dtype_cols: + is_object_dtype_by_index = [ + col in object_dtype_cols for col in self.columns + ] + return into_c( + ( + t[0], + { + columns[i]: maybe_box_native(v) + if is_object_dtype_by_index[i] + else v + for i, v in enumerate(t[1:]) + }, + ) + for t in self.itertuples(name=None) + ) + else: + return into_c( + ( + t[0], + {columns[i]: v for i, v in enumerate(t[1:])}, + ) + for t in self.itertuples(name=None) + ) + elif orient == "tight": + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + elif object_dtype_cols: + is_object_dtype_by_index = [ + col in object_dtype_cols for col in self.columns + ] + data = [ + [ + maybe_box_native(v) if is_object_dtype_by_index[i] else v + for i, v in enumerate(t) + ] + for t in self.itertuples(index=False, name=None) + ] + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] + return into_c( + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ("data", data), + ("index_names", list(self.index.names)), + ("column_names", list(self.columns.names)), + ) + ) + else: + raise ValueError(f"orient '{orient}' not understood") + def to_dict(self, orient: str = "dict", into=dict): """ Convert the DataFrame to a dictionary. diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d3509cac0edd..aba5282e9e04e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1771,7 +1771,13 @@ def to_dict(self, into=dict): """ # GH16122 into_c = com.standardize_mapping(into) - return into_c((k, maybe_box_native(v)) for k, v in self.items()) + + if is_object_dtype(self): + return into_c((k, maybe_box_native(v)) for k, v in self.items()) + else: + # Not an object dtype => all types will be the same so let the default + # indexer return native python type + return into_c((k, v) for k, v in self.items()) def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: """ From d9f9786ca6d7746c9745438a7230bf730d3e1044 Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Fri, 22 Apr 2022 10:34:53 +0100 Subject: [PATCH 02/13] Update --- pandas/core/frame.py | 273 ++++++++++++++++--------------------------- 1 file changed, 101 insertions(+), 172 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5141c5f20d0bb..ad3b64e32e07e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1771,139 +1771,6 @@ def to_numpy( return result - def _to_dict_helper(self, orient, into_c, into): - """Helper function to do main work to convert frame into dict based on - `orient` and `into` - - As part of GH46470 also takes care in when to use maybe_box_native as this - function can perform badly and is not necessary for non object cols - """ - object_dtype_cols = { - col for col, dtype in self.dtypes.items() if is_object_dtype(dtype) - } - are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes) - if orient == "dict": - return into_c((k, v.to_dict(into)) for k, v in self.items()) - elif orient == "list": - return into_c( - ( - k, - list(map(maybe_box_native, v.tolist())) - if k in object_dtype_cols - else v.tolist(), - ) - for k, v in self.items() - ) - elif orient == "split": - if are_all_object_dtype_cols: - data = [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ] - elif object_dtype_cols: - is_object_dtype_by_index = [ - col in object_dtype_cols for col in self.columns - ] - data = [ - [ - maybe_box_native(v) if is_object_dtype_by_index[i] else v - for i, v in enumerate(t) - ] - for t in self.itertuples(index=False, name=None) - ] - else: - data = [list(t) for t in self.itertuples(index=False, name=None)] - return into_c( - ( - ("index", self.index.tolist()), - ("columns", self.columns.tolist()), - ("data", data), - ) - ) - elif orient == "series": - return into_c((k, v) for k, v in self.items()) - elif orient == "records": - columns = self.columns.tolist() - if object_dtype_cols: - is_object_dtype_by_index = [col in object_dtype_cols for col in columns] - return [ - into_c( - zip( - columns, - [ - maybe_box_native(v) - if is_object_dtype_by_index[i] - else v - for i, v in enumerate(t) - ], - ) - ) - for t in self.itertuples(index=False, name=None) - ] - else: - return [ - into_c(zip(columns, t)) - for t in self.itertuples(index=False, name=None) - ] - elif orient == "index": - if not self.index.is_unique: - raise ValueError("DataFrame index must be unique for orient='index'.") - columns = self.columns.tolist() - if object_dtype_cols: - is_object_dtype_by_index = [ - col in object_dtype_cols for col in self.columns - ] - return into_c( - ( - t[0], - { - columns[i]: maybe_box_native(v) - if is_object_dtype_by_index[i] - else v - for i, v in enumerate(t[1:]) - }, - ) - for t in self.itertuples(name=None) - ) - else: - return into_c( - ( - t[0], - {columns[i]: v for i, v in enumerate(t[1:])}, - ) - for t in self.itertuples(name=None) - ) - elif orient == "tight": - if are_all_object_dtype_cols: - data = [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ] - elif object_dtype_cols: - is_object_dtype_by_index = [ - col in object_dtype_cols for col in self.columns - ] - data = [ - [ - maybe_box_native(v) if is_object_dtype_by_index[i] else v - for i, v in enumerate(t) - ] - for t in self.itertuples(index=False, name=None) - ] - else: - data = [list(t) for t in self.itertuples(index=False, name=None)] - return into_c( - ( - ("index", self.index.tolist()), - ("columns", self.columns.tolist()), - ("data", data), - ("index_names", list(self.index.names)), - ("column_names", list(self.columns.names)), - ) - ) - else: - raise ValueError(f"orient '{orient}' not understood") - def to_dict(self, orient: str = "dict", into=dict): """ Convert the DataFrame to a dictionary. @@ -2042,67 +1909,129 @@ def to_dict(self, orient: str = "dict", into=dict): elif orient.startswith("i"): orient = "index" + object_dtype_cols = { + col for col, dtype in self.dtypes.items() if is_object_dtype(dtype) + } + are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes) if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in self.items()) - elif orient == "list": - return into_c( - (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items() - ) - - elif orient == "split": return into_c( ( - ("index", self.index.tolist()), - ("columns", self.columns.tolist()), - ( - "data", - [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ], - ), + k, + list(map(maybe_box_native, v.tolist())) + if k in object_dtype_cols + else v.tolist(), ) + for k, v in self.items() ) - - elif orient == "tight": + elif orient == "split": + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + elif object_dtype_cols: + is_object_dtype_by_index = [ + col in object_dtype_cols for col in self.columns + ] + data = [ + [ + maybe_box_native(v) if is_object_dtype_by_index[i] else v + for i, v in enumerate(t) + ] + for t in self.itertuples(index=False, name=None) + ] + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] return into_c( ( ("index", self.index.tolist()), ("columns", self.columns.tolist()), - ( - "data", - [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ], - ), - ("index_names", list(self.index.names)), - ("column_names", list(self.columns.names)), + ("data", data), ) ) - elif orient == "series": return into_c((k, v) for k, v in self.items()) - elif orient == "records": columns = self.columns.tolist() - rows = ( - dict(zip(columns, row)) - for row in self.itertuples(index=False, name=None) - ) - return [ - into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows - ] - + if object_dtype_cols: + is_object_dtype_by_index = [col in object_dtype_cols for col in columns] + return [ + into_c( + zip( + columns, + [ + maybe_box_native(v) + if is_object_dtype_by_index[i] + else v + for i, v in enumerate(t) + ], + ) + ) + for t in self.itertuples(index=False, name=None) + ] + else: + return [ + into_c(zip(columns, t)) + for t in self.itertuples(index=False, name=None) + ] elif orient == "index": if not self.index.is_unique: raise ValueError("DataFrame index must be unique for orient='index'.") + columns = self.columns.tolist() + if object_dtype_cols: + is_object_dtype_by_index = [ + col in object_dtype_cols for col in self.columns + ] + return into_c( + ( + t[0], + { + columns[i]: maybe_box_native(v) + if is_object_dtype_by_index[i] + else v + for i, v in enumerate(t[1:]) + }, + ) + for t in self.itertuples(name=None) + ) + else: + return into_c( + ( + t[0], + {columns[i]: v for i, v in enumerate(t[1:])}, + ) + for t in self.itertuples(name=None) + ) + elif orient == "tight": + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + elif object_dtype_cols: + is_object_dtype_by_index = [ + col in object_dtype_cols for col in self.columns + ] + data = [ + [ + maybe_box_native(v) if is_object_dtype_by_index[i] else v + for i, v in enumerate(t) + ] + for t in self.itertuples(index=False, name=None) + ] + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] return into_c( - (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:])))) - for t in self.itertuples(name=None) + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ("data", data), + ("index_names", list(self.index.names)), + ("column_names", list(self.columns.names)), + ) ) - else: raise ValueError(f"orient '{orient}' not understood") From 96ac6fa342c6cd5ff3c59d5197ec3602b21c7265 Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Fri, 22 Apr 2022 12:58:59 +0100 Subject: [PATCH 03/13] Update --- pandas/core/frame.py | 57 +++++++++++++--------- pandas/tests/frame/methods/test_to_dict.py | 10 ++++ 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ad3b64e32e07e..b4781acb899fb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1932,16 +1932,15 @@ def to_dict(self, orient: str = "dict", into=dict): for t in self.itertuples(index=False, name=None) ] elif object_dtype_cols: - is_object_dtype_by_index = [ - col in object_dtype_cols for col in self.columns - ] - data = [ - [ - maybe_box_native(v) if is_object_dtype_by_index[i] else v - for i, v in enumerate(t) - ] - for t in self.itertuples(index=False, name=None) + # A number of ways were tried here, this solution proved to be the + # most optimal in general + data = [list(t) for t in self.itertuples(index=False, name=None)] + object_type_indices = [ + i for i, col in enumerate(self.columns) if col in object_dtype_cols ] + for row in data: + for i in object_type_indices: + row[i] = maybe_box_native(row[i]) else: data = [list(t) for t in self.itertuples(index=False, name=None)] return into_c( @@ -1955,7 +1954,16 @@ def to_dict(self, orient: str = "dict", into=dict): return into_c((k, v) for k, v in self.items()) elif orient == "records": columns = self.columns.tolist() - if object_dtype_cols: + if are_all_object_dtype_cols: + rows = ( + dict(zip(columns, row)) + for row in self.itertuples(index=False, name=None) + ) + return [ + into_c((k, maybe_box_native(v)) for k, v in row.items()) + for row in rows + ] + elif object_dtype_cols: is_object_dtype_by_index = [col in object_dtype_cols for col in columns] return [ into_c( @@ -1980,7 +1988,12 @@ def to_dict(self, orient: str = "dict", into=dict): if not self.index.is_unique: raise ValueError("DataFrame index must be unique for orient='index'.") columns = self.columns.tolist() - if object_dtype_cols: + if are_all_object_dtype_cols: + return into_c( + (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:])))) + for t in self.itertuples(name=None) + ) + elif object_dtype_cols: is_object_dtype_by_index = [ col in object_dtype_cols for col in self.columns ] @@ -1998,10 +2011,7 @@ def to_dict(self, orient: str = "dict", into=dict): ) else: return into_c( - ( - t[0], - {columns[i]: v for i, v in enumerate(t[1:])}, - ) + (t[0], dict(zip(self.columns, t[1:]))) for t in self.itertuples(name=None) ) elif orient == "tight": @@ -2011,16 +2021,15 @@ def to_dict(self, orient: str = "dict", into=dict): for t in self.itertuples(index=False, name=None) ] elif object_dtype_cols: - is_object_dtype_by_index = [ - col in object_dtype_cols for col in self.columns - ] - data = [ - [ - maybe_box_native(v) if is_object_dtype_by_index[i] else v - for i, v in enumerate(t) - ] - for t in self.itertuples(index=False, name=None) + # A number of ways were tried here, this solution proved to be the + # most optimal in general + data = [list(t) for t in self.itertuples(index=False, name=None)] + object_type_indices = [ + i for i, col in enumerate(self.columns) if col in object_dtype_cols ] + for row in data: + for i in object_type_indices: + row[i] = maybe_box_native(row[i]) else: data = [list(t) for t in self.itertuples(index=False, name=None)] return into_c( diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 6d5c32cae7368..69e64088e4870 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -380,6 +380,16 @@ def test_to_dict_orient_tight(self, index, columns): "b": [float, float, float], }, ), + ( # Make sure we have one df which is all object type cols + { + "a": [1, "hello", 3], + "b": [1.1, "world", 3.3], + }, + { + "a": [int, str, int], + "b": [float, str, float], + }, + ), ), ) def test_to_dict_returns_native_types(self, orient, data, expected_types): From 3c596f72eb183b17efd4dd4e676a51c502931a70 Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Fri, 22 Apr 2022 13:09:32 +0100 Subject: [PATCH 04/13] Clean up --- pandas/core/frame.py | 61 ++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b4781acb899fb..1aaa70008abaf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1915,6 +1915,7 @@ def to_dict(self, orient: str = "dict", into=dict): are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes) if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in self.items()) + elif orient == "list": return into_c( ( @@ -1925,6 +1926,7 @@ def to_dict(self, orient: str = "dict", into=dict): ) for k, v in self.items() ) + elif orient == "split": if are_all_object_dtype_cols: data = [ @@ -1950,8 +1952,38 @@ def to_dict(self, orient: str = "dict", into=dict): ("data", data), ) ) + + elif orient == "tight": + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + elif object_dtype_cols: + # A number of ways were tried here, this solution proved to be the + # most optimal in general + data = [list(t) for t in self.itertuples(index=False, name=None)] + object_type_indices = [ + i for i, col in enumerate(self.columns) if col in object_dtype_cols + ] + for row in data: + for i in object_type_indices: + row[i] = maybe_box_native(row[i]) + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] + return into_c( + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ("data", data), + ("index_names", list(self.index.names)), + ("column_names", list(self.columns.names)), + ) + ) + elif orient == "series": return into_c((k, v) for k, v in self.items()) + elif orient == "records": columns = self.columns.tolist() if are_all_object_dtype_cols: @@ -1984,6 +2016,7 @@ def to_dict(self, orient: str = "dict", into=dict): into_c(zip(columns, t)) for t in self.itertuples(index=False, name=None) ] + elif orient == "index": if not self.index.is_unique: raise ValueError("DataFrame index must be unique for orient='index'.") @@ -2014,33 +2047,7 @@ def to_dict(self, orient: str = "dict", into=dict): (t[0], dict(zip(self.columns, t[1:]))) for t in self.itertuples(name=None) ) - elif orient == "tight": - if are_all_object_dtype_cols: - data = [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ] - elif object_dtype_cols: - # A number of ways were tried here, this solution proved to be the - # most optimal in general - data = [list(t) for t in self.itertuples(index=False, name=None)] - object_type_indices = [ - i for i, col in enumerate(self.columns) if col in object_dtype_cols - ] - for row in data: - for i in object_type_indices: - row[i] = maybe_box_native(row[i]) - else: - data = [list(t) for t in self.itertuples(index=False, name=None)] - return into_c( - ( - ("index", self.index.tolist()), - ("columns", self.columns.tolist()), - ("data", data), - ("index_names", list(self.index.names)), - ("column_names", list(self.columns.names)), - ) - ) + else: raise ValueError(f"orient '{orient}' not understood") From 57c95ebf0b3f6de909c2e648f4ad520638a31171 Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Sun, 24 Apr 2022 12:02:37 +0100 Subject: [PATCH 05/13] Address PR comments --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/frame.py | 61 +++++++++++++--------------------- 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index fdce9e2158078..de4f109e555d1 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -453,7 +453,7 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) -- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`) +- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1aaa70008abaf..2fc138f4d59ff 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1933,18 +1933,17 @@ def to_dict(self, orient: str = "dict", into=dict): list(map(maybe_box_native, t)) for t in self.itertuples(index=False, name=None) ] - elif object_dtype_cols: - # A number of ways were tried here, this solution proved to be the - # most optimal in general - data = [list(t) for t in self.itertuples(index=False, name=None)] - object_type_indices = [ - i for i, col in enumerate(self.columns) if col in object_dtype_cols - ] - for row in data: - for i in object_type_indices: - row[i] = maybe_box_native(row[i]) else: data = [list(t) for t in self.itertuples(index=False, name=None)] + if object_dtype_cols: + object_dtype_indices = [ + i + for i, col in enumerate(self.columns) + if col in object_dtype_cols + ] + for row in data: + for i in object_dtype_indices: + row[i] = maybe_box_native(row[i]) return into_c( ( ("index", self.index.tolist()), @@ -1959,18 +1958,17 @@ def to_dict(self, orient: str = "dict", into=dict): list(map(maybe_box_native, t)) for t in self.itertuples(index=False, name=None) ] - elif object_dtype_cols: - # A number of ways were tried here, this solution proved to be the - # most optimal in general - data = [list(t) for t in self.itertuples(index=False, name=None)] - object_type_indices = [ - i for i, col in enumerate(self.columns) if col in object_dtype_cols - ] - for row in data: - for i in object_type_indices: - row[i] = maybe_box_native(row[i]) else: data = [list(t) for t in self.itertuples(index=False, name=None)] + if object_dtype_cols: + object_dtype_indices = [ + i + for i, col in enumerate(self.columns) + if col in object_dtype_cols + ] + for row in data: + for i in object_dtype_indices: + row[i] = maybe_box_native(row[i]) return into_c( ( ("index", self.index.tolist()), @@ -1995,27 +1993,16 @@ def to_dict(self, orient: str = "dict", into=dict): into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows ] - elif object_dtype_cols: - is_object_dtype_by_index = [col in object_dtype_cols for col in columns] - return [ - into_c( - zip( - columns, - [ - maybe_box_native(v) - if is_object_dtype_by_index[i] - else v - for i, v in enumerate(t) - ], - ) - ) - for t in self.itertuples(index=False, name=None) - ] else: - return [ + data = [ into_c(zip(columns, t)) for t in self.itertuples(index=False, name=None) ] + if object_dtype_cols: + for row in data: + for col in object_dtype_cols: + row[col] = maybe_box_native(row[col]) + return data elif orient == "index": if not self.index.is_unique: From e07f02c3a4ad0e9ceadcbd7ded9bf7c53eb5025c Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Fri, 29 Apr 2022 16:37:50 +0100 Subject: [PATCH 06/13] Address PR comments --- pandas/core/frame.py | 47 ++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2fc138f4d59ff..25655acf78006 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1909,22 +1909,26 @@ def to_dict(self, orient: str = "dict", into=dict): elif orient.startswith("i"): orient = "index" - object_dtype_cols = { - col for col, dtype in self.dtypes.items() if is_object_dtype(dtype) - } - are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes) + object_dtype_indices = [ + i + for i, col_dtype in enumerate(self.dtypes.values) + if is_object_dtype(col_dtype) + ] + are_all_object_dtype_cols = len(object_dtype_indices) == len(self.dtypes) + if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in self.items()) elif orient == "list": + object_dtype_indices = set(object_dtype_indices) return into_c( ( k, list(map(maybe_box_native, v.tolist())) - if k in object_dtype_cols + if i in object_dtype_indices else v.tolist(), ) - for k, v in self.items() + for i, (k, v) in enumerate(self.items()) ) elif orient == "split": @@ -1935,12 +1939,9 @@ def to_dict(self, orient: str = "dict", into=dict): ] else: data = [list(t) for t in self.itertuples(index=False, name=None)] - if object_dtype_cols: - object_dtype_indices = [ - i - for i, col in enumerate(self.columns) - if col in object_dtype_cols - ] + if object_dtype_indices: + # If we have object_dtype_cols, apply maybe_box_naive after list + # comprehension for perf for row in data: for i in object_dtype_indices: row[i] = maybe_box_native(row[i]) @@ -1960,12 +1961,9 @@ def to_dict(self, orient: str = "dict", into=dict): ] else: data = [list(t) for t in self.itertuples(index=False, name=None)] - if object_dtype_cols: - object_dtype_indices = [ - i - for i, col in enumerate(self.columns) - if col in object_dtype_cols - ] + if object_dtype_indices: + # If we have object_dtype_cols, apply maybe_box_naive after list + # comprehension for perf for row in data: for i in object_dtype_indices: row[i] = maybe_box_native(row[i]) @@ -1998,7 +1996,13 @@ def to_dict(self, orient: str = "dict", into=dict): into_c(zip(columns, t)) for t in self.itertuples(index=False, name=None) ] - if object_dtype_cols: + if object_dtype_indices: + object_dtype_indices = set(object_dtype_indices) + object_dtype_cols = { + col + for i, col in enumerate(self.columns) + if i in object_dtype_indices + } for row in data: for col in object_dtype_cols: row[col] = maybe_box_native(row[col]) @@ -2013,9 +2017,10 @@ def to_dict(self, orient: str = "dict", into=dict): (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:])))) for t in self.itertuples(name=None) ) - elif object_dtype_cols: + elif object_dtype_indices: + object_dtype_indices = set(object_dtype_indices) is_object_dtype_by_index = [ - col in object_dtype_cols for col in self.columns + i in object_dtype_indices for i in range(len(self.columns)) ] return into_c( ( From d2da86b46e3b6be7dad9e0415f42ef11e475ad9b Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Fri, 29 Apr 2022 17:27:47 +0100 Subject: [PATCH 07/13] Use as set --- pandas/core/frame.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 25655acf78006..587819876ed54 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1920,12 +1920,12 @@ def to_dict(self, orient: str = "dict", into=dict): return into_c((k, v.to_dict(into)) for k, v in self.items()) elif orient == "list": - object_dtype_indices = set(object_dtype_indices) + object_dtype_indices_as_set = set(object_dtype_indices) return into_c( ( k, list(map(maybe_box_native, v.tolist())) - if i in object_dtype_indices + if i in object_dtype_indices_as_set else v.tolist(), ) for i, (k, v) in enumerate(self.items()) @@ -1997,11 +1997,11 @@ def to_dict(self, orient: str = "dict", into=dict): for t in self.itertuples(index=False, name=None) ] if object_dtype_indices: - object_dtype_indices = set(object_dtype_indices) + object_dtype_indices_as_set = set(object_dtype_indices) object_dtype_cols = { col for i, col in enumerate(self.columns) - if i in object_dtype_indices + if i in object_dtype_indices_as_set } for row in data: for col in object_dtype_cols: @@ -2018,9 +2018,9 @@ def to_dict(self, orient: str = "dict", into=dict): for t in self.itertuples(name=None) ) elif object_dtype_indices: - object_dtype_indices = set(object_dtype_indices) + object_dtype_indices_as_set = set(object_dtype_indices) is_object_dtype_by_index = [ - i in object_dtype_indices for i in range(len(self.columns)) + i in object_dtype_indices_as_set for i in range(len(self.columns)) ] return into_c( ( From 0c0481d25dbd00882154954e95d145f7193df08a Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Tue, 10 May 2022 17:40:09 +0100 Subject: [PATCH 08/13] Add helper function --- pandas/core/frame.py | 52 ++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 587819876ed54..f8381a9485d16 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1771,6 +1771,24 @@ def to_numpy( return result + def _create_data_for_split_and_tight_to_dict( + self, are_all_object_dtype_cols, object_dtype_indices + ): + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] + if object_dtype_indices: + # If we have object_dtype_cols, apply maybe_box_naive after list + # comprehension for perf + for row in data: + for i in object_dtype_indices: + row[i] = maybe_box_native(row[i]) + return data + def to_dict(self, orient: str = "dict", into=dict): """ Convert the DataFrame to a dictionary. @@ -1932,19 +1950,10 @@ def to_dict(self, orient: str = "dict", into=dict): ) elif orient == "split": - if are_all_object_dtype_cols: - data = [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ] - else: - data = [list(t) for t in self.itertuples(index=False, name=None)] - if object_dtype_indices: - # If we have object_dtype_cols, apply maybe_box_naive after list - # comprehension for perf - for row in data: - for i in object_dtype_indices: - row[i] = maybe_box_native(row[i]) + data = self._create_data_for_split_and_tight_to_dict( + are_all_object_dtype_cols, object_dtype_indices + ) + return into_c( ( ("index", self.index.tolist()), @@ -1954,19 +1963,10 @@ def to_dict(self, orient: str = "dict", into=dict): ) elif orient == "tight": - if are_all_object_dtype_cols: - data = [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ] - else: - data = [list(t) for t in self.itertuples(index=False, name=None)] - if object_dtype_indices: - # If we have object_dtype_cols, apply maybe_box_naive after list - # comprehension for perf - for row in data: - for i in object_dtype_indices: - row[i] = maybe_box_native(row[i]) + data = self._create_data_for_split_and_tight_to_dict( + are_all_object_dtype_cols, object_dtype_indices + ) + return into_c( ( ("index", self.index.tolist()), From 1d68f83e496ddd71787885d12255d4fc3bc01ef9 Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Wed, 11 May 2022 11:09:55 +0100 Subject: [PATCH 09/13] Add comment and types --- pandas/core/frame.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f8381a9485d16..bff9be938f136 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1772,8 +1772,12 @@ def to_numpy( return result def _create_data_for_split_and_tight_to_dict( - self, are_all_object_dtype_cols, object_dtype_indices + self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] ): + """ + Simple helper method to create data for to ``to_dict(orient="split")`` and + ``to_dict(orient="tight")`` to create the main output data + """ if are_all_object_dtype_cols: data = [ list(map(maybe_box_native, t)) From f66340d450c3ddda65e63f7b57de6ec998c8e85b Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Mon, 27 Jun 2022 11:40:03 +0100 Subject: [PATCH 10/13] Return quickly if series --- pandas/core/frame.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 44e082a1abe98..15cc20f453efd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1971,6 +1971,10 @@ def to_dict(self, orient: str = "dict", into=dict): elif orient.startswith("i"): orient = "index" + if orient == "series": + # GH46470 Return quickly if orient series to avoid creating dtype objects + return into_c((k, v) for k, v in self.items()) + object_dtype_indices = [ i for i, col_dtype in enumerate(self.dtypes.values) @@ -2021,9 +2025,6 @@ def to_dict(self, orient: str = "dict", into=dict): ) ) - elif orient == "series": - return into_c((k, v) for k, v in self.items()) - elif orient == "records": columns = self.columns.tolist() if are_all_object_dtype_cols: From 03978aa9fe07602e3b2621c0b0d8773cc2e90922 Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Tue, 28 Jun 2022 09:53:05 +0100 Subject: [PATCH 11/13] Add return value --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 15cc20f453efd..9e22403c4ba27 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1813,7 +1813,7 @@ def to_numpy( def _create_data_for_split_and_tight_to_dict( self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] - ): + ) -> list: """ Simple helper method to create data for to ``to_dict(orient="split")`` and ``to_dict(orient="tight")`` to create the main output data From ecfffa6fdf0c8b15925ea9a85ef85644dbc11d70 Mon Sep 17 00:00:00 2001 From: RogerThomas Date: Tue, 22 Nov 2022 09:25:30 +0000 Subject: [PATCH 12/13] Move doc to 2.0.0 --- doc/source/whatsnew/v1.5.0.rst | 1 - doc/source/whatsnew/v2.0.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 3ecb4bdba0e42..08dbb357c8053 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -960,7 +960,6 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) -- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) - Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`) - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`) - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 04e5154ca1a0b..238e19dd21307 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -326,6 +326,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) +- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: From 36e22c05f7bbedd318aa5fb0888cad22687e6f8f Mon Sep 17 00:00:00 2001 From: RogerThomas Date: Tue, 22 Nov 2022 09:32:11 +0000 Subject: [PATCH 13/13] Revert --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 97e975e24a5a1..8188916a06008 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -603,6 +603,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) +- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`) - Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`) - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)