From b2fb9b1a75cf6f57f3b971f7b906f71ad2811370 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 20 Mar 2024 16:51:13 -0700 Subject: [PATCH 1/6] PERF: DataFrame(dict) returns RangeIndex columns when possible --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/api.py | 13 ++----------- pandas/core/internals/construction.py | 3 ++- pandas/tests/frame/test_constructors.py | 5 +++++ 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 10d5a518f686d..263470d40c546 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -263,6 +263,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) +- :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`?`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index a8887a21afa34..9b05eb42c6d6e 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,6 +1,5 @@ from __future__ import annotations -import textwrap from typing import ( TYPE_CHECKING, cast, @@ -23,6 +22,7 @@ ensure_index, ensure_index_from_sequences, get_unanimous_names, + maybe_sequence_to_range, ) from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import DatetimeIndex @@ -34,16 +34,6 @@ if TYPE_CHECKING: from pandas._typing import Axis -_sort_msg = textwrap.dedent( - """\ -Sorting because non-concatenation axis is not aligned. A future version -of pandas will change to not sort by default. - -To accept the future behavior, pass 'sort=False'. - -To retain the current behavior and silence the warning, pass 'sort=True'. -""" -) __all__ = [ @@ -66,6 +56,7 @@ "all_indexes_same", "default_index", "safe_sort_index", + "maybe_sequence_to_range", ] diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 93f1674fbd328..73b93110c9018 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -60,6 +60,7 @@ default_index, ensure_index, get_objs_combined_axis, + maybe_sequence_to_range, union_indexes, ) from pandas.core.internals.blocks import ( @@ -403,7 +404,7 @@ def dict_to_mgr( arrays[i] = arr else: - keys = list(data.keys()) + keys = maybe_sequence_to_range(list(data.keys())) columns = Index(keys) if keys else default_index(0) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7d1a5b4492740..12d8269b640fc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2709,6 +2709,11 @@ def test_inference_on_pandas_objects(self): result = DataFrame({"a": ser}) assert result.dtypes.iloc[0] == np.object_ + def test_dict_keys_returns_rangeindex(self): + result = DataFrame({0: [1], 1: [2]}).columns + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): From 35c68b906ce178b900ab676c4f62a76905c41f82 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 20 Mar 2024 16:53:05 -0700 Subject: [PATCH 2/6] add whatsnew note --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 263470d40c546..8059b3eb21c73 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -262,8 +262,8 @@ Removal of prior version deprecations/changes Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`) +- :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) -- :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`?`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) From 340fedf14af19be15db3e917a9c61685bc397a80 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Mar 2024 09:40:19 -0700 Subject: [PATCH 3/6] Fix test failures --- pandas/core/indexes/base.py | 6 +++++- pandas/tests/reshape/test_pivot.py | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 62facb89a2f16..9857e5ff0cbdc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7171,7 +7171,11 @@ def maybe_sequence_to_range(sequence) -> Any | range: """ if isinstance(sequence, (ABCSeries, Index)): return sequence - np_sequence = np.asarray(sequence) + try: + np_sequence = np.asarray(sequence) + except ValueError: + # sequence has nested values + return sequence if np_sequence.dtype.kind != "i" or len(np_sequence) == 1: return sequence elif len(np_sequence) == 0: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 99250dc929997..f750d5e7fa919 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1738,6 +1738,7 @@ def test_daily(self): mask = ts.index.year == y expected[y] = Series(ts.values[mask], index=doy[mask]) expected = DataFrame(expected, dtype=float).T + expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(result, expected) def test_monthly(self): @@ -1753,6 +1754,7 @@ def test_monthly(self): mask = ts.index.year == y expected[y] = Series(ts.values[mask], index=month[mask]) expected = DataFrame(expected, dtype=float).T + expected.index = expected.index.astype(np.int32) tm.assert_frame_equal(result, expected) def test_pivot_table_with_iterator_values(self, data): From 9dda9c5c2f711f6f92e1d69b0cd508bebd772a9f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Mar 2024 10:18:16 -0700 Subject: [PATCH 4/6] Only 1 ndim --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3f36edf7924a1..08e6840ce4335 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7176,7 +7176,7 @@ def maybe_sequence_to_range(sequence) -> Any | range: except ValueError: # sequence has nested values return sequence - if np_sequence.dtype.kind != "i" or len(np_sequence) == 1: + if np_sequence.dtype.kind != "i" or len(np_sequence) == 1 or np_sequence.ndim != 1: return sequence elif len(np_sequence) == 0: return range(0) From ac7bdf0927ffb7a440318411108f89ed9f379ba5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:27:05 -0700 Subject: [PATCH 5/6] Use infer_dtype --- pandas/core/indexes/base.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 08e6840ce4335..4d4dbedace50d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7171,20 +7171,15 @@ def maybe_sequence_to_range(sequence) -> Any | range: """ if isinstance(sequence, (ABCSeries, Index, range)): return sequence - try: - np_sequence = np.asarray(sequence) - except ValueError: - # sequence has nested values - return sequence - if np_sequence.dtype.kind != "i" or len(np_sequence) == 1 or np_sequence.ndim != 1: + elif len(sequence) == 1 or lib.infer_dtype(sequence) != "integer": return sequence - elif len(np_sequence) == 0: + elif len(sequence) == 0: return range(0) - diff = np_sequence[1] - np_sequence[0] + diff = sequence[1] - sequence[0] if diff == 0: return sequence - elif len(np_sequence) == 2 or lib.is_sequence_range(np_sequence, diff): - return range(np_sequence[0], np_sequence[-1] + diff, diff) + elif len(sequence) == 2 or lib.is_sequence_range(np.asarray(sequence), diff): + return range(sequence[0], sequence[-1] + diff, diff) else: return sequence From 61b9db4d8175aefd9b0a37994b14f13d30081045 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:58:49 -0700 Subject: [PATCH 6/6] Skip EA, skipna=False --- pandas/core/indexes/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4d4dbedace50d..e59c0542ee6da 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7169,9 +7169,9 @@ def maybe_sequence_to_range(sequence) -> Any | range: ------- Any : input or range """ - if isinstance(sequence, (ABCSeries, Index, range)): + if isinstance(sequence, (ABCSeries, Index, range, ExtensionArray)): return sequence - elif len(sequence) == 1 or lib.infer_dtype(sequence) != "integer": + elif len(sequence) == 1 or lib.infer_dtype(sequence, skipna=False) != "integer": return sequence elif len(sequence) == 0: return range(0)