From 18de335d5318d278d36d894772c57b463c285df5 Mon Sep 17 00:00:00 2001 From: GrimmXoXo Date: Mon, 10 Mar 2025 05:47:51 +0530 Subject: [PATCH 1/5] Updated infer_dtype docstring --- pandas/_libs/lib.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3c509a3eae11a..7fec5d9d5abd3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1518,7 +1518,7 @@ cdef object _try_infer_map(object dtype): def infer_dtype(value: object, skipna: bool = True) -> str: """ - Return a string label of the type of a scalar or list-like of values. + Return a string label of the type of the elements in a list-like input. This method inspects the elements of the provided input and determines classification of its data type. It is particularly useful for @@ -1527,8 +1527,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str: Parameters ---------- - value : scalar, list, ndarray, or pandas type - The input data to infer the dtype. + value : list, ndarray, or pandas type + The input data to infer the dtype. skipna : bool, default True Ignore NaN values when inferring the type. @@ -1573,6 +1573,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: Notes ----- + - The value parameter must be an iterable; scalar inputs are not supported. - 'mixed' is the catchall for anything that is not otherwise specialized - 'mixed-integer-float' are floats and integers From e1c149e976a11792b7d79948993b5903d4b45585 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Fri, 7 Mar 2025 18:30:55 -0500 Subject: [PATCH 2/5] BUG: Fix OverflowError in lib.maybe_indices_to_slice() (#61080) * BUG: Fix OverflowError in lib.maybe_indices_to_slice() This fixes this error when slicing massive dataframes: Traceback (most recent call last): File "", line 1, in File "/opt/anaconda3/lib/python3.12/site-packages/pandas/core/frame.py", line 4093, in __getitem__ return self._getitem_bool_array(key) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/pandas/core/frame.py", line 4155, in _getitem_bool_array return self._take_with_is_copy(indexer, axis=0) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/pandas/core/generic.py", line 4153, in _take_with_is_copy result = self.take(indices=indices, axis=axis) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/pandas/core/generic.py", line 4133, in take new_data = self._mgr.take( ^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/pandas/core/internals/managers.py", line 893, in take new_labels = self.axes[axis].take(indexer) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/pandas/core/indexes/datetimelike.py", line 839, in take maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "lib.pyx", line 522, in pandas._libs.lib.maybe_indices_to_slice OverflowError: value too large to convert to int * Sort whatsnew entries * Set type hint back to int --------- Co-authored-by: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/lib.pyx | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9fab1d12fc6a5..e2e4144f3323d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -694,6 +694,7 @@ Interval Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) +- Bug in :meth:`DataFrame.__getitem__` when slicing a :class:`DataFrame` with many rows raised an ``OverflowError`` (:issue:`59531`) - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) - Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7fec5d9d5abd3..2bd4b574d1174 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -502,7 +502,7 @@ def has_only_ints_or_nan(const floating[:] arr) -> bool: return True -def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): +def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, intp_t max_len): cdef: Py_ssize_t i, n = len(indices) intp_t k, vstart, vlast, v From ab49f79e2d46e368171a11b71bef1a93391c4422 Mon Sep 17 00:00:00 2001 From: Anurag Varma Date: Fri, 7 Mar 2025 18:51:45 -0500 Subject: [PATCH 3/5] BUG: Fix MultiIndex from_tuples on tuples with NaNs (#60944) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/algorithms.py | 6 +++ pandas/core/indexes/multi.py | 3 +- .../tests/indexes/multi/test_constructors.py | 13 +++++ pandas/tests/series/test_constructors.py | 52 ++++++++++++------- 5 files changed, 55 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e2e4144f3323d..ad086c4d636d5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -714,7 +714,7 @@ MultiIndex - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) - Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - Bug in :class:`DataFrame` arithmetic operations with :class:`Series` in case of unaligned MultiIndex (:issue:`61009`) -- +- Bug in :meth:`MultiIndex.from_tuples` causing wrong output with input of type tuples having NaN values (:issue:`60695`, :issue:`60988`) I/O ^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index aafd802b827a5..0c0232bdc6d4c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1647,6 +1647,8 @@ def map_array( If the function returns a tuple with more than one element a MultiIndex will be returned. """ + from pandas import Index + if na_action not in (None, "ignore"): msg = f"na_action must either be 'ignore' or None, {na_action} was passed" raise ValueError(msg) @@ -1676,6 +1678,10 @@ def map_array( if len(mapper) == 0: mapper = Series(mapper, dtype=np.float64) + elif isinstance(mapper, dict): + mapper = Series( + mapper.values(), index=Index(mapper.keys(), tupleize_cols=False) + ) else: mapper = Series(mapper) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 79eb1b693d866..29b34f560ab2e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -9,6 +9,7 @@ Sequence, ) from functools import wraps +from itertools import zip_longest from sys import getsizeof from typing import ( TYPE_CHECKING, @@ -588,7 +589,7 @@ def from_tuples( elif isinstance(tuples, list): arrays = list(lib.to_object_array_tuples(tuples).T) else: - arrs = zip(*tuples) + arrs = zip_longest(*tuples, fillvalue=np.nan) arrays = cast(list[Sequence[Hashable]], arrs) return cls.from_arrays(arrays, sortorder=sortorder, names=names) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index b2867d4ac8e68..92827cf154394 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -410,6 +410,19 @@ def test_from_tuples_with_tuple_label(): tm.assert_frame_equal(expected, result) +@pytest.mark.parametrize( + "keys, expected", + [ + ((("l1",), ("l1", "l2")), (("l1", np.nan), ("l1", "l2"))), + ((("l1", "l2"), ("l1",)), (("l1", "l2"), ("l1", np.nan))), + ], +) +def test_from_tuples_with_various_tuple_lengths(keys, expected): + # GH 60695 + idx = MultiIndex.from_tuples(keys) + assert tuple(idx) == expected + + # ---------------------------------------------------------------------------- # from_product # ---------------------------------------------------------------------------- diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a2be698c0ec28..5f4a100e7ccc7 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1441,10 +1441,17 @@ def test_constructor_tuple_of_tuples(self): s = Series(data) assert tuple(s) == data - def test_constructor_dict_of_tuples(self): - data = {(1, 2): 3, (None, 5): 6} + @pytest.mark.parametrize( + "data, expected_values, expected_index", + [ + ({(1, 2): 3, (None, 5): 6}, [3, 6], [(1, 2), (None, 5)]), + ({(1,): 3, (4, 5): 6}, [3, 6], [(1, None), (4, 5)]), + ], + ) + def test_constructor_dict_of_tuples(self, data, expected_values, expected_index): + # GH 60695 result = Series(data).sort_values() - expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) + expected = Series(expected_values, index=MultiIndex.from_tuples(expected_index)) tm.assert_series_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/22698 @@ -1860,23 +1867,30 @@ class A(OrderedDict): series = Series(A(data)) tm.assert_series_equal(series, expected) - def test_constructor_dict_multiindex(self): - d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} - _d = sorted(d.items()) - result = Series(d) - expected = Series( - [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d]) - ) - tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data, expected_index_multi", + [ + ({("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}, True), + ({("a",): 0.0, ("a", "b"): 1.0}, True), + ({"z": 111.0, ("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}, False), + ], + ) + def test_constructor_dict_multiindex(self, data, expected_index_multi): + # GH#60695 + result = Series(data) - d["z"] = 111.0 - _d.insert(0, ("z", d["z"])) - result = Series(d) - expected = Series( - [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False) - ) - result = result.reindex(index=expected.index) - tm.assert_series_equal(result, expected) + if expected_index_multi: + expected = Series( + list(data.values()), + index=MultiIndex.from_tuples(list(data.keys())), + ) + tm.assert_series_equal(result, expected) + else: + expected = Series( + list(data.values()), + index=Index(list(data.keys())), + ) + tm.assert_series_equal(result, expected) def test_constructor_dict_multiindex_reindex_flat(self): # construction involves reindexing with a MultiIndex corner case From 539199a04bf11f09213b368f08d80398d87da568 Mon Sep 17 00:00:00 2001 From: GrimmXoXo Date: Mon, 10 Mar 2025 05:47:51 +0530 Subject: [PATCH 4/5] Updated infer_dtype docstring --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2bd4b574d1174..8ba6098029895 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1528,7 +1528,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: Parameters ---------- value : list, ndarray, or pandas type - The input data to infer the dtype. + The input data to infer the dtype. skipna : bool, default True Ignore NaN values when inferring the type. From 5b8fbb55182f3f0883494ebf9fb4e3ca58fc21b7 Mon Sep 17 00:00:00 2001 From: GrimmXoXo Date: Mon, 10 Mar 2025 07:47:09 +0530 Subject: [PATCH 5/5] Fix misplaced whitespace in strings --- pandas/_libs/tslibs/offsets.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index a16964435ef50..5ffa363ea3ea8 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -5108,8 +5108,8 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\'" - f" instead.", + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f"instead.", FutureWarning, stacklevel=find_stack_level(), ) @@ -5122,8 +5122,8 @@ def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{_name}\'" - f" instead.", + f"\'{_name}\' " + f"instead.", FutureWarning, stacklevel=find_stack_level(), )