From 0d92965d60233c1793fb306ab9aa80fea1a3c87f Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 18 May 2023 23:06:25 +0100 Subject: [PATCH 1/5] BUG: dtype of DataFrame.idxmax/idxmin incorrect for empty frames --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/frame.py | 47 ++++++++++++++------------- pandas/tests/frame/test_reductions.py | 24 ++++++++++++++ 3 files changed, 49 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 3425d0c523dab..f53de2b60ee6d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -427,6 +427,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`crosstab` when ``dropna=False`` would not keep ``np.nan`` in the result (:issue:`10772`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) +- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`) - Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`) - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8f6698cab000c..fa289f3d762e9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11161,40 +11161,41 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: def idxmin( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: - axis = self._get_axis_number(axis) - if numeric_only: - data = self._get_numeric_data() - else: - data = self - - res = data._reduce( - nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False + return self._idxminmax( + op_name="idxmin", axis=axis, skipna=skipna, numeric_only=numeric_only ) - indices = res._values - - # indices will always be np.ndarray since axis is not None and - # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy - - index = data._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) - return final_result.__finalize__(self, method="idxmin") @doc(_shared_docs["idxmax"], numeric_only_default="False") def idxmax( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: + return self._idxminmax( + op_name="idxmax", axis=axis, skipna=skipna, numeric_only=numeric_only + ) + + def _idxminmax( + self, + op_name: str, + axis: Axis = 0, + skipna: bool = True, + numeric_only: bool = False, ) -> Series: axis = self._get_axis_number(axis) + if self.empty: + axis_dtype = self.axes[axis].dtype + return self._constructor_sliced(dtype=axis_dtype) + + arg, arg_name = { + "idxmin": (nanops.nanargmin, "argmin"), + "idxmax": (nanops.nanargmax, "argmax"), + }[op_name] + if numeric_only: data = self._get_numeric_data() else: data = self - res = data._reduce( - nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False - ) + res = data._reduce(arg, arg_name, axis=axis, skipna=skipna, numeric_only=False) indices = res._values # indices will always be np.ndarray since axis is not None and @@ -11205,7 +11206,7 @@ def idxmax( index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) - return final_result.__finalize__(self, method="idxmax") + return final_result.__finalize__(self, method=op_name) def _get_agg_axis(self, axis_num: int) -> Index: """ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 096f6fe83ea88..78576ef777ba8 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -964,6 +964,18 @@ def test_idxmin(self, float_frame, int_frame, skipna, axis): expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("axis", [0, 1]) + def test_idxmin_empty(self, index, skipna, axis): + # GH53265 + if axis == 0: + frame = DataFrame(index=index[:0]) + else: + frame = DataFrame(columns=index[:0]) + + result = frame.idxmin(axis=axis, skipna=skipna) + expected = Series(dtype=frame.axes[axis].dtype) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmin_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -992,6 +1004,18 @@ def test_idxmax(self, float_frame, int_frame, skipna, axis): expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("axis", [0, 1]) + def test_idxmax_empty(self, index, skipna, axis): + # GH53265 + if axis == 0: + frame = DataFrame(index=index[:0]) + else: + frame = DataFrame(columns=index[:0]) + + result = frame.idxmax(axis=axis, skipna=skipna) + expected = Series(dtype=frame.axes[axis].dtype) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmax_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) From 0df0574376594866797872dae0f618cded254845 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 19 May 2023 06:52:33 +0100 Subject: [PATCH 2/5] simplify --- pandas/core/frame.py | 49 ++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fa289f3d762e9..3defbe31ecc1c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11161,41 +11161,50 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: def idxmin( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: - return self._idxminmax( - op_name="idxmin", axis=axis, skipna=skipna, numeric_only=numeric_only + axis = self._get_axis_number(axis) + + if self.empty: + axis_dtype = self.axes[axis].dtype + return self._constructor_sliced(dtype=axis_dtype) + + if numeric_only: + data = self._get_numeric_data() + else: + data = self + + res = data._reduce( + nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False ) + indices = res._values + + # indices will always be np.ndarray since axis is not None and + # values is a 2d array for DataFrame + # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" + assert isinstance(indices, np.ndarray) # for mypy + + index = data._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) + return final_result.__finalize__(self, method="idxmin") @doc(_shared_docs["idxmax"], numeric_only_default="False") def idxmax( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False - ) -> Series: - return self._idxminmax( - op_name="idxmax", axis=axis, skipna=skipna, numeric_only=numeric_only - ) - - def _idxminmax( - self, - op_name: str, - axis: Axis = 0, - skipna: bool = True, - numeric_only: bool = False, ) -> Series: axis = self._get_axis_number(axis) + if self.empty: axis_dtype = self.axes[axis].dtype return self._constructor_sliced(dtype=axis_dtype) - arg, arg_name = { - "idxmin": (nanops.nanargmin, "argmin"), - "idxmax": (nanops.nanargmax, "argmax"), - }[op_name] - if numeric_only: data = self._get_numeric_data() else: data = self - res = data._reduce(arg, arg_name, axis=axis, skipna=skipna, numeric_only=False) + res = data._reduce( + nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False + ) indices = res._values # indices will always be np.ndarray since axis is not None and @@ -11206,7 +11215,7 @@ def _idxminmax( index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) - return final_result.__finalize__(self, method=op_name) + return final_result.__finalize__(self, method="idxmax") def _get_agg_axis(self, axis_num: int) -> Index: """ From c0530d257546c6da27999c7235827e230e51619b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 19 May 2023 09:35:17 +0100 Subject: [PATCH 3/5] fix groupby test failures --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3defbe31ecc1c..7d478dc55ce08 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11163,7 +11163,7 @@ def idxmin( ) -> Series: axis = self._get_axis_number(axis) - if self.empty: + if self.empty and len(self.axes[axis]): axis_dtype = self.axes[axis].dtype return self._constructor_sliced(dtype=axis_dtype) @@ -11193,7 +11193,7 @@ def idxmax( ) -> Series: axis = self._get_axis_number(axis) - if self.empty: + if self.empty and len(self.axes[axis]): axis_dtype = self.axes[axis].dtype return self._constructor_sliced(dtype=axis_dtype) From cde37fbf3dc9d5aa8007fcbe0983c9460c5df5b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Fri, 19 May 2023 17:18:09 +0200 Subject: [PATCH 4/5] DOC: Fixing EX01 - Added examples (#53292) * Added examples for Interval and IntervalArray * changed code_checks.sh * forgot these --- ci/code_checks.sh | 12 ---- pandas/_libs/interval.pyx | 32 +++++++++++ pandas/core/arrays/interval.py | 100 +++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 12 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 433118e648827..a82ba18d6798e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -172,16 +172,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.Period.asfreq \ pandas.Period.now \ pandas.arrays.PeriodArray \ - pandas.Interval.closed \ - pandas.Interval.left \ - pandas.Interval.length \ - pandas.Interval.right \ - pandas.arrays.IntervalArray.left \ - pandas.arrays.IntervalArray.right \ - pandas.arrays.IntervalArray.closed \ - pandas.arrays.IntervalArray.mid \ - pandas.arrays.IntervalArray.length \ - pandas.arrays.IntervalArray.is_non_overlapping_monotonic \ pandas.arrays.IntervalArray.from_arrays \ pandas.arrays.IntervalArray.to_tuples \ pandas.Int8Dtype \ @@ -310,9 +300,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.CategoricalIndex.as_ordered \ pandas.CategoricalIndex.as_unordered \ pandas.CategoricalIndex.equals \ - pandas.IntervalIndex.closed \ pandas.IntervalIndex.values \ - pandas.IntervalIndex.is_non_overlapping_monotonic \ pandas.IntervalIndex.to_tuples \ pandas.MultiIndex.dtypes \ pandas.MultiIndex.drop \ diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index fe405b98f218c..074e9b19eaf72 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -188,6 +188,14 @@ cdef class IntervalMixin: See Also -------- Interval.is_empty : Indicates if an interval contains no points. + + Examples + -------- + >>> interval = pd.Interval(left=1, right=2, closed='left') + >>> interval + Interval(1, 2, closed='left') + >>> interval.length + 1 """ return self.right - self.left @@ -369,11 +377,27 @@ cdef class Interval(IntervalMixin): cdef readonly object left """ Left bound for the interval. + + Examples + -------- + >>> interval = pd.Interval(left=1, right=2, closed='left') + >>> interval + Interval(1, 2, closed='left') + >>> interval.left + 1 """ cdef readonly object right """ Right bound for the interval. + + Examples + -------- + >>> interval = pd.Interval(left=1, right=2, closed='left') + >>> interval + Interval(1, 2, closed='left') + >>> interval.right + 2 """ cdef readonly str closed @@ -381,6 +405,14 @@ cdef class Interval(IntervalMixin): String describing the inclusive side the intervals. Either ``left``, ``right``, ``both`` or ``neither``. + + Examples + -------- + >>> interval = pd.Interval(left=1, right=2, closed='left') + >>> interval + Interval(1, 2, closed='left') + >>> interval.closed + 'left' """ def __init__(self, left, right, str closed="right"): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 48540113e8766..2303f8334c07c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1266,6 +1266,17 @@ def _format_space(self) -> str: def left(self): """ Return the left endpoints of each Interval in the IntervalArray as an Index. + + Examples + -------- + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(2, 5)]) + >>> interv_arr + + [(0, 1], (2, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.left + Index([0, 2], dtype='int64') """ from pandas import Index @@ -1275,6 +1286,17 @@ def left(self): def right(self): """ Return the right endpoints of each Interval in the IntervalArray as an Index. + + Examples + -------- + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(2, 5)]) + >>> interv_arr + + [(0, 1], (2, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.right + Index([1, 5], dtype='int64') """ from pandas import Index @@ -1284,6 +1306,17 @@ def right(self): def length(self) -> Index: """ Return an Index with entries denoting the length of each Interval. + + Examples + -------- + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.length + Index([1, 4], dtype='int64') """ return self.right - self.left @@ -1291,6 +1324,17 @@ def length(self) -> Index: def mid(self) -> Index: """ Return the midpoint of each Interval in the IntervalArray as an Index. + + Examples + -------- + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.mid + Index([0.5, 3.0], dtype='float64') """ try: return 0.5 * (self.left + self.right) @@ -1378,6 +1422,27 @@ def closed(self) -> IntervalClosedType: String describing the inclusive side the intervals. Either ``left``, ``right``, ``both`` or ``neither``. + + Examples + -------- + + For arrays: + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.closed + 'right' + + For Interval Index: + + >>> interv_idx = pd.interval_range(start=0, end=2) + >>> interv_idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> interv_idx.closed + 'right' """ return self.dtype.closed @@ -1436,6 +1501,41 @@ def set_closed(self, closed: IntervalClosedType) -> Self: Non-overlapping means (no Intervals share points), and monotonic means either monotonic increasing or monotonic decreasing. + + Examples + -------- + For arrays: + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.is_non_overlapping_monotonic + True + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), + ... pd.Interval(-1, 0.1)]) + >>> interv_arr + + [(0.0, 1.0], (-1.0, 0.1]] + Length: 2, dtype: interval[float64, right] + >>> interv_arr.is_non_overlapping_monotonic + False + + For Interval Index: + + >>> interv_idx = pd.interval_range(start=0, end=2) + >>> interv_idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> interv_idx.is_non_overlapping_monotonic + True + + >>> interv_idx = pd.interval_range(start=0, end=2, closed='both') + >>> interv_idx + IntervalIndex([[0, 1], [1, 2]], dtype='interval[int64, both]') + >>> interv_idx.is_non_overlapping_monotonic + False """ @property From 855673fdc5d92bdaa9fd096f48ba23223cde47c5 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 19 May 2023 16:11:45 +0100 Subject: [PATCH 5/5] fix DataFrame.[idxmax|idxmin] --- pandas/tests/frame/test_reductions.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 78576ef777ba8..83e7f5c5c103d 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -968,12 +968,12 @@ def test_idxmin(self, float_frame, int_frame, skipna, axis): def test_idxmin_empty(self, index, skipna, axis): # GH53265 if axis == 0: - frame = DataFrame(index=index[:0]) + frame = DataFrame(index=index) else: - frame = DataFrame(columns=index[:0]) + frame = DataFrame(columns=index) result = frame.idxmin(axis=axis, skipna=skipna) - expected = Series(dtype=frame.axes[axis].dtype) + expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numeric_only", [True, False]) @@ -1008,12 +1008,12 @@ def test_idxmax(self, float_frame, int_frame, skipna, axis): def test_idxmax_empty(self, index, skipna, axis): # GH53265 if axis == 0: - frame = DataFrame(index=index[:0]) + frame = DataFrame(index=index) else: - frame = DataFrame(columns=index[:0]) + frame = DataFrame(columns=index) result = frame.idxmax(axis=axis, skipna=skipna) - expected = Series(dtype=frame.axes[axis].dtype) + expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numeric_only", [True, False])