From e3b0b5d7d250f46d2e7be928e44b98f48bde72c5 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Thu, 20 Mar 2025 14:12:05 +0700 Subject: [PATCH 1/6] Fix bug in `~Series.describe` where median percentile is included when the `percentiles` argument is passed --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/methods/describe.py | 11 ++++---- pandas/io/formats/format.py | 3 +++ pandas/tests/frame/methods/test_describe.py | 30 +++++++++++++++++++++ 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b4aa6447c0a1b..cd0eb5183ebd0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -838,6 +838,7 @@ Other - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`) +- Bug in :meth:`Series.describe` where median percentile is included when the ``percentiles`` argument is passed (:issue:`60550`). - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 17d4d38c97f33..944e28a9b0238 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -229,10 +229,15 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: formatted_percentiles = format_percentiles(percentiles) + if len(percentiles) == 0: + quantiles = [] + else: + quantiles = series.quantile(percentiles).tolist() + stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] d = ( [series.count(), series.mean(), series.std(), series.min()] - + series.quantile(percentiles).tolist() + + quantiles + [series.max()] ) # GH#48340 - always return float on non-complex numeric data @@ -354,10 +359,6 @@ def _refine_percentiles( # get them all to be in [0, 1] validate_percentile(percentiles) - # median should always be included - if 0.5 not in percentiles: - percentiles.append(0.5) - percentiles = np.asarray(percentiles) # sort and check for duplicates diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b7fbc4e5e22b7..fb799361fea67 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1565,6 +1565,9 @@ def format_percentiles( >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] """ + if len(percentiles) == 0: + return [] + percentiles = np.asarray(percentiles) # It checks for np.nan as well diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index e9206e86b7b08..28c3fd8d89a43 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -413,3 +413,33 @@ def test_describe_exclude_pa_dtype(self): dtype=pd.ArrowDtype(pa.float64()), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("percentiles", [None, [], [0.2]]) + def test_refine_percentiles(self, percentiles): + """ + Test that the percentiles are returned correctly depending on the `percentiles` + argument. + - The default behavior is to return the 25th, 50th, and 75 percentiles + - If `percentiles` is an empty list, no percentiles are returned + - If `percentiles` is a non-empty list, only those percentiles are returned + """ + # GH#60550 + df = DataFrame({"a": np.arange(0, 10, 1)}) + + result = df.describe(percentiles=percentiles) + + if percentiles is None: + percentiles = [0.25, 0.5, 0.75] + + expected = Series( + { + "count": len(df.a), + "mean": df.a.mean(), + "std": df.a.std(), + "min": df.a.min(), + **{f"{p:.0%}": df.a.quantile(p) for p in percentiles}, + "max": df.a.max(), + }, + ).to_frame(name="a") + + tm.assert_frame_equal(result, expected) From 8bb3cf36d8e1ceb3c7751e387bec4a0a86912495 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Fri, 21 Mar 2025 09:41:55 +0700 Subject: [PATCH 2/6] Refine docstrings --- pandas/core/generic.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0c3f535df9ce2..e85402a46d6ec 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10818,9 +10818,12 @@ def describe( ---------- percentiles : list-like of numbers, optional The percentiles to include in the output. All should - fall between 0 and 1. The default is - ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. + fall between 0 and 1. Here are the options: + + - A list-like of numbers : To include the percentiles listed. If + that list is empty, no percentiles will be returned. + - None (default) : To include the default percentiles, which are the + 25th, 50th, and 75th ones. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. Here are the options: From a0a0c630adc74f19dabd1f9d8454ac4b81b84a19 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Fri, 21 Mar 2025 10:28:32 +0700 Subject: [PATCH 3/6] Update test_describe in groupby --- pandas/tests/groupby/methods/test_describe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index 6c4b913574d9e..72bdfebc5eeb7 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -202,15 +202,15 @@ def test_describe_duplicate_columns(): gb = df.groupby(df[1]) result = gb.describe(percentiles=[]) - columns = ["count", "mean", "std", "min", "50%", "max"] + columns = ["count", "mean", "std", "min", "max"] frames = [ - DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + DataFrame([[1.0, val, np.nan, val, val]], index=[1], columns=columns) for val in (0.0, 2.0, 3.0) ] expected = pd.concat(frames, axis=1) expected.columns = MultiIndex( levels=[[0, 2], columns], - codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + codes=[5 * [0] + 5 * [1] + 5 * [0], 3 * list(range(5))], ) expected.index.names = [1] tm.assert_frame_equal(result, expected) From bf1effa1ad3f3f88b42ccfec2fdc305eabc51e93 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Fri, 21 Mar 2025 23:14:55 +0700 Subject: [PATCH 4/6] Minor fixes --- pandas/core/generic.py | 8 ++------ pandas/tests/frame/methods/test_describe.py | 5 +++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e85402a46d6ec..a7a287de0241e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10818,12 +10818,8 @@ def describe( ---------- percentiles : list-like of numbers, optional The percentiles to include in the output. All should - fall between 0 and 1. Here are the options: - - - A list-like of numbers : To include the percentiles listed. If - that list is empty, no percentiles will be returned. - - None (default) : To include the default percentiles, which are the - 25th, 50th, and 75th ones. + fall between 0 and 1. The default, ``None``, will automatically + return the 25th, 50th, and 75th percentiles. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. Here are the options: diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 28c3fd8d89a43..401b65ed61a25 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -431,7 +431,7 @@ def test_refine_percentiles(self, percentiles): if percentiles is None: percentiles = [0.25, 0.5, 0.75] - expected = Series( + expected = DataFrame( { "count": len(df.a), "mean": df.a.mean(), @@ -440,6 +440,7 @@ def test_refine_percentiles(self, percentiles): **{f"{p:.0%}": df.a.quantile(p) for p in percentiles}, "max": df.a.max(), }, - ).to_frame(name="a") + index=["a"], + ).T tm.assert_frame_equal(result, expected) From 28756ad16298aa8d3f6ab468bc9b7b6849182da8 Mon Sep 17 00:00:00 2001 From: Martin Braquet Date: Fri, 21 Mar 2025 23:17:24 +0700 Subject: [PATCH 5/6] Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cd0eb5183ebd0..bad06329c4bfa 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -838,7 +838,7 @@ Other - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`) -- Bug in :meth:`Series.describe` where median percentile is included when the ``percentiles`` argument is passed (:issue:`60550`). +- Bug in :meth:`Series.describe` where median percentile was always included when the ``percentiles`` argument was passed (:issue:`60550`). - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) From 5ed786c67458e7840aff4e1b3e3a9a82dc294328 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Sat, 22 Mar 2025 02:01:52 +0700 Subject: [PATCH 6/6] Refactor expected df to avoid transpose --- pandas/tests/frame/methods/test_describe.py | 30 ++++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 401b65ed61a25..50656ca85e90a 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -432,15 +432,25 @@ def test_refine_percentiles(self, percentiles): percentiles = [0.25, 0.5, 0.75] expected = DataFrame( - { - "count": len(df.a), - "mean": df.a.mean(), - "std": df.a.std(), - "min": df.a.min(), - **{f"{p:.0%}": df.a.quantile(p) for p in percentiles}, - "max": df.a.max(), - }, - index=["a"], - ).T + [ + len(df.a), + df.a.mean(), + df.a.std(), + df.a.min(), + *[df.a.quantile(p) for p in percentiles], + df.a.max(), + ], + index=pd.Index( + [ + "count", + "mean", + "std", + "min", + *[f"{p:.0%}" for p in percentiles], + "max", + ] + ), + columns=["a"], + ) tm.assert_frame_equal(result, expected)