From d9077948a3b3e66ac5f07269dd7267643e565dc7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 8 Sep 2022 16:32:26 -0400 Subject: [PATCH 1/6] REGR: .describe on unsigned dtypes results in object --- pandas/core/describe.py | 7 +++++- pandas/tests/series/methods/test_describe.py | 23 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index d265a307078b9..fc2792a5f6855 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -240,7 +240,12 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: + series.quantile(percentiles).tolist() + [series.max()] ) - return Series(d, index=stat_index, name=series.name) + + result = Series(d, index=stat_index, name=series.name) + if isinstance(d[1], float): + # GH#48340 - don't rely on inference, always return float on numeric data + result = result.astype(float) + return result def describe_categorical_1d( diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index e6c6016d2b3a1..b44700be3b760 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( Period, @@ -149,3 +150,25 @@ def test_datetime_is_numeric_includes_datetime(self): index=["count", "mean", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"] + ) + def test_numeric_result_is_float(self, dtype): + # GH#48340 - describe should always return dtype float on numeric input + ser = Series([0, 1], dtype=dtype) + result = ser.describe() + expected = Series( + [ + 2.0, + 0.5, + ser.std(), + 0, + 0.25, + 0.5, + 0.75, + 1.0, + ], + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) From 97e6cdd983094bf79b09f3b1397fbb3397a96916 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 8 Sep 2022 22:23:21 -0400 Subject: [PATCH 2/6] Improve implementation --- pandas/core/describe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index fc2792a5f6855..6dcbda8de7e5d 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -242,7 +242,7 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: ) result = Series(d, index=stat_index, name=series.name) - if isinstance(d[1], float): + if is_numeric_dtype(series): # GH#48340 - don't rely on inference, always return float on numeric data result = result.astype(float) return result From ff4a95fe86e14938747fc1d96ec44f49878d8536 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 10 Sep 2022 07:05:29 -0400 Subject: [PATCH 3/6] Set via dtype --- pandas/core/describe.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 6dcbda8de7e5d..faf9a13d9a7b0 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -241,11 +241,9 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: + [series.max()] ) - result = Series(d, index=stat_index, name=series.name) - if is_numeric_dtype(series): - # GH#48340 - don't rely on inference, always return float on numeric data - result = result.astype(float) - return result + # GH#48340 - don't rely on inference, always return float on numeric data + dtype = float if is_numeric_dtype(series) else None + return Series(d, index=stat_index, name=series.name, dtype=dtype) def describe_categorical_1d( From 359cfd781bc48a15822c0609976600ee8b134205 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 10 Sep 2022 07:06:30 -0400 Subject: [PATCH 4/6] fixup --- pandas/core/describe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index faf9a13d9a7b0..c70e54c5def0a 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -240,7 +240,6 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: + series.quantile(percentiles).tolist() + [series.max()] ) - # GH#48340 - don't rely on inference, always return float on numeric data dtype = float if is_numeric_dtype(series) else None return Series(d, index=stat_index, name=series.name, dtype=dtype) From 21b839de1fc7e5772946564b091776b932887a33 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 10 Sep 2022 08:06:34 -0400 Subject: [PATCH 5/6] More tests, fix complex --- pandas/conftest.py | 39 ++++++++++++++++++++ pandas/core/describe.py | 3 +- pandas/tests/series/methods/test_describe.py | 13 +++---- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 6f31e2a11486a..446d7f7263cbe 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1606,6 +1606,45 @@ def any_numpy_dtype(request): return request.param +@pytest.fixture( + params=tm.ALL_REAL_NUMPY_DTYPES + + tm.COMPLEX_DTYPES + + tm.ALL_INT_EA_DTYPES + + tm.FLOAT_EA_DTYPES +) +def any_numeric_dtype(request): + """ + Parameterized fixture for all numeric dtypes. + + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' + * complex + * 'complex64' + * 'complex128' + * 'UInt8' + * 'Int8' + * 'UInt16' + * 'Int16' + * 'UInt32' + * 'Int32' + * 'UInt64' + * 'Int64' + * 'Float32' + * 'Float64' + """ + return request.param + + # categoricals are handled separately _any_skipna_inferred_dtype = [ ("string", ["a", np.nan, "c"]), diff --git a/pandas/core/describe.py b/pandas/core/describe.py index c70e54c5def0a..ec8ad455a69a6 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -32,6 +32,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, + is_complex_dtype, is_datetime64_any_dtype, is_numeric_dtype, is_timedelta64_dtype, @@ -241,7 +242,7 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: + [series.max()] ) # GH#48340 - don't rely on inference, always return float on numeric data - dtype = float if is_numeric_dtype(series) else None + dtype = float if is_numeric_dtype(series) and not is_complex_dtype(series) else None return Series(d, index=stat_index, name=series.name, dtype=dtype) diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index b44700be3b760..d7650e2768781 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,5 +1,6 @@ import numpy as np -import pytest + +from pandas.core.dtypes.common import is_complex_dtype from pandas import ( Period, @@ -151,12 +152,9 @@ def test_datetime_is_numeric_includes_datetime(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"] - ) - def test_numeric_result_is_float(self, dtype): - # GH#48340 - describe should always return dtype float on numeric input - ser = Series([0, 1], dtype=dtype) + def test_numeric_result_dtype(self, any_numeric_dtype): + # GH#48340 - describe should always return float on non-complex numeric input + ser = Series([0, 1], dtype=any_numeric_dtype) result = ser.describe() expected = Series( [ @@ -170,5 +168,6 @@ def test_numeric_result_is_float(self, dtype): 1.0, ], index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + dtype="complex128" if is_complex_dtype(ser) else None, ) tm.assert_series_equal(result, expected) From 3e3aa9fb2f0b1d14839c4fcb0e576d60339c66ea Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 10 Sep 2022 08:08:42 -0400 Subject: [PATCH 6/6] Fix comment --- pandas/core/describe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index ec8ad455a69a6..d6546b06ec711 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -241,7 +241,7 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: + series.quantile(percentiles).tolist() + [series.max()] ) - # GH#48340 - don't rely on inference, always return float on numeric data + # GH#48340 - always return float on non-complex numeric data dtype = float if is_numeric_dtype(series) and not is_complex_dtype(series) else None return Series(d, index=stat_index, name=series.name, dtype=dtype)