From 800fd17ff9b81cda97d3026ee3eebd331dbe0171 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 20 Jan 2021 11:13:25 +0700 Subject: [PATCH 1/4] REF: split describe_categorical_1d --- pandas/core/describe.py | 157 ++++++++++++++++++++-------------------- 1 file changed, 80 insertions(+), 77 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 09862b72c4a4f..78b1603d98198 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -6,7 +6,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast +from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Union, cast import warnings import numpy as np @@ -113,12 +113,11 @@ class SeriesDescriber(NDFrameDescriberAbstract): obj: "Series" def describe(self, percentiles: Sequence[float]) -> Series: - return describe_1d( + describe_func = select_describe_func( self.obj, - percentiles=percentiles, - datetime_is_numeric=self.datetime_is_numeric, - is_series=True, + self.datetime_is_numeric, ) + return describe_func(self.obj, percentiles) class DataFrameDescriber(NDFrameDescriberAbstract): @@ -155,15 +154,10 @@ def __init__( def describe(self, percentiles: Sequence[float]) -> DataFrame: data = self._select_data() - ldesc = [ - describe_1d( - series, - percentiles=percentiles, - datetime_is_numeric=self.datetime_is_numeric, - is_series=False, - ) - for _, series in data.items() - ] + ldesc: List["Series"] = [] + for _, series in data.items(): + describe_func = select_describe_func(series, self.datetime_is_numeric) + ldesc.append(describe_func(series, percentiles)) col_names = reorder_columns(ldesc) d = concat( @@ -231,55 +225,73 @@ def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> Serie return Series(d, index=stat_index, name=series.name) -def describe_categorical_1d(data: "Series", is_series: bool) -> Series: +def describe_categorical_1d( + data: "Series", + percentiles_ignored: Sequence[float], +) -> Series: """Describe series containing categorical data. Parameters ---------- data : Series Series to be described. - is_series : bool - True if the original object is a Series. - False if the one column of the DataFrame is described. + percentiles : list-like of numbers + Ignored, but in place to unify interface. + """ + names = ["count", "unique", "top", "freq"] + objcounts = data.value_counts() + count_unique = len(objcounts[objcounts != 0]) + if count_unique > 0: + top, freq = objcounts.index[0], objcounts.iloc[0] + dtype = None + else: + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + top, freq = np.nan, np.nan + dtype = "object" + + result = [data.count(), count_unique, top, freq] + + from pandas import Series + + return Series(result, index=names, name=data.name, dtype=dtype) + + +def describe_timestamp_as_categorical_1d( + data: "Series", + percentiles_ignored: Sequence[float], +) -> Series: + """Describe series containing timestamp data treated as categorical. + + Parameters + ---------- + data : Series + Series to be described. + percentiles : list-like of numbers + Ignored, but in place to unify interface. """ names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] dtype = None - if result[1] > 0: + if count_unique > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - if is_datetime64_any_dtype(data.dtype): - if is_series: - stacklevel = 6 - else: - stacklevel = 7 - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=stacklevel, - ) - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) else: - names += ["top", "freq"] - result += [top, freq] + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency @@ -317,41 +329,32 @@ def describe_timestamp_1d(data: "Series", percentiles: Sequence[float]) -> Serie return Series(d, index=stat_index, name=data.name) -def describe_1d( +def select_describe_func( data: "Series", - percentiles: Sequence[float], datetime_is_numeric: bool, - *, - is_series: bool, -) -> Series: - """Describe series. - - Parameters - ---------- - data : Series - Series to be described. - percentiles : list-like of numbers - The percentiles to include in the output. - datetime_is_numeric : bool, default False - Whether to treat datetime dtypes as numeric. - is_series : bool - True if the original object is a Series. - False if the one column of the DataFrame is described. - - Returns - ------- - Series - """ +) -> Callable: if is_bool_dtype(data.dtype): - return describe_categorical_1d(data, is_series) + describe_func = describe_categorical_1d elif is_numeric_dtype(data): - return describe_numeric_1d(data, percentiles) + describe_func = describe_numeric_1d elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: - return describe_timestamp_1d(data, percentiles) + describe_func = describe_timestamp_1d elif is_timedelta64_dtype(data.dtype): - return describe_numeric_1d(data, percentiles) + describe_func = describe_numeric_1d else: - return describe_categorical_1d(data, is_series) + describe_func = describe_categorical_1d + + if describe_func == describe_categorical_1d and is_datetime64_any_dtype(data.dtype): + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=5, + ) + describe_func = describe_timestamp_as_categorical_1d + return describe_func def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]: From 48270c5c820c31ffaa8a443b16790cbf4711fc87 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 20 Jan 2021 13:47:51 +0700 Subject: [PATCH 2/4] DOC: add docstring to select_describe_func --- pandas/core/describe.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 78b1603d98198..2e14c7b7ed0ae 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -333,6 +333,17 @@ def select_describe_func( data: "Series", datetime_is_numeric: bool, ) -> Callable: + """Select proper function for describing series based on data type. + + Parameters + ---------- + data : Series + Series to be described. + datetime_is_numeric : bool + Whether to treat datetime dtypes as numeric. + """ + describe_func: Callable + if is_bool_dtype(data.dtype): describe_func = describe_categorical_1d elif is_numeric_dtype(data): From 83fa0b53cf7671c7144a56ba031c691d2fbc5e10 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 20 Jan 2021 13:49:10 +0700 Subject: [PATCH 3/4] REF: simplify logic in select_describe_func --- pandas/core/describe.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 2e14c7b7ed0ae..a31858f1a7afc 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -348,23 +348,24 @@ def select_describe_func( describe_func = describe_categorical_1d elif is_numeric_dtype(data): describe_func = describe_numeric_1d - elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: - describe_func = describe_timestamp_1d + elif is_datetime64_any_dtype(data.dtype): + if datetime_is_numeric: + describe_func = describe_timestamp_1d + else: + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=5, + ) + describe_func = describe_timestamp_as_categorical_1d elif is_timedelta64_dtype(data.dtype): describe_func = describe_numeric_1d else: describe_func = describe_categorical_1d - if describe_func == describe_categorical_1d and is_datetime64_any_dtype(data.dtype): - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=5, - ) - describe_func = describe_timestamp_as_categorical_1d return describe_func From 18c80d31815b2094c907dc4af49893db8b011ede Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 20 Jan 2021 13:53:15 +0700 Subject: [PATCH 4/4] DOC: update param names in categorical docstring --- pandas/core/describe.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index a31858f1a7afc..3eafdafa99518 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -235,7 +235,7 @@ def describe_categorical_1d( ---------- data : Series Series to be described. - percentiles : list-like of numbers + percentiles_ignored : list-like of numbers Ignored, but in place to unify interface. """ names = ["count", "unique", "top", "freq"] @@ -267,7 +267,7 @@ def describe_timestamp_as_categorical_1d( ---------- data : Series Series to be described. - percentiles : list-like of numbers + percentiles_ignored : list-like of numbers Ignored, but in place to unify interface. """ names = ["count", "unique"] @@ -342,15 +342,13 @@ def select_describe_func( datetime_is_numeric : bool Whether to treat datetime dtypes as numeric. """ - describe_func: Callable - if is_bool_dtype(data.dtype): - describe_func = describe_categorical_1d + return describe_categorical_1d elif is_numeric_dtype(data): - describe_func = describe_numeric_1d + return describe_numeric_1d elif is_datetime64_any_dtype(data.dtype): if datetime_is_numeric: - describe_func = describe_timestamp_1d + return describe_timestamp_1d else: warnings.warn( "Treating datetime data as categorical rather than numeric in " @@ -360,13 +358,11 @@ def select_describe_func( FutureWarning, stacklevel=5, ) - describe_func = describe_timestamp_as_categorical_1d + return describe_timestamp_as_categorical_1d elif is_timedelta64_dtype(data.dtype): - describe_func = describe_numeric_1d + return describe_numeric_1d else: - describe_func = describe_categorical_1d - - return describe_func + return describe_categorical_1d def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]: