diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 09862b72c4a4f..3eafdafa99518 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -6,7 +6,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast +from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Union, cast import warnings import numpy as np @@ -113,12 +113,11 @@ class SeriesDescriber(NDFrameDescriberAbstract): obj: "Series" def describe(self, percentiles: Sequence[float]) -> Series: - return describe_1d( + describe_func = select_describe_func( self.obj, - percentiles=percentiles, - datetime_is_numeric=self.datetime_is_numeric, - is_series=True, + self.datetime_is_numeric, ) + return describe_func(self.obj, percentiles) class DataFrameDescriber(NDFrameDescriberAbstract): @@ -155,15 +154,10 @@ def __init__( def describe(self, percentiles: Sequence[float]) -> DataFrame: data = self._select_data() - ldesc = [ - describe_1d( - series, - percentiles=percentiles, - datetime_is_numeric=self.datetime_is_numeric, - is_series=False, - ) - for _, series in data.items() - ] + ldesc: List["Series"] = [] + for _, series in data.items(): + describe_func = select_describe_func(series, self.datetime_is_numeric) + ldesc.append(describe_func(series, percentiles)) col_names = reorder_columns(ldesc) d = concat( @@ -231,55 +225,73 @@ def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> Serie return Series(d, index=stat_index, name=series.name) -def describe_categorical_1d(data: "Series", is_series: bool) -> Series: +def describe_categorical_1d( + data: "Series", + percentiles_ignored: Sequence[float], +) -> Series: """Describe series containing categorical data. Parameters ---------- data : Series Series to be described. - is_series : bool - True if the original object is a Series. - False if the one column of the DataFrame is described. + percentiles_ignored : list-like of numbers + Ignored, but in place to unify interface. + """ + names = ["count", "unique", "top", "freq"] + objcounts = data.value_counts() + count_unique = len(objcounts[objcounts != 0]) + if count_unique > 0: + top, freq = objcounts.index[0], objcounts.iloc[0] + dtype = None + else: + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + top, freq = np.nan, np.nan + dtype = "object" + + result = [data.count(), count_unique, top, freq] + + from pandas import Series + + return Series(result, index=names, name=data.name, dtype=dtype) + + +def describe_timestamp_as_categorical_1d( + data: "Series", + percentiles_ignored: Sequence[float], +) -> Series: + """Describe series containing timestamp data treated as categorical. + + Parameters + ---------- + data : Series + Series to be described. + percentiles_ignored : list-like of numbers + Ignored, but in place to unify interface. """ names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] dtype = None - if result[1] > 0: + if count_unique > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - if is_datetime64_any_dtype(data.dtype): - if is_series: - stacklevel = 6 - else: - stacklevel = 7 - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=stacklevel, - ) - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) else: - names += ["top", "freq"] - result += [top, freq] + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency @@ -317,41 +329,40 @@ def describe_timestamp_1d(data: "Series", percentiles: Sequence[float]) -> Serie return Series(d, index=stat_index, name=data.name) -def describe_1d( +def select_describe_func( data: "Series", - percentiles: Sequence[float], datetime_is_numeric: bool, - *, - is_series: bool, -) -> Series: - """Describe series. +) -> Callable: + """Select proper function for describing series based on data type. Parameters ---------- data : Series Series to be described. - percentiles : list-like of numbers - The percentiles to include in the output. - datetime_is_numeric : bool, default False + datetime_is_numeric : bool Whether to treat datetime dtypes as numeric. - is_series : bool - True if the original object is a Series. - False if the one column of the DataFrame is described. - - Returns - ------- - Series """ if is_bool_dtype(data.dtype): - return describe_categorical_1d(data, is_series) + return describe_categorical_1d elif is_numeric_dtype(data): - return describe_numeric_1d(data, percentiles) - elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: - return describe_timestamp_1d(data, percentiles) + return describe_numeric_1d + elif is_datetime64_any_dtype(data.dtype): + if datetime_is_numeric: + return describe_timestamp_1d + else: + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=5, + ) + return describe_timestamp_as_categorical_1d elif is_timedelta64_dtype(data.dtype): - return describe_numeric_1d(data, percentiles) + return describe_numeric_1d else: - return describe_categorical_1d(data, is_series) + return describe_categorical_1d def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]: