diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 24c1ae971686e..b0b60fea2bf27 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -55,11 +55,7 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc, rewrite_axis_style_signature -from pandas.util._validators import ( - validate_bool_kwarg, - validate_fillna_kwargs, - validate_percentile, -) +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.dtypes.common import ( ensure_int64, @@ -109,11 +105,8 @@ from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt -from pandas.io.formats.format import ( - DataFrameFormatter, - DataFrameRenderer, - format_percentiles, -) +from pandas.io.formats.describe import describe_ndframe +from pandas.io.formats.format import DataFrameFormatter, DataFrameRenderer from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: @@ -10237,145 +10230,13 @@ def describe( 75% NaN 2.5 max NaN 3.0 """ - if self.ndim == 2 and self.columns.size == 0: - raise ValueError("Cannot describe a DataFrame without columns") - - if percentiles is not None: - # explicit conversion of `percentiles` to list - percentiles = list(percentiles) - - # get them all to be in [0, 1] - validate_percentile(percentiles) - - # median should always be included - if 0.5 not in percentiles: - percentiles.append(0.5) - percentiles = np.asarray(percentiles) - else: - percentiles = np.array([0.25, 0.5, 0.75]) - - # sort and check for duplicates - unique_pcts = np.unique(percentiles) - if len(unique_pcts) < len(percentiles): - raise ValueError("percentiles cannot contain duplicates") - percentiles = unique_pcts - - formatted_percentiles = format_percentiles(percentiles) - - def describe_numeric_1d(series) -> "Series": - stat_index = ( - ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] - ) - d = ( - [series.count(), series.mean(), series.std(), series.min()] - + series.quantile(percentiles).tolist() - + [series.max()] - ) - return pd.Series(d, index=stat_index, name=series.name) - - def describe_categorical_1d(data) -> "Series": - names = ["count", "unique"] - objcounts = data.value_counts() - count_unique = len(objcounts[objcounts != 0]) - result = [data.count(), count_unique] - dtype = None - if result[1] > 0: - top, freq = objcounts.index[0], objcounts.iloc[0] - if is_datetime64_any_dtype(data.dtype): - if self.ndim == 1: - stacklevel = 4 - else: - stacklevel = 5 - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=stacklevel, - ) - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] - else: - names += ["top", "freq"] - result += [top, freq] - - # If the DataFrame is empty, set 'top' and 'freq' to None - # to maintain output shape consistency - else: - names += ["top", "freq"] - result += [np.nan, np.nan] - dtype = "object" - - return pd.Series(result, index=names, name=data.name, dtype=dtype) - - def describe_timestamp_1d(data) -> "Series": - # GH-30164 - stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] - d = ( - [data.count(), data.mean(), data.min()] - + data.quantile(percentiles).tolist() - + [data.max()] - ) - return pd.Series(d, index=stat_index, name=data.name) - - def describe_1d(data) -> "Series": - if is_bool_dtype(data.dtype): - return describe_categorical_1d(data) - elif is_numeric_dtype(data): - return describe_numeric_1d(data) - elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: - return describe_timestamp_1d(data) - elif is_timedelta64_dtype(data.dtype): - return describe_numeric_1d(data) - else: - return describe_categorical_1d(data) - - if self.ndim == 1: - # Incompatible return value type - # (got "Series", expected "FrameOrSeries") [return-value] - return describe_1d(self) # type:ignore[return-value] - elif (include is None) and (exclude is None): - # when some numerics are found, keep only numerics - default_include = [np.number] - if datetime_is_numeric: - default_include.append("datetime") - data = self.select_dtypes(include=default_include) - if len(data.columns) == 0: - data = self - elif include == "all": - if exclude is not None: - msg = "exclude must be None when include is 'all'" - raise ValueError(msg) - data = self - else: - data = self.select_dtypes(include=include, exclude=exclude) - - ldesc = [describe_1d(s) for _, s in data.items()] - # set a convenient order for rows - names: List[Label] = [] - ldesc_indexes = sorted((x.index for x in ldesc), key=len) - for idxnames in ldesc_indexes: - for name in idxnames: - if name not in names: - names.append(name) - - d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) - d.columns = data.columns.copy() - return d + return describe_ndframe( + data=self, + include=include, + exclude=exclude, + datetime_is_numeric=datetime_is_numeric, + percentiles=percentiles, + ) @final def pct_change( diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py new file mode 100644 index 0000000000000..d9fa236c54899 --- /dev/null +++ b/pandas/io/formats/describe.py @@ -0,0 +1,466 @@ +"""Module responsible for execution of NDFrame.describe() method. + +Method NDFrame.describe() delegates actual execution to function describe_ndframe(). + +Strategy pattern is utilized. + - The appropriate strategy is selected based on the series datatype. + - The strategy is responsible for running proper description. +""" + +from abc import ABC, abstractmethod +from typing import ( + TYPE_CHECKING, + Any, + List, + Optional, + Sequence, + Tuple, + Type, + Union, + cast, +) +import warnings + +import numpy as np + +from pandas._libs.tslibs import Timestamp +from pandas._typing import Dtype, FrameOrSeries, FrameOrSeriesUnion, Label +from pandas.util._validators import validate_percentile + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_datetime64_any_dtype, + is_numeric_dtype, + is_timedelta64_dtype, +) + +from pandas.core.reshape.concat import concat + +from pandas.io.formats.format import format_percentiles + +if TYPE_CHECKING: + from pandas import DataFrame, Series + + +def describe_ndframe( + *, + data: FrameOrSeries, + include: Optional[Union[str, Sequence[str]]], + exclude: Optional[Union[str, Sequence[str]]], + datetime_is_numeric: bool, + percentiles: Optional[Sequence[float]], +) -> FrameOrSeries: + """Describe series or dataframe. + + Called from pandas.core.generic.NDFrame.describe() + + Parameters + ---------- + data : FrameOrSeries + Either dataframe or series. + include : 'all', list-like of dtypes or None (default), optional + A white list of data types to include in the result. Ignored for ``Series``. + exclude : list-like of dtypes or None (default), optional, + A black list of data types to omit from the result. Ignored for ``Series``. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should fall between 0 and 1. + The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + + Returns + ------- + FrameOrSeries + Dataframe or series description. + """ + describer: "NDFrameDescriber" + if data.ndim == 1: + series = cast("Series", data) + describer = SeriesDescriber( + data=series, + datetime_is_numeric=datetime_is_numeric, + ) + else: + dataframe = cast("DataFrame", data) + describer = DataFrameDescriber( + data=dataframe, + include=include, + exclude=exclude, + datetime_is_numeric=datetime_is_numeric, + ) + result = describer.describe(percentiles) + return cast(FrameOrSeries, result) + + +class StrategyCreatorMixin: + """Mixin for creating instance of appropriate strategy for describing series.""" + + datetime_is_numeric: bool + + def create_strategy( + self, + series: "Series", + percentiles: Optional[Sequence[float]], + ) -> "StrategyAbstract": + """Create strategy instance for description.""" + klass = self._select_strategy(series) + return klass(series, percentiles) + + def _select_strategy(self, series: "Series") -> Type["StrategyAbstract"]: + """Select strategy for description.""" + strategy: Type[StrategyAbstract] = CategoricalStrategy + if is_bool_dtype(series.dtype): + strategy = CategoricalStrategy + elif is_numeric_dtype(series): + strategy = NumericStrategy + elif is_datetime64_any_dtype(series.dtype) and self.datetime_is_numeric: + strategy = TimestampStrategy + elif is_timedelta64_dtype(series.dtype): + strategy = NumericStrategy + + if strategy == CategoricalStrategy and is_datetime64_any_dtype(series.dtype): + strategy = TimestampAsCategoricalStrategy + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=6, + ) + return strategy + + +class NDFrameDescriber(ABC): + """Abstract class for describing dataframe or series.""" + + @abstractmethod + def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeriesUnion: + """Do describe either series or dataframe. + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should fall between 0 and 1. + The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + """ + + +class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin): + """Class responsible for creating series description. + + Parameters + ---------- + data : Series + Series to be described. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. + """ + + def __init__( + self, + *, + data: "Series", + datetime_is_numeric: bool, + ): + self.data = data + self.datetime_is_numeric = datetime_is_numeric + + def describe(self, percentiles: Optional[Sequence[float]]) -> "Series": + """Do describe series.""" + strategy = self.create_strategy(self.data, percentiles) + result = strategy.describe() + return result + + +class DataFrameDescriber(NDFrameDescriber, StrategyCreatorMixin): + """Class responsible for creating dataframe description. + + Parameters + ---------- + data : DataFrame + Dataframe to be described. + include : 'all', list-like of dtypes or None (default), optional + A white list of data types to include in the result. + exclude : list-like of dtypes or None (default), optional, + A black list of data types to omit from the result. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. + """ + + def __init__( + self, + *, + data: "DataFrame", + include: Optional[Union[str, Sequence[str]]], + exclude: Optional[Union[str, Sequence[str]]], + datetime_is_numeric: bool, + ): + self.include = include + self.exclude = exclude + self.datetime_is_numeric = datetime_is_numeric + self.data: "DataFrame" = self._initialize_data(data) + + def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame": + """Do describe dataframe.""" + ldesc: List["Series"] = [] + for _, series in self.data.items(): + strategy = self.create_strategy(series, percentiles) + ldesc.append(strategy.describe()) + + df = concat( + self._reindex_columns(ldesc), + axis=1, + sort=False, + ) + df.columns = self.data.columns.copy() + return cast("DataFrame", df) + + def _reindex_columns(self, column_data) -> List["Series"]: + """Set a convenient order for rows.""" + names: List[Label] = [] + ldesc_indexes = sorted((x.index for x in column_data), key=len) + for idxnames in ldesc_indexes: + for name in idxnames: + if name not in names: + names.append(name) + return [x.reindex(names, copy=False) for x in column_data] + + def _initialize_data(self, data: "DataFrame") -> "DataFrame": + _validate_dframe_size(data) + + if self.include is None and self.exclude is None: + return self._extract_numeric_data(data) + + if self.include == "all": + if self.exclude is not None: + msg = "exclude must be None when include is 'all'" + raise ValueError(msg) + return data + + return data.select_dtypes(include=self.include, exclude=self.exclude) + + def _extract_numeric_data(self, data: "DataFrame") -> "DataFrame": + """When some numerics are found, keep only numerics.""" + include = [np.number] + if self.datetime_is_numeric: + include.append("datetime") + numeric_only = data.select_dtypes(include=include) + if len(numeric_only.columns) == 0: + return data + else: + return numeric_only + + +class StrategyAbstract(ABC): + """Abstract strategy for describing series.""" + + def __init__( + self, + data: "Series", + percentiles: Optional[Sequence[float]], + ): + self.data = data + self.percentiles = self._initialize_percentiles(percentiles) + + def describe(self) -> "Series": + """Describe series.""" + from pandas.core.series import Series + + return Series( + self.array, + index=self.names, + name=self.data.name, + dtype=self.dtype, + ) + + @property + @abstractmethod + def array(self) -> List[object]: + """Series data.""" + + @property + @abstractmethod + def names(self) -> List[str]: + """Series index names.""" + + @property + @abstractmethod + def dtype(self) -> Optional[Dtype]: + """Series dtype.""" + + @property + def formatted_percentiles(self) -> List[str]: + """Percentiles formatted as strings, rounded.""" + return format_percentiles(self.percentiles) + + @staticmethod + def _initialize_percentiles( + percentiles: Optional[Sequence[float]], + ) -> Sequence[float]: + if percentiles is None: + return np.array([0.25, 0.5, 0.75]) + + # explicit conversion of `percentiles` to list + percentiles = list(percentiles) + + # get them all to be in [0, 1] + validate_percentile(percentiles) + + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) + percentiles = np.asarray(percentiles) + + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + assert percentiles is not None + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + return unique_pcts + + +class CategoricalStrategy(StrategyAbstract): + """Strategy for series with categorical values.""" + + def __init__(self, data, percentiles): + self.data = data + super().__init__(data, percentiles) + self.objcounts = self.data.value_counts() + + @property + def array(self) -> List[object]: + top, freq = self._get_top_and_freq() + return [ + self.count, + self.count_unique, + top, + freq, + ] + + @property + def names(self) -> List[str]: + return ["count", "unique", "top", "freq"] + + @property + def dtype(self) -> Optional[Dtype]: + if self.count_unique == 0: + return "object" + return None + + @property + def count(self) -> "Series": + return self.data.count() + + @property + def count_unique(self) -> int: + return len(self.objcounts[self.objcounts != 0]) + + def _get_top_and_freq(self) -> Tuple[Any, Any]: + if self.count_unique > 0: + return self.objcounts.index[0], self.objcounts.iloc[0] + return np.nan, np.nan + + +class TimestampAsCategoricalStrategy(CategoricalStrategy): + """Strategy for series with timestamp values treated as categorical values.""" + + @property + def array(self) -> List[object]: + result = [self.count, self.count_unique] + if self.count_unique > 0: + top, freq = self.objcounts.index[0], self.objcounts.iloc[0] + tz = self.data.dt.tz + asint = self.data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + result += [np.nan, np.nan] + return result + + @property + def names(self) -> List[str]: + names = ["count", "unique"] + if self.count_unique > 0: + names += ["top", "freq", "first", "last"] + return names + + +class NumericStrategy(StrategyAbstract): + """Strategy for series with numeric values.""" + + @property + def array(self) -> List[object]: + return [ + self.data.count(), + self.data.mean(), + self.data.std(), + self.data.min(), + *self.data.quantile(self.percentiles).tolist(), + self.data.max(), + ] + + @property + def names(self) -> List[str]: + return [ + "count", + "mean", + "std", + "min", + *self.formatted_percentiles, + "max", + ] + + @property + def dtype(self) -> Optional[Dtype]: + return None + + +class TimestampStrategy(StrategyAbstract): + """Strategy for series with timestamp values.""" + + @property + def array(self) -> List[object]: + return [ + self.data.count(), + self.data.mean(), + self.data.min(), + *self.data.quantile(self.percentiles).tolist(), + self.data.max(), + ] + + @property + def names(self) -> List[str]: + return [ + "count", + "mean", + "min", + *self.formatted_percentiles, + "max", + ] + + @property + def dtype(self) -> Optional[Dtype]: + return None + + +def _validate_dframe_size(df: FrameOrSeriesUnion) -> None: + """Validate correct size of dataframe.""" + if df.ndim == 2 and df.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index f77b7cd4a6c3b..0b7da5e862fd4 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range @@ -332,6 +333,16 @@ def test_describe_tz_values2(self): result = df.describe(include="all") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]]) + def test_describe_when_include_all_exclude_not_allowed(self, exclude): + """ + When include is 'all', then setting exclude != None is not allowed. + """ + df = DataFrame({"x": [1], "y": [2], "z": [3]}) + msg = "exclude must be None when include is 'all'" + with pytest.raises(ValueError, match=msg): + df.describe(include="all", exclude=exclude) + def test_describe_percentiles_integer_idx(self): # GH#26660 df = DataFrame({"x": [1]})