Skip to content

Commit 97f136b

Browse files
committed
REF: extract classes in pandas/core/describe.py
1 parent ee92c2f commit 97f136b

File tree

1 file changed

+117
-103
lines changed

1 file changed

+117
-103
lines changed

pandas/core/describe.py

Lines changed: 117 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44
Method NDFrame.describe() delegates actual execution to function describe_ndframe().
55
"""
66

7+
from abc import ABC, abstractmethod
78
from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast
89
import warnings
910

1011
import numpy as np
1112

1213
from pandas._libs.tslibs import Timestamp
13-
from pandas._typing import FrameOrSeries, Hashable
14+
from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Hashable
1415
from pandas.util._validators import validate_percentile
1516

1617
from pandas.core.dtypes.common import (
@@ -61,106 +62,140 @@ def describe_ndframe(
6162
"""
6263
percentiles = refine_percentiles(percentiles)
6364

65+
describer: NDFrameDescriberAbstract
66+
6467
if obj.ndim == 1:
65-
result_series = describe_series(
66-
cast("Series", obj),
67-
percentiles,
68-
datetime_is_numeric,
68+
describer = SeriesDescriber(
69+
series=cast("Series", obj),
70+
datetime_is_numeric=datetime_is_numeric,
71+
)
72+
else:
73+
describer = DataFrameDescriber(
74+
frame=cast("DataFrame", obj),
75+
include=include,
76+
exclude=exclude,
77+
datetime_is_numeric=datetime_is_numeric,
6978
)
70-
return cast(FrameOrSeries, result_series)
7179

72-
frame = cast("DataFrame", obj)
80+
result = describer.describe(percentiles=percentiles)
81+
return cast(FrameOrSeries, result)
7382

74-
if frame.ndim == 2 and frame.columns.size == 0:
75-
raise ValueError("Cannot describe a DataFrame without columns")
7683

77-
result_frame = describe_frame(
78-
frame=frame,
79-
include=include,
80-
exclude=exclude,
81-
percentiles=percentiles,
82-
datetime_is_numeric=datetime_is_numeric,
83-
)
84-
return cast(FrameOrSeries, result_frame)
84+
class NDFrameDescriberAbstract(ABC):
85+
"""Abstract class for describing dataframe or series."""
8586

87+
@abstractmethod
88+
def describe(self, percentiles: Sequence[float]) -> "FrameOrSeriesUnion":
89+
"""Do describe either series or dataframe.
90+
91+
Parameters
92+
----------
93+
percentiles : list-like of numbers
94+
The percentiles to include in the output.
95+
"""
8696

87-
def describe_series(
88-
series: "Series",
89-
percentiles: Sequence[float],
90-
datetime_is_numeric: bool,
91-
) -> "Series":
92-
"""Describe series.
9397

94-
The reason for the delegation to ``describe_1d`` only:
95-
to allow for a proper stacklevel of the FutureWarning.
98+
class SeriesDescriber(NDFrameDescriberAbstract):
99+
"""Class responsible for creating series description.
96100
97101
Parameters
98102
----------
99-
series : Series
103+
data : Series
100104
Series to be described.
101-
percentiles : list-like of numbers
102-
The percentiles to include in the output.
103-
datetime_is_numeric : bool, default False
105+
datetime_is_numeric : bool
104106
Whether to treat datetime dtypes as numeric.
105-
106-
Returns
107-
-------
108-
Series
109107
"""
110-
return describe_1d(
111-
series,
112-
percentiles,
113-
datetime_is_numeric,
114-
is_series=True,
115-
)
116108

109+
def __init__(
110+
self,
111+
series: "Series",
112+
*,
113+
datetime_is_numeric: bool,
114+
):
115+
self.series = series
116+
self.datetime_is_numeric = datetime_is_numeric
117+
118+
def describe(self, percentiles: Sequence[float]) -> "Series":
119+
return describe_1d(
120+
self.series,
121+
percentiles=percentiles,
122+
datetime_is_numeric=self.datetime_is_numeric,
123+
is_series=True,
124+
)
117125

118-
def describe_frame(
119-
frame: "DataFrame",
120-
include: Optional[Union[str, Sequence[str]]],
121-
exclude: Optional[Union[str, Sequence[str]]],
122-
percentiles: Sequence[float],
123-
datetime_is_numeric: bool,
124-
) -> "DataFrame":
125-
"""Describe DataFrame.
126+
127+
class DataFrameDescriber(NDFrameDescriberAbstract):
128+
"""Class responsible for creating dataframe description.
126129
127130
Parameters
128131
----------
129-
frame : DataFrame
130-
DataFrame to be described.
131-
include : 'all', list-like of dtypes or None (default), optional
132+
data : DataFrame
133+
Dataframe to be described.
134+
include : 'all', list-like of dtypes or None
132135
A white list of data types to include in the result.
133-
exclude : list-like of dtypes or None (default), optional,
136+
exclude : list-like of dtypes or None
134137
A black list of data types to omit from the result.
135-
percentiles : list-like of numbers
136-
The percentiles to include in the output.
137-
datetime_is_numeric : bool, default False
138+
datetime_is_numeric : bool
138139
Whether to treat datetime dtypes as numeric.
139-
140-
Returns
141-
-------
142-
DataFrame
143140
"""
144-
data = select_columns(
145-
frame=frame,
146-
include=include,
147-
exclude=exclude,
148-
datetime_is_numeric=datetime_is_numeric,
149-
)
150-
151-
ldesc = [
152-
describe_1d(s, percentiles, datetime_is_numeric, is_series=False)
153-
for _, s in data.items()
154-
]
155141

156-
col_names = reorder_columns(ldesc)
157-
d = concat(
158-
[x.reindex(col_names, copy=False) for x in ldesc],
159-
axis=1,
160-
sort=False,
161-
)
162-
d.columns = data.columns.copy()
163-
return d
142+
def __init__(
143+
self,
144+
frame: "DataFrame",
145+
*,
146+
include: Optional[Union[str, Sequence[str]]],
147+
exclude: Optional[Union[str, Sequence[str]]],
148+
datetime_is_numeric: bool,
149+
):
150+
validate_frame(frame)
151+
self.frame = frame
152+
self.include = include
153+
self.exclude = exclude
154+
self.datetime_is_numeric = datetime_is_numeric
155+
156+
def describe(self, percentiles: Sequence[float]) -> "DataFrame":
157+
data = self._select_data()
158+
159+
ldesc = [
160+
describe_1d(
161+
series,
162+
percentiles=percentiles,
163+
datetime_is_numeric=self.datetime_is_numeric,
164+
is_series=False,
165+
)
166+
for _, series in data.items()
167+
]
168+
169+
col_names = reorder_columns(ldesc)
170+
d = concat(
171+
[x.reindex(col_names, copy=False) for x in ldesc],
172+
axis=1,
173+
sort=False,
174+
)
175+
d.columns = data.columns.copy()
176+
return d
177+
178+
def _select_data(self):
179+
"""Select columns to be described."""
180+
if (self.include is None) and (self.exclude is None):
181+
# when some numerics are found, keep only numerics
182+
default_include = [np.number]
183+
if self.datetime_is_numeric:
184+
default_include.append("datetime")
185+
data = self.frame.select_dtypes(include=default_include)
186+
if len(data.columns) == 0:
187+
data = self.frame
188+
elif self.include == "all":
189+
if self.exclude is not None:
190+
msg = "exclude must be None when include is 'all'"
191+
raise ValueError(msg)
192+
data = self.frame
193+
else:
194+
data = self.frame.select_dtypes(
195+
include=self.include,
196+
exclude=self.exclude,
197+
)
198+
return data
164199

165200

166201
def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]:
@@ -174,32 +209,6 @@ def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]:
174209
return names
175210

176211

177-
def select_columns(
178-
frame: "DataFrame",
179-
include: Optional[Union[str, Sequence[str]]],
180-
exclude: Optional[Union[str, Sequence[str]]],
181-
datetime_is_numeric: bool,
182-
) -> "DataFrame":
183-
"""Select columns to be described."""
184-
if (include is None) and (exclude is None):
185-
# when some numerics are found, keep only numerics
186-
default_include = [np.number]
187-
if datetime_is_numeric:
188-
default_include.append("datetime")
189-
data = frame.select_dtypes(include=default_include)
190-
if len(data.columns) == 0:
191-
data = frame
192-
elif include == "all":
193-
if exclude is not None:
194-
msg = "exclude must be None when include is 'all'"
195-
raise ValueError(msg)
196-
data = frame
197-
else:
198-
data = frame.select_dtypes(include=include, exclude=exclude)
199-
200-
return data
201-
202-
203212
def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> "Series":
204213
"""Describe series containing numerical data.
205214
@@ -376,3 +385,8 @@ def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float
376385
raise ValueError("percentiles cannot contain duplicates")
377386

378387
return unique_pcts
388+
389+
390+
def validate_frame(frame: "DataFrame"):
391+
if frame.ndim == 2 and frame.columns.size == 0:
392+
raise ValueError("Cannot describe a DataFrame without columns")

0 commit comments

Comments
 (0)