Skip to content

Commit 008b10b

Browse files
authored
REF: extract more functions in pandas/core/describe.py (#39170)
1 parent a2a32e5 commit 008b10b

File tree

1 file changed

+127
-34
lines changed

1 file changed

+127
-34
lines changed

pandas/core/describe.py

Lines changed: 127 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from pandas.io.formats.format import format_percentiles
2626

2727
if TYPE_CHECKING:
28-
from pandas import Series
28+
from pandas import DataFrame, Series
2929

3030

3131
def describe_ndframe(
@@ -59,52 +59,145 @@ def describe_ndframe(
5959
-------
6060
Dataframe or series description.
6161
"""
62-
if obj.ndim == 2 and obj.columns.size == 0:
63-
raise ValueError("Cannot describe a DataFrame without columns")
64-
65-
percentiles = _refine_percentiles(percentiles)
62+
percentiles = refine_percentiles(percentiles)
6663

6764
if obj.ndim == 1:
68-
series = cast("Series", obj)
69-
# Incompatible return value type
70-
# (got "Series", expected "FrameOrSeries") [return-value]
71-
return describe_1d(
72-
series,
65+
result_series = describe_series(
66+
cast("Series", obj),
7367
percentiles,
7468
datetime_is_numeric,
75-
is_series=True,
76-
) # type:ignore[return-value]
77-
elif (include is None) and (exclude is None):
78-
# when some numerics are found, keep only numerics
79-
default_include = [np.number]
80-
if datetime_is_numeric:
81-
default_include.append("datetime")
82-
data = obj.select_dtypes(include=default_include)
83-
if len(data.columns) == 0:
84-
data = obj
85-
elif include == "all":
86-
if exclude is not None:
87-
msg = "exclude must be None when include is 'all'"
88-
raise ValueError(msg)
89-
data = obj
90-
else:
91-
data = obj.select_dtypes(include=include, exclude=exclude)
69+
)
70+
return cast(FrameOrSeries, result_series)
71+
72+
frame = cast("DataFrame", obj)
73+
74+
if frame.ndim == 2 and frame.columns.size == 0:
75+
raise ValueError("Cannot describe a DataFrame without columns")
76+
77+
result_frame = describe_frame(
78+
frame=frame,
79+
include=include,
80+
exclude=exclude,
81+
percentiles=percentiles,
82+
datetime_is_numeric=datetime_is_numeric,
83+
)
84+
return cast(FrameOrSeries, result_frame)
85+
86+
87+
def describe_series(
88+
series: "Series",
89+
percentiles: Sequence[float],
90+
datetime_is_numeric: bool,
91+
) -> "Series":
92+
"""Describe series.
93+
94+
The reason for the delegation to ``describe_1d`` only:
95+
to allow for a proper stacklevel of the FutureWarning.
96+
97+
Parameters
98+
----------
99+
series : Series
100+
Series to be described.
101+
percentiles : list-like of numbers
102+
The percentiles to include in the output.
103+
datetime_is_numeric : bool, default False
104+
Whether to treat datetime dtypes as numeric.
105+
106+
Returns
107+
-------
108+
Series
109+
"""
110+
return describe_1d(
111+
series,
112+
percentiles,
113+
datetime_is_numeric,
114+
is_series=True,
115+
)
116+
117+
118+
def describe_frame(
119+
frame: "DataFrame",
120+
include: Optional[Union[str, Sequence[str]]],
121+
exclude: Optional[Union[str, Sequence[str]]],
122+
percentiles: Sequence[float],
123+
datetime_is_numeric: bool,
124+
) -> "DataFrame":
125+
"""Describe DataFrame.
126+
127+
Parameters
128+
----------
129+
frame : DataFrame
130+
DataFrame to be described.
131+
include : 'all', list-like of dtypes or None (default), optional
132+
A white list of data types to include in the result.
133+
exclude : list-like of dtypes or None (default), optional,
134+
A black list of data types to omit from the result.
135+
percentiles : list-like of numbers
136+
The percentiles to include in the output.
137+
datetime_is_numeric : bool, default False
138+
Whether to treat datetime dtypes as numeric.
139+
140+
Returns
141+
-------
142+
DataFrame
143+
"""
144+
data = select_columns(
145+
frame=frame,
146+
include=include,
147+
exclude=exclude,
148+
datetime_is_numeric=datetime_is_numeric,
149+
)
92150

93151
ldesc = [
94152
describe_1d(s, percentiles, datetime_is_numeric, is_series=False)
95153
for _, s in data.items()
96154
]
97-
# set a convenient order for rows
155+
156+
col_names = reorder_columns(ldesc)
157+
d = concat(
158+
[x.reindex(col_names, copy=False) for x in ldesc],
159+
axis=1,
160+
sort=False,
161+
)
162+
d.columns = data.columns.copy()
163+
return d
164+
165+
166+
def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]:
167+
"""Set a convenient order for rows for display."""
98168
names: List[Hashable] = []
99169
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
100170
for idxnames in ldesc_indexes:
101171
for name in idxnames:
102172
if name not in names:
103173
names.append(name)
174+
return names
104175

105-
d = concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
106-
d.columns = data.columns.copy()
107-
return d
176+
177+
def select_columns(
178+
frame: "DataFrame",
179+
include: Optional[Union[str, Sequence[str]]],
180+
exclude: Optional[Union[str, Sequence[str]]],
181+
datetime_is_numeric: bool,
182+
) -> "DataFrame":
183+
"""Select columns to be described."""
184+
if (include is None) and (exclude is None):
185+
# when some numerics are found, keep only numerics
186+
default_include = [np.number]
187+
if datetime_is_numeric:
188+
default_include.append("datetime")
189+
data = frame.select_dtypes(include=default_include)
190+
if len(data.columns) == 0:
191+
data = frame
192+
elif include == "all":
193+
if exclude is not None:
194+
msg = "exclude must be None when include is 'all'"
195+
raise ValueError(msg)
196+
data = frame
197+
else:
198+
data = frame.select_dtypes(include=include, exclude=exclude)
199+
200+
return data
108201

109202

110203
def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> "Series":
@@ -150,9 +243,9 @@ def describe_categorical_1d(data: "Series", is_series: bool) -> "Series":
150243
top, freq = objcounts.index[0], objcounts.iloc[0]
151244
if is_datetime64_any_dtype(data.dtype):
152245
if is_series:
153-
stacklevel = 5
154-
else:
155246
stacklevel = 6
247+
else:
248+
stacklevel = 7
156249
warnings.warn(
157250
"Treating datetime data as categorical rather than numeric in "
158251
"`.describe` is deprecated and will be removed in a future "
@@ -253,7 +346,7 @@ def describe_1d(
253346
return describe_categorical_1d(data, is_series)
254347

255348

256-
def _refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]:
349+
def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]:
257350
"""Ensure that percentiles are unique and sorted.
258351
259352
Parameters

0 commit comments

Comments
 (0)