4
4
Method NDFrame.describe() delegates actual execution to function describe_ndframe().
5
5
"""
6
6
7
+ from abc import ABC , abstractmethod
7
8
from typing import TYPE_CHECKING , List , Optional , Sequence , Union , cast
8
9
import warnings
9
10
10
11
import numpy as np
11
12
12
13
from pandas ._libs .tslibs import Timestamp
13
- from pandas ._typing import FrameOrSeries , Hashable
14
+ from pandas ._typing import FrameOrSeries , FrameOrSeriesUnion , Hashable
14
15
from pandas .util ._validators import validate_percentile
15
16
16
17
from pandas .core .dtypes .common import (
@@ -61,106 +62,140 @@ def describe_ndframe(
61
62
"""
62
63
percentiles = refine_percentiles (percentiles )
63
64
65
+ describer : NDFrameDescriberAbstract
66
+
64
67
if obj .ndim == 1 :
65
- result_series = describe_series (
66
- cast ("Series" , obj ),
67
- percentiles ,
68
- datetime_is_numeric ,
68
+ describer = SeriesDescriber (
69
+ series = cast ("Series" , obj ),
70
+ datetime_is_numeric = datetime_is_numeric ,
71
+ )
72
+ else :
73
+ describer = DataFrameDescriber (
74
+ frame = cast ("DataFrame" , obj ),
75
+ include = include ,
76
+ exclude = exclude ,
77
+ datetime_is_numeric = datetime_is_numeric ,
69
78
)
70
- return cast (FrameOrSeries , result_series )
71
79
72
- frame = cast ("DataFrame" , obj )
80
+ result = describer .describe (percentiles = percentiles )
81
+ return cast (FrameOrSeries , result )
73
82
74
- if frame .ndim == 2 and frame .columns .size == 0 :
75
- raise ValueError ("Cannot describe a DataFrame without columns" )
76
83
77
- result_frame = describe_frame (
78
- frame = frame ,
79
- include = include ,
80
- exclude = exclude ,
81
- percentiles = percentiles ,
82
- datetime_is_numeric = datetime_is_numeric ,
83
- )
84
- return cast (FrameOrSeries , result_frame )
84
+ class NDFrameDescriberAbstract (ABC ):
85
+ """Abstract class for describing dataframe or series."""
85
86
87
+ @abstractmethod
88
+ def describe (self , percentiles : Sequence [float ]) -> "FrameOrSeriesUnion" :
89
+ """Do describe either series or dataframe.
90
+
91
+ Parameters
92
+ ----------
93
+ percentiles : list-like of numbers
94
+ The percentiles to include in the output.
95
+ """
86
96
87
- def describe_series (
88
- series : "Series" ,
89
- percentiles : Sequence [float ],
90
- datetime_is_numeric : bool ,
91
- ) -> "Series" :
92
- """Describe series.
93
97
94
- The reason for the delegation to ``describe_1d`` only :
95
- to allow for a proper stacklevel of the FutureWarning .
98
+ class SeriesDescriber ( NDFrameDescriberAbstract ) :
99
+ """Class responsible for creating series description .
96
100
97
101
Parameters
98
102
----------
99
- series : Series
103
+ data : Series
100
104
Series to be described.
101
- percentiles : list-like of numbers
102
- The percentiles to include in the output.
103
- datetime_is_numeric : bool, default False
105
+ datetime_is_numeric : bool
104
106
Whether to treat datetime dtypes as numeric.
105
-
106
- Returns
107
- -------
108
- Series
109
107
"""
110
- return describe_1d (
111
- series ,
112
- percentiles ,
113
- datetime_is_numeric ,
114
- is_series = True ,
115
- )
116
108
109
+ def __init__ (
110
+ self ,
111
+ series : "Series" ,
112
+ * ,
113
+ datetime_is_numeric : bool ,
114
+ ):
115
+ self .series = series
116
+ self .datetime_is_numeric = datetime_is_numeric
117
+
118
+ def describe (self , percentiles : Sequence [float ]) -> "Series" :
119
+ return describe_1d (
120
+ self .series ,
121
+ percentiles = percentiles ,
122
+ datetime_is_numeric = self .datetime_is_numeric ,
123
+ is_series = True ,
124
+ )
117
125
118
- def describe_frame (
119
- frame : "DataFrame" ,
120
- include : Optional [Union [str , Sequence [str ]]],
121
- exclude : Optional [Union [str , Sequence [str ]]],
122
- percentiles : Sequence [float ],
123
- datetime_is_numeric : bool ,
124
- ) -> "DataFrame" :
125
- """Describe DataFrame.
126
+
127
+ class DataFrameDescriber (NDFrameDescriberAbstract ):
128
+ """Class responsible for creating dataframe description.
126
129
127
130
Parameters
128
131
----------
129
- frame : DataFrame
130
- DataFrame to be described.
131
- include : 'all', list-like of dtypes or None (default), optional
132
+ data : DataFrame
133
+ Dataframe to be described.
134
+ include : 'all', list-like of dtypes or None
132
135
A white list of data types to include in the result.
133
- exclude : list-like of dtypes or None (default), optional,
136
+ exclude : list-like of dtypes or None
134
137
A black list of data types to omit from the result.
135
- percentiles : list-like of numbers
136
- The percentiles to include in the output.
137
- datetime_is_numeric : bool, default False
138
+ datetime_is_numeric : bool
138
139
Whether to treat datetime dtypes as numeric.
139
-
140
- Returns
141
- -------
142
- DataFrame
143
140
"""
144
- data = select_columns (
145
- frame = frame ,
146
- include = include ,
147
- exclude = exclude ,
148
- datetime_is_numeric = datetime_is_numeric ,
149
- )
150
-
151
- ldesc = [
152
- describe_1d (s , percentiles , datetime_is_numeric , is_series = False )
153
- for _ , s in data .items ()
154
- ]
155
141
156
- col_names = reorder_columns (ldesc )
157
- d = concat (
158
- [x .reindex (col_names , copy = False ) for x in ldesc ],
159
- axis = 1 ,
160
- sort = False ,
161
- )
162
- d .columns = data .columns .copy ()
163
- return d
142
+ def __init__ (
143
+ self ,
144
+ frame : "DataFrame" ,
145
+ * ,
146
+ include : Optional [Union [str , Sequence [str ]]],
147
+ exclude : Optional [Union [str , Sequence [str ]]],
148
+ datetime_is_numeric : bool ,
149
+ ):
150
+ validate_frame (frame )
151
+ self .frame = frame
152
+ self .include = include
153
+ self .exclude = exclude
154
+ self .datetime_is_numeric = datetime_is_numeric
155
+
156
+ def describe (self , percentiles : Sequence [float ]) -> "DataFrame" :
157
+ data = self ._select_data ()
158
+
159
+ ldesc = [
160
+ describe_1d (
161
+ series ,
162
+ percentiles = percentiles ,
163
+ datetime_is_numeric = self .datetime_is_numeric ,
164
+ is_series = False ,
165
+ )
166
+ for _ , series in data .items ()
167
+ ]
168
+
169
+ col_names = reorder_columns (ldesc )
170
+ d = concat (
171
+ [x .reindex (col_names , copy = False ) for x in ldesc ],
172
+ axis = 1 ,
173
+ sort = False ,
174
+ )
175
+ d .columns = data .columns .copy ()
176
+ return d
177
+
178
+ def _select_data (self ):
179
+ """Select columns to be described."""
180
+ if (self .include is None ) and (self .exclude is None ):
181
+ # when some numerics are found, keep only numerics
182
+ default_include = [np .number ]
183
+ if self .datetime_is_numeric :
184
+ default_include .append ("datetime" )
185
+ data = self .frame .select_dtypes (include = default_include )
186
+ if len (data .columns ) == 0 :
187
+ data = self .frame
188
+ elif self .include == "all" :
189
+ if self .exclude is not None :
190
+ msg = "exclude must be None when include is 'all'"
191
+ raise ValueError (msg )
192
+ data = self .frame
193
+ else :
194
+ data = self .frame .select_dtypes (
195
+ include = self .include ,
196
+ exclude = self .exclude ,
197
+ )
198
+ return data
164
199
165
200
166
201
def reorder_columns (ldesc : Sequence ["Series" ]) -> List [Hashable ]:
@@ -174,32 +209,6 @@ def reorder_columns(ldesc: Sequence["Series"]) -> List[Hashable]:
174
209
return names
175
210
176
211
177
- def select_columns (
178
- frame : "DataFrame" ,
179
- include : Optional [Union [str , Sequence [str ]]],
180
- exclude : Optional [Union [str , Sequence [str ]]],
181
- datetime_is_numeric : bool ,
182
- ) -> "DataFrame" :
183
- """Select columns to be described."""
184
- if (include is None ) and (exclude is None ):
185
- # when some numerics are found, keep only numerics
186
- default_include = [np .number ]
187
- if datetime_is_numeric :
188
- default_include .append ("datetime" )
189
- data = frame .select_dtypes (include = default_include )
190
- if len (data .columns ) == 0 :
191
- data = frame
192
- elif include == "all" :
193
- if exclude is not None :
194
- msg = "exclude must be None when include is 'all'"
195
- raise ValueError (msg )
196
- data = frame
197
- else :
198
- data = frame .select_dtypes (include = include , exclude = exclude )
199
-
200
- return data
201
-
202
-
203
212
def describe_numeric_1d (series : "Series" , percentiles : Sequence [float ]) -> "Series" :
204
213
"""Describe series containing numerical data.
205
214
@@ -376,3 +385,8 @@ def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float
376
385
raise ValueError ("percentiles cannot contain duplicates" )
377
386
378
387
return unique_pcts
388
+
389
+
390
+ def validate_frame (frame : "DataFrame" ):
391
+ if frame .ndim == 2 and frame .columns .size == 0 :
392
+ raise ValueError ("Cannot describe a DataFrame without columns" )
0 commit comments