@@ -35,12 +35,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
35
35
----------
36
36
x : array-like
37
37
The input array to be binned. Must be 1-dimensional.
38
- bins : int, sequence of scalars, or pandas.IntervalIndex
38
+ bins : int, str, sequence of scalars, or pandas.IntervalIndex
39
39
The criteria to bin by.
40
40
41
41
* int : Defines the number of equal-width bins in the range of `x`. The
42
42
range of `x` is extended by .1% on each side to include the minimum
43
43
and maximum values of `x`.
44
+ * str : Bin calculaton dispatched to `np.histogram_bin_edges`. See that
45
+ documentation for details.
44
46
* sequence of scalars : Defines the bin edges allowing for non-uniform
45
47
width. No extension of the range of `x` is done.
46
48
* IntervalIndex : Defines the exact bins to be used.
@@ -83,7 +85,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
83
85
84
86
* False : returns an ndarray of integers.
85
87
86
- bins : numpy.ndarray or IntervalIndex.
88
+ bins : numpy.ndarray or IntervalIndex
87
89
The computed or specified bins. Only returned when `retbins=True`.
88
90
For scalar or sequence `bins`, this is an ndarray with the computed
89
91
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
@@ -181,14 +183,37 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
181
183
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
182
184
[NaN, (0, 1], NaN, (2, 3], (4, 5]]
183
185
Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
186
+
187
+ Passng a string for `bins` dispatches the bin calculation to numpy's
188
+ `histogram_bin_edges`.
189
+ >>> pd.cut(array([0.1, 0.1, 0.2, 0.5, 0.5, 0.9, 1.0]),
190
+ ... bins="auto")
191
+ ... # doctest: +ELLIPSIS`
192
+ [(0.0991, 0.325], (0.0991, 0.325], (0.0991, 0.325], (0.325, 0.55],
193
+ (0.325, 0.55], (0.775, 1.0], (0.775, 1.0]]
194
+ Categories (4, interval[float64]): [(0.0991, 0.325] < (0.325, 0.55] <
195
+ (0.55, 0.775] < (0.775, 1.0]]
184
196
"""
185
197
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
186
198
187
199
# for handling the cut for datetime and timedelta objects
188
200
x_is_series , series_index , name , x = _preprocess_for_cut (x )
189
201
x , dtype = _coerce_to_type (x )
190
202
191
- if not np .iterable (bins ):
203
+ if isinstance (bins , str ):
204
+ bins = np .histogram_bin_edges (x , bins )
205
+ mn , mx = bins [0 ], bins [- 1 ]
206
+ adj = (mx - mn )
207
+ if adj :
208
+ adj *= 0.001 # 0.1% of the range
209
+ else :
210
+ adj = 0.001
211
+ if right :
212
+ bins [0 ] -= adj
213
+ else :
214
+ bins [- 1 ] += adj
215
+
216
+ elif not np .iterable (bins ):
192
217
if is_scalar (bins ) and bins < 1 :
193
218
raise ValueError ("`bins` should be a positive integer." )
194
219
0 commit comments