Skip to content

Commit d5e8b8c

Browse files
committed
ENH: add include_lowest cut parameter, use it in qcut, make 4 not the qcut default, integer array tests, close #1411, #1462, #1469
1 parent f5001a1 commit d5e8b8c

File tree

2 files changed

+29
-15
lines changed

2 files changed

+29
-15
lines changed

pandas/tools/tests/test_tile.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,9 @@ def test_qcut(self):
9696

9797
labels, bins = qcut(arr, 4, retbins=True)
9898
ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
99-
ex_bins[0] -= (arr.max() - arr.min()) * 0.001
10099
assert_almost_equal(bins, ex_bins)
101100

102-
ex_levels = cut(arr, ex_bins)
101+
ex_levels = cut(arr, ex_bins, include_lowest=True)
103102
self.assert_(np.array_equal(labels, ex_levels))
104103

105104
def test_qcut_bounds(self):
@@ -139,6 +138,14 @@ def test_cut_pass_labels(self):
139138

140139
self.assert_(result.equals(exp))
141140

141+
def test_qcut_include_lowest(self):
142+
values = np.arange(10)
143+
144+
cats = qcut(values, 4)
145+
146+
ex_levels = ['[0, 2.25]', '(2.25, 4.5]', '(4.5, 6.75]', '(6.75, 9]']
147+
self.assert_((cats.levels == ex_levels).all())
148+
142149
if __name__ == '__main__':
143150
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
144151
exit=False)

pandas/tools/tile.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
import numpy as np
1212

1313

14-
def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
14+
def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
15+
include_lowest=False):
1516
"""
1617
Return indices of half-open bins to which each value of `x` belongs.
1718
@@ -38,9 +39,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
3839
3940
Returns
4041
-------
41-
out : ndarray of labels
42-
Same shape as `x`. Array of strings by default, integers if
43-
labels=False
42+
out : Categorical or array of integers if labels is False
4443
bins : ndarray of floats
4544
Returned only if `retbins` is True.
4645
@@ -50,7 +49,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
5049
a categorical variable. For example, `cut` could convert ages to groups
5150
of age ranges.
5251
53-
Any NA values will be NA in the result
52+
Any NA values will be NA in the result. Out of bounds values will be NA in
53+
the resulting Categorical object
54+
5455
5556
Examples
5657
--------
@@ -95,11 +96,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
9596
raise ValueError('bins must increase monotonically.')
9697

9798
return _bins_to_cuts(x, bins, right=right, labels=labels,
98-
retbins=retbins, precision=precision)
99+
retbins=retbins, precision=precision,
100+
include_lowest=include_lowest)
99101

100102

101103

102-
def qcut(x, q=4, labels=None, retbins=False, precision=3):
104+
def qcut(x, q, labels=None, retbins=False, precision=3):
103105
"""
104106
Quantile-based discretization function. Discretize variable into
105107
equal-sized buckets based on rank or based on sample quantiles. For example
@@ -111,8 +113,7 @@ def qcut(x, q=4, labels=None, retbins=False, precision=3):
111113
x : ndarray or Series
112114
q : integer or array of quantiles
113115
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
114-
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. Array of
115-
quantiles must span [0, 1]
116+
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
116117
labels : array or boolean, default None
117118
Labels to use for bin edges, or False to return integer bin labels
118119
retbins : bool, optional
@@ -121,9 +122,11 @@ def qcut(x, q=4, labels=None, retbins=False, precision=3):
121122
122123
Returns
123124
-------
125+
cat : Categorical
124126
125127
Notes
126128
-----
129+
Out of bounds values will be NA in the resulting Categorical object
127130
128131
Examples
129132
--------
@@ -133,21 +136,22 @@ def qcut(x, q=4, labels=None, retbins=False, precision=3):
133136
else:
134137
quantiles = q
135138
bins = algos.quantile(x, quantiles)
136-
bins[0] -= 0.001 * (x.max() - x.min())
137-
138139
return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
139-
precision=precision)
140+
precision=precision, include_lowest=True)
140141

141142

142143
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
143-
precision=3, name=None):
144+
precision=3, name=None, include_lowest=False):
144145
if name is None and isinstance(x, Series):
145146
name = x.name
146147
x = np.asarray(x)
147148

148149
side = 'left' if right else 'right'
149150
ids = bins.searchsorted(x, side=side)
150151

152+
if include_lowest:
153+
ids[x == bins[0]] = 1
154+
151155
na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
152156
has_nas = na_mask.any()
153157

@@ -157,9 +161,12 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
157161
if right:
158162
levels = ['(%s, %s]' % (fmt(a), fmt(b))
159163
for a, b in zip(bins, bins[1:])]
164+
if include_lowest:
165+
levels[0] = '[' + levels[0][1:]
160166
else:
161167
levels = ['[%s, %s)' % (fmt(a), fmt(b))
162168
for a, b in zip(bins, bins[1:])]
169+
163170
else:
164171
if len(labels) != len(bins) - 1:
165172
raise ValueError('Bin labels must be one fewer than '

0 commit comments

Comments
 (0)