Skip to content

Commit b9928e7

Browse files
committed
switch to labels is True and add more tests
1 parent 5d8c8d9 commit b9928e7

File tree

3 files changed

+280
-4
lines changed

3 files changed

+280
-4
lines changed

pandas/core/reshape/tile.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ def _bins_to_cuts(
397397
labels = _format_labels(
398398
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
399399
)
400-
elif labels:
400+
elif labels is True:
401401
raise ValueError(
402402
"User desired bin labels must be passed in as an argument, "
403403
"not just `True`"
@@ -407,8 +407,6 @@ def _bins_to_cuts(
407407
raise ValueError(
408408
"Bin labels must be one fewer than the number of bin edges"
409409
)
410-
else:
411-
labels = Categorical(labels, categories=labels, ordered=True)
412410
if not is_categorical_dtype(labels):
413411
labels = Categorical(labels, categories=labels, ordered=True)
414412

pandas/tests/reshape/test_qcut 2.py

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
import os
2+
3+
import numpy as np
4+
import pytest
5+
6+
from pandas import (
7+
Categorical,
8+
DatetimeIndex,
9+
Interval,
10+
IntervalIndex,
11+
NaT,
12+
Series,
13+
TimedeltaIndex,
14+
Timestamp,
15+
cut,
16+
date_range,
17+
isna,
18+
qcut,
19+
timedelta_range,
20+
)
21+
from pandas.api.types import CategoricalDtype as CDT
22+
from pandas.core.algorithms import quantile
23+
import pandas.util.testing as tm
24+
25+
from pandas.tseries.offsets import Day, Nano
26+
27+
28+
def test_qcut():
29+
arr = np.random.randn(1000)
30+
31+
# We store the bins as Index that have been
32+
# rounded to comparisons are a bit tricky.
33+
labels, bins = qcut(arr, 4, retbins=True)
34+
ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
35+
36+
result = labels.categories.left.values
37+
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
38+
39+
result = labels.categories.right.values
40+
assert np.allclose(result, ex_bins[1:], atol=1e-2)
41+
42+
ex_levels = cut(arr, ex_bins, include_lowest=True)
43+
tm.assert_categorical_equal(labels, ex_levels)
44+
45+
46+
def test_qcut_bounds():
47+
arr = np.random.randn(1000)
48+
49+
factor = qcut(arr, 10, labels=False)
50+
assert len(np.unique(factor)) == 10
51+
52+
53+
def test_qcut_specify_quantiles():
54+
arr = np.random.randn(100)
55+
factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
56+
57+
expected = qcut(arr, 4)
58+
tm.assert_categorical_equal(factor, expected)
59+
60+
61+
def test_qcut_all_bins_same():
62+
with pytest.raises(ValueError, match="edges.*unique"):
63+
qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
64+
65+
66+
def test_qcut_include_lowest():
67+
values = np.arange(10)
68+
ii = qcut(values, 4)
69+
70+
ex_levels = IntervalIndex(
71+
[
72+
Interval(-0.001, 2.25),
73+
Interval(2.25, 4.5),
74+
Interval(4.5, 6.75),
75+
Interval(6.75, 9),
76+
]
77+
)
78+
tm.assert_index_equal(ii.categories, ex_levels)
79+
80+
81+
def test_qcut_nas():
82+
arr = np.random.randn(100)
83+
arr[:20] = np.nan
84+
85+
result = qcut(arr, 4)
86+
assert isna(result[:20]).all()
87+
88+
89+
def test_qcut_index():
90+
result = qcut([0, 2], 2)
91+
intervals = [Interval(-0.001, 1), Interval(1, 2)]
92+
93+
expected = Categorical(intervals, ordered=True)
94+
tm.assert_categorical_equal(result, expected)
95+
96+
97+
def test_qcut_binning_issues(datapath):
98+
# see gh-1978, gh-1979
99+
cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
100+
arr = np.loadtxt(cut_file)
101+
result = qcut(arr, 20)
102+
103+
starts = []
104+
ends = []
105+
106+
for lev in np.unique(result):
107+
s = lev.left
108+
e = lev.right
109+
assert s != e
110+
111+
starts.append(float(s))
112+
ends.append(float(e))
113+
114+
for (sp, sn), (ep, en) in zip(
115+
zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])
116+
):
117+
assert sp < sn
118+
assert ep < en
119+
assert ep <= sn
120+
121+
122+
def test_qcut_return_intervals():
123+
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
124+
res = qcut(ser, [0, 0.333, 0.666, 1])
125+
126+
exp_levels = np.array(
127+
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
128+
)
129+
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
130+
tm.assert_series_equal(res, exp)
131+
132+
133+
@pytest.mark.parametrize(
134+
"kwargs,msg",
135+
[
136+
(dict(duplicates="drop"), None),
137+
(dict(), "Bin edges must be unique"),
138+
(dict(duplicates="raise"), "Bin edges must be unique"),
139+
(dict(duplicates="foo"), "invalid value for 'duplicates' parameter"),
140+
],
141+
)
142+
def test_qcut_duplicates_bin(kwargs, msg):
143+
# see gh-7751
144+
values = [0, 0, 0, 0, 1, 2, 3]
145+
146+
if msg is not None:
147+
with pytest.raises(ValueError, match=msg):
148+
qcut(values, 3, **kwargs)
149+
else:
150+
result = qcut(values, 3, **kwargs)
151+
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
152+
tm.assert_index_equal(result.categories, expected)
153+
154+
155+
@pytest.mark.parametrize(
156+
"data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
157+
)
158+
@pytest.mark.parametrize("length", [1, 2])
159+
@pytest.mark.parametrize("labels", [None, False])
160+
def test_single_quantile(data, start, end, length, labels):
161+
# see gh-15431
162+
ser = Series([data] * length)
163+
result = qcut(ser, 1, labels=labels)
164+
165+
if labels is None:
166+
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
167+
expected = Series(intervals).astype(CDT(ordered=True))
168+
else:
169+
expected = Series([0] * length)
170+
171+
tm.assert_series_equal(result, expected)
172+
173+
174+
@pytest.mark.parametrize(
175+
"ser",
176+
[
177+
Series(DatetimeIndex(["20180101", NaT, "20180103"])),
178+
Series(TimedeltaIndex(["0 days", NaT, "2 days"])),
179+
],
180+
ids=lambda x: str(x.dtype),
181+
)
182+
def test_qcut_nat(ser):
183+
# see gh-19768
184+
intervals = IntervalIndex.from_tuples(
185+
[(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])]
186+
)
187+
expected = Series(Categorical(intervals, ordered=True))
188+
189+
result = qcut(ser, 2)
190+
tm.assert_series_equal(result, expected)
191+
192+
193+
@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
194+
def test_datetime_tz_qcut(bins):
195+
# see gh-19872
196+
tz = "US/Eastern"
197+
ser = Series(date_range("20130101", periods=3, tz=tz))
198+
199+
result = qcut(ser, bins)
200+
expected = Series(
201+
IntervalIndex(
202+
[
203+
Interval(
204+
Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
205+
Timestamp("2013-01-01 16:00:00", tz=tz),
206+
),
207+
Interval(
208+
Timestamp("2013-01-01 16:00:00", tz=tz),
209+
Timestamp("2013-01-02 08:00:00", tz=tz),
210+
),
211+
Interval(
212+
Timestamp("2013-01-02 08:00:00", tz=tz),
213+
Timestamp("2013-01-03 00:00:00", tz=tz),
214+
),
215+
]
216+
)
217+
).astype(CDT(ordered=True))
218+
tm.assert_series_equal(result, expected)
219+
220+
221+
@pytest.mark.parametrize(
222+
"arg,expected_bins",
223+
[
224+
[
225+
timedelta_range("1day", periods=3),
226+
TimedeltaIndex(["1 days", "2 days", "3 days"]),
227+
],
228+
[
229+
date_range("20180101", periods=3),
230+
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
231+
],
232+
],
233+
)
234+
def test_date_like_qcut_bins(arg, expected_bins):
235+
# see gh-19891
236+
ser = Series(arg)
237+
result, result_bins = qcut(ser, 2, retbins=True)
238+
tm.assert_index_equal(result_bins, expected_bins)
239+
240+
241+
@pytest.mark.parametrize("bins", [6, 7])
242+
@pytest.mark.parametrize(
243+
"box, compare",
244+
[
245+
(Series, tm.assert_series_equal),
246+
(np.array, tm.assert_categorical_equal),
247+
(list, tm.assert_equal),
248+
],
249+
)
250+
def test_qcut_bool_coercion_to_int(bins, box, compare):
251+
# issue 20303
252+
data_expected = box([0, 1, 1, 0, 1] * 10)
253+
data_result = box([False, True, True, False, True] * 10)
254+
expected = qcut(data_expected, bins, duplicates="drop")
255+
result = qcut(data_result, bins, duplicates="drop")
256+
compare(result, expected)

pandas/tests/reshape/test_qcut.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,13 +131,35 @@ def test_qcut_return_intervals():
131131

132132

133133
def test_qcut_labels_true():
134-
# issue 13318
134+
# GH 13318
135135
values = range(5)
136136
msg = "User desired bin labels must be passed in as an argument, not just `True`"
137137
with pytest.raises(ValueError, match=msg):
138138
qcut(values, 4, labels=True)
139139

140140

141+
@pytest.mark.parametrize("kwargs", [["a", "b", "c"], list(range(3))])
142+
def test_qcut_wrong_length_labels(kwargs):
143+
# GH 13318
144+
values = range(10)
145+
msg = "Bin labels must be one fewer than the number of bin edges"
146+
with pytest.raises(ValueError, match=msg):
147+
qcut(values, 4, labels=kwargs)
148+
149+
150+
@pytest.mark.parametrize(
151+
"kwargs, expected",
152+
[
153+
(["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)),
154+
(list(range(3)), Categorical([0, 1, 2], ordered=True)),
155+
],
156+
)
157+
def test_qcut_list_like_labels(kwargs, expected):
158+
# GH 13318
159+
values = range(10)
160+
qcut(values, 3, labels=kwargs)
161+
162+
141163
@pytest.mark.parametrize(
142164
"kwargs,msg",
143165
[

0 commit comments

Comments
 (0)