Skip to content

Commit c5e3e25

Browse files
authored
implement test_select_dtypes (#32250)
1 parent 786bfd9 commit c5e3e25

File tree

2 files changed

+329
-319
lines changed

2 files changed

+329
-319
lines changed
Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
from collections import OrderedDict
2+
3+
import numpy as np
4+
import pytest
5+
6+
import pandas as pd
7+
from pandas import DataFrame, Timestamp
8+
import pandas._testing as tm
9+
10+
11+
class TestSelectDtypes:
12+
def test_select_dtypes_include_using_list_like(self):
13+
df = DataFrame(
14+
{
15+
"a": list("abc"),
16+
"b": list(range(1, 4)),
17+
"c": np.arange(3, 6).astype("u1"),
18+
"d": np.arange(4.0, 7.0, dtype="float64"),
19+
"e": [True, False, True],
20+
"f": pd.Categorical(list("abc")),
21+
"g": pd.date_range("20130101", periods=3),
22+
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
23+
"i": pd.date_range("20130101", periods=3, tz="CET"),
24+
"j": pd.period_range("2013-01", periods=3, freq="M"),
25+
"k": pd.timedelta_range("1 day", periods=3),
26+
}
27+
)
28+
29+
ri = df.select_dtypes(include=[np.number])
30+
ei = df[["b", "c", "d", "k"]]
31+
tm.assert_frame_equal(ri, ei)
32+
33+
ri = df.select_dtypes(include=[np.number], exclude=["timedelta"])
34+
ei = df[["b", "c", "d"]]
35+
tm.assert_frame_equal(ri, ei)
36+
37+
ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"])
38+
ei = df[["b", "c", "d", "f"]]
39+
tm.assert_frame_equal(ri, ei)
40+
41+
ri = df.select_dtypes(include=["datetime"])
42+
ei = df[["g"]]
43+
tm.assert_frame_equal(ri, ei)
44+
45+
ri = df.select_dtypes(include=["datetime64"])
46+
ei = df[["g"]]
47+
tm.assert_frame_equal(ri, ei)
48+
49+
ri = df.select_dtypes(include=["datetimetz"])
50+
ei = df[["h", "i"]]
51+
tm.assert_frame_equal(ri, ei)
52+
53+
with pytest.raises(NotImplementedError, match=r"^$"):
54+
df.select_dtypes(include=["period"])
55+
56+
def test_select_dtypes_exclude_using_list_like(self):
57+
df = DataFrame(
58+
{
59+
"a": list("abc"),
60+
"b": list(range(1, 4)),
61+
"c": np.arange(3, 6).astype("u1"),
62+
"d": np.arange(4.0, 7.0, dtype="float64"),
63+
"e": [True, False, True],
64+
}
65+
)
66+
re = df.select_dtypes(exclude=[np.number])
67+
ee = df[["a", "e"]]
68+
tm.assert_frame_equal(re, ee)
69+
70+
def test_select_dtypes_exclude_include_using_list_like(self):
71+
df = DataFrame(
72+
{
73+
"a": list("abc"),
74+
"b": list(range(1, 4)),
75+
"c": np.arange(3, 6).astype("u1"),
76+
"d": np.arange(4.0, 7.0, dtype="float64"),
77+
"e": [True, False, True],
78+
"f": pd.date_range("now", periods=3).values,
79+
}
80+
)
81+
exclude = (np.datetime64,)
82+
include = np.bool_, "integer"
83+
r = df.select_dtypes(include=include, exclude=exclude)
84+
e = df[["b", "c", "e"]]
85+
tm.assert_frame_equal(r, e)
86+
87+
exclude = ("datetime",)
88+
include = "bool", "int64", "int32"
89+
r = df.select_dtypes(include=include, exclude=exclude)
90+
e = df[["b", "e"]]
91+
tm.assert_frame_equal(r, e)
92+
93+
def test_select_dtypes_include_using_scalars(self):
94+
df = DataFrame(
95+
{
96+
"a": list("abc"),
97+
"b": list(range(1, 4)),
98+
"c": np.arange(3, 6).astype("u1"),
99+
"d": np.arange(4.0, 7.0, dtype="float64"),
100+
"e": [True, False, True],
101+
"f": pd.Categorical(list("abc")),
102+
"g": pd.date_range("20130101", periods=3),
103+
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
104+
"i": pd.date_range("20130101", periods=3, tz="CET"),
105+
"j": pd.period_range("2013-01", periods=3, freq="M"),
106+
"k": pd.timedelta_range("1 day", periods=3),
107+
}
108+
)
109+
110+
ri = df.select_dtypes(include=np.number)
111+
ei = df[["b", "c", "d", "k"]]
112+
tm.assert_frame_equal(ri, ei)
113+
114+
ri = df.select_dtypes(include="datetime")
115+
ei = df[["g"]]
116+
tm.assert_frame_equal(ri, ei)
117+
118+
ri = df.select_dtypes(include="datetime64")
119+
ei = df[["g"]]
120+
tm.assert_frame_equal(ri, ei)
121+
122+
ri = df.select_dtypes(include="category")
123+
ei = df[["f"]]
124+
tm.assert_frame_equal(ri, ei)
125+
126+
with pytest.raises(NotImplementedError, match=r"^$"):
127+
df.select_dtypes(include="period")
128+
129+
def test_select_dtypes_exclude_using_scalars(self):
130+
df = DataFrame(
131+
{
132+
"a": list("abc"),
133+
"b": list(range(1, 4)),
134+
"c": np.arange(3, 6).astype("u1"),
135+
"d": np.arange(4.0, 7.0, dtype="float64"),
136+
"e": [True, False, True],
137+
"f": pd.Categorical(list("abc")),
138+
"g": pd.date_range("20130101", periods=3),
139+
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
140+
"i": pd.date_range("20130101", periods=3, tz="CET"),
141+
"j": pd.period_range("2013-01", periods=3, freq="M"),
142+
"k": pd.timedelta_range("1 day", periods=3),
143+
}
144+
)
145+
146+
ri = df.select_dtypes(exclude=np.number)
147+
ei = df[["a", "e", "f", "g", "h", "i", "j"]]
148+
tm.assert_frame_equal(ri, ei)
149+
150+
ri = df.select_dtypes(exclude="category")
151+
ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]]
152+
tm.assert_frame_equal(ri, ei)
153+
154+
with pytest.raises(NotImplementedError, match=r"^$"):
155+
df.select_dtypes(exclude="period")
156+
157+
def test_select_dtypes_include_exclude_using_scalars(self):
158+
df = DataFrame(
159+
{
160+
"a": list("abc"),
161+
"b": list(range(1, 4)),
162+
"c": np.arange(3, 6).astype("u1"),
163+
"d": np.arange(4.0, 7.0, dtype="float64"),
164+
"e": [True, False, True],
165+
"f": pd.Categorical(list("abc")),
166+
"g": pd.date_range("20130101", periods=3),
167+
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
168+
"i": pd.date_range("20130101", periods=3, tz="CET"),
169+
"j": pd.period_range("2013-01", periods=3, freq="M"),
170+
"k": pd.timedelta_range("1 day", periods=3),
171+
}
172+
)
173+
174+
ri = df.select_dtypes(include=np.number, exclude="floating")
175+
ei = df[["b", "c", "k"]]
176+
tm.assert_frame_equal(ri, ei)
177+
178+
def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
179+
df = DataFrame(
180+
{
181+
"a": list("abc"),
182+
"b": list(range(1, 4)),
183+
"c": np.arange(3, 6).astype("u1"),
184+
"d": np.arange(4.0, 7.0, dtype="float64"),
185+
"e": [True, False, True],
186+
"f": pd.Categorical(list("abc")),
187+
"g": pd.date_range("20130101", periods=3),
188+
"h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
189+
"i": pd.date_range("20130101", periods=3, tz="CET"),
190+
"j": pd.period_range("2013-01", periods=3, freq="M"),
191+
"k": pd.timedelta_range("1 day", periods=3),
192+
}
193+
)
194+
195+
ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"])
196+
ei = df[["b", "c"]]
197+
tm.assert_frame_equal(ri, ei)
198+
199+
ri = df.select_dtypes(include=[np.number, "category"], exclude="floating")
200+
ei = df[["b", "c", "f", "k"]]
201+
tm.assert_frame_equal(ri, ei)
202+
203+
def test_select_dtypes_duplicate_columns(self):
204+
# GH20839
205+
odict = OrderedDict
206+
df = DataFrame(
207+
odict(
208+
[
209+
("a", list("abc")),
210+
("b", list(range(1, 4))),
211+
("c", np.arange(3, 6).astype("u1")),
212+
("d", np.arange(4.0, 7.0, dtype="float64")),
213+
("e", [True, False, True]),
214+
("f", pd.date_range("now", periods=3).values),
215+
]
216+
)
217+
)
218+
df.columns = ["a", "a", "b", "b", "b", "c"]
219+
220+
expected = DataFrame(
221+
{"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")}
222+
)
223+
224+
result = df.select_dtypes(include=[np.number], exclude=["floating"])
225+
tm.assert_frame_equal(result, expected)
226+
227+
def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
228+
df = DataFrame(
229+
{
230+
"a": list("abc"),
231+
"b": list(range(1, 4)),
232+
"c": np.arange(3, 6).astype("u1"),
233+
"d": np.arange(4.0, 7.0, dtype="float64"),
234+
"e": [True, False, True],
235+
"f": pd.date_range("now", periods=3).values,
236+
}
237+
)
238+
df["g"] = df.f.diff()
239+
assert not hasattr(np, "u8")
240+
r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"])
241+
e = df[["a", "b"]]
242+
tm.assert_frame_equal(r, e)
243+
244+
r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"])
245+
e = df[["a", "b", "g"]]
246+
tm.assert_frame_equal(r, e)
247+
248+
def test_select_dtypes_empty(self):
249+
df = DataFrame({"a": list("abc"), "b": list(range(1, 4))})
250+
msg = "at least one of include or exclude must be nonempty"
251+
with pytest.raises(ValueError, match=msg):
252+
df.select_dtypes()
253+
254+
def test_select_dtypes_bad_datetime64(self):
255+
df = DataFrame(
256+
{
257+
"a": list("abc"),
258+
"b": list(range(1, 4)),
259+
"c": np.arange(3, 6).astype("u1"),
260+
"d": np.arange(4.0, 7.0, dtype="float64"),
261+
"e": [True, False, True],
262+
"f": pd.date_range("now", periods=3).values,
263+
}
264+
)
265+
with pytest.raises(ValueError, match=".+ is too specific"):
266+
df.select_dtypes(include=["datetime64[D]"])
267+
268+
with pytest.raises(ValueError, match=".+ is too specific"):
269+
df.select_dtypes(exclude=["datetime64[as]"])
270+
271+
def test_select_dtypes_datetime_with_tz(self):
272+
273+
df2 = DataFrame(
274+
dict(
275+
A=Timestamp("20130102", tz="US/Eastern"),
276+
B=Timestamp("20130603", tz="CET"),
277+
),
278+
index=range(5),
279+
)
280+
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
281+
result = df3.select_dtypes(include=["datetime64[ns]"])
282+
expected = df3.reindex(columns=[])
283+
tm.assert_frame_equal(result, expected)
284+
285+
@pytest.mark.parametrize(
286+
"dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"]
287+
)
288+
@pytest.mark.parametrize("arg", ["include", "exclude"])
289+
def test_select_dtypes_str_raises(self, dtype, arg):
290+
df = DataFrame(
291+
{
292+
"a": list("abc"),
293+
"g": list("abc"),
294+
"b": list(range(1, 4)),
295+
"c": np.arange(3, 6).astype("u1"),
296+
"d": np.arange(4.0, 7.0, dtype="float64"),
297+
"e": [True, False, True],
298+
"f": pd.date_range("now", periods=3).values,
299+
}
300+
)
301+
msg = "string dtypes are not allowed"
302+
kwargs = {arg: [dtype]}
303+
304+
with pytest.raises(TypeError, match=msg):
305+
df.select_dtypes(**kwargs)
306+
307+
def test_select_dtypes_bad_arg_raises(self):
308+
df = DataFrame(
309+
{
310+
"a": list("abc"),
311+
"g": list("abc"),
312+
"b": list(range(1, 4)),
313+
"c": np.arange(3, 6).astype("u1"),
314+
"d": np.arange(4.0, 7.0, dtype="float64"),
315+
"e": [True, False, True],
316+
"f": pd.date_range("now", periods=3).values,
317+
}
318+
)
319+
320+
msg = "data type.*not understood"
321+
with pytest.raises(TypeError, match=msg):
322+
df.select_dtypes(["blargy, blarg, blarg"])
323+
324+
def test_select_dtypes_typecodes(self):
325+
# GH 11990
326+
df = tm.makeCustomDataframe(30, 3, data_gen_f=lambda x, y: np.random.random())
327+
expected = df
328+
FLOAT_TYPES = list(np.typecodes["AllFloat"])
329+
tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)

0 commit comments

Comments
 (0)