Skip to content

Commit 8f87907

Browse files
authored
BENCH: collect low-dependency asvs (#39917)
1 parent 497553d commit 8f87907

File tree

10 files changed

+291
-265
lines changed

10 files changed

+291
-265
lines changed

asv_bench/benchmarks/algorithms.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22

33
import numpy as np
44

5-
from pandas._libs import lib
6-
75
import pandas as pd
86

97
from .pandas_vb_common import tm
@@ -16,19 +14,6 @@
1614
pass
1715

1816

19-
class MaybeConvertObjects:
20-
def setup(self):
21-
N = 10 ** 5
22-
23-
data = list(range(N))
24-
data[0] = pd.NaT
25-
data = np.array(data)
26-
self.data = data
27-
28-
def time_maybe_convert_objects(self):
29-
lib.maybe_convert_objects(self.data)
30-
31-
3217
class Factorize:
3318

3419
params = [

asv_bench/benchmarks/attrs_caching.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,6 @@
33
import pandas as pd
44
from pandas import DataFrame
55

6-
try:
7-
from pandas.util import cache_readonly
8-
except ImportError:
9-
from pandas.util.decorators import cache_readonly
10-
116
try:
127
from pandas.core.construction import extract_array
138
except ImportError:
@@ -53,17 +48,4 @@ def time_extract_array_numpy(self, dtype):
5348
extract_array(self.series, extract_numpy=True)
5449

5550

56-
class CacheReadonly:
57-
def setup(self):
58-
class Foo:
59-
@cache_readonly
60-
def prop(self):
61-
return 5
62-
63-
self.obj = Foo()
64-
65-
def time_cache_readonly(self):
66-
self.obj.prop
67-
68-
6951
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/dtypes.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from .pandas_vb_common import (
1414
datetime_dtypes,
1515
extension_dtypes,
16-
lib,
1716
numeric_dtypes,
1817
string_dtypes,
1918
)
@@ -49,27 +48,6 @@ def time_pandas_dtype_invalid(self, dtype):
4948
pass
5049

5150

52-
class InferDtypes:
53-
param_names = ["dtype"]
54-
data_dict = {
55-
"np-object": np.array([1] * 100000, dtype="O"),
56-
"py-object": [1] * 100000,
57-
"np-null": np.array([1] * 50000 + [np.nan] * 50000),
58-
"py-null": [1] * 50000 + [None] * 50000,
59-
"np-int": np.array([1] * 100000, dtype=int),
60-
"np-floating": np.array([1.0] * 100000, dtype=float),
61-
"empty": [],
62-
"bytes": [b"a"] * 100000,
63-
}
64-
params = list(data_dict.keys())
65-
66-
def time_infer_skipna(self, dtype):
67-
lib.infer_dtype(self.data_dict[dtype], skipna=True)
68-
69-
def time_infer(self, dtype):
70-
lib.infer_dtype(self.data_dict[dtype], skipna=False)
71-
72-
7351
class SelectDtypes:
7452

7553
params = [

asv_bench/benchmarks/gil.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ def time_take1d(self, dtype):
125125

126126

127127
class ParallelKth:
128+
# This depends exclusively on code in _libs/, could go in libs.py
128129

129130
number = 1
130131
repeat = 5

asv_bench/benchmarks/indexing_engines.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
"""
2+
Benchmarks in this fiel depend exclusively on code in _libs/
3+
4+
If a PR does not edit anything in _libs, it is very unlikely that benchmarks
5+
in this file will be affected.
6+
"""
7+
18
import numpy as np
29

310
from pandas._libs import index as libindex

asv_bench/benchmarks/inference.py

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,20 @@
1+
"""
2+
The functions benchmarked in this file depend _almost_ exclusively on
3+
_libs, but not in a way that is easy to formalize.
4+
5+
If a PR does not change anything in pandas/_libs/ or pandas/core/tools/, then
6+
it is likely that these benchmarks will be unaffected.
7+
"""
8+
19
import numpy as np
210

311
from pandas import (
12+
NaT,
413
Series,
14+
date_range,
15+
to_datetime,
516
to_numeric,
17+
to_timedelta,
618
)
719

820
from .pandas_vb_common import (
@@ -69,6 +81,9 @@ def time_downcast(self, dtype, downcast):
6981

7082

7183
class MaybeConvertNumeric:
84+
# maybe_convert_numeric depends _exclusively_ on _libs, could
85+
# go in benchmarks/libs.py
86+
7287
def setup_cache(self):
7388
N = 10 ** 6
7489
arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64")
@@ -81,4 +96,205 @@ def time_convert(self, data):
8196
lib.maybe_convert_numeric(data, set(), coerce_numeric=False)
8297

8398

99+
class MaybeConvertObjects:
100+
# maybe_convert_objects depends _almost_ exclusively on _libs, but
101+
# does have some run-time imports from outside of _libs
102+
103+
def setup(self):
104+
N = 10 ** 5
105+
106+
data = list(range(N))
107+
data[0] = NaT
108+
data = np.array(data)
109+
self.data = data
110+
111+
def time_maybe_convert_objects(self):
112+
lib.maybe_convert_objects(self.data)
113+
114+
115+
class ToDatetimeFromIntsFloats:
116+
def setup(self):
117+
self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64")
118+
self.ts_sec_float = self.ts_sec.astype("float64")
119+
120+
self.ts_nanosec = 1_000_000 * self.ts_sec
121+
self.ts_nanosec_float = self.ts_nanosec.astype("float64")
122+
123+
# speed of int64 and float64 paths should be comparable
124+
125+
def time_nanosec_int64(self):
126+
to_datetime(self.ts_nanosec, unit="ns")
127+
128+
def time_nanosec_float64(self):
129+
to_datetime(self.ts_nanosec_float, unit="ns")
130+
131+
def time_sec_int64(self):
132+
to_datetime(self.ts_sec, unit="s")
133+
134+
def time_sec_float64(self):
135+
to_datetime(self.ts_sec_float, unit="s")
136+
137+
138+
class ToDatetimeYYYYMMDD:
139+
def setup(self):
140+
rng = date_range(start="1/1/2000", periods=10000, freq="D")
141+
self.stringsD = Series(rng.strftime("%Y%m%d"))
142+
143+
def time_format_YYYYMMDD(self):
144+
to_datetime(self.stringsD, format="%Y%m%d")
145+
146+
147+
class ToDatetimeCacheSmallCount:
148+
149+
params = ([True, False], [50, 500, 5000, 100000])
150+
param_names = ["cache", "count"]
151+
152+
def setup(self, cache, count):
153+
rng = date_range(start="1/1/1971", periods=count)
154+
self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist()
155+
156+
def time_unique_date_strings(self, cache, count):
157+
to_datetime(self.unique_date_strings, cache=cache)
158+
159+
160+
class ToDatetimeISO8601:
161+
def setup(self):
162+
rng = date_range(start="1/1/2000", periods=20000, freq="H")
163+
self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
164+
self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist()
165+
self.strings_tz_space = [
166+
x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng
167+
]
168+
169+
def time_iso8601(self):
170+
to_datetime(self.strings)
171+
172+
def time_iso8601_nosep(self):
173+
to_datetime(self.strings_nosep)
174+
175+
def time_iso8601_format(self):
176+
to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S")
177+
178+
def time_iso8601_format_no_sep(self):
179+
to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S")
180+
181+
def time_iso8601_tz_spaceformat(self):
182+
to_datetime(self.strings_tz_space)
183+
184+
185+
class ToDatetimeNONISO8601:
186+
def setup(self):
187+
N = 10000
188+
half = N // 2
189+
ts_string_1 = "March 1, 2018 12:00:00+0400"
190+
ts_string_2 = "March 1, 2018 12:00:00+0500"
191+
self.same_offset = [ts_string_1] * N
192+
self.diff_offset = [ts_string_1] * half + [ts_string_2] * half
193+
194+
def time_same_offset(self):
195+
to_datetime(self.same_offset)
196+
197+
def time_different_offset(self):
198+
to_datetime(self.diff_offset)
199+
200+
201+
class ToDatetimeFormatQuarters:
202+
def setup(self):
203+
self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000)
204+
205+
def time_infer_quarter(self):
206+
to_datetime(self.s)
207+
208+
209+
class ToDatetimeFormat:
210+
def setup(self):
211+
N = 100000
212+
self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N)
213+
self.s2 = self.s.str.replace(":\\S+$", "")
214+
215+
self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N
216+
self.diff_offset = [
217+
f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10)
218+
] * (N // 10)
219+
220+
def time_exact(self):
221+
to_datetime(self.s2, format="%d%b%y")
222+
223+
def time_no_exact(self):
224+
to_datetime(self.s, format="%d%b%y", exact=False)
225+
226+
def time_same_offset(self):
227+
to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z")
228+
229+
def time_different_offset(self):
230+
to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z")
231+
232+
def time_same_offset_to_utc(self):
233+
to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True)
234+
235+
def time_different_offset_to_utc(self):
236+
to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True)
237+
238+
239+
class ToDatetimeCache:
240+
241+
params = [True, False]
242+
param_names = ["cache"]
243+
244+
def setup(self, cache):
245+
N = 10000
246+
self.unique_numeric_seconds = list(range(N))
247+
self.dup_numeric_seconds = [1000] * N
248+
self.dup_string_dates = ["2000-02-11"] * N
249+
self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N
250+
251+
def time_unique_seconds_and_unit(self, cache):
252+
to_datetime(self.unique_numeric_seconds, unit="s", cache=cache)
253+
254+
def time_dup_seconds_and_unit(self, cache):
255+
to_datetime(self.dup_numeric_seconds, unit="s", cache=cache)
256+
257+
def time_dup_string_dates(self, cache):
258+
to_datetime(self.dup_string_dates, cache=cache)
259+
260+
def time_dup_string_dates_and_format(self, cache):
261+
to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache)
262+
263+
def time_dup_string_tzoffset_dates(self, cache):
264+
to_datetime(self.dup_string_with_tz, cache=cache)
265+
266+
267+
class ToTimedelta:
268+
def setup(self):
269+
self.ints = np.random.randint(0, 60, size=10000)
270+
self.str_days = []
271+
self.str_seconds = []
272+
for i in self.ints:
273+
self.str_days.append(f"{i} days")
274+
self.str_seconds.append(f"00:00:{i:02d}")
275+
276+
def time_convert_int(self):
277+
to_timedelta(self.ints, unit="s")
278+
279+
def time_convert_string_days(self):
280+
to_timedelta(self.str_days)
281+
282+
def time_convert_string_seconds(self):
283+
to_timedelta(self.str_seconds)
284+
285+
286+
class ToTimedeltaErrors:
287+
288+
params = ["coerce", "ignore"]
289+
param_names = ["errors"]
290+
291+
def setup(self, errors):
292+
ints = np.random.randint(0, 60, size=10000)
293+
self.arr = [f"{i} days" for i in ints]
294+
self.arr[-1] = "apple"
295+
296+
def time_convert(self, errors):
297+
to_timedelta(self.arr, errors=errors)
298+
299+
84300
from .pandas_vb_common import setup # noqa: F401 isort:skip

0 commit comments

Comments
 (0)