From 8d5aedaf90ae519852780aa9801fa5f45517dc4b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Feb 2021 11:58:54 -0800 Subject: [PATCH] BENCH: collect low-dependency asvs --- asv_bench/benchmarks/algorithms.py | 15 -- asv_bench/benchmarks/attrs_caching.py | 18 -- asv_bench/benchmarks/dtypes.py | 22 --- asv_bench/benchmarks/gil.py | 1 + asv_bench/benchmarks/indexing_engines.py | 7 + asv_bench/benchmarks/inference.py | 216 +++++++++++++++++++++++ asv_bench/benchmarks/libs.py | 66 ++++++- asv_bench/benchmarks/reindex.py | 20 +-- asv_bench/benchmarks/timedelta.py | 36 ---- asv_bench/benchmarks/timeseries.py | 155 +--------------- 10 files changed, 291 insertions(+), 265 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..823daa2e31529 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -2,8 +2,6 @@ import numpy as np -from pandas._libs import lib - import pandas as pd from .pandas_vb_common import tm @@ -16,19 +14,6 @@ pass -class MaybeConvertObjects: - def setup(self): - N = 10 ** 5 - - data = list(range(N)) - data[0] = pd.NaT - data = np.array(data) - self.data = data - - def time_maybe_convert_objects(self): - lib.maybe_convert_objects(self.data) - - class Factorize: params = [ diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index 9c7b107b478d4..d4366c42f96aa 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -3,11 +3,6 @@ import pandas as pd from pandas import DataFrame -try: - from pandas.util import cache_readonly -except ImportError: - from pandas.util.decorators import cache_readonly - try: from pandas.core.construction import extract_array except ImportError: @@ -53,17 +48,4 @@ def time_extract_array_numpy(self, dtype): extract_array(self.series, extract_numpy=True) -class CacheReadonly: - def setup(self): - class Foo: - @cache_readonly - def prop(self): - return 5 - - self.obj = Foo() - - def time_cache_readonly(self): - self.obj.prop - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 9209e851289bb..c561b80ed1ca6 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -13,7 +13,6 @@ from .pandas_vb_common import ( datetime_dtypes, extension_dtypes, - lib, numeric_dtypes, string_dtypes, ) @@ -49,27 +48,6 @@ def time_pandas_dtype_invalid(self, dtype): pass -class InferDtypes: - param_names = ["dtype"] - data_dict = { - "np-object": np.array([1] * 100000, dtype="O"), - "py-object": [1] * 100000, - "np-null": np.array([1] * 50000 + [np.nan] * 50000), - "py-null": [1] * 50000 + [None] * 50000, - "np-int": np.array([1] * 100000, dtype=int), - "np-floating": np.array([1.0] * 100000, dtype=float), - "empty": [], - "bytes": [b"a"] * 100000, - } - params = list(data_dict.keys()) - - def time_infer_skipna(self, dtype): - lib.infer_dtype(self.data_dict[dtype], skipna=True) - - def time_infer(self, dtype): - lib.infer_dtype(self.data_dict[dtype], skipna=False) - - class SelectDtypes: params = [ diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 410668ca3c7cf..459046d2decfb 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -125,6 +125,7 @@ def time_take1d(self, dtype): class ParallelKth: + # This depends exclusively on code in _libs/, could go in libs.py number = 1 repeat = 5 diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 44a22dfa77791..30ef7f63dc0dc 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,3 +1,10 @@ +""" +Benchmarks in this fiel depend exclusively on code in _libs/ + +If a PR does not edit anything in _libs, it is very unlikely that benchmarks +in this file will be affected. +""" + import numpy as np from pandas._libs import index as libindex diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index b6808ace629db..0aa924dabd469 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,8 +1,20 @@ +""" +The functions benchmarked in this file depend _almost_ exclusively on +_libs, but not in a way that is easy to formalize. + +If a PR does not change anything in pandas/_libs/ or pandas/core/tools/, then +it is likely that these benchmarks will be unaffected. +""" + import numpy as np from pandas import ( + NaT, Series, + date_range, + to_datetime, to_numeric, + to_timedelta, ) from .pandas_vb_common import ( @@ -69,6 +81,9 @@ def time_downcast(self, dtype, downcast): class MaybeConvertNumeric: + # maybe_convert_numeric depends _exclusively_ on _libs, could + # go in benchmarks/libs.py + def setup_cache(self): N = 10 ** 6 arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64") @@ -81,4 +96,205 @@ def time_convert(self, data): lib.maybe_convert_numeric(data, set(), coerce_numeric=False) +class MaybeConvertObjects: + # maybe_convert_objects depends _almost_ exclusively on _libs, but + # does have some run-time imports from outside of _libs + + def setup(self): + N = 10 ** 5 + + data = list(range(N)) + data[0] = NaT + data = np.array(data) + self.data = data + + def time_maybe_convert_objects(self): + lib.maybe_convert_objects(self.data) + + +class ToDatetimeFromIntsFloats: + def setup(self): + self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") + self.ts_sec_float = self.ts_sec.astype("float64") + + self.ts_nanosec = 1_000_000 * self.ts_sec + self.ts_nanosec_float = self.ts_nanosec.astype("float64") + + # speed of int64 and float64 paths should be comparable + + def time_nanosec_int64(self): + to_datetime(self.ts_nanosec, unit="ns") + + def time_nanosec_float64(self): + to_datetime(self.ts_nanosec_float, unit="ns") + + def time_sec_int64(self): + to_datetime(self.ts_sec, unit="s") + + def time_sec_float64(self): + to_datetime(self.ts_sec_float, unit="s") + + +class ToDatetimeYYYYMMDD: + def setup(self): + rng = date_range(start="1/1/2000", periods=10000, freq="D") + self.stringsD = Series(rng.strftime("%Y%m%d")) + + def time_format_YYYYMMDD(self): + to_datetime(self.stringsD, format="%Y%m%d") + + +class ToDatetimeCacheSmallCount: + + params = ([True, False], [50, 500, 5000, 100000]) + param_names = ["cache", "count"] + + def setup(self, cache, count): + rng = date_range(start="1/1/1971", periods=count) + self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist() + + def time_unique_date_strings(self, cache, count): + to_datetime(self.unique_date_strings, cache=cache) + + +class ToDatetimeISO8601: + def setup(self): + rng = date_range(start="1/1/2000", periods=20000, freq="H") + self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() + self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() + self.strings_tz_space = [ + x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng + ] + + def time_iso8601(self): + to_datetime(self.strings) + + def time_iso8601_nosep(self): + to_datetime(self.strings_nosep) + + def time_iso8601_format(self): + to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S") + + def time_iso8601_format_no_sep(self): + to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S") + + def time_iso8601_tz_spaceformat(self): + to_datetime(self.strings_tz_space) + + +class ToDatetimeNONISO8601: + def setup(self): + N = 10000 + half = N // 2 + ts_string_1 = "March 1, 2018 12:00:00+0400" + ts_string_2 = "March 1, 2018 12:00:00+0500" + self.same_offset = [ts_string_1] * N + self.diff_offset = [ts_string_1] * half + [ts_string_2] * half + + def time_same_offset(self): + to_datetime(self.same_offset) + + def time_different_offset(self): + to_datetime(self.diff_offset) + + +class ToDatetimeFormatQuarters: + def setup(self): + self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000) + + def time_infer_quarter(self): + to_datetime(self.s) + + +class ToDatetimeFormat: + def setup(self): + N = 100000 + self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) + self.s2 = self.s.str.replace(":\\S+$", "") + + self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N + self.diff_offset = [ + f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10) + ] * (N // 10) + + def time_exact(self): + to_datetime(self.s2, format="%d%b%y") + + def time_no_exact(self): + to_datetime(self.s, format="%d%b%y", exact=False) + + def time_same_offset(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_different_offset(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_same_offset_to_utc(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + + def time_different_offset_to_utc(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + + +class ToDatetimeCache: + + params = [True, False] + param_names = ["cache"] + + def setup(self, cache): + N = 10000 + self.unique_numeric_seconds = list(range(N)) + self.dup_numeric_seconds = [1000] * N + self.dup_string_dates = ["2000-02-11"] * N + self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N + + def time_unique_seconds_and_unit(self, cache): + to_datetime(self.unique_numeric_seconds, unit="s", cache=cache) + + def time_dup_seconds_and_unit(self, cache): + to_datetime(self.dup_numeric_seconds, unit="s", cache=cache) + + def time_dup_string_dates(self, cache): + to_datetime(self.dup_string_dates, cache=cache) + + def time_dup_string_dates_and_format(self, cache): + to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache) + + def time_dup_string_tzoffset_dates(self, cache): + to_datetime(self.dup_string_with_tz, cache=cache) + + +class ToTimedelta: + def setup(self): + self.ints = np.random.randint(0, 60, size=10000) + self.str_days = [] + self.str_seconds = [] + for i in self.ints: + self.str_days.append(f"{i} days") + self.str_seconds.append(f"00:00:{i:02d}") + + def time_convert_int(self): + to_timedelta(self.ints, unit="s") + + def time_convert_string_days(self): + to_timedelta(self.str_days) + + def time_convert_string_seconds(self): + to_timedelta(self.str_seconds) + + +class ToTimedeltaErrors: + + params = ["coerce", "ignore"] + param_names = ["errors"] + + def setup(self, errors): + ints = np.random.randint(0, 60, size=10000) + self.arr = [f"{i} days" for i in ints] + self.arr[-1] = "apple" + + def time_convert(self, errors): + to_timedelta(self.arr, errors=errors) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py index f5c2397945cea..4e3f938a33eb1 100644 --- a/asv_bench/benchmarks/libs.py +++ b/asv_bench/benchmarks/libs.py @@ -1,10 +1,14 @@ """ Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs, -which has its own directory +which has its own directory. + +If a PR does not edit anything in _libs/, then it is unlikely that thes +benchmarks will be affected. """ import numpy as np from pandas._libs.lib import ( + infer_dtype, is_list_like, is_scalar, ) @@ -14,6 +18,17 @@ NaT, ) +from .pandas_vb_common import ( + lib, + tm, +) + +try: + from pandas.util import cache_readonly +except ImportError: + from pandas.util.decorators import cache_readonly + + # TODO: share with something in pd._testing? scalars = [ 0, @@ -40,3 +55,52 @@ def time_is_list_like(self, param): def time_is_scalar(self, param): is_scalar(param) + + +class FastZip: + def setup(self): + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + col_array = np.vstack([key1, key2, np.random.randn(N * K)]) + col_array2 = col_array.copy() + col_array2[:, :10000] = np.nan + self.col_array_list = list(col_array) + + def time_lib_fast_zip(self): + lib.fast_zip(self.col_array_list) + + +class InferDtype: + param_names = ["dtype"] + data_dict = { + "np-object": np.array([1] * 100000, dtype="O"), + "py-object": [1] * 100000, + "np-null": np.array([1] * 50000 + [np.nan] * 50000), + "py-null": [1] * 50000 + [None] * 50000, + "np-int": np.array([1] * 100000, dtype=int), + "np-floating": np.array([1.0] * 100000, dtype=float), + "empty": [], + "bytes": [b"a"] * 100000, + } + params = list(data_dict.keys()) + + def time_infer_dtype_skipna(self, dtype): + infer_dtype(self.data_dict[dtype], skipna=True) + + def time_infer_dtype(self, dtype): + infer_dtype(self.data_dict[dtype], skipna=False) + + +class CacheReadonly: + def setup(self): + class Foo: + @cache_readonly + def prop(self): + return 5 + + self.obj = Foo() + + def time_cache_readonly(self): + self.obj.prop diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 65392f2cea65b..5181b983c9f7a 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -9,10 +9,7 @@ period_range, ) -from .pandas_vb_common import ( - lib, - tm, -) +from .pandas_vb_common import tm class Reindex: @@ -155,19 +152,4 @@ def time_align_series_irregular_string(self): self.x + self.y -class LibFastZip: - def setup(self): - N = 10000 - K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) - col_array = np.vstack([key1, key2, np.random.randn(N * K)]) - col_array2 = col_array.copy() - col_array2[:, :10000] = np.nan - self.col_array_list = list(col_array) - - def time_lib_fast_zip(self): - lib.fast_zip(self.col_array_list) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 9e221ee030e6d..cb0e4455e1a56 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -3,49 +3,13 @@ benchmarks.tslibs.timedelta for benchmarks that rely only on tslibs. """ -import numpy as np - from pandas import ( DataFrame, Series, timedelta_range, - to_timedelta, ) -class ToTimedelta: - def setup(self): - self.ints = np.random.randint(0, 60, size=10000) - self.str_days = [] - self.str_seconds = [] - for i in self.ints: - self.str_days.append(f"{i} days") - self.str_seconds.append(f"00:00:{i:02d}") - - def time_convert_int(self): - to_timedelta(self.ints, unit="s") - - def time_convert_string_days(self): - to_timedelta(self.str_days) - - def time_convert_string_seconds(self): - to_timedelta(self.str_seconds) - - -class ToTimedeltaErrors: - - params = ["coerce", "ignore"] - param_names = ["errors"] - - def setup(self, errors): - ints = np.random.randint(0, 60, size=10000) - self.arr = [f"{i} days" for i in ints] - self.arr[-1] = "apple" - - def time_convert(self, errors): - to_timedelta(self.arr, errors=errors) - - class DatetimeAccessor: def setup_cache(self): N = 100000 diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 94498e54f0f06..51081db86b76e 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -9,7 +9,6 @@ date_range, period_range, timedelta_range, - to_datetime, ) from pandas.tseries.frequencies import infer_freq @@ -102,7 +101,7 @@ def time_reest_datetimeindex(self, tz): class InferFreq: - + # This depends mostly on code in _libs/, tseries/, and core.algos.unique params = [None, "D", "B"] param_names = ["freq"] @@ -273,158 +272,6 @@ def time_lookup_and_cleanup(self): self.ts.index._cleanup() -class ToDatetimeFromIntsFloats: - def setup(self): - self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") - self.ts_sec_float = self.ts_sec.astype("float64") - - self.ts_nanosec = 1_000_000 * self.ts_sec - self.ts_nanosec_float = self.ts_nanosec.astype("float64") - - # speed of int64 and float64 paths should be comparable - - def time_nanosec_int64(self): - to_datetime(self.ts_nanosec, unit="ns") - - def time_nanosec_float64(self): - to_datetime(self.ts_nanosec_float, unit="ns") - - def time_sec_int64(self): - to_datetime(self.ts_sec, unit="s") - - def time_sec_float64(self): - to_datetime(self.ts_sec_float, unit="s") - - -class ToDatetimeYYYYMMDD: - def setup(self): - rng = date_range(start="1/1/2000", periods=10000, freq="D") - self.stringsD = Series(rng.strftime("%Y%m%d")) - - def time_format_YYYYMMDD(self): - to_datetime(self.stringsD, format="%Y%m%d") - - -class ToDatetimeCacheSmallCount: - - params = ([True, False], [50, 500, 5000, 100000]) - param_names = ["cache", "count"] - - def setup(self, cache, count): - rng = date_range(start="1/1/1971", periods=count) - self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist() - - def time_unique_date_strings(self, cache, count): - to_datetime(self.unique_date_strings, cache=cache) - - -class ToDatetimeISO8601: - def setup(self): - rng = date_range(start="1/1/2000", periods=20000, freq="H") - self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() - self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() - self.strings_tz_space = [ - x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng - ] - - def time_iso8601(self): - to_datetime(self.strings) - - def time_iso8601_nosep(self): - to_datetime(self.strings_nosep) - - def time_iso8601_format(self): - to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S") - - def time_iso8601_format_no_sep(self): - to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S") - - def time_iso8601_tz_spaceformat(self): - to_datetime(self.strings_tz_space) - - -class ToDatetimeNONISO8601: - def setup(self): - N = 10000 - half = N // 2 - ts_string_1 = "March 1, 2018 12:00:00+0400" - ts_string_2 = "March 1, 2018 12:00:00+0500" - self.same_offset = [ts_string_1] * N - self.diff_offset = [ts_string_1] * half + [ts_string_2] * half - - def time_same_offset(self): - to_datetime(self.same_offset) - - def time_different_offset(self): - to_datetime(self.diff_offset) - - -class ToDatetimeFormatQuarters: - def setup(self): - self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000) - - def time_infer_quarter(self): - to_datetime(self.s) - - -class ToDatetimeFormat: - def setup(self): - N = 100000 - self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) - self.s2 = self.s.str.replace(":\\S+$", "") - - self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N - self.diff_offset = [ - f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10) - ] * (N // 10) - - def time_exact(self): - to_datetime(self.s2, format="%d%b%y") - - def time_no_exact(self): - to_datetime(self.s, format="%d%b%y", exact=False) - - def time_same_offset(self): - to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - - def time_different_offset(self): - to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - - def time_same_offset_to_utc(self): - to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) - - def time_different_offset_to_utc(self): - to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) - - -class ToDatetimeCache: - - params = [True, False] - param_names = ["cache"] - - def setup(self, cache): - N = 10000 - self.unique_numeric_seconds = list(range(N)) - self.dup_numeric_seconds = [1000] * N - self.dup_string_dates = ["2000-02-11"] * N - self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N - - def time_unique_seconds_and_unit(self, cache): - to_datetime(self.unique_numeric_seconds, unit="s", cache=cache) - - def time_dup_seconds_and_unit(self, cache): - to_datetime(self.dup_numeric_seconds, unit="s", cache=cache) - - def time_dup_string_dates(self, cache): - to_datetime(self.dup_string_dates, cache=cache) - - def time_dup_string_dates_and_format(self, cache): - to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache) - - def time_dup_string_tzoffset_dates(self, cache): - to_datetime(self.dup_string_with_tz, cache=cache) - - class DatetimeAccessor: params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()]