Skip to content

BENCH: collect low-dependency asvs #39917

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import numpy as np

from pandas._libs import lib

import pandas as pd

from .pandas_vb_common import tm
Expand All @@ -16,19 +14,6 @@
pass


class MaybeConvertObjects:
def setup(self):
N = 10 ** 5

data = list(range(N))
data[0] = pd.NaT
data = np.array(data)
self.data = data

def time_maybe_convert_objects(self):
lib.maybe_convert_objects(self.data)


class Factorize:

params = [
Expand Down
18 changes: 0 additions & 18 deletions asv_bench/benchmarks/attrs_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,6 @@
import pandas as pd
from pandas import DataFrame

try:
from pandas.util import cache_readonly
except ImportError:
from pandas.util.decorators import cache_readonly

try:
from pandas.core.construction import extract_array
except ImportError:
Expand Down Expand Up @@ -53,17 +48,4 @@ def time_extract_array_numpy(self, dtype):
extract_array(self.series, extract_numpy=True)


class CacheReadonly:
def setup(self):
class Foo:
@cache_readonly
def prop(self):
return 5

self.obj = Foo()

def time_cache_readonly(self):
self.obj.prop


from .pandas_vb_common import setup # noqa: F401 isort:skip
22 changes: 0 additions & 22 deletions asv_bench/benchmarks/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from .pandas_vb_common import (
datetime_dtypes,
extension_dtypes,
lib,
numeric_dtypes,
string_dtypes,
)
Expand Down Expand Up @@ -49,27 +48,6 @@ def time_pandas_dtype_invalid(self, dtype):
pass


class InferDtypes:
param_names = ["dtype"]
data_dict = {
"np-object": np.array([1] * 100000, dtype="O"),
"py-object": [1] * 100000,
"np-null": np.array([1] * 50000 + [np.nan] * 50000),
"py-null": [1] * 50000 + [None] * 50000,
"np-int": np.array([1] * 100000, dtype=int),
"np-floating": np.array([1.0] * 100000, dtype=float),
"empty": [],
"bytes": [b"a"] * 100000,
}
params = list(data_dict.keys())

def time_infer_skipna(self, dtype):
lib.infer_dtype(self.data_dict[dtype], skipna=True)

def time_infer(self, dtype):
lib.infer_dtype(self.data_dict[dtype], skipna=False)


class SelectDtypes:

params = [
Expand Down
1 change: 1 addition & 0 deletions asv_bench/benchmarks/gil.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def time_take1d(self, dtype):


class ParallelKth:
# This depends exclusively on code in _libs/, could go in libs.py

number = 1
repeat = 5
Expand Down
7 changes: 7 additions & 0 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
"""
Benchmarks in this fiel depend exclusively on code in _libs/

If a PR does not edit anything in _libs, it is very unlikely that benchmarks
in this file will be affected.
"""

import numpy as np

from pandas._libs import index as libindex
Expand Down
216 changes: 216 additions & 0 deletions asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,20 @@
"""
The functions benchmarked in this file depend _almost_ exclusively on
_libs, but not in a way that is easy to formalize.

If a PR does not change anything in pandas/_libs/ or pandas/core/tools/, then
it is likely that these benchmarks will be unaffected.
"""

import numpy as np

from pandas import (
NaT,
Series,
date_range,
to_datetime,
to_numeric,
to_timedelta,
)

from .pandas_vb_common import (
Expand Down Expand Up @@ -69,6 +81,9 @@ def time_downcast(self, dtype, downcast):


class MaybeConvertNumeric:
# maybe_convert_numeric depends _exclusively_ on _libs, could
# go in benchmarks/libs.py

def setup_cache(self):
N = 10 ** 6
arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64")
Expand All @@ -81,4 +96,205 @@ def time_convert(self, data):
lib.maybe_convert_numeric(data, set(), coerce_numeric=False)


class MaybeConvertObjects:
# maybe_convert_objects depends _almost_ exclusively on _libs, but
# does have some run-time imports from outside of _libs

def setup(self):
N = 10 ** 5

data = list(range(N))
data[0] = NaT
data = np.array(data)
self.data = data

def time_maybe_convert_objects(self):
lib.maybe_convert_objects(self.data)


class ToDatetimeFromIntsFloats:
def setup(self):
self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64")
self.ts_sec_float = self.ts_sec.astype("float64")

self.ts_nanosec = 1_000_000 * self.ts_sec
self.ts_nanosec_float = self.ts_nanosec.astype("float64")

# speed of int64 and float64 paths should be comparable

def time_nanosec_int64(self):
to_datetime(self.ts_nanosec, unit="ns")

def time_nanosec_float64(self):
to_datetime(self.ts_nanosec_float, unit="ns")

def time_sec_int64(self):
to_datetime(self.ts_sec, unit="s")

def time_sec_float64(self):
to_datetime(self.ts_sec_float, unit="s")


class ToDatetimeYYYYMMDD:
def setup(self):
rng = date_range(start="1/1/2000", periods=10000, freq="D")
self.stringsD = Series(rng.strftime("%Y%m%d"))

def time_format_YYYYMMDD(self):
to_datetime(self.stringsD, format="%Y%m%d")


class ToDatetimeCacheSmallCount:

params = ([True, False], [50, 500, 5000, 100000])
param_names = ["cache", "count"]

def setup(self, cache, count):
rng = date_range(start="1/1/1971", periods=count)
self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist()

def time_unique_date_strings(self, cache, count):
to_datetime(self.unique_date_strings, cache=cache)


class ToDatetimeISO8601:
def setup(self):
rng = date_range(start="1/1/2000", periods=20000, freq="H")
self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist()
self.strings_tz_space = [
x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng
]

def time_iso8601(self):
to_datetime(self.strings)

def time_iso8601_nosep(self):
to_datetime(self.strings_nosep)

def time_iso8601_format(self):
to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S")

def time_iso8601_format_no_sep(self):
to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S")

def time_iso8601_tz_spaceformat(self):
to_datetime(self.strings_tz_space)


class ToDatetimeNONISO8601:
def setup(self):
N = 10000
half = N // 2
ts_string_1 = "March 1, 2018 12:00:00+0400"
ts_string_2 = "March 1, 2018 12:00:00+0500"
self.same_offset = [ts_string_1] * N
self.diff_offset = [ts_string_1] * half + [ts_string_2] * half

def time_same_offset(self):
to_datetime(self.same_offset)

def time_different_offset(self):
to_datetime(self.diff_offset)


class ToDatetimeFormatQuarters:
def setup(self):
self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000)

def time_infer_quarter(self):
to_datetime(self.s)


class ToDatetimeFormat:
def setup(self):
N = 100000
self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N)
self.s2 = self.s.str.replace(":\\S+$", "")

self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N
self.diff_offset = [
f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10)
] * (N // 10)

def time_exact(self):
to_datetime(self.s2, format="%d%b%y")

def time_no_exact(self):
to_datetime(self.s, format="%d%b%y", exact=False)

def time_same_offset(self):
to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z")

def time_different_offset(self):
to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z")

def time_same_offset_to_utc(self):
to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True)

def time_different_offset_to_utc(self):
to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True)


class ToDatetimeCache:

params = [True, False]
param_names = ["cache"]

def setup(self, cache):
N = 10000
self.unique_numeric_seconds = list(range(N))
self.dup_numeric_seconds = [1000] * N
self.dup_string_dates = ["2000-02-11"] * N
self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N

def time_unique_seconds_and_unit(self, cache):
to_datetime(self.unique_numeric_seconds, unit="s", cache=cache)

def time_dup_seconds_and_unit(self, cache):
to_datetime(self.dup_numeric_seconds, unit="s", cache=cache)

def time_dup_string_dates(self, cache):
to_datetime(self.dup_string_dates, cache=cache)

def time_dup_string_dates_and_format(self, cache):
to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache)

def time_dup_string_tzoffset_dates(self, cache):
to_datetime(self.dup_string_with_tz, cache=cache)


class ToTimedelta:
def setup(self):
self.ints = np.random.randint(0, 60, size=10000)
self.str_days = []
self.str_seconds = []
for i in self.ints:
self.str_days.append(f"{i} days")
self.str_seconds.append(f"00:00:{i:02d}")

def time_convert_int(self):
to_timedelta(self.ints, unit="s")

def time_convert_string_days(self):
to_timedelta(self.str_days)

def time_convert_string_seconds(self):
to_timedelta(self.str_seconds)


class ToTimedeltaErrors:

params = ["coerce", "ignore"]
param_names = ["errors"]

def setup(self, errors):
ints = np.random.randint(0, 60, size=10000)
self.arr = [f"{i} days" for i in ints]
self.arr[-1] = "apple"

def time_convert(self, errors):
to_timedelta(self.arr, errors=errors)


from .pandas_vb_common import setup # noqa: F401 isort:skip
Loading