Skip to content

Commit 8ef4803

Browse files
committed
added heuristic to decrease slowdowns for unique arrays
1 parent f3d8365 commit 8ef4803

File tree

1 file changed

+29
-0
lines changed

1 file changed

+29
-0
lines changed

pandas/core/tools/datetimes.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,31 @@ def _guess_datetime_format_for_array(arr, **kwargs):
4242
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
4343

4444

45+
def do_cache(arg, check_count: int, unique_share: float):
46+
"""
47+
Decides whether to do caching.
48+
49+
If the percent of unique elements among `check_count` elements less
50+
than `unique_share * 100` then we can do caching.
51+
52+
Parameters
53+
----------
54+
arg: list, tuple, 1-d array, Series
55+
check_count: int
56+
unique_share: float
57+
58+
Returns
59+
-------
60+
: bool
61+
"""
62+
from pandas.core.algorithms import unique
63+
64+
unique = unique(arg[:check_count])
65+
if len(unique) > check_count * unique_share:
66+
return False
67+
return True
68+
69+
4570
def _maybe_cache(arg, format, cache, convert_listlike):
4671
"""
4772
Create a cache of unique dates from an array of dates
@@ -66,6 +91,10 @@ def _maybe_cache(arg, format, cache, convert_listlike):
6691
if cache:
6792
# Perform a quicker unique check
6893
from pandas import Index
94+
95+
if not do_cache(arg, int(len(arg) * 0.1), 0.7):
96+
return cache_array
97+
6998
unique_dates = Index(arg).unique()
7099
if len(unique_dates) < len(arg):
71100
cache_dates = convert_listlike(unique_dates.to_numpy(),

0 commit comments

Comments
 (0)