Skip to content

Commit 62c87d1

Browse files
committed
attempt to improve perfomance
1 parent 76d8058 commit 62c87d1

File tree

2 files changed

+26
-15
lines changed

2 files changed

+26
-15
lines changed

pandas/core/tools/datetimes.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
4343
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
4444

4545

46-
def should_cache(arg, check_count: int, unique_share: float):
46+
def should_cache(arg, unique_share=0.7, check_count=None):
4747
"""
4848
Decides whether to do caching.
4949
@@ -53,23 +53,34 @@ def should_cache(arg, check_count: int, unique_share: float):
5353
Parameters
5454
----------
5555
arg: listlike, tuple, 1-d array, Series
56-
check_count: int
57-
0 <= check_count <= len(arg)
58-
unique_share: float
56+
unique_share: float or None
5957
0 < unique_share < 1
58+
check_count: int or None
59+
0 <= check_count <= len(arg)
6060
6161
Returns
6262
-------
6363
do_caching: bool
6464
"""
65-
assert 0 <= check_count <= len(arg), ('check_count must be in next bounds:'
66-
' [0; len(arg)]')
67-
assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
65+
do_caching = True
6866

69-
if check_count == 0:
70-
return False
67+
# default realization
68+
if check_count is None:
69+
# in this case, the gain from caching is negligible
70+
if len(arg) <= 50:
71+
return False
7172

72-
do_caching = True
73+
if len(arg) <= 5000:
74+
check_count = int(len(arg) * 0.1)
75+
else:
76+
check_count = 500
77+
else:
78+
assert 0 <= check_count <= len(arg), \
79+
'check_count must be in next bounds: [0; len(arg)]'
80+
assert 0 < unique_share < 1, \
81+
'unique_share must be in next bounds: (0; 1)'
82+
if check_count == 0:
83+
return False
7384

7485
unique_elements = unique(arg[:check_count])
7586
if len(unique_elements) > check_count * unique_share:
@@ -102,7 +113,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
102113
# Perform a quicker unique check
103114
from pandas import Index
104115

105-
if not should_cache(arg, int(len(arg) * 0.1), 0.7):
116+
if not should_cache(arg):
106117
return cache_array
107118

108119
unique_dates = Index(arg).unique()

pandas/tests/indexes/datetimes/test_tools.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2043,12 +2043,12 @@ def test_should_cache(listlike, do_caching):
20432043
unique_share=0.7) == do_caching
20442044

20452045

2046-
@pytest.mark.parametrize('check_count,unique_share, err_message', [
2047-
(11, 0.5, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
2046+
@pytest.mark.parametrize('unique_share,check_count, err_message', [
2047+
(0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
20482048
(10, 2, r'unique_share must be in next bounds: \(0; 1\)')
20492049
])
2050-
def test_should_cache_errors(check_count, unique_share, err_message):
2050+
def test_should_cache_errors(unique_share, check_count, err_message):
20512051
arg = [5] * 10
20522052

20532053
with pytest.raises(AssertionError, match=err_message):
2054-
tools.should_cache(arg, check_count, unique_share)
2054+
tools.should_cache(arg, unique_share, check_count)

0 commit comments

Comments
 (0)