Skip to content

Commit c213523

Browse files
committed
Merge branch 'vbench_for_groupby' of https://github.com/dlovell/pandas into dlovell-vbench_for_groupby
2 parents e871245 + c494c03 commit c213523

File tree

2 files changed

+76
-0
lines changed

2 files changed

+76
-0
lines changed

doc/source/v0.15.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,7 @@ Performance
831831
- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
832832
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
833833
- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
834+
- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`)
834835

835836

836837

vb_suite/groupby.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,3 +484,78 @@ def f(g):
484484

485485
groupby_agg_builtins1 = Benchmark("df.groupby('jim').agg([sum, min, max])", setup)
486486
groupby_agg_builtins2 = Benchmark("df.groupby(['jim', 'joe']).agg([sum, min, max])", setup)
487+
488+
#----------------------------------------------------------------------
489+
# groupby with a variable value for ngroups
490+
491+
492+
ngroups_list = [100, 10000]
493+
no_arg_func_list = [
494+
'all',
495+
'any',
496+
'count',
497+
'cumcount',
498+
'cummax',
499+
'cummin',
500+
'cumprod',
501+
'cumsum',
502+
'describe',
503+
'diff',
504+
'first',
505+
'head',
506+
'last',
507+
'mad',
508+
'max',
509+
'mean',
510+
'median',
511+
'min',
512+
'nunique',
513+
'pct_change',
514+
'prod',
515+
'rank',
516+
'sem',
517+
'size',
518+
'skew',
519+
'std',
520+
'sum',
521+
'tail',
522+
'unique',
523+
'var',
524+
'value_counts',
525+
]
526+
527+
528+
_stmt_template = "df.groupby('value')['timestamp'].%s"
529+
_setup_template = common_setup + """
530+
np.random.seed(1234)
531+
ngroups = %s
532+
size = ngroups * 2
533+
rng = np.arange(ngroups)
534+
df = DataFrame(dict(
535+
timestamp=rng.take(np.random.randint(0, ngroups, size=size)),
536+
value=np.random.randint(0, size, size=size)
537+
))
538+
"""
539+
START_DATE = datetime(2011, 7, 1)
540+
541+
542+
def make_large_ngroups_bmark(ngroups, func_name, func_args=''):
543+
bmark_name = 'groupby_ngroups_%s_%s' % (ngroups, func_name)
544+
stmt = _stmt_template % ('%s(%s)' % (func_name, func_args))
545+
setup = _setup_template % ngroups
546+
bmark = Benchmark(stmt, setup, start_date=START_DATE)
547+
# MUST set name
548+
bmark.name = bmark_name
549+
return bmark
550+
551+
552+
def inject_bmark_into_globals(bmark):
553+
if not bmark.name:
554+
raise AssertionError('benchmark must have a name')
555+
globals()[bmark.name] = bmark
556+
557+
558+
for ngroups in ngroups_list:
559+
for func_name in no_arg_func_list:
560+
bmark = make_large_ngroups_bmark(ngroups, func_name)
561+
inject_bmark_into_globals(bmark)

0 commit comments

Comments
 (0)