Skip to content

PERF: faster grouping #14294

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 54 additions & 119 deletions asv_bench/benchmarks/gil.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def wrapper(fname):
return wrapper


class nogil_groupby_count_2(object):
class nogil_groupby_base(object):
goal_time = 0.2

def setup(self):
Expand All @@ -33,6 +33,9 @@ def setup(self):
if (not have_real_test_parallel):
raise NotImplementedError


class nogil_groupby_count_2(nogil_groupby_base):

def time_nogil_groupby_count_2(self):
self.pg2()

Expand All @@ -41,16 +44,7 @@ def pg2(self):
self.df.groupby('key')['data'].count()


class nogil_groupby_last_2(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
if (not have_real_test_parallel):
raise NotImplementedError
class nogil_groupby_last_2(nogil_groupby_base):

def time_nogil_groupby_last_2(self):
self.pg2()
Expand All @@ -60,16 +54,7 @@ def pg2(self):
self.df.groupby('key')['data'].last()


class nogil_groupby_max_2(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
if (not have_real_test_parallel):
raise NotImplementedError
class nogil_groupby_max_2(nogil_groupby_base):

def time_nogil_groupby_max_2(self):
self.pg2()
Expand All @@ -79,16 +64,7 @@ def pg2(self):
self.df.groupby('key')['data'].max()


class nogil_groupby_mean_2(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
if (not have_real_test_parallel):
raise NotImplementedError
class nogil_groupby_mean_2(nogil_groupby_base):

def time_nogil_groupby_mean_2(self):
self.pg2()
Expand All @@ -98,16 +74,7 @@ def pg2(self):
self.df.groupby('key')['data'].mean()


class nogil_groupby_min_2(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
if (not have_real_test_parallel):
raise NotImplementedError
class nogil_groupby_min_2(nogil_groupby_base):

def time_nogil_groupby_min_2(self):
self.pg2()
Expand All @@ -117,16 +84,7 @@ def pg2(self):
self.df.groupby('key')['data'].min()


class nogil_groupby_prod_2(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
if (not have_real_test_parallel):
raise NotImplementedError
class nogil_groupby_prod_2(nogil_groupby_base):

def time_nogil_groupby_prod_2(self):
self.pg2()
Expand All @@ -136,16 +94,7 @@ def pg2(self):
self.df.groupby('key')['data'].prod()


class nogil_groupby_sum_2(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
if (not have_real_test_parallel):
raise NotImplementedError
class nogil_groupby_sum_2(nogil_groupby_base):

def time_nogil_groupby_sum_2(self):
self.pg2()
Expand All @@ -155,107 +104,93 @@ def pg2(self):
self.df.groupby('key')['data'].sum()


class nogil_groupby_sum_4(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
if (not have_real_test_parallel):
raise NotImplementedError
class nogil_groupby_sum_4(nogil_groupby_base):

def time_nogil_groupby_sum_4(self):
self.pg4()

def f(self):
self.df.groupby('key')['data'].sum()

def g2(self):
for i in range(2):
self.f()

def g4(self):
for i in range(4):
self.f()

def g8(self):
for i in range(8):
self.f()

@test_parallel(num_threads=2)
def pg2(self):
self.f()

@test_parallel(num_threads=4)
def pg4(self):
self.f()

@test_parallel(num_threads=8)
def pg8(self):
self.f()


class nogil_groupby_sum_8(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
if (not have_real_test_parallel):
raise NotImplementedError
class nogil_groupby_sum_8(nogil_groupby_base):

def time_nogil_groupby_sum_8(self):
self.pg8()

def f(self):
self.df.groupby('key')['data'].sum()

def g2(self):
for i in range(2):
self.f()

def g4(self):
for i in range(4):
self.f()

def g8(self):
for i in range(8):
self.f()

@test_parallel(num_threads=2)
def pg2(self):
self.f()

@test_parallel(num_threads=4)
def pg4(self):
self.f()

@test_parallel(num_threads=8)
def pg8(self):
self.f()


class nogil_groupby_var_2(object):
class nogil_groupby_var_2(nogil_groupby_base):

def time_nogil_groupby_var_2(self):
self.pg2()

@test_parallel(num_threads=2)
def pg2(self):
self.df.groupby('key')['data'].var()


class nogil_groupby_groups(object):
goal_time = 0.2

def setup(self):
self.N = 1000000
self.ngroups = 1000
np.random.seed(1234)
self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), })
self.size = 2**22
self.ngroups = 100
self.data = Series(np.random.randint(0, self.ngroups, size=self.size))
if (not have_real_test_parallel):
raise NotImplementedError

def time_nogil_groupby_var_2(self):
def f(self):
self.data.groupby(self.data).groups


class nogil_groupby_groups_2(nogil_groupby_groups):

def time_nogil_groupby_groups(self):
self.pg2()

@test_parallel(num_threads=2)
def pg2(self):
self.df.groupby('key')['data'].var()
self.f()


class nogil_groupby_groups_4(nogil_groupby_groups):

def time_nogil_groupby_groups(self):
self.pg4()

@test_parallel(num_threads=4)
def pg4(self):
self.f()


class nogil_groupby_groups_8(nogil_groupby_groups):

def time_nogil_groupby_groups(self):
self.pg8()

@test_parallel(num_threads=8)
def pg8(self):
self.f()


class nogil_take1d_float64(object):
Expand Down
26 changes: 26 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,32 @@ def time_groupby_apply_dict_return(self):
self.data.groupby(self.labels).apply(self.f)


#----------------------------------------------------------------------
# groups

class groupby_groups(object):
goal_time = 0.1

def setup(self):
size = 2**22
self.data = Series(np.random.randint(0, 100, size=size))
self.data2 = Series(np.random.randint(0, 10000, size=size))
self.data3 = Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size)))
self.data4 = Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size)))

def time_groupby_groups_int64_small(self):
self.data.groupby(self.data).groups

def time_groupby_groups_int64_large(self):
self.data2.groupby(self.data2).groups

def time_groupby_groups_object_small(self):
self.data3.groupby(self.data3).groups

def time_groupby_groups_object_large(self):
self.data4.groupby(self.data4).groups


#----------------------------------------------------------------------
# First / last functions

Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1335,6 +1335,7 @@ Other API Changes
- ``Series`` and ``Index`` now support ``divmod`` which will return a tuple of
series or indices. This behaves like a standard binary operator with regards
to broadcasting rules (:issue:`14208`).
- ``.groupby.groups`` will now return a dictionary of ``Index`` objects, rather than a dictionary of ``np.ndarray`` or ``lists`` (:issue:`14293`)

.. _whatsnew_0190.deprecations:

Expand Down Expand Up @@ -1407,6 +1408,7 @@ Performance Improvements
- Improved performance of hashing ``Period`` (:issue:`12817`)
- Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`)
- Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`)
- Improved performance of ``groupby.groups`` (:issue:`14293`)


.. _whatsnew_0190.bug_fixes:
Expand Down
Loading