From 82d19dd50755b66c6e7cefd57b0a693e61619a4d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 24 Sep 2016 09:07:47 -0400 Subject: [PATCH] PERF: faster grouping remove pandas.core.groupby._groupby_indices to use algos.groupsort_indexer add Categorical._reverse_indexer to facilitate closes #14293 --- asv_bench/benchmarks/gil.py | 173 ++++++++----------------- asv_bench/benchmarks/groupby.py | 26 ++++ doc/source/whatsnew/v0.19.0.txt | 2 + pandas/algos.pyx | 132 ++++--------------- pandas/core/categorical.py | 40 ++++++ pandas/core/groupby.py | 26 +--- pandas/indexes/base.py | 23 +++- pandas/indexes/numeric.py | 2 - pandas/src/algos_common_helper.pxi | 175 -------------------------- pandas/src/algos_common_helper.pxi.in | 30 ----- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexes/test_numeric.py | 13 +- pandas/tests/test_groupby.py | 13 +- pandas/tests/types/test_inference.py | 15 ++- pandas/tseries/base.py | 4 - pandas/types/common.py | 7 ++ 16 files changed, 201 insertions(+), 482 deletions(-) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 2eb6786356511..1c82560c7e630 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -22,7 +22,7 @@ def wrapper(fname): return wrapper -class nogil_groupby_count_2(object): +class nogil_groupby_base(object): goal_time = 0.2 def setup(self): @@ -33,6 +33,9 @@ def setup(self): if (not have_real_test_parallel): raise NotImplementedError + +class nogil_groupby_count_2(nogil_groupby_base): + def time_nogil_groupby_count_2(self): self.pg2() @@ -41,16 +44,7 @@ def pg2(self): self.df.groupby('key')['data'].count() -class nogil_groupby_last_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError +class nogil_groupby_last_2(nogil_groupby_base): def time_nogil_groupby_last_2(self): self.pg2() @@ -60,16 +54,7 @@ def pg2(self): self.df.groupby('key')['data'].last() -class nogil_groupby_max_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError +class nogil_groupby_max_2(nogil_groupby_base): def time_nogil_groupby_max_2(self): self.pg2() @@ -79,16 +64,7 @@ def pg2(self): self.df.groupby('key')['data'].max() -class nogil_groupby_mean_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError +class nogil_groupby_mean_2(nogil_groupby_base): def time_nogil_groupby_mean_2(self): self.pg2() @@ -98,16 +74,7 @@ def pg2(self): self.df.groupby('key')['data'].mean() -class nogil_groupby_min_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError +class nogil_groupby_min_2(nogil_groupby_base): def time_nogil_groupby_min_2(self): self.pg2() @@ -117,16 +84,7 @@ def pg2(self): self.df.groupby('key')['data'].min() -class nogil_groupby_prod_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError +class nogil_groupby_prod_2(nogil_groupby_base): def time_nogil_groupby_prod_2(self): self.pg2() @@ -136,16 +94,7 @@ def pg2(self): self.df.groupby('key')['data'].prod() -class nogil_groupby_sum_2(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError +class nogil_groupby_sum_2(nogil_groupby_base): def time_nogil_groupby_sum_2(self): self.pg2() @@ -155,16 +104,7 @@ def pg2(self): self.df.groupby('key')['data'].sum() -class nogil_groupby_sum_4(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError +class nogil_groupby_sum_4(nogil_groupby_base): def time_nogil_groupby_sum_4(self): self.pg4() @@ -172,41 +112,16 @@ def time_nogil_groupby_sum_4(self): def f(self): self.df.groupby('key')['data'].sum() - def g2(self): - for i in range(2): - self.f() - def g4(self): for i in range(4): self.f() - def g8(self): - for i in range(8): - self.f() - - @test_parallel(num_threads=2) - def pg2(self): - self.f() - @test_parallel(num_threads=4) def pg4(self): self.f() - @test_parallel(num_threads=8) - def pg8(self): - self.f() - -class nogil_groupby_sum_8(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): - raise NotImplementedError +class nogil_groupby_sum_8(nogil_groupby_base): def time_nogil_groupby_sum_8(self): self.pg8() @@ -214,48 +129,68 @@ def time_nogil_groupby_sum_8(self): def f(self): self.df.groupby('key')['data'].sum() - def g2(self): - for i in range(2): - self.f() - - def g4(self): - for i in range(4): - self.f() - def g8(self): for i in range(8): self.f() - @test_parallel(num_threads=2) - def pg2(self): - self.f() - - @test_parallel(num_threads=4) - def pg4(self): - self.f() - @test_parallel(num_threads=8) def pg8(self): self.f() -class nogil_groupby_var_2(object): +class nogil_groupby_var_2(nogil_groupby_base): + + def time_nogil_groupby_var_2(self): + self.pg2() + + @test_parallel(num_threads=2) + def pg2(self): + self.df.groupby('key')['data'].var() + + +class nogil_groupby_groups(object): goal_time = 0.2 def setup(self): - self.N = 1000000 - self.ngroups = 1000 np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) + self.size = 2**22 + self.ngroups = 100 + self.data = Series(np.random.randint(0, self.ngroups, size=self.size)) if (not have_real_test_parallel): raise NotImplementedError - def time_nogil_groupby_var_2(self): + def f(self): + self.data.groupby(self.data).groups + + +class nogil_groupby_groups_2(nogil_groupby_groups): + + def time_nogil_groupby_groups(self): self.pg2() @test_parallel(num_threads=2) def pg2(self): - self.df.groupby('key')['data'].var() + self.f() + + +class nogil_groupby_groups_4(nogil_groupby_groups): + + def time_nogil_groupby_groups(self): + self.pg4() + + @test_parallel(num_threads=4) + def pg4(self): + self.f() + + +class nogil_groupby_groups_8(nogil_groupby_groups): + + def time_nogil_groupby_groups(self): + self.pg8() + + @test_parallel(num_threads=8) + def pg8(self): + self.f() class nogil_take1d_float64(object): diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 0611a3564ff7a..e12b00dd06b39 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -32,6 +32,32 @@ def time_groupby_apply_dict_return(self): self.data.groupby(self.labels).apply(self.f) +#---------------------------------------------------------------------- +# groups + +class groupby_groups(object): + goal_time = 0.1 + + def setup(self): + size = 2**22 + self.data = Series(np.random.randint(0, 100, size=size)) + self.data2 = Series(np.random.randint(0, 10000, size=size)) + self.data3 = Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))) + self.data4 = Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))) + + def time_groupby_groups_int64_small(self): + self.data.groupby(self.data).groups + + def time_groupby_groups_int64_large(self): + self.data2.groupby(self.data2).groups + + def time_groupby_groups_object_small(self): + self.data3.groupby(self.data3).groups + + def time_groupby_groups_object_large(self): + self.data4.groupby(self.data4).groups + + #---------------------------------------------------------------------- # First / last functions diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 67beb468dce8a..355d12e113398 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1335,6 +1335,7 @@ Other API Changes - ``Series`` and ``Index`` now support ``divmod`` which will return a tuple of series or indices. This behaves like a standard binary operator with regards to broadcasting rules (:issue:`14208`). +- ``.groupby.groups`` will now return a dictionary of ``Index`` objects, rather than a dictionary of ``np.ndarray`` or ``lists`` (:issue:`14293`) .. _whatsnew_0190.deprecations: @@ -1407,6 +1408,7 @@ Performance Improvements - Improved performance of hashing ``Period`` (:issue:`12817`) - Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`) - Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`) +- Improved performance of ``groupby.groups`` (:issue:`14293`) .. _whatsnew_0190.bug_fixes: diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 8710ef34504d1..04f3ac70bdf5c 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -989,129 +989,47 @@ def is_lexsorted(list list_of_arrays): @cython.boundscheck(False) -def groupby_indices(dict ids, ndarray[int64_t] labels, - ndarray[int64_t] counts): - """ - turn group_labels output into a combined indexer mapping the labels to - indexers - - Parameters - ---------- - ids: dict - mapping of label -> group indexer - labels: ndarray - labels for positions - counts: ndarray - group counts - - Returns - ------- - list of ndarrays of indices - - """ - cdef: - Py_ssize_t i, n = len(labels) - ndarray[int64_t] arr, seen - int64_t loc - int64_t k - dict result = {} - - seen = np.zeros_like(counts) - - cdef int64_t **vecs = malloc(len(ids) * sizeof(int64_t*)) - for i from 0 <= i < len(counts): - arr = np.empty(counts[i], dtype=np.int64) - result[ids[i]] = arr - vecs[i] = arr.data - - for i from 0 <= i < n: - k = labels[i] - - # was NaN - if k == -1: - continue - - loc = seen[k] - vecs[k][loc] = i - seen[k] = loc + 1 - - free(vecs) - return result - - @cython.wraparound(False) -@cython.boundscheck(False) -def group_labels(ndarray[object] values): +def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): """ - Compute label vector from input values and associated useful data + compute a 1-d indexer that is an ordering of the passed index, + ordered by the groups. This is a reverse of the label + factorization process. Parameters ---------- - values: object ndarray + index: int64 ndarray + mappings from group -> position + ngroups: int64 + number of groups - Returns - ------- - tuple of (reverse mappings of label -> group indexer, - factorized labels ndarray, - group counts ndarray) + return a tuple of (1-d indexer ordered by groups, group counts) """ - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - ndarray[int64_t] counts = np.empty(n, dtype=np.int64) - dict ids = {}, reverse = {} - int64_t idx - object val - int64_t count = 0 - - for i from 0 <= i < n: - val = values[i] - - # is NaN - if val != val: - labels[i] = -1 - continue - # for large number of groups, not doing try: except: makes a big - # difference - if val in ids: - idx = ids[val] - labels[i] = idx - counts[idx] = counts[idx] + 1 - else: - ids[val] = count - reverse[count] = val - labels[i] = count - counts[count] = 1 - count += 1 - - return reverse, labels, counts[:count].copy() - - -@cython.boundscheck(False) -@cython.wraparound(False) -def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): cdef: Py_ssize_t i, loc, label, n ndarray[int64_t] counts, where, result - # count group sizes, location 0 for NA counts = np.zeros(ngroups + 1, dtype=np.int64) n = len(index) - for i from 0 <= i < n: - counts[index[i] + 1] += 1 - - # mark the start of each contiguous group of like-indexed data + result = np.zeros(n, dtype=np.int64) where = np.zeros(ngroups + 1, dtype=np.int64) - for i from 1 <= i < ngroups + 1: - where[i] = where[i - 1] + counts[i - 1] - # this is our indexer - result = np.zeros(n, dtype=np.int64) - for i from 0 <= i < n: - label = index[i] + 1 - result[where[label]] = i - where[label] += 1 + with nogil: + + # count group sizes, location 0 for NA + for i from 0 <= i < n: + counts[index[i] + 1] += 1 + + # mark the start of each contiguous group of like-indexed data + for i from 1 <= i < ngroups + 1: + where[i] = where[i - 1] + counts[i - 1] + + # this is our indexer + for i from 0 <= i < n: + label = index[i] + 1 + result[where[label]] = i + where[label] += 1 return result, counts diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6b37a5e2cd202..db48f2a46eaf3 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -6,6 +6,7 @@ from pandas import compat, lib from pandas.compat import u, lzip +import pandas.algos as _algos from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex from pandas.types.missing import isnull, notnull @@ -1699,6 +1700,45 @@ def __setitem__(self, key, value): lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer + def _reverse_indexer(self): + """ + Compute the inverse of a categorical, returning + a dict of categories -> indexers. + + *This is an internal function* + + Returns + ------- + dict of categories -> indexers + + Example + ------- + In [1]: c = pd.Categorical(list('aabca')) + + In [2]: c + Out[2]: + [a, a, b, c, a] + Categories (3, object): [a, b, c] + + In [3]: c.categories + Out[3]: Index([u'a', u'b', u'c'], dtype='object') + + In [4]: c.codes + Out[4]: array([0, 0, 1, 2, 0], dtype=int8) + + In [5]: c._reverse_indexer() + Out[5]: {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} + + """ + categories = self.categories + r, counts = _algos.groupsort_indexer(self.codes.astype('int64'), + categories.size) + counts = counts.cumsum() + result = [r[counts[indexer]:counts[indexer + 1]] + for indexer in range(len(counts) - 1)] + result = dict(zip(categories, result)) + return result + # reduction ops # def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 733fae0c34729..3c376e3188eac 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -28,6 +28,7 @@ _ensure_platform_int, _ensure_int64, _ensure_object, + _ensure_categorical, _ensure_float) from pandas.types.cast import _possibly_downcast_to_dtype from pandas.types.missing import isnull, notnull, _maybe_fill @@ -1657,7 +1658,7 @@ def groups(self): else: to_groupby = lzip(*(ping.grouper for ping in self.groupings)) to_groupby = Index(to_groupby) - return self.axis.groupby(to_groupby.values) + return self.axis.groupby(to_groupby) @cache_readonly def is_monotonic(self): @@ -2319,7 +2320,8 @@ def ngroups(self): @cache_readonly def indices(self): - return _groupby_indices(self.grouper) + values = _ensure_categorical(self.grouper) + return values._reverse_indexer() @property def labels(self): @@ -2342,7 +2344,8 @@ def _make_labels(self): @cache_readonly def groups(self): - return self.index.groupby(self.grouper) + return self.index.groupby(Categorical.from_codes(self.labels, + self.group_index)) def _get_grouper(obj, key=None, axis=0, level=None, sort=True, @@ -4436,23 +4439,6 @@ def _reorder_by_uniques(uniques, labels): return uniques, labels -def _groupby_indices(values): - - if is_categorical_dtype(values): - # we have a categorical, so we can do quite a bit - # bit better than factorizing again - reverse = dict(enumerate(values.categories)) - codes = values.codes.astype('int64') - - mask = 0 <= codes - counts = np.bincount(codes[mask], minlength=values.categories.size) - else: - reverse, codes, counts = _algos.group_labels( - _values_from_object(_ensure_object(values))) - - return _algos.groupby_indices(reverse, codes, counts) - - def numpy_groupby(data, labels, axis=0): s = np.argsort(labels) keys, inv = np.unique(labels, return_inverse=True) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index f430305f5cb91..5138ca5a6b21e 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -17,7 +17,9 @@ from pandas.types.generic import ABCSeries, ABCMultiIndex, ABCPeriodIndex from pandas.types.missing import isnull, array_equivalent -from pandas.types.common import (_ensure_int64, _ensure_object, +from pandas.types.common import (_ensure_int64, + _ensure_object, + _ensure_categorical, _ensure_platform_int, is_integer, is_float, @@ -111,7 +113,6 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): _join_precedence = 1 # Cython methods - _groupby = _algos.groupby_object _arrmap = _algos.arrmap_object _left_indexer_unique = _join.left_join_indexer_unique_object _left_indexer = _join.left_join_indexer_object @@ -2352,13 +2353,13 @@ def _possibly_promote(self, other): return self.astype('object'), other.astype('object') return self, other - def groupby(self, to_groupby): + def groupby(self, values): """ Group the index labels by a given array of values. Parameters ---------- - to_groupby : array + values : array Values used to determine the groups. Returns @@ -2366,7 +2367,19 @@ def groupby(self, to_groupby): groups : dict {group name -> group labels} """ - return self._groupby(self.values, _values_from_object(to_groupby)) + + # TODO: if we are a MultiIndex, we can do better + # that converting to tuples + from .multi import MultiIndex + if isinstance(values, MultiIndex): + values = values.values + values = _ensure_categorical(values) + result = values._reverse_indexer() + + # map to the label + result = {k: self.take(v) for k, v in compat.iteritems(result)} + + return result def map(self, mapper): """ diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index b9625f3aaff92..97f7093e99064 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -113,7 +113,6 @@ class Int64Index(NumericIndex): """ _typ = 'int64index' - _groupby = _algos.groupby_int64 _arrmap = _algos.arrmap_int64 _left_indexer_unique = _join.left_join_indexer_unique_int64 _left_indexer = _join.left_join_indexer_int64 @@ -200,7 +199,6 @@ class Float64Index(NumericIndex): _typ = 'float64index' _engine_type = _index.Float64Engine - _groupby = _algos.groupby_float64 _arrmap = _algos.arrmap_float64 _left_indexer_unique = _join.left_join_indexer_unique_float64 _left_indexer = _join.left_join_indexer_float64 diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi index be587fbc9a019..9dede87e0c15b 100644 --- a/pandas/src/algos_common_helper.pxi +++ b/pandas/src/algos_common_helper.pxi @@ -10,7 +10,6 @@ Template for each `dtype` helper function using 1-d template - backfill_1d - backfill_2d - is_monotonic -- groupby - arrmap WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in @@ -391,35 +390,6 @@ def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): is_unique and (is_monotonic_inc or is_monotonic_dec) -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_float64(ndarray[float64_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def arrmap_float64(ndarray[float64_t] index, object func): @@ -806,35 +776,6 @@ def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): is_unique and (is_monotonic_inc or is_monotonic_dec) -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_float32(ndarray[float32_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def arrmap_float32(ndarray[float32_t] index, object func): @@ -1221,35 +1162,6 @@ def is_monotonic_object(ndarray[object] arr, bint timelike): is_unique and (is_monotonic_inc or is_monotonic_dec) -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_object(ndarray[object] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def arrmap_object(ndarray[object] index, object func): @@ -1636,35 +1548,6 @@ def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): is_unique and (is_monotonic_inc or is_monotonic_dec) -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_int32(ndarray[int32_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def arrmap_int32(ndarray[int32_t] index, object func): @@ -2051,35 +1934,6 @@ def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): is_unique and (is_monotonic_inc or is_monotonic_dec) -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_int64(ndarray[int64_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def arrmap_int64(ndarray[int64_t] index, object func): @@ -2466,35 +2320,6 @@ def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): is_unique and (is_monotonic_inc or is_monotonic_dec) -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_bool(ndarray[uint8_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def arrmap_bool(ndarray[uint8_t] index, object func): diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index cec5712c0b7f4..c52c734f727e9 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -10,7 +10,6 @@ Template for each `dtype` helper function using 1-d template - backfill_1d - backfill_2d - is_monotonic -- groupby - arrmap WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in @@ -413,35 +412,6 @@ def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): is_unique and (is_monotonic_inc or is_monotonic_dec) -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_{{name}}(ndarray[{{c_type}}] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - @cython.wraparound(False) @cython.boundscheck(False) def arrmap_{{name}}(ndarray[{{c_type}}] index, object func): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7f68318d4d7d3..421174ded57d5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1541,7 +1541,7 @@ def get_reindex_type(target): def test_groupby(self): idx = Index(range(5)) groups = idx.groupby(np.array([1, 1, 2, 2, 2])) - exp = {1: [0, 1], 2: [2, 3, 4]} + exp = {1: pd.Index([0, 1]), 2: pd.Index([2, 3, 4])} tm.assert_dict_equal(groups, exp) def test_equals_op_multiindex(self): diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index b04d7f128e133..b362c9716b672 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -144,8 +144,8 @@ def test_index_groupby(self): for idx in [int_idx, float_idx, obj_idx, dt_idx]: to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) - self.assertEqual(idx.groupby(to_groupby), - {1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]}) + tm.assert_dict_equal(idx.groupby(to_groupby), + {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]}) to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1), @@ -155,11 +155,10 @@ def test_index_groupby(self): datetime(2011, 11, 1)], tz='UTC').values - ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp( - '2011-11-01'), Timestamp('2011-12-01')])) - expected = {ex_keys[0][0]: [idx[0], idx[5]], - ex_keys[0][1]: [idx[1], idx[4]]} - self.assertEqual(idx.groupby(to_groupby), expected) + ex_keys = [Timestamp('2011-11-01'), Timestamp('2011-12-01')] + expected = {ex_keys[0]: idx[[0, 5]], + ex_keys[1]: idx[[1, 4]]} + tm.assert_dict_equal(idx.groupby(to_groupby), expected) def test_modulo(self): # GH 9244 diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a21295e1a9823..01c1d48c6d5c0 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -521,13 +521,6 @@ def test_groupby_dict_mapping(self): assert_series_equal(result, result2) assert_series_equal(result, expected2) - def test_groupby_bounds_check(self): - # groupby_X is code-generated, so if one variant - # does, the rest probably do to - a = np.array([1, 2], dtype='object') - b = np.array([1, 2, 3], dtype='object') - self.assertRaises(AssertionError, pd.algos.groupby_object, a, b) - def test_groupby_grouper_f_sanity_checked(self): dates = date_range('01-Jan-2013', periods=12, freq='MS') ts = Series(np.random.randn(12), index=dates) @@ -3478,13 +3471,13 @@ def test_groupby_nat_exclude(self): 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']}) grouped = df.groupby('dt') - expected = [[1, 7], [3, 5]] + expected = [pd.Index([1, 7]), pd.Index([3, 5])] keys = sorted(grouped.groups.keys()) self.assertEqual(len(keys), 2) for k, e in zip(keys, expected): # grouped.groups keys are np.datetime64 with system tz # not to be affected by tz, only compare values - self.assertEqual(grouped.groups[k], e) + tm.assert_index_equal(grouped.groups[k], e) # confirm obj is not filtered tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) @@ -4447,7 +4440,7 @@ def test_multiindex_columns_empty_level(self): expected = df.groupby('to filter').groups result = df.groupby([('to filter', '')]).groups - self.assertEqual(result, expected) + tm.assert_dict_equal(result, expected) def test_cython_median(self): df = DataFrame(np.random.randn(1000)) diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index 9a12220f5b41d..a63ae5f7cf74e 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -16,7 +16,7 @@ from pandas import lib, tslib from pandas import (Series, Index, DataFrame, Timedelta, DatetimeIndex, TimedeltaIndex, Timestamp, - Panel, Period) + Panel, Period, Categorical) from pandas.compat import u, PY2, lrange from pandas.types import inference from pandas.types.common import (is_timedelta64_dtype, @@ -26,7 +26,8 @@ is_float, is_bool, is_scalar, - _ensure_int32) + _ensure_int32, + _ensure_categorical) from pandas.types.missing import isnull from pandas.util import testing as tm @@ -842,6 +843,16 @@ def test_ensure_int32(): assert (result.dtype == np.int32) +def test_ensure_categorical(): + values = np.arange(10, dtype=np.int32) + result = _ensure_categorical(values) + assert (result.dtype == 'category') + + values = Categorical(values) + result = _ensure_categorical(values) + tm.assert_categorical_equal(result, values) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 3b676b894d355..96213a4aec34d 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -226,10 +226,6 @@ def _box_values(self, values): """ return lib.map_infer(values, self._box_func) - def groupby(self, f): - objs = self.asobject.values - return _algos.groupby_object(objs, f) - def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) diff --git a/pandas/types/common.py b/pandas/types/common.py index 2e7a67112e6db..e0e4501738745 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -42,6 +42,13 @@ def _ensure_float(arr): _ensure_object = algos.ensure_object +def _ensure_categorical(arr): + if not is_categorical(arr): + from pandas import Categorical + arr = Categorical(arr) + return arr + + def is_object_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.object_)