diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cb237b93c70ba..e81aaebe77807 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4562,7 +4562,7 @@ def _count_level(self, level, axis=0, numeric_only=False): level_index = count_axis.levels[level] labels = com._ensure_int64(count_axis.labels[level]) - counts = lib.count_level_2d(mask, labels, len(level_index)) + counts = lib.count_level_2d(mask, labels, len(level_index), axis=0) result = DataFrame(counts, index=level_index, columns=agg_axis) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 43110494d675b..1f5855e63dee8 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -69,7 +69,7 @@ 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', 'resample', 'describe', - 'rank', 'quantile', 'count', + 'rank', 'quantile', 'fillna', 'mad', 'any', 'all', @@ -149,9 +149,6 @@ def _last(x): return _last(x) -def _count_compat(x, axis=0): - return x.count() # .size != .count(); count excludes nan - class Grouper(object): """ A Grouper allows the user to specify a groupby instruction for a target object @@ -801,11 +798,6 @@ def size(self): numeric_only=False, _convert=True) last = _groupby_function('last', 'last', _last_compat, numeric_only=False, _convert=True) - _count = _groupby_function('_count', 'count', _count_compat, - numeric_only=False) - - def count(self, axis=0): - return self._count().astype('int64') def ohlc(self): """ @@ -1463,7 +1455,6 @@ def get_group_levels(self): 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) }, 'last': 'group_last', - 'count': 'group_count', } _cython_arity = { @@ -3468,6 +3459,24 @@ def _apply_to_column_groupbys(self, func): in self._iterate_column_groupbys()), keys=self._selected_obj.columns, axis=1) + def count(self): + from functools import partial + from pandas.lib import count_level_2d + from pandas.core.common import _isnull_ndarraylike as isnull + + data, _ = self._get_data_to_aggregate() + ids, _, ngroups = self.grouper.group_info + mask = ids != -1 + + val = ((mask & ~isnull(blk.get_values())) for blk in data.blocks) + loc = (blk.mgr_locs for blk in data.blocks) + + counter = partial(count_level_2d, labels=ids, max_bin=ngroups, axis=1) + blk = map(make_block, map(counter, val), loc) + + return self._wrap_agged_blocks(data.items, list(blk)) + + from pandas.tools.plotting import boxplot_frame_groupby DataFrameGroupBy.boxplot = boxplot_frame_groupby diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 7b2d849695c98..2b4974155d44c 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1253,19 +1253,32 @@ def lookup_values(ndarray[object] values, dict mapping): return maybe_convert_objects(result) +@cython.boundscheck(False) +@cython.wraparound(False) def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, - ndarray[int64_t] labels, Py_ssize_t max_bin): + ndarray[int64_t, ndim=1] labels, + Py_ssize_t max_bin, + int axis): cdef: Py_ssize_t i, j, k, n ndarray[int64_t, ndim=2] counts + assert(axis == 0 or axis == 1) n, k = ( mask).shape - counts = np.zeros((max_bin, k), dtype='i8') - for i from 0 <= i < n: - for j from 0 <= j < k: - if mask[i, j]: - counts[labels[i], j] += 1 + if axis == 0: + counts = np.zeros((max_bin, k), dtype='i8') + with nogil: + for i from 0 <= i < n: + for j from 0 <= j < k: + counts[labels[i], j] += mask[i, j] + + else: # axis == 1 + counts = np.zeros((n, max_bin), dtype='i8') + with nogil: + for i from 0 <= i < n: + for j from 0 <= j < k: + counts[i, labels[j]] += mask[i, j] return counts diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index c086919d94644..b055d75df4cf4 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -971,44 +971,6 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, """ -group_count_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab, ncounts = len(counts) - Py_ssize_t N = values.shape[0], K = values.shape[1] - %(c_type)s val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - - %(nogil)s - %(tab)sfor i in range(N): - %(tab)s lab = labels[i] - %(tab)s if lab < 0: - %(tab)s continue - - %(tab)s counts[lab] += 1 - %(tab)s for j in range(K): - %(tab)s val = values[i, j] - - %(tab)s # not nan - %(tab)s nobs[lab, j] += val == val and val != iNaT - - %(tab)sfor i in range(ncounts): - %(tab)s for j in range(K): - %(tab)s out[i, j] = nobs[i, j] -""" - # add passing bin edges, instead of labels @@ -1995,8 +1957,6 @@ def generate_from_template(template, exclude=None): groupby_min_max = [group_min_template, group_max_template] -groupby_count = [group_count_template] - templates_1d = [map_indices_template, pad_template, backfill_template, @@ -2051,12 +2011,6 @@ def generate_take_cython_file(): print(generate_put_min_max_template(template, use_ints=True), file=f) - for template in groupby_count: - print(generate_put_selection_template(template, use_ints=True, - use_datelikes=True, - use_objects=True), - file=f) - for template in nobool_1d_templates: print(generate_from_template(template, exclude=['bool']), file=f) diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index c0ecd04749e58..2f2fd528999d6 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -7930,192 +7930,6 @@ def group_max_int64(ndarray[int64_t, ndim=2] out, out[i, j] = maxx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab, ncounts = len(counts) - Py_ssize_t N = values.shape[0], K = values.shape[1] - float64_t val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[lab, j] += val == val and val != iNaT - - for i in range(ncounts): - for j in range(K): - out[i, j] = nobs[i, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab, ncounts = len(counts) - Py_ssize_t N = values.shape[0], K = values.shape[1] - float32_t val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[lab, j] += val == val and val != iNaT - - for i in range(ncounts): - for j in range(K): - out[i, j] = nobs[i, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab, ncounts = len(counts) - Py_ssize_t N = values.shape[0], K = values.shape[1] - int64_t val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[lab, j] += val == val and val != iNaT - - for i in range(ncounts): - for j in range(K): - out[i, j] = nobs[i, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab, ncounts = len(counts) - Py_ssize_t N = values.shape[0], K = values.shape[1] - object val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[lab, j] += val == val and val != iNaT - - for i in range(ncounts): - for j in range(K): - out[i, j] = nobs[i, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_count_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, lab, ncounts = len(counts) - Py_ssize_t N = values.shape[0], K = values.shape[1] - int64_t val - ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), - dtype=np.int64) - - if len(values) != len(labels): - raise AssertionError("len(index) != len(labels)") - - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - nobs[lab, j] += val == val and val != iNaT - - for i in range(ncounts): - for j in range(K): - out[i, j] = nobs[i, j] - - @cython.wraparound(False) @cython.boundscheck(False) def left_join_indexer_unique_float64(ndarray[float64_t] left, diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index f5693983f1cc1..a85e68602493b 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2481,6 +2481,30 @@ def test_size(self): self.assertEqual(result[key], len(group)) def test_count(self): + from string import ascii_lowercase + n = 1 << 15 + dr = date_range('2015-08-30', periods=n // 10, freq='T') + + df = DataFrame({ + '1st':np.random.choice(list(ascii_lowercase), n), + '2nd':np.random.randint(0, 5, n), + '3rd':np.random.randn(n).round(3), + '4th':np.random.randint(-10, 10, n), + '5th':np.random.choice(dr, n), + '6th':np.random.randn(n).round(3), + '7th':np.random.randn(n).round(3), + '8th':np.random.choice(dr, n) - np.random.choice(dr, 1), + '9th':np.random.choice(list(ascii_lowercase), n)}) + + for col in df.columns.drop(['1st', '2nd', '4th']): + df.loc[np.random.choice(n, n // 10), col] = np.nan + + df['9th'] = df['9th'].astype('category') + + for key in '1st', '2nd', ['1st', '2nd']: + left = df.groupby(key).count() + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + assert_frame_equal(left, right) # GH5610 # count counts non-nulls @@ -4966,7 +4990,7 @@ def test_groupby_whitelist(self): 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', 'resample', 'describe', - 'rank', 'quantile', 'count', + 'rank', 'quantile', 'fillna', 'mad', 'any', 'all', @@ -4987,7 +5011,7 @@ def test_groupby_whitelist(self): 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', 'resample', 'describe', - 'rank', 'quantile', 'count', + 'rank', 'quantile', 'fillna', 'mad', 'any', 'all', @@ -5253,7 +5277,6 @@ def test__cython_agg_general(self): ('max', np.max), ('first', lambda x: x.iloc[0]), ('last', lambda x: x.iloc[-1]), - ('count', np.size), ] df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) @@ -5439,26 +5462,26 @@ def test_first_last_max_min_on_time_data(self): def test_groupby_preserves_sort(self): # Test to ensure that groupby always preserves sort order of original # object. Issue #8588 and #9651 - - df = DataFrame({'int_groups':[3,1,0,1,0,3,3,3], - 'string_groups':['z','a','z','a','a','g','g','g'], + + df = DataFrame({'int_groups':[3,1,0,1,0,3,3,3], + 'string_groups':['z','a','z','a','a','g','g','g'], 'ints':[8,7,4,5,2,9,1,1], 'floats':[2.3,5.3,6.2,-2.4,2.2,1.1,1.1,5], 'strings':['z','d','a','e','word','word2','42','47']}) # Try sorting on different types and with different group types - for sort_column in ['ints', 'floats', 'strings', ['ints','floats'], + for sort_column in ['ints', 'floats', 'strings', ['ints','floats'], ['ints','strings']]: - for group_column in ['int_groups', 'string_groups', + for group_column in ['int_groups', 'string_groups', ['int_groups','string_groups']]: df = df.sort_values(by=sort_column) g = df.groupby(group_column) - + def test_sort(x): assert_frame_equal(x, x.sort_values(by=sort_column)) - + g.apply(test_sort) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 0f55f79b8b9b9..df61387734cb3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -829,7 +829,7 @@ def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) - expected = frame.groupby(axis=axis, level=i).count(axis=axis) + expected = frame.groupby(axis=axis, level=i).count() expected = expected.reindex_like(result).astype('i8') assert_frame_equal(result, expected)