Skip to content

CLN: removes cython implementation of groupby count #11013

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 7, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4562,7 +4562,7 @@ def _count_level(self, level, axis=0, numeric_only=False):

level_index = count_axis.levels[level]
labels = com._ensure_int64(count_axis.labels[level])
counts = lib.count_level_2d(mask, labels, len(level_index))
counts = lib.count_level_2d(mask, labels, len(level_index), axis=0)

result = DataFrame(counts, index=level_index,
columns=agg_axis)
Expand Down
29 changes: 19 additions & 10 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
'resample',
'describe',
'rank', 'quantile', 'count',
'rank', 'quantile',
'fillna',
'mad',
'any', 'all',
Expand Down Expand Up @@ -149,9 +149,6 @@ def _last(x):
return _last(x)


def _count_compat(x, axis=0):
return x.count() # .size != .count(); count excludes nan

class Grouper(object):
"""
A Grouper allows the user to specify a groupby instruction for a target object
Expand Down Expand Up @@ -801,11 +798,6 @@ def size(self):
numeric_only=False, _convert=True)
last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
_convert=True)
_count = _groupby_function('_count', 'count', _count_compat,
numeric_only=False)

def count(self, axis=0):
return self._count().astype('int64')

def ohlc(self):
"""
Expand Down Expand Up @@ -1463,7 +1455,6 @@ def get_group_levels(self):
'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
},
'last': 'group_last',
'count': 'group_count',
}

_cython_arity = {
Expand Down Expand Up @@ -3468,6 +3459,24 @@ def _apply_to_column_groupbys(self, func):
in self._iterate_column_groupbys()),
keys=self._selected_obj.columns, axis=1)

def count(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs to take the axis kw to be back-compat. (I see that you changed a test to fix this, pls change back).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having an axis argument there is a bug when current master simply ignores the axis arg; groupby.py:L807

also, when doing a groupby, counting on axis=1 has no meaning because u may get a different values for each row regardless of keys being the same or different.

from functools import partial
from pandas.lib import count_level_2d
from pandas.core.common import _isnull_ndarraylike as isnull

data, _ = self._get_data_to_aggregate()
ids, _, ngroups = self.grouper.group_info
mask = ids != -1

val = ((mask & ~isnull(blk.get_values())) for blk in data.blocks)
loc = (blk.mgr_locs for blk in data.blocks)

counter = partial(count_level_2d, labels=ids, max_bin=ngroups, axis=1)
blk = map(make_block, map(counter, val), loc)

return self._wrap_agged_blocks(data.items, list(blk))


from pandas.tools.plotting import boxplot_frame_groupby
DataFrameGroupBy.boxplot = boxplot_frame_groupby

Expand Down
25 changes: 19 additions & 6 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1253,19 +1253,32 @@ def lookup_values(ndarray[object] values, dict mapping):
return maybe_convert_objects(result)


@cython.boundscheck(False)
@cython.wraparound(False)
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
ndarray[int64_t] labels, Py_ssize_t max_bin):
ndarray[int64_t, ndim=1] labels,
Py_ssize_t max_bin,
int axis):
cdef:
Py_ssize_t i, j, k, n
ndarray[int64_t, ndim=2] counts

assert(axis == 0 or axis == 1)
n, k = (<object> mask).shape
counts = np.zeros((max_bin, k), dtype='i8')

for i from 0 <= i < n:
for j from 0 <= j < k:
if mask[i, j]:
counts[labels[i], j] += 1
if axis == 0:
counts = np.zeros((max_bin, k), dtype='i8')
with nogil:
for i from 0 <= i < n:
for j from 0 <= j < k:
counts[labels[i], j] += mask[i, j]

else: # axis == 1
counts = np.zeros((n, max_bin), dtype='i8')
with nogil:
for i from 0 <= i < n:
for j from 0 <= j < k:
counts[i, labels[j]] += mask[i, j]

return counts

Expand Down
46 changes: 0 additions & 46 deletions pandas/src/generate_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,44 +971,6 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,

"""

group_count_template = """@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
ndarray[int64_t] counts,
ndarray[%(c_type)s, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab, ncounts = len(counts)
Py_ssize_t N = values.shape[0], K = values.shape[1]
%(c_type)s val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")


%(nogil)s
%(tab)sfor i in range(N):
%(tab)s lab = labels[i]
%(tab)s if lab < 0:
%(tab)s continue

%(tab)s counts[lab] += 1
%(tab)s for j in range(K):
%(tab)s val = values[i, j]

%(tab)s # not nan
%(tab)s nobs[lab, j] += val == val and val != iNaT

%(tab)sfor i in range(ncounts):
%(tab)s for j in range(K):
%(tab)s out[i, j] = nobs[i, j]
"""

# add passing bin edges, instead of labels


Expand Down Expand Up @@ -1995,8 +1957,6 @@ def generate_from_template(template, exclude=None):
groupby_min_max = [group_min_template,
group_max_template]

groupby_count = [group_count_template]

templates_1d = [map_indices_template,
pad_template,
backfill_template,
Expand Down Expand Up @@ -2051,12 +2011,6 @@ def generate_take_cython_file():
print(generate_put_min_max_template(template, use_ints=True),
file=f)

for template in groupby_count:
print(generate_put_selection_template(template, use_ints=True,
use_datelikes=True,
use_objects=True),
file=f)

for template in nobool_1d_templates:
print(generate_from_template(template, exclude=['bool']), file=f)

Expand Down
186 changes: 0 additions & 186 deletions pandas/src/generated.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7930,192 +7930,6 @@ def group_max_int64(ndarray[int64_t, ndim=2] out,
out[i, j] = maxx[i, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_float64(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab, ncounts = len(counts)
Py_ssize_t N = values.shape[0], K = values.shape[1]
float64_t val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")


with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[lab, j] += val == val and val != iNaT

for i in range(ncounts):
for j in range(K):
out[i, j] = nobs[i, j]

@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_float32(ndarray[float32_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float32_t, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab, ncounts = len(counts)
Py_ssize_t N = values.shape[0], K = values.shape[1]
float32_t val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")


with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[lab, j] += val == val and val != iNaT

for i in range(ncounts):
for j in range(K):
out[i, j] = nobs[i, j]

@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_int64(ndarray[int64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[int64_t, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab, ncounts = len(counts)
Py_ssize_t N = values.shape[0], K = values.shape[1]
int64_t val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")


with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[lab, j] += val == val and val != iNaT

for i in range(ncounts):
for j in range(K):
out[i, j] = nobs[i, j]

@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_object(ndarray[object, ndim=2] out,
ndarray[int64_t] counts,
ndarray[object, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab, ncounts = len(counts)
Py_ssize_t N = values.shape[0], K = values.shape[1]
object val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")



for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[lab, j] += val == val and val != iNaT

for i in range(ncounts):
for j in range(K):
out[i, j] = nobs[i, j]

@cython.boundscheck(False)
@cython.wraparound(False)
def group_count_int64(ndarray[int64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[int64_t, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, lab, ncounts = len(counts)
Py_ssize_t N = values.shape[0], K = values.shape[1]
int64_t val
ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
dtype=np.int64)

if len(values) != len(labels):
raise AssertionError("len(index) != len(labels)")


with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1
for j in range(K):
val = values[i, j]

# not nan
nobs[lab, j] += val == val and val != iNaT

for i in range(ncounts):
for j in range(K):
out[i, j] = nobs[i, j]


@cython.wraparound(False)
@cython.boundscheck(False)
def left_join_indexer_unique_float64(ndarray[float64_t] left,
Expand Down
Loading