From 9869155b5dbda1d691124c67b522ae23cac45bf3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 27 Aug 2019 13:11:00 -0700 Subject: [PATCH 1/4] reduction-> libreduction for grepability --- pandas/core/apply.py | 6 +++--- pandas/core/groupby/ops.py | 10 +++++----- pandas/tests/groupby/test_bin_groupby.py | 16 ++++++++-------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 5c8599dbb054b..b96b3c7572031 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -3,7 +3,7 @@ import numpy as np -from pandas._libs import reduction +from pandas._libs import reduction as libreduction from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -221,7 +221,7 @@ def apply_raw(self): """ apply to the values as a numpy array """ try: - result = reduction.compute_reduction(self.values, self.f, axis=self.axis) + result = libreduction.compute_reduction(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) @@ -281,7 +281,7 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.compute_reduction( + result = libreduction.compute_reduction( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) return self.obj._constructor_sliced(result, index=labels) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b0c629f017dd3..56ba1901c4137 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,7 +12,7 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby -import pandas._libs.reduction as reduction +import pandas._libs.reduction as libreduction from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -207,7 +207,7 @@ def apply(self, f, data, axis=0): if len(result_values) == len(group_keys): return group_keys, result_values, mutated - except reduction.InvalidApply: + except libreduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. @@ -678,7 +678,7 @@ def _aggregate_series_fast(self, obj, func): indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) - grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) + grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts @@ -852,7 +852,7 @@ def groupings(self): def agg_series(self, obj, func): dummy = obj[:0] - grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() @@ -940,7 +940,7 @@ def fast_apply(self, f, names): return [], True sdata = self._get_sorted_data() - return reduction.apply_frame_axis0(sdata, f, names, starts, ends) + return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) def _chop(self, sdata, slice_obj): if self.axis == 0: diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 2195686ee9c7f..b8f9ecd42bae3 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -2,7 +2,7 @@ from numpy import nan import pytest -from pandas._libs import groupby, lib, reduction +from pandas._libs import groupby, lib, reduction as libreduction from pandas.core.dtypes.common import ensure_int64 @@ -18,7 +18,7 @@ def test_series_grouper(): labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) + grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) result, counts = grouper.get_result() expected = np.array([obj[3:6].mean(), obj[6:].mean()]) @@ -34,7 +34,7 @@ def test_series_bin_grouper(): bins = np.array([3, 6]) - grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy) result, counts = grouper.get_result() expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) @@ -120,31 +120,31 @@ class TestMoments: class TestReducer: def test_int_index(self): arr = np.random.randn(100, 4) - result = reduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + result = libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(100)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) ) expected = arr.sum(0) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(4)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) assert_almost_equal(result, expected) From a870082c1f5c57bbbd00cb00012bff767137880b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 27 Aug 2019 18:13:38 -0700 Subject: [PATCH 2/4] fix passing DataFrame to make_blocK --- pandas/core/groupby/generic.py | 41 ++++++++++++++++++++++----- pandas/tests/groupby/test_function.py | 7 +++-- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7d6690a0dfa5a..7598debbd52b9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -30,6 +30,7 @@ ensure_int64, ensure_platform_int, is_bool, + is_categorical_dtype, is_datetimelike, is_dict_like, is_integer_dtype, @@ -161,10 +162,15 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): result, _ = self.grouper.aggregate( block.values, how, axis=agg_axis, min_count=min_count ) - except NotImplementedError: + except NotImplementedError as err: # generally if we have numeric_only=False # and non-applicable functions # try to python agg + if "type does not support" in str(err): + # exception raised by NumPy, not pandas + # e.g. "timedelta64 type does not support prod operations" + deleted_items.append(locs) + continue if alt is None: # we cannot perform the operation @@ -182,10 +188,30 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # continue and exclude the block deleted_items.append(locs) continue + + if is_categorical_dtype(block.dtype): + # restore e.g. Categorical + # not all dtypes are conserved by agg + result = result.astype(block.dtype) + + assert len(result._data.blocks) == 1 + result = result._data.blocks[0].values + + # Check that we didn't mess up some corner case + # TODO: this isn't a reliable way of doing this + grp = obj.loc[s.groups[1]] + try: + alt(grp.values, axis=self.axis) + except TypeError: + result = no_result + deleted_items.append(locs) + continue + finally: if result is not no_result: # see if we can cast the block back to the original dtype result = maybe_downcast_numeric(result, block.dtype) + assert not isinstance(result, DataFrame) newb = block.make_block(result) new_items.append(locs) @@ -242,11 +268,12 @@ def aggregate(self, func, *args, **kwargs): # grouper specific aggregations if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + result = self._aggregate_generic(func, *args, **kwargs) else: # try to treat as if we are passing a list try: - assert not args and not kwargs result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) @@ -261,7 +288,7 @@ def aggregate(self, func, *args, **kwargs): # to SparseDataFrame, so we do it here. result = SparseDataFrame(result._data) except Exception: - result = self._aggregate_generic(func, *args, **kwargs) + result = self._aggregate_generic(func) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -311,10 +338,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): cannot_agg = [] errors = None for item in obj: - try: - data = obj[item] - colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + data = obj[item] + colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + try: cast = self._transform_should_cast(func) result[item] = colg.aggregate(func, *args, **kwargs) @@ -682,7 +709,7 @@ def _transform_item_by_item(self, obj, wrapper): return DataFrame(output, index=obj.index, columns=columns) - def filter(self, func, dropna=True, *args, **kwargs): # noqa + def filter(self, func, dropna=True, *args, **kwargs): """ Return a copy of a DataFrame excluding elements from groups that do not satisfy the boolean criterion specified by func. diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index d89233f2fd603..c27af160c9898 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -171,13 +171,12 @@ def test_arg_passthru(): result = f(numeric_only=False) tm.assert_frame_equal(result.reindex_like(expected), expected) - # TODO: min, max *should* handle - # categorical (ordered) dtype expected_columns = Index( [ "int", "float", "string", + "category_string", "category_int", "datetime", "datetimetz", @@ -212,7 +211,9 @@ def test_arg_passthru(): result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(["int", "float", "string", "category_int", "timedelta"]) + expected_columns = Index( + ["int", "float", "string", "category_string", "category_int", "timedelta"] + ) for attr in ["sum"]: f = getattr(df.groupby("group"), attr) result = f() From 9b054208d808b62484354ad2f6ff137cacce6713 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 27 Aug 2019 20:00:02 -0700 Subject: [PATCH 3/4] raise for object dtype --- pandas/core/groupby/generic.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7598debbd52b9..a0c60b1e1c8db 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -53,7 +53,6 @@ GroupBy, _apply_docs, _transform_template, - groupby, ) from pandas.core.index import Index, MultiIndex, _all_indexes_same import pandas.core.indexes.base as ibase @@ -180,7 +179,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # call our grouper again with only this block obj = self.obj[data.items[locs]] - s = groupby(obj, self.grouper) + s = obj.groupby(self.grouper) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: @@ -189,24 +188,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): deleted_items.append(locs) continue + if is_object_dtype(block.dtype) and how in["prod", "cumprod", "sum"]: + # s.aggregate is not reliable for e.g. `prod` with strings + result = no_result + raise + if is_categorical_dtype(block.dtype): - # restore e.g. Categorical - # not all dtypes are conserved by agg + # restore Categorical; not all dtypes are conserved by agg + # TODO: will this be right for e.g. sum? result = result.astype(block.dtype) assert len(result._data.blocks) == 1 result = result._data.blocks[0].values - # Check that we didn't mess up some corner case - # TODO: this isn't a reliable way of doing this - grp = obj.loc[s.groups[1]] - try: - alt(grp.values, axis=self.axis) - except TypeError: - result = no_result - deleted_items.append(locs) - continue - finally: if result is not no_result: # see if we can cast the block back to the original dtype From 3a83d2a51b6b0250faa5e701d9aa13b0696eea6f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 27 Aug 2019 20:04:45 -0700 Subject: [PATCH 4/4] comment, black --- pandas/core/groupby/generic.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a0c60b1e1c8db..28316fa85c066 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -49,11 +49,7 @@ from pandas.core.frame import DataFrame from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs from pandas.core.groupby import base -from pandas.core.groupby.groupby import ( - GroupBy, - _apply_docs, - _transform_template, -) +from pandas.core.groupby.groupby import GroupBy, _apply_docs, _transform_template from pandas.core.index import Index, MultiIndex, _all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block @@ -188,9 +184,12 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): deleted_items.append(locs) continue - if is_object_dtype(block.dtype) and how in["prod", "cumprod", "sum"]: + if is_object_dtype(block.dtype) and how in ["prod", "cumprod", "sum"]: # s.aggregate is not reliable for e.g. `prod` with strings result = no_result + # TODO: why are we raising here and continuing elsewhere? + # (tests.groupby.test_function.test_arg_passthru breaks + # if we continue here) raise if is_categorical_dtype(block.dtype):