From 768f93cc925071621e4bdee3ea5e10c52dfd91a7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 18 Feb 2020 12:52:45 -0800 Subject: [PATCH] REF: remove SeriesBinGrouper, SeriesGrouper --- pandas/_libs/reduction.pyx | 247 ----------------------- pandas/core/groupby/groupby.py | 11 +- pandas/core/groupby/ops.py | 50 +---- pandas/tests/groupby/test_bin_groupby.py | 42 ---- 4 files changed, 3 insertions(+), 347 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index b27072aa66708..382a5c6692b55 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -147,253 +147,6 @@ cdef class Reducer: return result -cdef class _BaseGrouper: - cdef _check_dummy(self, dummy): - # both values and index must be an ndarray! - - values = dummy.values - # GH 23683: datetimetz types are equivalent to datetime types here - if (dummy.dtype != self.arr.dtype - and values.dtype != self.arr.dtype): - raise ValueError('Dummy array must be same dtype') - if util.is_array(values) and not values.flags.contiguous: - # e.g. Categorical has no `flags` attribute - values = values.copy() - index = dummy.index.values - if not index.flags.contiguous: - index = index.copy() - - return values, index - - cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp, - Slider islider, Slider vslider): - if cached_typ is None: - cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name) - else: - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference - # to a 1-d ndarray like datetime / timedelta / period. - object.__setattr__(cached_ityp, '_index_data', islider.buf) - cached_ityp._engine.clear_mapping() - object.__setattr__(cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', self.name) - - return cached_typ, cached_ityp - - cdef inline object _apply_to_group(self, - object cached_typ, object cached_ityp, - Slider islider, Slider vslider, - Py_ssize_t group_size, bint initialized): - """ - Call self.f on our new group, then update to the next group. - """ - cached_ityp._engine.clear_mapping() - res = self.f(cached_typ) - res = _extract_result(res) - if not initialized: - # On the first pass, we check the output shape to see - # if this looks like a reduction. - initialized = 1 - _check_result_array(res, len(self.dummy_arr)) - - islider.advance(group_size) - vslider.advance(group_size) - - return res, initialized - - -cdef class SeriesBinGrouper(_BaseGrouper): - """ - Performs grouping operation according to bin edges, rather than labels - """ - cdef: - Py_ssize_t nresults, ngroups - - cdef public: - ndarray arr, index, dummy_arr, dummy_index - object values, f, bins, typ, ityp, name - - def __init__(self, object series, object f, object bins, object dummy): - - assert dummy is not None # always obj[:0] - assert len(bins) > 0 # otherwise we get IndexError in get_result - - self.bins = bins - self.f = f - - values = series.values - if util.is_array(values) and not values.flags.c_contiguous: - # e.g. Categorical has no `flags` attribute - values = values.copy('C') - self.arr = values - self.typ = series._constructor - self.ityp = series.index._constructor - self.index = series.index.values - self.name = series.name - - self.dummy_arr, self.dummy_index = self._check_dummy(dummy) - - # kludge for #1688 - if len(bins) > 0 and bins[-1] == len(series): - self.ngroups = len(bins) - else: - self.ngroups = len(bins) + 1 - - def get_result(self): - cdef: - ndarray arr, result - ndarray[int64_t] counts - Py_ssize_t i, n, group_size - object res - bint initialized = 0 - Slider vslider, islider - object cached_typ = None, cached_ityp = None - - counts = np.zeros(self.ngroups, dtype=np.int64) - - if self.ngroups > 0: - counts[0] = self.bins[0] - for i in range(1, self.ngroups): - if i == self.ngroups - 1: - counts[i] = len(self.arr) - self.bins[i - 1] - else: - counts[i] = self.bins[i] - self.bins[i - 1] - - group_size = 0 - n = len(self.arr) - - vslider = Slider(self.arr, self.dummy_arr) - islider = Slider(self.index, self.dummy_index) - - result = np.empty(self.ngroups, dtype='O') - - try: - for i in range(self.ngroups): - group_size = counts[i] - - islider.set_length(group_size) - vslider.set_length(group_size) - - cached_typ, cached_ityp = self._update_cached_objs( - cached_typ, cached_ityp, islider, vslider) - - res, initialized = self._apply_to_group(cached_typ, cached_ityp, - islider, vslider, - group_size, initialized) - - result[i] = res - - finally: - # so we don't free the wrong memory - islider.reset() - vslider.reset() - - result = maybe_convert_objects(result) - return result, counts - - -cdef class SeriesGrouper(_BaseGrouper): - """ - Performs generic grouping operation while avoiding ndarray construction - overhead - """ - cdef: - Py_ssize_t nresults, ngroups - - cdef public: - ndarray arr, index, dummy_arr, dummy_index - object f, labels, values, typ, ityp, name - - def __init__(self, object series, object f, object labels, - Py_ssize_t ngroups, object dummy): - - # in practice we always pass obj.iloc[:0] or equivalent - assert dummy is not None - - if len(series) == 0: - # get_result would never assign `result` - raise ValueError("SeriesGrouper requires non-empty `series`") - - self.labels = labels - self.f = f - - values = series.values - if util.is_array(values) and not values.flags.c_contiguous: - # e.g. Categorical has no `flags` attribute - values = values.copy('C') - self.arr = values - self.typ = series._constructor - self.ityp = series.index._constructor - self.index = series.index.values - self.name = series.name - - self.dummy_arr, self.dummy_index = self._check_dummy(dummy) - self.ngroups = ngroups - - def get_result(self): - cdef: - # Define result to avoid UnboundLocalError - ndarray arr, result = None - ndarray[int64_t] labels, counts - Py_ssize_t i, n, group_size, lab - object res - bint initialized = 0 - Slider vslider, islider - object cached_typ = None, cached_ityp = None - - labels = self.labels - counts = np.zeros(self.ngroups, dtype=np.int64) - group_size = 0 - n = len(self.arr) - - vslider = Slider(self.arr, self.dummy_arr) - islider = Slider(self.index, self.dummy_index) - - result = np.empty(self.ngroups, dtype='O') - - try: - for i in range(n): - group_size += 1 - - lab = labels[i] - - if i == n - 1 or lab != labels[i + 1]: - if lab == -1: - islider.advance(group_size) - vslider.advance(group_size) - group_size = 0 - continue - - islider.set_length(group_size) - vslider.set_length(group_size) - - cached_typ, cached_ityp = self._update_cached_objs( - cached_typ, cached_ityp, islider, vslider) - - res, initialized = self._apply_to_group(cached_typ, cached_ityp, - islider, vslider, - group_size, initialized) - - result[lab] = res - counts[lab] = group_size - group_size = 0 - - finally: - # so we don't free the wrong memory - islider.reset() - vslider.reset() - - # We check for empty series in the constructor, so should always - # have result initialized by this point. - assert initialized, "`result` has not been initialized." - - result = maybe_convert_objects(result) - - return result, counts - - cdef inline _extract_result(object res, bint squeeze=True): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cc46485b4a2e8..f946f0e63a583 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -923,17 +923,10 @@ def _python_agg_general(self, func, *args, **kwargs): try: # if this function is invalid for this dtype, we will ignore it. - func(obj[:0]) + result, counts = self.grouper.agg_series(obj, f) except TypeError: continue - except AssertionError: - raise - except Exception: - # Our function depends on having a non-empty argument - # See test_groupby_agg_err_catching - pass - - result, counts = self.grouper.agg_series(obj, f) + assert result is not None key = base.OutputKey(label=name, position=idx) output[key] = self._try_cast(result, obj, numeric_only=True) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7259268ac3f2b..2b89a51eb635c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -621,50 +621,8 @@ def agg_series(self, obj: Series, func): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 - if len(obj) == 0: - # SeriesGrouper would raise if we were to call _aggregate_series_fast - return self._aggregate_series_pure_python(obj, func) - - elif is_extension_array_dtype(obj.dtype): - # _aggregate_series_fast would raise TypeError when - # calling libreduction.Slider - # In the datetime64tz case it would incorrectly cast to tz-naive - # TODO: can we get a performant workaround for EAs backed by ndarray? - return self._aggregate_series_pure_python(obj, func) - - elif obj.index._has_complex_internals: - # Pre-empt TypeError in _aggregate_series_fast - return self._aggregate_series_pure_python(obj, func) - - try: - return self._aggregate_series_fast(obj, func) - except ValueError as err: - if "Function does not reduce" in str(err): - # raised in libreduction - pass - else: - raise return self._aggregate_series_pure_python(obj, func) - def _aggregate_series_fast(self, obj: Series, func): - # At this point we have already checked that - # - obj.index is not a MultiIndex - # - obj is backed by an ndarray, not ExtensionArray - # - len(obj) > 0 - # - ngroups != 0 - func = self._is_builtin_func(func) - - group_index, _, ngroups = self.group_info - - # avoids object / Series creation overhead - dummy = obj.iloc[:0] - indexer = get_group_index_sorter(group_index, ngroups) - obj = obj.take(indexer) - group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) - grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) - result, counts = grouper.get_result() - return result, counts - def _aggregate_series_pure_python(self, obj: Series, func): group_index, _, ngroups = self.group_info @@ -856,13 +814,7 @@ def agg_series(self, obj: Series, func): assert self.ngroups != 0 assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result - if is_extension_array_dtype(obj.dtype): - # pre-empt SeriesBinGrouper from raising TypeError - return self._aggregate_series_pure_python(obj, func) - - dummy = obj[:0] - grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) - return grouper.get_result() + return self._aggregate_series_pure_python(obj, func) def _is_indexed_like(obj, axes) -> bool: diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index ff74d374e5e3f..5f20155fc46e4 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -9,48 +9,6 @@ import pandas._testing as tm -def test_series_grouper(): - obj = Series(np.random.randn(10)) - dummy = obj.iloc[:0] - - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - - grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) - result, counts = grouper.get_result() - - expected = np.array([obj[3:6].mean(), obj[6:].mean()]) - tm.assert_almost_equal(result, expected) - - exp_counts = np.array([3, 4], dtype=np.int64) - tm.assert_almost_equal(counts, exp_counts) - - -def test_series_grouper_requires_nonempty_raises(): - # GH#29500 - obj = Series(np.random.randn(10)) - dummy = obj.iloc[:0] - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - - with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): - libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy) - - -def test_series_bin_grouper(): - obj = Series(np.random.randn(10)) - dummy = obj[:0] - - bins = np.array([3, 6]) - - grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy) - result, counts = grouper.get_result() - - expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) - tm.assert_almost_equal(result, expected) - - exp_counts = np.array([3, 3, 4], dtype=np.int64) - tm.assert_almost_equal(counts, exp_counts) - - @pytest.mark.parametrize( "binner,closed,expected", [