API: Sum / Prod of all-NA and empty

TomAugspurger · TomAugspurger · commit a87f1f94c9b2 · 2017-12-20T07:06:06.000-06:00
Changes the sum of empty and all-NA to be 0.
Changes the prod of empty and all-NA to be 1.
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -7310,7 +7310,8 @@ def _add_numeric_operations(cls):
         @Substitution(outname='mad',
                       desc="Return the mean absolute deviation of the values "
                            "for the requested axis",
-                      name1=name, name2=name2, axis_descr=axis_descr)
+                      name1=name, name2=name2, axis_descr=axis_descr,
+                      empty_is_na='')
         @Appender(_num_doc)
         def mad(self, axis=None, skipna=None, level=None):
             if skipna is None:
@@ -7351,7 +7352,7 @@ def mad(self, axis=None, skipna=None, level=None):
         @Substitution(outname='compounded',
                       desc="Return the compound percentage of the values for "
                       "the requested axis", name1=name, name2=name2,
-                      axis_descr=axis_descr)
+                      axis_descr=axis_descr, empty_is_na='')
         @Appender(_num_doc)
         def compound(self, axis=None, skipna=None, level=None):
             if skipna is None:
@@ -7375,10 +7376,11 @@ def compound(self, axis=None, skipna=None, level=None):
             lambda y, axis: np.maximum.accumulate(y, axis), "max",
             -np.inf, np.nan)
 
-        cls.sum = _make_stat_function(
+        cls.sum = _make_empty_stat_function(
             cls, 'sum', name, name2, axis_descr,
             'Return the sum of the values for the requested axis',
-            nanops.nansum)
+            nanops.nansum,
+            empty_is_na=False)
         cls.mean = _make_stat_function(
             cls, 'mean', name, name2, axis_descr,
             'Return the mean of the values for the requested axis',
@@ -7394,10 +7396,11 @@ def compound(self, axis=None, skipna=None, level=None):
             "by N-1\n",
             nanops.nankurt)
         cls.kurtosis = cls.kurt
-        cls.prod = _make_stat_function(
+        cls.prod = _make_empty_stat_function(
             cls, 'prod', name, name2, axis_descr,
             'Return the product of the values for the requested axis',
-            nanops.nanprod)
+            nanops.nanprod,
+            empty_is_na=False)
         cls.product = cls.prod
         cls.median = _make_stat_function(
             cls, 'median', name, name2, axis_descr,
@@ -7520,14 +7523,14 @@ def _doc_parms(cls):
 ----------
 axis : %(axis_descr)s
 skipna : boolean, default True
-    Exclude NA/null values. If an entire row/column is NA or empty, the result
-    will be NA
+    Exclude NA/null values before computing the result.
 level : int or level name, default None
     If the axis is a MultiIndex (hierarchical), count along a
     particular level, collapsing into a %(name1)s
 numeric_only : boolean, default None
     Include only float, int, boolean columns. If None, will attempt to use
-    everything, then use only numeric data. Not implemented for Series.
+    everything, then use only numeric data. Not implemented for
+    Series.%(empty_is_na)s
 
 Returns
 -------
@@ -7584,7 +7587,7 @@ def _doc_parms(cls):
 axis : %(axis_descr)s
 skipna : boolean, default True
     Exclude NA/null values. If an entire row/column is NA, the result
-    will be NA
+    will be NA.
 
 Returns
 -------
@@ -7598,16 +7601,45 @@ def _doc_parms(cls):
 
 """
 
+_empty_is_na_doc = """
+empty_is_na : bool, default False
+    The result of operating on an empty array should be NA. The default
+    behavior is for the sum of an empty array to be 0, and the product
+    of an empty array to be 1.
+
+    When ``skipna=True``, "empty" refers to whether or not the array
+    is empty after removing NAs. So operating on an all-NA array with
+    ``skipna=True`` will be NA when ``empty_is_na`` is True.
+    """
+
+
+def _make_empty_stat_function(cls, name, name1, name2, axis_descr, desc, f,
+                              empty_is_na=False):
+    @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
+                  axis_descr=axis_descr, empty_is_na=_empty_is_na_doc)
+    @Appender(_num_doc)
+    def stat_func(self, axis=None, skipna=True, level=None, numeric_only=None,
+                  empty_is_na=empty_is_na, **kwargs):
+        nv.validate_stat_func(tuple(), kwargs, fname=name)
+        if axis is None:
+            axis = self._stat_axis_number
+        if level is not None:
+            return self._agg_by_level(name, axis=axis, level=level,
+                                      skipna=skipna, empty_is_na=empty_is_na)
+        return self._reduce(f, name, axis=axis, skipna=skipna,
+                            numeric_only=numeric_only,
+                            empty_is_na=empty_is_na)
+
+    return set_function_name(stat_func, name, cls)
+
 
 def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f):
     @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
-                  axis_descr=axis_descr)
+                  axis_descr=axis_descr, empty_is_na='')
     @Appender(_num_doc)
-    def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
+    def stat_func(self, axis=None, skipna=True, level=None, numeric_only=None,
                   **kwargs):
         nv.validate_stat_func(tuple(), kwargs, fname=name)
-        if skipna is None:
-            skipna = True
         if axis is None:
             axis = self._stat_axis_number
         if level is not None:
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -107,7 +107,8 @@ def f(values, axis=None, skipna=True, **kwds):
                     if k not in kwds:
                         kwds[k] = v
             try:
-                if values.size == 0:
+                # TODO: NaT
+                if values.size == 0 and kwds.get('empty_is_na'):
 
                     # we either return np.nan or pd.NaT
                     if is_numeric_dtype(values):
@@ -155,6 +156,7 @@ def _bn_ok_dtype(dt, name):
     # Bottleneck chokes on datetime64
     if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):
 
+        # TODO: handle this overflow
         # GH 15507
         # bottleneck does not properly upcast during the sum
         # so can overflow
@@ -163,6 +165,9 @@ def _bn_ok_dtype(dt, name):
         # further we also want to preserve NaN when all elements
         # are NaN, unlinke bottleneck/numpy which consider this
         # to be 0
+
+        # https://github.com/kwgoodman/bottleneck/issues/180
+        # No upcast for boolean -> int
         if name in ['nansum', 'nanprod']:
             return False
 
@@ -303,22 +308,21 @@ def nanall(values, axis=None, skipna=True):
 
 
 @disallow('M8')
-@bottleneck_switch()
-def nansum(values, axis=None, skipna=True):
+@bottleneck_switch(empty_is_na=False)
+def nansum(values, axis=None, skipna=True, empty_is_na=False):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
     dtype_sum = dtype_max
     if is_float_dtype(dtype):
         dtype_sum = dtype
     elif is_timedelta64_dtype(dtype):
         dtype_sum = np.float64
     the_sum = values.sum(axis, dtype=dtype_sum)
-    the_sum = _maybe_null_out(the_sum, axis, mask)
+    the_sum = _maybe_null_out(the_sum, axis, mask, empty_is_na)
 
     return _wrap_results(the_sum, dtype)
 
 
 @disallow('M8')
-@bottleneck_switch()
 def nanmean(values, axis=None, skipna=True):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
 
@@ -641,13 +645,15 @@ def nankurt(values, axis=None, skipna=True):
 
 
 @disallow('M8', 'm8')
-def nanprod(values, axis=None, skipna=True):
+@bottleneck_switch(empty_is_na=False)
+def nanprod(values, axis=None, skipna=True, empty_is_na=False):
     mask = isna(values)
     if skipna and not is_any_int_dtype(values):
         values = values.copy()
         values[mask] = 1
     result = values.prod(axis)
-    return _maybe_null_out(result, axis, mask)
+
+    return _maybe_null_out(result, axis, mask, empty_is_na, unit=1.0)
 
 
 def _maybe_arg_null_out(result, axis, mask, skipna):
@@ -683,9 +689,13 @@ def _get_counts(mask, axis, dtype=float):
         return np.array(count, dtype=dtype)
 
 
-def _maybe_null_out(result, axis, mask):
+def _maybe_null_out(result, axis, mask, empty_is_na=True, unit=0.0):
     if axis is not None and getattr(result, 'ndim', False):
         null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
+
+        if not empty_is_na:
+            null_mask[result == unit] = False
+
         if np.any(null_mask):
             if is_numeric_dtype(result):
                 if np.iscomplexobj(result):
@@ -698,7 +708,7 @@ def _maybe_null_out(result, axis, mask):
                 result[null_mask] = None
     elif result is not tslib.NaT:
         null_mask = mask.size - mask.sum()
-        if null_mask == 0:
+        if null_mask == 0.0 and empty_is_na:
             result = np.nan
 
     return result
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -478,10 +478,11 @@ def test_nunique(self):
                                Series({0: 1, 1: 3, 2: 2}))
 
     def test_sum(self):
-        self._check_stat_op('sum', np.sum, has_numeric_only=True)
+        self._check_stat_op('sum', np.nansum, has_numeric_only=True,
+                            no_skipna_alternative=np.sum)
 
         # mixed types (with upcasting happening)
-        self._check_stat_op('sum', np.sum,
+        self._check_stat_op('sum', np.nansum,
                             frame=self.mixed_float.astype('float32'),
                             has_numeric_only=True, check_dtype=False,
                             check_less_precise=True)
@@ -753,7 +754,8 @@ def alt(x):
 
     def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
                        has_numeric_only=False, check_dtype=True,
-                       check_dates=False, check_less_precise=False):
+                       check_dates=False, check_less_precise=False,
+                       no_skipna_alternative=None):
         if frame is None:
             frame = self.frame
             # set some NAs
@@ -774,14 +776,20 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
             assert len(result)
 
         if has_skipna:
-            def skipna_wrapper(x):
-                nona = x.dropna()
-                if len(nona) == 0:
-                    return np.nan
-                return alternative(nona)
+            alt = no_skipna_alternative or alternative  # e.g. sum / nansum
+
+            if no_skipna_alternative:
+                def skipna_wrapper(x):
+                    return alternative(x.values)
+            else:
+                def skipna_wrapper(x):
+                    nona = x.dropna()
+                    if len(nona) == 0:
+                        return np.nan
+                    return alt(nona)
 
             def wrapper(x):
-                return alternative(x.values)
+                return alt(x.values)
 
             result0 = f(axis=0, skipna=False)
             result1 = f(axis=1, skipna=False)
@@ -793,7 +801,7 @@ def wrapper(x):
                                    check_dtype=False,
                                    check_less_precise=check_less_precise)
         else:
-            skipna_wrapper = alternative
+            skipna_wrapper =alternative
             wrapper = alternative
 
         result0 = f(axis=0)
@@ -834,6 +842,12 @@ def wrapper(x):
             r0 = getattr(all_na, name)(axis=0)
             r1 = getattr(all_na, name)(axis=1)
             if name in ['sum', 'prod']:
+                tm.assert_numpy_array_equal(r0.values, np.zeros_like(r0))
+                tm.assert_numpy_array_equal(r1.values, np.zeros_like(r1))
+
+            if name in ['sum', 'prod']:
+                r0 = getattr(all_na, name)(axis=0, skipna=False)
+                r1 = getattr(all_na, name)(axis=1, skipna=False)
                 assert np.isnan(r0).all()
                 assert np.isnan(r1).all()
 
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -33,40 +33,46 @@ class TestSeriesAnalytics(TestData):
     @pytest.mark.parametrize("method", ["sum", "prod"])
     def test_empty(self, method, use_bottleneck):
 
+        if method == "sum":
+            unit = 0
+        else:
+            unit = 1
         with pd.option_context("use_bottleneck", use_bottleneck):
-            # GH 9422
-            # treat all missing as NaN
+            # GH 9422 / 18678
+            # treat all missing as 0
             s = Series([])
             result = getattr(s, method)()
-            assert isna(result)
+            assert result == unit
 
             result = getattr(s, method)(skipna=True)
-            assert isna(result)
+            assert result == unit
 
             s = Series([np.nan])
             result = getattr(s, method)()
-            assert isna(result)
+            assert result == unit
 
             result = getattr(s, method)(skipna=True)
-            assert isna(result)
+            assert result == unit
 
             s = Series([np.nan, 1])
             result = getattr(s, method)()
-            assert result == 1.0
+            assert result == 1
 
             s = Series([np.nan, 1])
             result = getattr(s, method)(skipna=True)
             assert result == 1.0
 
             # GH #844 (changed in 9422)
             df = DataFrame(np.empty((10, 0)))
-            assert (df.sum(1).isnull()).all()
+            result = df.sum(1)
+            expected = pd.Series(0, index=df.index, dtype='float64')
+            tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
-        "method", ['sum', 'mean', 'median', 'std', 'var'])
+        "method", ['mean', 'median', 'std', 'var'])
     def test_ops_consistency_on_empty(self, method):
 
-        # GH 7869
+        # GH 7869 / 18678
         # consistency on empty
 
         # float
@@ -77,6 +83,19 @@ def test_ops_consistency_on_empty(self, method):
         result = getattr(Series(dtype='m8[ns]'), method)()
         assert result is pd.NaT
 
+    @pytest.mark.parametrize('method, unit', [
+        ('sum', 0),
+        ('prod', 1),
+    ])
+    def test_ops_consistency_on_empty_sum_prod(self, method, unit):
+        # GH 18678
+        result = getattr(Series(dtype=float), method)()
+        assert result == unit
+
+        if method == 'sum':
+            result = getattr(Series(dtype='m8[ns]'), method)()
+            assert result == pd.Timedelta(0)
+
     def test_nansum_buglet(self):
         s = Series([1.0, np.nan], index=[0, 1])
         result = np.nansum(s)
@@ -111,7 +130,7 @@ def test_sum_overflow(self, use_bottleneck):
                 assert np.allclose(float(result), v[-1])
 
     def test_sum(self):
-        self._check_stat_op('sum', np.sum, check_allna=True)
+        self._check_stat_op('sum', np.nansum, check_allna=False)
 
     def test_sum_inf(self):
         s = Series(np.random.randn(10))
diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py
@@ -38,7 +38,7 @@ def test_quantile(self):
 
         # GH7661
         result = Series([np.timedelta64('NaT')]).sum()
-        assert result is pd.NaT
+        assert result == pd.Timedelta(0)
 
         msg = 'percentiles should all be in the interval \\[0, 1\\]'
         for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py