Merge branch 'master' into bug_38183

ivanovmg · ivanovmg · commit 0be8f996b183 · 2020-12-01T21:54:03.000+07:00
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -358,14 +358,6 @@ def time_assign_with_setitem(self):
         for i in range(100):
             self.df[i] = np.random.randn(self.N)
 
-    def time_assign_list_like_with_setitem(self):
-        np.random.seed(1234)
-        self.df[list(range(100))] = np.random.randn(self.N, 100)
-
-    def time_assign_list_of_columns_concat(self):
-        df = DataFrame(np.random.randn(self.N, 100))
-        concat([self.df, df], axis=1)
-
 
 class ChainIndexing:
 
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -225,6 +225,20 @@ def time_rolling_offset(self, method):
         getattr(self.groupby_roll_offset, method)()
 
 
+class GroupbyLargeGroups:
+    # https://github.com/pandas-dev/pandas/issues/38038
+    # specific example where the rolling operation on a larger dataframe
+    # is relatively cheap (few but large groups), but creation of
+    # MultiIndex of result can be expensive
+
+    def setup(self):
+        N = 100000
+        self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})
+
+    def time_rolling_multiindex_creation(self):
+        self.df.groupby("A").rolling(3).mean()
+
+
 class GroupbyEWM:
 
     params = ["cython", "numba"]
diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst
@@ -24,7 +24,7 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
 - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
 - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
-- Fixed performance regression for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
+- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`)
 - Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`)
 
 .. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -196,6 +196,9 @@ Alternatively, you can also use the dtype object:
 
    pd.Series([1.5, None], dtype=pd.Float32Dtype())
 
+Operations with the existing integer or boolean nullable data types that
+give float results will now also use the nullable floating data types (:issue:`38178`).
+
 .. warning::
 
    Experimental: the new floating data types are currently experimental, and their
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -706,10 +706,11 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
         if (is_float_dtype(other) or is_float(other)) or (
             op_name in ["rtruediv", "truediv"]
         ):
-            result[mask] = np.nan
-            return result
+            from pandas.core.arrays import FloatingArray
+
+            return FloatingArray(result, mask, copy=False)
 
-        if is_bool_dtype(result):
+        elif is_bool_dtype(result):
             return BooleanArray(result, mask, copy=False)
 
         elif is_integer_dtype(result):
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -539,13 +539,15 @@ def _cmp_method(self, other, op):
         return BooleanArray(result, mask)
 
     def _arith_method(self, other, op):
+        from pandas.core.arrays import FloatingArray
+
         op_name = op.__name__
         omask = None
 
         if getattr(other, "ndim", 0) > 1:
             raise NotImplementedError("can only perform ops with 1-d structures")
 
-        if isinstance(other, IntegerArray):
+        if isinstance(other, (IntegerArray, FloatingArray)):
             other, omask = other._data, other._mask
 
         elif is_list_like(other):
@@ -636,8 +638,9 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
         if (is_float_dtype(other) or is_float(other)) or (
             op_name in ["rtruediv", "truediv"]
         ):
-            result[mask] = np.nan
-            return result
+            from pandas.core.arrays import FloatingArray
+
+            return FloatingArray(result, mask, copy=False)
 
         if result.dtype == "timedelta64[ns]":
             from pandas.core.arrays import TimedeltaArray
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -672,8 +672,17 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None):
             and not com.is_bool_indexer(key)
             and all(is_hashable(k) for k in key)
         ):
-            keys = self.obj.columns.union(key, sort=False)
-            self.obj._mgr = self.obj._mgr.reindex_axis(keys, 0)
+            for i, k in enumerate(key):
+                if k not in self.obj:
+                    if value is None:
+                        self.obj[k] = np.nan
+                    elif is_array_like(value) and value.ndim == 2:
+                        # GH#37964 have to select columnwise in case of array
+                        self.obj[k] = value[:, i]
+                    elif is_list_like(value):
+                        self.obj[k] = value[i]
+                    else:
+                        self.obj[k] = value
 
     def __setitem__(self, key, value):
         if isinstance(key, tuple):
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -50,7 +50,6 @@
 
 from pandas.core.aggregation import aggregate
 from pandas.core.base import DataError, SelectionMixin
-import pandas.core.common as com
 from pandas.core.construction import extract_array
 from pandas.core.groupby.base import GotItemMixin, ShallowMixin
 from pandas.core.indexes.api import Index, MultiIndex
@@ -791,22 +790,29 @@ def _apply(
             # Our result will have still kept the column in the result
             result = result.drop(columns=column_keys, errors="ignore")
 
-        result_index_data = []
-        for key, values in self._groupby.grouper.indices.items():
-            for value in values:
-                data = [
-                    *com.maybe_make_list(key),
-                    *com.maybe_make_list(
-                        grouped_object_index[value]
-                        if grouped_object_index is not None
-                        else []
-                    ),
-                ]
-                result_index_data.append(tuple(data))
-
-        result_index = MultiIndex.from_tuples(
-            result_index_data, names=result_index_names
+        codes = self._groupby.grouper.codes
+        levels = self._groupby.grouper.levels
+
+        group_indices = self._groupby.grouper.indices.values()
+        if group_indices:
+            indexer = np.concatenate(list(group_indices))
+        else:
+            indexer = np.array([], dtype=np.intp)
+        codes = [c.take(indexer) for c in codes]
+
+        # if the index of the original dataframe needs to be preserved, append
+        # this index (but reordered) to the codes/levels from the groupby
+        if grouped_object_index is not None:
+            idx = grouped_object_index.take(indexer)
+            if not isinstance(idx, MultiIndex):
+                idx = MultiIndex.from_arrays([idx])
+            codes.extend(list(idx.codes))
+            levels.extend(list(idx.levels))
+
+        result_index = MultiIndex(
+            levels, codes, names=result_index_names, verify_integrity=False
         )
+
         result.index = result_index
         return result
 
diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.arrays import FloatingArray
 
 
 @pytest.fixture
@@ -51,13 +52,15 @@ def test_sub(left_array, right_array):
 
 
 def test_div(left_array, right_array):
-    # for now division gives a float numpy array
     result = left_array / right_array
-    expected = np.array(
-        [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
-        dtype="float64",
+    expected = FloatingArray(
+        np.array(
+            [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
+            dtype="float64",
+        ),
+        np.array([False, False, True, False, False, True, True, True, True]),
     )
-    tm.assert_numpy_array_equal(result, expected)
+    tm.assert_extension_array_equal(result, expected)
 
 
 @pytest.mark.parametrize(
diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py
@@ -85,6 +85,13 @@ def test_value_counts_na():
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_with_normalize():
+    s = pd.Series([True, False, pd.NA], dtype="boolean")
+    result = s.value_counts(normalize=True)
+    expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2
+    tm.assert_series_equal(result, expected)
+
+
 def test_diff():
     a = pd.array(
         [True, True, False, False, True, None, True, None, False], dtype="boolean"
diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py
@@ -113,6 +113,13 @@ def test_value_counts_empty():
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_with_normalize():
+    s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
+    result = s.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.parametrize("min_count", [0, 4])
 def test_floating_array_sum(skipna, min_count, dtype):
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -7,7 +7,7 @@
 
 import pandas as pd
 import pandas._testing as tm
-from pandas.core.arrays import integer_array
+from pandas.core.arrays import FloatingArray, integer_array
 import pandas.core.ops as ops
 
 # Basic test for the arithmetic array ops
@@ -45,24 +45,26 @@ def test_sub(dtype):
 
 
 def test_div(dtype):
-    # for now division gives a float numpy array
     a = pd.array([1, 2, 3, None, 5], dtype=dtype)
     b = pd.array([0, 1, None, 3, 4], dtype=dtype)
 
     result = a / b
-    expected = np.array([np.inf, 2, np.nan, np.nan, 1.25], dtype="float64")
-    tm.assert_numpy_array_equal(result, expected)
+    expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64")
+    tm.assert_extension_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
 def test_divide_by_zero(zero, negative):
     # https://github.com/pandas-dev/pandas/issues/27398
     a = pd.array([0, 1, -1, None], dtype="Int64")
     result = a / zero
-    expected = np.array([np.nan, np.inf, -np.inf, np.nan])
+    expected = FloatingArray(
+        np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
+        np.array([False, False, False, True]),
+    )
     if negative:
         expected *= -1
-    tm.assert_numpy_array_equal(result, expected)
+    tm.assert_extension_array_equal(result, expected)
 
 
 def test_floordiv(dtype):
@@ -99,8 +101,11 @@ def test_pow_scalar():
     tm.assert_extension_array_equal(result, expected)
 
     result = a ** np.nan
-    expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64")
-    tm.assert_numpy_array_equal(result, expected)
+    expected = FloatingArray(
+        np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
+        np.array([False, False, False, True, False]),
+    )
+    tm.assert_extension_array_equal(result, expected)
 
     # reversed
     a = a[1:]  # Can't raise integers to negative powers.
@@ -118,8 +123,11 @@ def test_pow_scalar():
     tm.assert_extension_array_equal(result, expected)
 
     result = np.nan ** a
-    expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64")
-    tm.assert_numpy_array_equal(result, expected)
+    expected = FloatingArray(
+        np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
+        np.array([False, False, True, False]),
+    )
+    tm.assert_extension_array_equal(result, expected)
 
 
 def test_pow_array():
@@ -133,10 +141,10 @@ def test_pow_array():
 def test_rpow_one_to_na():
     # https://github.com/pandas-dev/pandas/issues/22022
     # https://github.com/pandas-dev/pandas/issues/29997
-    arr = integer_array([np.nan, np.nan])
+    arr = pd.array([np.nan, np.nan], dtype="Int64")
     result = np.array([1.0, 2.0]) ** arr
-    expected = np.array([1.0, np.nan])
-    tm.assert_numpy_array_equal(result, expected)
+    expected = pd.array([1.0, np.nan], dtype="Float64")
+    tm.assert_extension_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("other", [0, 0.5])
@@ -198,11 +206,19 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators):
 
     result = op(s, other)
     expected = op(s.astype(float), other)
+    expected = expected.astype("Float64")
     # rfloordiv results in nan instead of inf
     if all_arithmetic_operators == "__rfloordiv__" and _np_version_under1p20:
         # for numpy 1.20 https://github.com/numpy/numpy/pull/16161
         #  updated floordiv, now matches our behavior defined in core.ops
-        expected[(expected == np.inf) | (expected == -np.inf)] = np.nan
+        mask = (
+            ((expected == np.inf) | (expected == -np.inf)).fillna(False).to_numpy(bool)
+        )
+        expected.array._data[mask] = np.nan
+    # rmod results in NaN that wasn't NA in original nullable Series -> unmask it
+    elif all_arithmetic_operators == "__rmod__":
+        mask = (s == 0).fillna(False).to_numpy(bool)
+        expected.array._mask[mask] = False
 
     tm.assert_series_equal(result, expected)
 
@@ -215,7 +231,7 @@ def test_arithmetic_conversion(all_arithmetic_operators, other):
 
     s = pd.Series([1, 2, 3], dtype="Int64")
     result = op(s, other)
-    assert result.dtype is np.dtype("float")
+    assert result.dtype == "Float64"
 
 
 def test_cross_type_arithmetic():
diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py
@@ -127,6 +127,14 @@ def test_value_counts_empty():
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_with_normalize():
+    # GH 33172
+    s = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
+    result = s.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.parametrize("min_count", [0, 4])
 def test_integer_array_sum(skipna, min_count, any_nullable_int_dtype):
diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py
@@ -43,11 +43,7 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
     for scalar in [scalar, data.dtype.type(scalar)]:
         result = op(data, scalar)
         expected = op(data, scalar_array)
-        if isinstance(expected, ExtensionArray):
-            tm.assert_extension_array_equal(result, expected)
-        else:
-            # TODO div still gives float ndarray -> remove this once we have Float EA
-            tm.assert_numpy_array_equal(result, expected)
+        tm.assert_extension_array_equal(result, expected)
 
 
 def test_array_NA(data, all_arithmetic_operators):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -495,6 +495,18 @@ def test_value_counts_na(dtype, request):
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_with_normalize(dtype, request):
+    if dtype == "arrow_string":
+        reason = "TypeError: boolean value of NA is ambiguous"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
+    result = s.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "values, expected",
     [
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py