Skip to content

Commit 0be8f99

Browse files
committed
Merge branch 'master' into bug_38183
2 parents 5970d41 + c2018c1 commit 0be8f99

22 files changed

+240
-76
lines changed

asv_bench/benchmarks/indexing.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -358,14 +358,6 @@ def time_assign_with_setitem(self):
358358
for i in range(100):
359359
self.df[i] = np.random.randn(self.N)
360360

361-
def time_assign_list_like_with_setitem(self):
362-
np.random.seed(1234)
363-
self.df[list(range(100))] = np.random.randn(self.N, 100)
364-
365-
def time_assign_list_of_columns_concat(self):
366-
df = DataFrame(np.random.randn(self.N, 100))
367-
concat([self.df, df], axis=1)
368-
369361

370362
class ChainIndexing:
371363

asv_bench/benchmarks/rolling.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,20 @@ def time_rolling_offset(self, method):
225225
getattr(self.groupby_roll_offset, method)()
226226

227227

228+
class GroupbyLargeGroups:
229+
# https://github.com/pandas-dev/pandas/issues/38038
230+
# specific example where the rolling operation on a larger dataframe
231+
# is relatively cheap (few but large groups), but creation of
232+
# MultiIndex of result can be expensive
233+
234+
def setup(self):
235+
N = 100000
236+
self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})
237+
238+
def time_rolling_multiindex_creation(self):
239+
self.df.groupby("A").rolling(3).mean()
240+
241+
228242
class GroupbyEWM:
229243

230244
params = ["cython", "numba"]

doc/source/whatsnew/v1.1.5.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Fixed regressions
2424
- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
2525
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
2626
- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
27-
- Fixed performance regression for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
27+
- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`)
2828
- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`)
2929

3030
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.2.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,9 @@ Alternatively, you can also use the dtype object:
196196
197197
pd.Series([1.5, None], dtype=pd.Float32Dtype())
198198
199+
Operations with the existing integer or boolean nullable data types that
200+
give float results will now also use the nullable floating data types (:issue:`38178`).
201+
199202
.. warning::
200203

201204
Experimental: the new floating data types are currently experimental, and their

pandas/core/arrays/boolean.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -706,10 +706,11 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
706706
if (is_float_dtype(other) or is_float(other)) or (
707707
op_name in ["rtruediv", "truediv"]
708708
):
709-
result[mask] = np.nan
710-
return result
709+
from pandas.core.arrays import FloatingArray
710+
711+
return FloatingArray(result, mask, copy=False)
711712

712-
if is_bool_dtype(result):
713+
elif is_bool_dtype(result):
713714
return BooleanArray(result, mask, copy=False)
714715

715716
elif is_integer_dtype(result):

pandas/core/arrays/integer.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -539,13 +539,15 @@ def _cmp_method(self, other, op):
539539
return BooleanArray(result, mask)
540540

541541
def _arith_method(self, other, op):
542+
from pandas.core.arrays import FloatingArray
543+
542544
op_name = op.__name__
543545
omask = None
544546

545547
if getattr(other, "ndim", 0) > 1:
546548
raise NotImplementedError("can only perform ops with 1-d structures")
547549

548-
if isinstance(other, IntegerArray):
550+
if isinstance(other, (IntegerArray, FloatingArray)):
549551
other, omask = other._data, other._mask
550552

551553
elif is_list_like(other):
@@ -636,8 +638,9 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
636638
if (is_float_dtype(other) or is_float(other)) or (
637639
op_name in ["rtruediv", "truediv"]
638640
):
639-
result[mask] = np.nan
640-
return result
641+
from pandas.core.arrays import FloatingArray
642+
643+
return FloatingArray(result, mask, copy=False)
641644

642645
if result.dtype == "timedelta64[ns]":
643646
from pandas.core.arrays import TimedeltaArray

pandas/core/indexing.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -672,8 +672,17 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None):
672672
and not com.is_bool_indexer(key)
673673
and all(is_hashable(k) for k in key)
674674
):
675-
keys = self.obj.columns.union(key, sort=False)
676-
self.obj._mgr = self.obj._mgr.reindex_axis(keys, 0)
675+
for i, k in enumerate(key):
676+
if k not in self.obj:
677+
if value is None:
678+
self.obj[k] = np.nan
679+
elif is_array_like(value) and value.ndim == 2:
680+
# GH#37964 have to select columnwise in case of array
681+
self.obj[k] = value[:, i]
682+
elif is_list_like(value):
683+
self.obj[k] = value[i]
684+
else:
685+
self.obj[k] = value
677686

678687
def __setitem__(self, key, value):
679688
if isinstance(key, tuple):

pandas/core/window/rolling.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050

5151
from pandas.core.aggregation import aggregate
5252
from pandas.core.base import DataError, SelectionMixin
53-
import pandas.core.common as com
5453
from pandas.core.construction import extract_array
5554
from pandas.core.groupby.base import GotItemMixin, ShallowMixin
5655
from pandas.core.indexes.api import Index, MultiIndex
@@ -791,22 +790,29 @@ def _apply(
791790
# Our result will have still kept the column in the result
792791
result = result.drop(columns=column_keys, errors="ignore")
793792

794-
result_index_data = []
795-
for key, values in self._groupby.grouper.indices.items():
796-
for value in values:
797-
data = [
798-
*com.maybe_make_list(key),
799-
*com.maybe_make_list(
800-
grouped_object_index[value]
801-
if grouped_object_index is not None
802-
else []
803-
),
804-
]
805-
result_index_data.append(tuple(data))
806-
807-
result_index = MultiIndex.from_tuples(
808-
result_index_data, names=result_index_names
793+
codes = self._groupby.grouper.codes
794+
levels = self._groupby.grouper.levels
795+
796+
group_indices = self._groupby.grouper.indices.values()
797+
if group_indices:
798+
indexer = np.concatenate(list(group_indices))
799+
else:
800+
indexer = np.array([], dtype=np.intp)
801+
codes = [c.take(indexer) for c in codes]
802+
803+
# if the index of the original dataframe needs to be preserved, append
804+
# this index (but reordered) to the codes/levels from the groupby
805+
if grouped_object_index is not None:
806+
idx = grouped_object_index.take(indexer)
807+
if not isinstance(idx, MultiIndex):
808+
idx = MultiIndex.from_arrays([idx])
809+
codes.extend(list(idx.codes))
810+
levels.extend(list(idx.levels))
811+
812+
result_index = MultiIndex(
813+
levels, codes, names=result_index_names, verify_integrity=False
809814
)
815+
810816
result.index = result_index
811817
return result
812818

pandas/tests/arrays/boolean/test_arithmetic.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pandas as pd
77
import pandas._testing as tm
8+
from pandas.arrays import FloatingArray
89

910

1011
@pytest.fixture
@@ -51,13 +52,15 @@ def test_sub(left_array, right_array):
5152

5253

5354
def test_div(left_array, right_array):
54-
# for now division gives a float numpy array
5555
result = left_array / right_array
56-
expected = np.array(
57-
[1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
58-
dtype="float64",
56+
expected = FloatingArray(
57+
np.array(
58+
[1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
59+
dtype="float64",
60+
),
61+
np.array([False, False, True, False, False, True, True, True, True]),
5962
)
60-
tm.assert_numpy_array_equal(result, expected)
63+
tm.assert_extension_array_equal(result, expected)
6164

6265

6366
@pytest.mark.parametrize(

pandas/tests/arrays/boolean/test_function.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ def test_value_counts_na():
8585
tm.assert_series_equal(result, expected)
8686

8787

88+
def test_value_counts_with_normalize():
89+
s = pd.Series([True, False, pd.NA], dtype="boolean")
90+
result = s.value_counts(normalize=True)
91+
expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2
92+
tm.assert_series_equal(result, expected)
93+
94+
8895
def test_diff():
8996
a = pd.array(
9097
[True, True, False, False, True, None, True, None, False], dtype="boolean"

pandas/tests/arrays/floating/test_function.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,13 @@ def test_value_counts_empty():
113113
tm.assert_series_equal(result, expected)
114114

115115

116+
def test_value_counts_with_normalize():
117+
s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
118+
result = s.value_counts(normalize=True)
119+
expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3
120+
tm.assert_series_equal(result, expected)
121+
122+
116123
@pytest.mark.parametrize("skipna", [True, False])
117124
@pytest.mark.parametrize("min_count", [0, 4])
118125
def test_floating_array_sum(skipna, min_count, dtype):

pandas/tests/arrays/integer/test_arithmetic.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import pandas as pd
99
import pandas._testing as tm
10-
from pandas.core.arrays import integer_array
10+
from pandas.core.arrays import FloatingArray, integer_array
1111
import pandas.core.ops as ops
1212

1313
# Basic test for the arithmetic array ops
@@ -45,24 +45,26 @@ def test_sub(dtype):
4545

4646

4747
def test_div(dtype):
48-
# for now division gives a float numpy array
4948
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
5049
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
5150

5251
result = a / b
53-
expected = np.array([np.inf, 2, np.nan, np.nan, 1.25], dtype="float64")
54-
tm.assert_numpy_array_equal(result, expected)
52+
expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64")
53+
tm.assert_extension_array_equal(result, expected)
5554

5655

5756
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
5857
def test_divide_by_zero(zero, negative):
5958
# https://github.com/pandas-dev/pandas/issues/27398
6059
a = pd.array([0, 1, -1, None], dtype="Int64")
6160
result = a / zero
62-
expected = np.array([np.nan, np.inf, -np.inf, np.nan])
61+
expected = FloatingArray(
62+
np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
63+
np.array([False, False, False, True]),
64+
)
6365
if negative:
6466
expected *= -1
65-
tm.assert_numpy_array_equal(result, expected)
67+
tm.assert_extension_array_equal(result, expected)
6668

6769

6870
def test_floordiv(dtype):
@@ -99,8 +101,11 @@ def test_pow_scalar():
99101
tm.assert_extension_array_equal(result, expected)
100102

101103
result = a ** np.nan
102-
expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64")
103-
tm.assert_numpy_array_equal(result, expected)
104+
expected = FloatingArray(
105+
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
106+
np.array([False, False, False, True, False]),
107+
)
108+
tm.assert_extension_array_equal(result, expected)
104109

105110
# reversed
106111
a = a[1:] # Can't raise integers to negative powers.
@@ -118,8 +123,11 @@ def test_pow_scalar():
118123
tm.assert_extension_array_equal(result, expected)
119124

120125
result = np.nan ** a
121-
expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64")
122-
tm.assert_numpy_array_equal(result, expected)
126+
expected = FloatingArray(
127+
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
128+
np.array([False, False, True, False]),
129+
)
130+
tm.assert_extension_array_equal(result, expected)
123131

124132

125133
def test_pow_array():
@@ -133,10 +141,10 @@ def test_pow_array():
133141
def test_rpow_one_to_na():
134142
# https://github.com/pandas-dev/pandas/issues/22022
135143
# https://github.com/pandas-dev/pandas/issues/29997
136-
arr = integer_array([np.nan, np.nan])
144+
arr = pd.array([np.nan, np.nan], dtype="Int64")
137145
result = np.array([1.0, 2.0]) ** arr
138-
expected = np.array([1.0, np.nan])
139-
tm.assert_numpy_array_equal(result, expected)
146+
expected = pd.array([1.0, np.nan], dtype="Float64")
147+
tm.assert_extension_array_equal(result, expected)
140148

141149

142150
@pytest.mark.parametrize("other", [0, 0.5])
@@ -198,11 +206,19 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators):
198206

199207
result = op(s, other)
200208
expected = op(s.astype(float), other)
209+
expected = expected.astype("Float64")
201210
# rfloordiv results in nan instead of inf
202211
if all_arithmetic_operators == "__rfloordiv__" and _np_version_under1p20:
203212
# for numpy 1.20 https://github.com/numpy/numpy/pull/16161
204213
# updated floordiv, now matches our behavior defined in core.ops
205-
expected[(expected == np.inf) | (expected == -np.inf)] = np.nan
214+
mask = (
215+
((expected == np.inf) | (expected == -np.inf)).fillna(False).to_numpy(bool)
216+
)
217+
expected.array._data[mask] = np.nan
218+
# rmod results in NaN that wasn't NA in original nullable Series -> unmask it
219+
elif all_arithmetic_operators == "__rmod__":
220+
mask = (s == 0).fillna(False).to_numpy(bool)
221+
expected.array._mask[mask] = False
206222

207223
tm.assert_series_equal(result, expected)
208224

@@ -215,7 +231,7 @@ def test_arithmetic_conversion(all_arithmetic_operators, other):
215231

216232
s = pd.Series([1, 2, 3], dtype="Int64")
217233
result = op(s, other)
218-
assert result.dtype is np.dtype("float")
234+
assert result.dtype == "Float64"
219235

220236

221237
def test_cross_type_arithmetic():

pandas/tests/arrays/integer/test_function.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@ def test_value_counts_empty():
127127
tm.assert_series_equal(result, expected)
128128

129129

130+
def test_value_counts_with_normalize():
131+
# GH 33172
132+
s = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
133+
result = s.value_counts(normalize=True)
134+
expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3
135+
tm.assert_series_equal(result, expected)
136+
137+
130138
@pytest.mark.parametrize("skipna", [True, False])
131139
@pytest.mark.parametrize("min_count", [0, 4])
132140
def test_integer_array_sum(skipna, min_count, any_nullable_int_dtype):

pandas/tests/arrays/masked/test_arithmetic.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,7 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
4343
for scalar in [scalar, data.dtype.type(scalar)]:
4444
result = op(data, scalar)
4545
expected = op(data, scalar_array)
46-
if isinstance(expected, ExtensionArray):
47-
tm.assert_extension_array_equal(result, expected)
48-
else:
49-
# TODO div still gives float ndarray -> remove this once we have Float EA
50-
tm.assert_numpy_array_equal(result, expected)
46+
tm.assert_extension_array_equal(result, expected)
5147

5248

5349
def test_array_NA(data, all_arithmetic_operators):

pandas/tests/arrays/string_/test_string.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,18 @@ def test_value_counts_na(dtype, request):
495495
tm.assert_series_equal(result, expected)
496496

497497

498+
def test_value_counts_with_normalize(dtype, request):
499+
if dtype == "arrow_string":
500+
reason = "TypeError: boolean value of NA is ambiguous"
501+
mark = pytest.mark.xfail(reason=reason)
502+
request.node.add_marker(mark)
503+
504+
s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
505+
result = s.value_counts(normalize=True)
506+
expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3
507+
tm.assert_series_equal(result, expected)
508+
509+
498510
@pytest.mark.parametrize(
499511
"values, expected",
500512
[

0 commit comments

Comments
 (0)