-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: block-wise arithmetic for frame-with-frame #32779
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
1697252
a7764d6
30a836d
3559698
4334353
cb40b0c
713a776
95ef3ad
61e5cd6
89c3d7b
e348e46
519c757
2b1ba18
53e93fc
91c86a3
2034084
8aedf35
0c12d35
9727562
6661dd3
42bbbf3
65ab023
0d958a3
7f91e74
56eef51
4baea6f
41a4e7a
b23144e
7f24d57
ae744b7
b14a98c
f42c403
8a2807e
fa046f0
fd10fb6
7ea5d3a
7150e87
bddfbb0
25f83d6
2142d29
0ca2125
1ea0cc0
2bfc308
d5ad2a0
108004b
7ca5f9a
e78570d
33dfbdf
30f655b
65a4eaf
30d6580
fe21f9c
f86deb4
f3dc465
b57f52c
0c46531
463a145
32e70d8
7989251
455e45e
41e8e78
ac8eea8
8c4f951
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
""" | ||
import datetime | ||
import operator | ||
from typing import TYPE_CHECKING, Optional, Set, Tuple | ||
from typing import TYPE_CHECKING, List, Optional, Set, Tuple | ||
|
||
import numpy as np | ||
|
||
|
@@ -58,6 +58,7 @@ | |
|
||
if TYPE_CHECKING: | ||
from pandas import DataFrame # noqa:F401 | ||
from pandas.core.internals.blocks import Block # noqa: F401 | ||
|
||
# ----------------------------------------------------------------------------- | ||
# constants | ||
|
@@ -353,6 +354,70 @@ def fill_binop(left, right, fill_value): | |
# Dispatch logic | ||
|
||
|
||
def operate_blockwise(left, right, array_op): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok much nicer / readable that before. I would consider moving this to a new module to de-nest these functions for blockwise operations (e.g. pandas/core/ops/blockwise.py), but can be later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think if you move this to ops/blockwise.py would be a +1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would really like to put this in a new file |
||
assert right._indexed_same(left) | ||
|
||
res_blks: List["Block"] = [] | ||
rmgr = right._data | ||
for n, blk in enumerate(left._data.blocks): | ||
locs = blk.mgr_locs | ||
|
||
blk_vals = blk.values | ||
|
||
if not isinstance(blk_vals, np.ndarray): | ||
# 1D EA | ||
assert len(locs) == 1, locs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be more clear to make for example this section (368 to 374) as a function so the structure of what you are doing is more clear. |
||
rser = right.iloc[:, locs[0]] | ||
rvals = extract_array(rser, extract_numpy=True) | ||
res_values = array_op(blk_vals, rvals) | ||
nbs = blk._split_op_result(res_values) | ||
res_blks.extend(nbs) | ||
continue | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some comments here.(general + line comments as needed) |
||
rblks = rmgr._slice_take_blocks_ax0(locs.indexer) | ||
|
||
for k, rblk in enumerate(rblks): | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
lvals = blk_vals[rblk.mgr_locs.indexer, :] | ||
rvals = rblk.values | ||
|
||
if not isinstance(rvals, np.ndarray): | ||
# 1D EA | ||
assert lvals.shape[0] == 1, lvals.shape | ||
lvals = lvals[0, :] | ||
res_values = array_op(lvals, rvals) | ||
nbs = rblk._split_op_result(res_values) | ||
assert len(nbs) == 1 | ||
nb = nbs[0] | ||
nb.mgr_locs = locs.as_array[nb.mgr_locs] | ||
res_blks.append(nb) | ||
continue | ||
|
||
assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) | ||
|
||
res_values = array_op(lvals, rvals) | ||
assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) | ||
nbs = rblk._split_op_result(res_values) | ||
for nb in nbs: | ||
# TODO: maybe optimize by sticking with slices? | ||
nb_mgr_locs = nb.mgr_locs | ||
nblocs = locs.as_array[nb_mgr_locs.indexer] | ||
nb.mgr_locs = nblocs | ||
assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) | ||
assert all(x in locs.as_array for x in nb.mgr_locs.as_array) | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
res_blks.extend(nbs) | ||
|
||
slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} | ||
nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) | ||
assert nlocs == len(left.columns), (nlocs, len(left.columns)) | ||
assert len(slocs) == nlocs, (len(slocs), nlocs) | ||
assert slocs == set(range(nlocs)), slocs | ||
|
||
# TODO: once this is working, pass do_integrity_check=False | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
new_mgr = type(rmgr)(res_blks, axes=rmgr.axes) | ||
return new_mgr | ||
|
||
|
||
def dispatch_to_series(left, right, func, str_rep=None, axis=None): | ||
""" | ||
Evaluate the frame operation func(left, right) by evaluating | ||
|
@@ -385,8 +450,9 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): | |
elif isinstance(right, ABCDataFrame): | ||
assert right._indexed_same(left) | ||
|
||
def column_op(a, b): | ||
return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An alternative would be to improve the performance of this line, and avoid the complexity that is added in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. regardless this is worth doing on L343/424 and L348/429 |
||
array_op = get_array_op(func, str_rep=str_rep) | ||
bm = operate_blockwise(left, right, array_op) | ||
return type(left)(bm) | ||
|
||
elif isinstance(right, ABCSeries) and axis == "columns": | ||
# We only get here if called via _combine_series_frame, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
from functools import partial | ||
import operator | ||
from typing import Any, Optional | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
|
@@ -132,7 +133,7 @@ def masked_arith_op(x: np.ndarray, y, op): | |
return result | ||
|
||
|
||
def define_na_arithmetic_op(op, str_rep: str): | ||
def define_na_arithmetic_op(op, str_rep: Optional[str]): | ||
def na_op(x, y): | ||
return na_arithmetic_op(x, y, op, str_rep) | ||
|
||
|
@@ -163,15 +164,18 @@ def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = Fal | |
""" | ||
import pandas.core.computation.expressions as expressions | ||
|
||
try: | ||
result = expressions.evaluate(op, str_rep, left, right) | ||
except TypeError: | ||
if is_cmp: | ||
# numexpr failed on comparison op, e.g. ndarray[float] > datetime | ||
# In this case we do not fall back to the masked op, as that | ||
# will handle complex numbers incorrectly, see GH#32047 | ||
raise | ||
result = masked_arith_op(left, right, op) | ||
with warnings.catch_warnings(): | ||
# suppress warnings from numpy about element-wise comparison | ||
warnings.simplefilter("ignore", DeprecationWarning) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What warning are we suppressing here, and is this future-proof? If this is the usual In [2]: np.array([1, 2]) == 'a'
/Users/taugspurger/.virtualenvs/dask-dev/bin/ipython:1: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
#!/Users/taugspurger/Envs/dask-dev/bin/python
Out[2]: False then in the future the result will be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this warning-catching was necessary to make the npdev build, pass, im going to see if i can revert it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks like catching this is needed in some form, but i can move the catching so as to cast a less-wide net. |
||
try: | ||
result = expressions.evaluate(op, str_rep, left, right) | ||
except TypeError: | ||
if is_cmp: | ||
# numexpr failed on comparison op, e.g. ndarray[float] > datetime | ||
# In this case we do not fall back to the masked op, as that | ||
# will handle complex numbers incorrectly, see GH#32047 | ||
raise | ||
result = masked_arith_op(left, right, op) | ||
|
||
if is_cmp and (is_scalar(result) or result is NotImplemented): | ||
# numpy returned a scalar instead of operating element-wise | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -964,7 +964,9 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): | |
obj = tm.box_expected(dti, box_with_array) | ||
expected = tm.box_expected(expected, box_with_array) | ||
|
||
warn = PerformanceWarning if box_with_array is not pd.DataFrame else None | ||
warn = None | ||
if box_with_array is not pd.DataFrame or tz_naive_fixture is None: | ||
warn = PerformanceWarning | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know why this changed? We do / do not raise a warning on the array-level? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't do a warning when operating |
||
with tm.assert_produces_warning(warn): | ||
result = obj - obj.astype(object) | ||
tm.assert_equal(result, expected) | ||
|
@@ -1388,8 +1390,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): | |
s = DatetimeIndex([Timestamp("2000-1-1"), Timestamp("2000-2-1")]) | ||
s = tm.box_expected(s, box_with_array) | ||
|
||
warn = None if box_with_array is pd.DataFrame else PerformanceWarning | ||
with tm.assert_produces_warning(warn): | ||
with tm.assert_produces_warning(PerformanceWarning): | ||
other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) | ||
other = tm.box_expected(other, box_with_array) | ||
result = s + other | ||
|
Uh oh!
There was an error while loading. Please reload this page.