diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 58e0db67d6025..64e067d25a454 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,3 +1,5 @@ +import operator + import numpy as np from pandas import DataFrame, Series, date_range @@ -9,6 +11,36 @@ import pandas.computation.expressions as expr +class IntFrameWithScalar: + params = [ + [np.float64, np.int64], + [2, 3.0, np.int32(4), np.float64(5)], + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.floordiv, + operator.pow, + operator.mod, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ], + ] + param_names = ["dtype", "scalar", "op"] + + def setup(self, dtype, scalar, op): + arr = np.random.randn(20000, 100) + self.df = DataFrame(arr.astype(dtype)) + + def time_frame_op_with_scalar(self, dtype, scalar, op): + op(self.df, scalar) + + class Ops: params = [[True, False], ["default", 1]] diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0d0e9d9a54fff..652d5626417cd 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -656,6 +656,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :class:`DataFrame` arithmetic and comparison operations with scalars (:issue:`24990`, :issue:`29853`) - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in :attr:`MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 66f0ad2500c54..ceeaf018eb5f3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -325,6 +325,24 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray) _generate_range """ + @property + def ndim(self) -> int: + return self._data.ndim + + @property + def shape(self): + return self._data.shape + + def reshape(self, *args, **kwargs): + # Note: we drop any freq + data = self._data.reshape(*args, **kwargs) + return type(self)(data, dtype=self.dtype) + + def ravel(self, *args, **kwargs): + # Note: we drop any freq + data = self._data.ravel(*args, **kwargs) + return type(self)(data, dtype=self.dtype) + @property def _box_func(self): """ @@ -413,7 +431,10 @@ def __getitem__(self, key): getitem = self._data.__getitem__ if is_int: val = getitem(key) - return self._box_func(val) + if lib.is_scalar(val): + # i.e. self.ndim == 1 + return self._box_func(val) + return type(self)(val, dtype=self.dtype) if com.is_bool_indexer(key): key = np.asarray(key, dtype=bool) @@ -823,6 +844,8 @@ def inferred_freq(self): generated by infer_freq. Returns None if it can't autodetect the frequency. """ + if self.ndim != 1: + return None try: return frequencies.infer_freq(self) except ValueError: @@ -968,7 +991,7 @@ def _add_timedeltalike_scalar(self, other): """ if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds - new_values = np.empty(len(self), dtype="i8") + new_values = np.empty(self.shape, dtype="i8") new_values[:] = iNaT return new_values @@ -1014,7 +1037,7 @@ def _add_nat(self): # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes - result = np.zeros(len(self), dtype=np.int64) + result = np.zeros(self.shape, dtype=np.int64) result.fill(iNaT) return type(self)(result, dtype=self.dtype, freq=None) @@ -1028,7 +1051,7 @@ def _sub_nat(self): # For datetime64 dtypes by convention we treat NaT as a datetime, so # this subtraction returns a timedelta64 dtype. # For period dtype, timedelta64 is a close-enough return dtype. - result = np.zeros(len(self), dtype=np.int64) + result = np.zeros(self.shape, dtype=np.int64) result.fill(iNaT) return result.view("timedelta64[ns]") diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 1d4052ad8b114..eb762a23d684d 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -339,7 +339,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): " those." ) raise ValueError(msg) - if values.ndim != 1: + if values.ndim not in [1, 2]: raise ValueError("Only 1-dimensional input arrays are supported.") if values.dtype == "i8": @@ -788,6 +788,9 @@ def _sub_datetime_arraylike(self, other): return new_values.view("timedelta64[ns]") def _add_offset(self, offset): + if self.ndim == 2: + return self.ravel()._add_offset(offset).reshape(self.shape) + assert not isinstance(offset, Tick) try: if self.tz is not None: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index db4effa608582..b95dfc9ba7580 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -217,7 +217,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): " TimedeltaArray ndarray, or Series or Index containing one of those." ) raise ValueError(msg) - if values.ndim != 1: + if values.ndim not in [1, 2]: raise ValueError("Only 1-dimensional input arrays are supported.") if values.dtype == "i8": @@ -1036,8 +1036,6 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") data = np.array(data, copy=copy) - if data.ndim != 1: - raise ValueError("Only 1-dimensional input arrays are supported.") assert data.dtype == "m8[ns]", data return data, inferred_freq diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5b755f509e6b9..664f6ea75a3be 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -368,7 +368,19 @@ def apply(self, func, **kwargs): """ with np.errstate(all="ignore"): result = func(self.values, **kwargs) + + if is_extension_array_dtype(result) and result.ndim > 1: + # if we get a 2D ExtensionArray, we need to split it into 1D pieces + nbs = [] + for i, loc in enumerate(self.mgr_locs): + vals = result[i] + nv = _block_shape(vals, ndim=self.ndim) + block = self.make_block(values=nv, placement=[loc]) + nbs.append(block) + return nbs + if not isinstance(result, Block): + # Exclude the 0-dim case so we can do reductions result = self.make_block(values=_block_shape(result, ndim=self.ndim)) return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d15a95191745b..9729f172183e7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -340,13 +340,13 @@ def _verify_integrity(self): f"tot_items: {tot_items}" ) - def apply(self, f: str, filter=None, **kwargs): + def apply(self, f, filter=None, **kwargs): """ Iterate over the blocks, collect and create a new BlockManager. Parameters ---------- - f : str + f : str or callable Name of the Block method to apply. filter : list, if supplied, only call the block if the filter is in the block @@ -411,7 +411,10 @@ def apply(self, f: str, filter=None, **kwargs): axis = obj._info_axis_number kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) - applied = getattr(b, f)(**kwargs) + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) if len(result_blocks) == 0: diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 14705f4d22e9b..be5e53eaa6721 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -26,6 +26,7 @@ arithmetic_op, comparison_op, define_na_arithmetic_op, + get_array_op, logical_op, ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 @@ -372,8 +373,10 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: - def column_op(a, b): - return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} + # Get the appropriate array-op to apply to each block's values. + array_op = get_array_op(func, str_rep=str_rep) + bm = left._data.apply(array_op, right=right) + return type(left)(bm) elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) @@ -713,7 +716,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): if fill_value is not None: self = self.fillna(fill_value) - new_data = dispatch_to_series(self, other, op) + new_data = dispatch_to_series(self, other, op, str_rep) return self._construct_result(new_data) f.__name__ = op_name diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 40bf19c60e144..e0ddd17335175 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -2,8 +2,9 @@ Functions for arithmetic and comparison operations on NumPy arrays and ExtensionArrays. """ +from functools import partial import operator -from typing import Any, Union +from typing import Any, Optional, Union import numpy as np @@ -51,10 +52,10 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, (ABCSeries, ABCIndex)): y = y.values - result = libops.vec_compare(x, y, op) + result = libops.vec_compare(x.ravel(), y, op) else: - result = libops.scalar_compare(x, y, op) - return result + result = libops.scalar_compare(x.ravel(), y, op) + return result.reshape(x.shape) def masked_arith_op(x, y, op): @@ -237,9 +238,9 @@ def comparison_op( elif is_scalar(rvalues) and isna(rvalues): # numpy does not like comparisons vs None if op is operator.ne: - res_values = np.ones(len(lvalues), dtype=bool) + res_values = np.ones(lvalues.shape, dtype=bool) else: - res_values = np.zeros(len(lvalues), dtype=bool) + res_values = np.zeros(lvalues.shape, dtype=bool) elif is_object_dtype(lvalues.dtype): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) @@ -367,3 +368,27 @@ def fill_bool(x, left=None): res_values = filler(res_values) # type: ignore return res_values + + +def get_array_op(op, str_rep: Optional[str] = None): + """ + Return a binary array operation corresponding to the given operator op. + + Parameters + ---------- + op : function + Binary operator from operator or roperator module. + str_rep : str or None, default None + str_rep to pass to arithmetic_op + + Returns + ------- + function + """ + op_name = op.__name__.strip("_") + if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: + return partial(comparison_op, op=op) + elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: + return partial(logical_op, op=op) + else: + return partial(arithmetic_op, op=op, str_rep=str_rep) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index c3cda22497ecb..d5ec473f4c74d 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -24,8 +24,8 @@ def test_only_1dim_accepted(self): arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): - # 2-dim - DatetimeArray(arr.reshape(2, 2)) + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + DatetimeArray(arr.reshape(2, 2, 1)) with pytest.raises(ValueError, match="Only 1-dimensional"): # 0-dim diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index bb6ef09bad17e..8d54ea564e1c2 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -12,8 +12,8 @@ def test_only_1dim_accepted(self): arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): - # 2-dim - TimedeltaArray(arr.reshape(2, 2)) + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + TimedeltaArray(arr.reshape(2, 2, 1)) with pytest.raises(ValueError, match="Only 1-dimensional"): # 0-dim