diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ad5af5df710ba..9f7aff0a30bd3 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -238,6 +238,8 @@ Other enhancements - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) - - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) +- Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`). +- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). @@ -470,6 +472,7 @@ Deprecations - The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set. (:issue:`24804`) - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) +- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`) - Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) - Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) @@ -750,6 +753,7 @@ Other - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) +- Fixed metadata propagation in :meth:`Series.abs` and ufuncs called on Series and DataFrames (:issue:`28283`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly casting from ``PeriodDtype`` to object dtype (:issue:`34871`) - Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`) - Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`) (:issue:`37381`) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index da366c9abf0a4..6b28f8f135769 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -5,8 +5,15 @@ ExtensionArray """ import operator +from typing import Any, Callable +import warnings -from pandas.core.ops import roperator +import numpy as np + +from pandas._libs import lib + +from pandas.core.construction import extract_array +from pandas.core.ops import maybe_dispatch_ufunc_to_dunder_op, roperator from pandas.core.ops.common import unpack_zerodim_and_defer @@ -140,3 +147,138 @@ def __pow__(self, other): @unpack_zerodim_and_defer("__rpow__") def __rpow__(self, other): return self._arith_method(other, roperator.rpow) + + +def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): + """ + Compatibility with numpy ufuncs. + + See also + -------- + numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__ + """ + from pandas.core.generic import NDFrame + from pandas.core.internals import BlockManager + + cls = type(self) + + # for binary ops, use our custom dunder methods + result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + types = tuple(type(x) for x in inputs) + alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + + if len(set(types)) > 1: + # We currently don't handle ufunc(DataFrame, Series) + # well. Previously this raised an internal ValueError. We might + # support it someday, so raise a NotImplementedError. + raise NotImplementedError( + "Cannot apply ufunc {} to mixed DataFrame and Series " + "inputs.".format(ufunc) + ) + axes = self.axes + for obj in alignable[1:]: + # this relies on the fact that we aren't handling mixed + # series / frame ufuncs. + for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): + axes[i] = ax1.union(ax2) + + reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) + inputs = tuple( + x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x + for x, t in zip(inputs, types) + ) + else: + reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) + + if self.ndim == 1: + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + name = names[0] if len(set(names)) == 1 else None + reconstruct_kwargs = {"name": name} + else: + reconstruct_kwargs = {} + + def reconstruct(result): + if lib.is_scalar(result): + return result + if result.ndim != self.ndim: + if method == "outer": + if self.ndim == 2: + # we already deprecated for Series + msg = ( + "outer method for ufunc {} is not implemented on " + "pandas objects. Returning an ndarray, but in the " + "future this will raise a 'NotImplementedError'. " + "Consider explicitly converting the DataFrame " + "to an array with '.to_numpy()' first." + ) + warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) + return result + raise NotImplementedError + return result + if isinstance(result, BlockManager): + # we went through BlockManager.apply + result = self._constructor(result, **reconstruct_kwargs, copy=False) + else: + # we converted an array, lost our axes + result = self._constructor( + result, **reconstruct_axes, **reconstruct_kwargs, copy=False + ) + # TODO: When we support multiple values in __finalize__, this + # should pass alignable to `__fianlize__` instead of self. + # Then `np.add(a, b)` would consider attrs from both a and b + # when a and b are NDFrames. + if len(alignable) == 1: + result = result.__finalize__(self) + return result + + if self.ndim > 1 and ( + len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] + ): + # Just give up on preserving types in the complex case. + # In theory we could preserve them for them. + # * nout>1 is doable if BlockManager.apply took nout and + # returned a Tuple[BlockManager]. + # * len(inputs) > 1 is doable when we know that we have + # aligned blocks / dtypes. + inputs = tuple(np.asarray(x) for x in inputs) + result = getattr(ufunc, method)(*inputs) + elif self.ndim == 1: + # ufunc(series, ...) + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + else: + # ufunc(dataframe) + mgr = inputs[0]._mgr + result = mgr.apply(getattr(ufunc, method)) + + if ufunc.nout > 1: # type: ignore[attr-defined] + result = tuple(reconstruct(x) for x in result) + else: + result = reconstruct(result) + return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b87c4ea8b9cc..9c70f3557e339 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -434,6 +434,7 @@ class DataFrame(NDFrame, OpsMixin): _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) @property def _constructor(self) -> Type[DataFrame]: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7e8012d76fe1b..e866314f00639 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -87,7 +87,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import indexing, missing, nanops +from pandas.core import arraylike, indexing, missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com @@ -1927,6 +1927,11 @@ def __array_wrap__( self, method="__array_wrap__" ) + def __array_ufunc__( + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any + ): + return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs) + # ideally we would define this to avoid the getattr checks, but # is slower # @property diff --git a/pandas/core/series.py b/pandas/core/series.py index 4c3ad38c8a922..1e4c0e07de403 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -176,6 +176,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): """ _typ = "series" + _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) _name: Label _metadata: List[str] = ["name"] @@ -683,81 +684,6 @@ def view(self, dtype=None) -> "Series": # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) - def __array_ufunc__( - self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any - ): - # TODO: handle DataFrame - cls = type(self) - - # for binary ops, use our custom dunder methods - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) - - for item in inputs: - higher_priority = ( - hasattr(item, "__array_priority__") - and item.__array_priority__ > self.__array_priority__ - ) - has_array_ufunc = ( - hasattr(item, "__array_ufunc__") - and type(item).__array_ufunc__ not in no_defer - and not isinstance(item, self._HANDLED_TYPES) - ) - if higher_priority or has_array_ufunc: - return NotImplemented - - # align all the inputs. - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - types = tuple(type(x) for x in inputs) - # TODO: dataframe - alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] - - if len(alignable) > 1: - # This triggers alignment. - # At the moment, there aren't any ufuncs with more than two inputs - # so this ends up just being x1.index | x2.index, but we write - # it to handle *args. - index = alignable[0].index - for s in alignable[1:]: - index = index.union(s.index) - inputs = tuple( - x.reindex(index) if issubclass(t, Series) else x - for x, t in zip(inputs, types) - ) - else: - index = self.index - - inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - - name = names[0] if len(set(names)) == 1 else None - - def construct_return(result): - if lib.is_scalar(result): - return result - elif result.ndim > 1: - # e.g. np.subtract.outer - if method == "outer": - # GH#27198 - raise NotImplementedError - return result - return self._constructor(result, index=index, name=name, copy=False) - - if type(result) is tuple: - # multiple return values - return tuple(construct_return(x) for x in result) - elif method == "at": - # no return value - return None - else: - return construct_return(result) - def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py new file mode 100644 index 0000000000000..7bc9aa29af3b4 --- /dev/null +++ b/pandas/tests/frame/test_ufunc.py @@ -0,0 +1,111 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +dtypes = [ + "int64", + "Int64", + dict(A="int64", B="Int64"), +] + + +@pytest.mark.parametrize("dtype", dtypes) +def test_unary_unary(dtype): + # unary input, unary output + values = np.array([[-1, -1], [1, 1]], dtype="int64") + df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype) + result = np.positive(df) + expected = pd.DataFrame( + np.positive(values), index=df.index, columns=df.columns + ).astype(dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", dtypes) +def test_unary_binary(dtype): + # unary input, binary output + if pd.api.types.is_extension_array_dtype(dtype) or isinstance(dtype, dict): + pytest.xfail(reason="Extension / mixed with multiple outuputs not implemented.") + + values = np.array([[-1, -1], [1, 1]], dtype="int64") + df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype) + result_pandas = np.modf(df) + assert isinstance(result_pandas, tuple) + assert len(result_pandas) == 2 + expected_numpy = np.modf(values) + + for result, b in zip(result_pandas, expected_numpy): + expected = pd.DataFrame(b, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", dtypes) +def test_binary_input_dispatch_binop(dtype): + # binop ufuncs are dispatched to our dunder methods. + values = np.array([[-1, -1], [1, 1]], dtype="int64") + df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype) + result = np.add(df, df) + expected = pd.DataFrame( + np.add(values, values), index=df.index, columns=df.columns + ).astype(dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype_a", dtypes) +@pytest.mark.parametrize("dtype_b", dtypes) +def test_binary_input_aligns_columns(dtype_a, dtype_b): + if ( + pd.api.types.is_extension_array_dtype(dtype_a) + or isinstance(dtype_a, dict) + or pd.api.types.is_extension_array_dtype(dtype_b) + or isinstance(dtype_b, dict) + ): + pytest.xfail(reason="Extension / mixed with multiple inputs not implemented.") + + df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}).astype(dtype_a) + + if isinstance(dtype_a, dict) and isinstance(dtype_b, dict): + dtype_b["C"] = dtype_b.pop("B") + + df2 = pd.DataFrame({"A": [1, 2], "C": [3, 4]}).astype(dtype_b) + result = np.heaviside(df1, df2) + expected = np.heaviside( + np.array([[1, 3, np.nan], [2, 4, np.nan]]), + np.array([[1, np.nan, 3], [2, np.nan, 4]]), + ) + expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", dtypes) +def test_binary_input_aligns_index(dtype): + if pd.api.types.is_extension_array_dtype(dtype) or isinstance(dtype, dict): + pytest.xfail(reason="Extension / mixed with multiple inputs not implemented.") + df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).astype(dtype) + df2 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "c"]).astype(dtype) + result = np.heaviside(df1, df2) + expected = np.heaviside( + np.array([[1, 3], [3, 4], [np.nan, np.nan]]), + np.array([[1, 3], [np.nan, np.nan], [3, 4]]), + ) + # TODO(FloatArray): this will be Float64Dtype. + expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_binary_frame_series_raises(): + # We don't currently implement + df = pd.DataFrame({"A": [1, 2]}) + with pytest.raises(NotImplementedError, match="logaddexp"): + np.logaddexp(df, df["A"]) + + with pytest.raises(NotImplementedError, match="logaddexp"): + np.logaddexp(df["A"], df) + + +def test_frame_outer_deprecated(): + df = pd.DataFrame({"A": [1, 2]}) + with tm.assert_produces_warning(FutureWarning): + np.subtract.outer(df, df) diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index 3f7bebd86e983..300f4cd72573a 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -37,8 +37,8 @@ def test_construction_ok(self, cls, data): operator.methodcaller("add", 1), operator.methodcaller("rename", str.upper), operator.methodcaller("rename", "name"), - pytest.param(operator.methodcaller("abs"), marks=not_implemented), - # TODO: test np.abs + operator.methodcaller("abs"), + np.abs, ], ) def test_preserved_series(self, func): diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index ecd70bb415334..4974d3fff1df4 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -302,7 +302,7 @@ (pd.DataFrame, frame_data, operator.inv), (pd.Series, [1], operator.inv), (pd.DataFrame, frame_data, abs), - pytest.param((pd.Series, [1], abs), marks=not_implemented_mark), + (pd.Series, [1], abs), pytest.param((pd.DataFrame, frame_data, round), marks=not_implemented_mark), (pd.Series, [1], round), (pd.DataFrame, frame_data, operator.methodcaller("take", [0, 0])),