-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
API/COMPAT: support axis=None for logical reduction (reduce over all axes) #21486
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
a6114b4
4bb6e53
9636d54
18c1b11
3c7f4e5
1c32b2b
1f94469
50db719
9fd9740
ae759bd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,36 @@ and bug fixes. We recommend that all users upgrade to this version. | |
:local: | ||
:backlinks: none | ||
|
||
.. _whatsnew_0232.enhancements: | ||
|
||
Logical Reductions over Entire DataFrame | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
:meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`) | ||
|
||
.. ipython:: python | ||
|
||
df = pd.DataFrame({"A": [1, 2], "B": [True, False]}) | ||
df.all(axis=None) | ||
|
||
|
||
This also provides compatibility with NumPy 1.15, which now dispatches to ``DataFrame.all``. | ||
With NumPy 1.15 and pandas 0.23.1 or earlier, :func:`numpy.all` will not reduce over every axis: | ||
|
||
.. code-block:: python | ||
|
||
>>> # NumPy 1.15, pandas 0.23.1 | ||
>>> np.any(pd.DataFrame({"A": [False], "B": [False]})) | ||
A False | ||
B False | ||
dtype: bool | ||
|
||
With pandas 0.23.2, that will correctly return False. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe add ", as it did before with numpy < 1.15" ? |
||
|
||
.. ipython:: python | ||
|
||
np.any(pd.DataFrame({"A": [False], "B": [False]})) | ||
|
||
|
||
.. _whatsnew_0232.fixed_regressions: | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6844,13 +6844,18 @@ def _count_level(self, level, axis=0, numeric_only=False): | |
|
||
def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, | ||
filter_type=None, **kwds): | ||
axis = self._get_axis_number(axis) | ||
if axis is None and filter_type == 'bool': | ||
labels = None | ||
constructor = None | ||
else: | ||
# TODO: Make other agg func handle axis=None properly | ||
axis = self._get_axis_number(axis) | ||
labels = self._get_agg_axis(axis) | ||
constructor = self._constructor | ||
|
||
def f(x): | ||
return op(x, axis=axis, skipna=skipna, **kwds) | ||
|
||
labels = self._get_agg_axis(axis) | ||
|
||
# exclude timedelta/datetime unless we are uniform types | ||
if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: | ||
numeric_only = True | ||
|
@@ -6859,6 +6864,13 @@ def f(x): | |
try: | ||
values = self.values | ||
result = f(values) | ||
|
||
if (filter_type == 'bool' and values.dtype.kind == 'O' and | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use is_object_dtype |
||
axis is None): | ||
# work around https://github.com/numpy/numpy/issues/10489 | ||
# TODO: combine with hasattr(result, 'dtype') further down | ||
# hard since we don't have `values` down there. | ||
result = np.bool_(result) | ||
except Exception as e: | ||
|
||
# try by-column first | ||
|
@@ -6925,7 +6937,9 @@ def f(x): | |
if axis == 0: | ||
result = coerce_to_dtypes(result, self.dtypes) | ||
|
||
return Series(result, index=labels) | ||
if constructor is not None: | ||
result = Series(result, index=labels) | ||
return result | ||
|
||
def nunique(self, axis=0, dropna=True): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8728,6 +8728,8 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, | |
return rs | ||
|
||
def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): | ||
if axis is None: | ||
raise ValueError("Must specify 'axis' when aggregating by level.") | ||
grouped = self.groupby(level=level, axis=axis, sort=False) | ||
if hasattr(grouped, name) and skipna: | ||
return getattr(grouped, name)(**kwargs) | ||
|
@@ -9055,8 +9057,16 @@ def _doc_parms(cls): | |
|
||
Parameters | ||
---------- | ||
axis : int, default 0 | ||
Select the axis which can be 0 for indices and 1 for columns. | ||
axis : {None, 0 or 'index', 1 or 'columns'}, default None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think |
||
Indicate which axis should be reduced. By default all axes are reduced | ||
and a scalar is returned. | ||
|
||
* None : reduce all axes, return a scalar. | ||
* 0 / 'index' : reduce the index, return a Series whose index is the | ||
original column labels. | ||
* 1 / 'columns' : reduce the columns, return a Series whose index is the | ||
original index. | ||
|
||
skipna : boolean, default True | ||
Exclude NA/null values. If an entire row/column is NA, the result | ||
will be NA. | ||
|
@@ -9078,9 +9088,9 @@ def _doc_parms(cls): | |
%(examples)s""" | ||
|
||
_all_doc = """\ | ||
Return whether all elements are True over series or dataframe axis. | ||
Return whether all elements are True, potentially over an axis. | ||
|
||
Returns True if all elements within a series or along a dataframe | ||
Returns True if all elements within a series or along a Dataframe | ||
axis are non-zero, not-empty or not-False.""" | ||
|
||
_all_examples = """\ | ||
|
@@ -9093,7 +9103,7 @@ def _doc_parms(cls): | |
>>> pd.Series([True, False]).all() | ||
False | ||
|
||
Dataframes | ||
DataFrames | ||
|
||
Create a dataframe from a dictionary. | ||
|
||
|
@@ -9110,12 +9120,17 @@ def _doc_parms(cls): | |
col2 False | ||
dtype: bool | ||
|
||
Adding axis=1 argument will check if row-wise values all return True. | ||
Specify ``axis=1`` to check if row-wise values all return True. | ||
|
||
>>> df.all(axis=1) | ||
0 True | ||
1 False | ||
dtype: bool | ||
|
||
Or ``axis=None`` for whether every value is True. | ||
|
||
>>> df.all(axis=None) | ||
False | ||
""" | ||
|
||
_all_see_also = """\ | ||
|
@@ -9481,6 +9496,11 @@ def _doc_parms(cls): | |
1 False | ||
dtype: bool | ||
|
||
Aggregating over the entire DataFrame with ``axis=None``. | ||
|
||
>>> df.any(axis=None) | ||
True | ||
|
||
`any` for an empty DataFrame is an empty Series. | ||
|
||
>>> pd.DataFrame([]).any() | ||
|
@@ -9651,22 +9671,17 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, | |
@Substitution(outname=name, desc=desc, name1=name1, name2=name2, | ||
axis_descr=axis_descr, examples=examples, see_also=see_also) | ||
@Appender(_bool_doc) | ||
def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, | ||
def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, | ||
**kwargs): | ||
nv.validate_logical_func(tuple(), kwargs, fname=name) | ||
if skipna is None: | ||
skipna = True | ||
if axis is None: | ||
axis = self._stat_axis_number | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't there be an override of this in case of Panel? (which is the only case where the stat_axis differs from 0) |
||
if level is not None: | ||
if bool_only is not None: | ||
raise NotImplementedError("Option bool_only is not " | ||
"implemented with option level.") | ||
return self._agg_by_level(name, axis=axis, level=level, | ||
skipna=skipna) | ||
return self._reduce(f, axis=axis, skipna=skipna, | ||
numeric_only=bool_only, filter_type='bool', | ||
name=name) | ||
return self._reduce(f, name, axis=axis, skipna=skipna, | ||
numeric_only=bool_only, filter_type='bool') | ||
|
||
return set_function_name(logical_func, name, cls) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3238,6 +3238,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, | |
|
||
""" | ||
delegate = self._values | ||
if axis is None: | ||
axis = self._stat_axis_number | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this still needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you have a look at this one? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, missed this earlier. We still call |
||
if isinstance(delegate, np.ndarray): | ||
# Validate that 'axis' is consistent with Series's single axis. | ||
self._get_axis_number(axis) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1159,11 +1159,34 @@ def test_any_all(self): | |
self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) | ||
self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) | ||
|
||
df = DataFrame(randn(10, 4)) > 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would make a new test function here |
||
df.any(1) | ||
df.all(1) | ||
df.any(1, bool_only=True) | ||
df.all(1, bool_only=True) | ||
df = DataFrame({ | ||
'A': [True, False, False], | ||
'B': [True, True, False], | ||
'C': [True, True, True], | ||
}, index=['a', 'b', 'c']) | ||
result = df[['A', 'B']].any(1) | ||
expected = Series([True, True, False], index=['a', 'b', 'c']) | ||
tm.assert_series_equal(result, expected) | ||
|
||
result = df[['A', 'B']].any(1, bool_only=True) | ||
tm.assert_series_equal(result, expected) | ||
|
||
result = df.all(1) | ||
expected = Series([True, False, False], index=['a', 'b', 'c']) | ||
tm.assert_series_equal(result, expected) | ||
|
||
result = df.all(1, bool_only=True) | ||
tm.assert_series_equal(result, expected) | ||
|
||
# Axis is None | ||
result = df.all(axis=None).item() | ||
assert result is False | ||
|
||
result = df.any(axis=None).item() | ||
assert result is True | ||
|
||
result = df[['C']].all(axis=None).item() | ||
assert result is True | ||
|
||
# skip pathological failure cases | ||
# class CantNonzero(object): | ||
|
@@ -1185,6 +1208,78 @@ def test_any_all(self): | |
# df.any(1, bool_only=True) | ||
# df.all(1, bool_only=True) | ||
|
||
@pytest.mark.parametrize('func, data, expected', [ | ||
(np.any, {}, False), | ||
(np.all, {}, True), | ||
(np.any, {'A': []}, False), | ||
(np.all, {'A': []}, True), | ||
(np.any, {'A': [False, False]}, False), | ||
(np.all, {'A': [False, False]}, False), | ||
(np.any, {'A': [True, False]}, True), | ||
(np.all, {'A': [True, False]}, False), | ||
(np.any, {'A': [True, True]}, True), | ||
(np.all, {'A': [True, True]}, True), | ||
|
||
(np.any, {'A': [False], 'B': [False]}, False), | ||
(np.all, {'A': [False], 'B': [False]}, False), | ||
|
||
(np.any, {'A': [False, False], 'B': [False, True]}, True), | ||
(np.all, {'A': [False, False], 'B': [False, True]}, False), | ||
|
||
# other types | ||
(np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False), | ||
(np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True), | ||
(np.all, {'A': pd.Series([0, 1], dtype=int)}, False), | ||
(np.any, {'A': pd.Series([0, 1], dtype=int)}, True), | ||
(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False), | ||
(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True), | ||
(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True), | ||
(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True), | ||
(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False), | ||
(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True), | ||
(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True), | ||
(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True), | ||
(np.all, {'A': pd.Series([0, 1], dtype='category')}, False), | ||
(np.any, {'A': pd.Series([0, 1], dtype='category')}, True), | ||
(np.all, {'A': pd.Series([1, 2], dtype='category')}, True), | ||
(np.any, {'A': pd.Series([1, 2], dtype='category')}, True), | ||
|
||
# # Mix | ||
# GH-21484 | ||
# (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), | ||
# 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), | ||
]) | ||
def test_any_all_np_func(self, func, data, expected): | ||
# https://github.com/pandas-dev/pandas/issues/19976 | ||
data = DataFrame(data) | ||
result = func(data) | ||
assert isinstance(result, np.bool_) | ||
assert result.item() is expected | ||
|
||
# method version | ||
result = getattr(DataFrame(data), func.__name__)(axis=None) | ||
assert isinstance(result, np.bool_) | ||
assert result.item() is expected | ||
|
||
def test_any_all_object(self): | ||
# https://github.com/pandas-dev/pandas/issues/19976 | ||
result = np.all(DataFrame(columns=['a', 'b'])).item() | ||
assert result is True | ||
|
||
result = np.any(DataFrame(columns=['a', 'b'])).item() | ||
assert result is False | ||
|
||
@pytest.mark.parametrize('method', ['any', 'all']) | ||
def test_any_all_level_axis_none_raises(self, method): | ||
df = DataFrame( | ||
{"A": 1}, | ||
index=MultiIndex.from_product([['A', 'B'], ['a', 'b']], | ||
names=['out', 'in']) | ||
) | ||
xpr = "Must specify 'axis' when aggregating by level." | ||
with tm.assert_raises_regex(ValueError, xpr): | ||
getattr(df, method)(axis=None, level='out') | ||
|
||
def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, | ||
has_bool_only=False): | ||
if frame is None: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not -> no longer ?