diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c743e38a118f7..df3ec3b9da8b1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -364,10 +364,29 @@ second column is instead renamed to ``a.2``. res -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: +.. _whatsnew_140.notable_bug_fixes.unstack_pivot_int32_limit: -notable_bug_fix3 -^^^^^^^^^^^^^^^^ +unstack and pivot_table no longer raises ValueError for result that would exceed int32 limit +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously :meth:`DataFrame.pivot_table` and :meth:`DataFrame.unstack` would raise a ``ValueError`` if the operation +could produce a result with more than ``2**31 - 1`` elements. This operation now raises a :class:`errors.PerformanceWarning` +instead (:issue:`26314`). + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df = DataFrame({"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0}) + In [4]: df.pivot_table(index="ind1", columns="ind2", values="count", aggfunc="count") + ValueError: Unstacked DataFrame is too big, causing int32 overflow + +*New behavior*: + +.. code-block:: python + + In [4]: df.pivot_table(index="ind1", columns="ind2", values="count", aggfunc="count") + PerformanceWarning: The following operation may generate 4294967296 cells in the resulting pandas object. .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c2cd73584b7da..a570af1f949d7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -2,6 +2,7 @@ import itertools from typing import TYPE_CHECKING +import warnings import numpy as np @@ -11,6 +12,7 @@ Dtype, npt, ) +from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -125,10 +127,15 @@ def __init__(self, index: MultiIndex, level=-1, constructor=None): num_columns = self.removed_level.size # GH20601: This forces an overflow if the number of cells is too high. - num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) - - if num_rows > 0 and num_columns > 0 and num_cells <= 0: - raise ValueError("Unstacked DataFrame is too big, causing int32 overflow") + num_cells = num_rows * num_columns + + # GH 26314: Previous ValueError raised was too restrictive for many users. + if num_cells > np.iinfo(np.int32).max: + warnings.warn( + f"The following operation may generate {num_cells} cells " + f"in the resulting pandas object.", + PerformanceWarning, + ) self._make_selectors() diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 689c54b03b507..4b3ddbc6c193c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( DataFrame, @@ -1819,11 +1821,17 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): # GH#20601 + # GH 26314: Change ValueError to PerformanceWarning df = DataFrame( np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)] ) - with pytest.raises(ValueError, match="int32 overflow"): - df.unstack() + msg = "The following operation may generate" + with tm.assert_produces_warning(PerformanceWarning, match=msg): + try: + df.unstack() + except MemoryError: + # Just checking the warning + return def test_stack_order_with_unsorted_levels(self): # GH#16323 diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 88607f4b036a0..9bfda33956287 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( Categorical, @@ -1991,15 +1993,20 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 + # GH 26314: Change ValueError to PerformanceWarning df = DataFrame( {"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0} ) - msg = "Unstacked DataFrame is too big, causing int32 overflow" - with pytest.raises(ValueError, match=msg): - df.pivot_table( - index="ind1", columns="ind2", values="count", aggfunc="count" - ) + msg = "The following operation may generate" + with tm.assert_produces_warning(PerformanceWarning, match=msg): + try: + df.pivot_table( + index="ind1", columns="ind2", values="count", aggfunc="count" + ) + except MemoryError: + # Just checking the warning + return def test_pivot_table_aggfunc_dropna(self, dropna): # GH 22159