From 1ed21324db2477e9573d90abe5a8c253ac7f462b Mon Sep 17 00:00:00 2001 From: Austin Au-Yeung Date: Wed, 26 Jul 2023 01:54:01 -0500 Subject: [PATCH 1/6] Remove downcasting and update tests to expect floats --- pandas/core/reshape/pivot.py | 22 ------------ pandas/tests/reshape/test_pivot.py | 58 +++++++++++++++--------------- 2 files changed, 29 insertions(+), 51 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 099bfde7af1d3..3bef55463b036 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -172,28 +172,6 @@ def __internal_pivot_table( if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") - # gh-21133 - # we want to down cast if - # the original values are ints - # as we grouped with a NaN value - # and then dropped, coercing to floats - for v in values: - if ( - v in data - and is_integer_dtype(data[v]) - and v in agged - and not is_integer_dtype(agged[v]) - ): - if not isinstance(agged[v], ABCDataFrame) and isinstance( - data[v].dtype, np.dtype - ): - # exclude DataFrame case bc maybe_downcast_to_dtype expects - # ArrayLike - # e.g. test_pivot_table_multiindex_columns_doctest_case - # agged.columns is a MultiIndex and 'v' is indexing only - # on its first level. - agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) - table = agged # GH17038, this check should only happen if index is defined (not None) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1e122442cd40c..540494b234691 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -204,7 +204,7 @@ def test_pivot_table_categorical(self): result = pivot_table(df, values="values", index=["A", "B"], dropna=True) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) - expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) + expected = DataFrame({"values": [1.0, 2.0, 3.0, 4.0]}, index=exp_index) tm.assert_frame_equal(result, expected) def test_pivot_table_dropna_categoricals(self, dropna): @@ -225,7 +225,7 @@ def test_pivot_table_dropna_categoricals(self, dropna): expected_columns = expected_columns.astype(CDT(categories, ordered=False)) expected_index = Series([1, 2, 3], name="B") expected = DataFrame( - [[0, 3, 6], [1, 4, 7], [2, 5, 8]], + [[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]], index=expected_index, columns=expected_columns, ) @@ -283,7 +283,7 @@ def test_pivot_with_non_observable_dropna_multi_cat(self, dropna): result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame( - {"B": [2, 3, 0]}, + {"B": [2.0, 3.0, 0.0]}, index=Index( Categorical.from_codes( [0, 1, 2], categories=["low", "high", "left"], ordered=True @@ -300,7 +300,7 @@ def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 df = DataFrame({"A": interval_values, "B": 1}) result = df.pivot_table(index="A", values="B", dropna=dropna) - expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) + expected = DataFrame({"B": 1.0}, index=Index(interval_values.unique(), name="A")) if not dropna: expected = expected.astype(float) tm.assert_frame_equal(result, expected) @@ -444,7 +444,7 @@ def test_pivot_no_values(self): index=Grouper(freq="A"), columns=Grouper(key="dt", freq="M") ) exp = DataFrame( - [3], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns + [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns ) tm.assert_frame_equal(res, exp) @@ -1059,7 +1059,7 @@ def test_pivot_table_multiindex_only(self, cols): result = df2.pivot_table(values="v", columns=cols) expected = DataFrame( - [[4, 5, 6]], + [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), index=Index(["v"]), ) @@ -1558,7 +1558,7 @@ def test_pivot_datetime_tz(self): exp_col1 = Index(["value1", "value1"]) exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) + expected = DataFrame([[0.0, 3.0], [1.0, 4.0], [2.0, 5.0]], index=exp_idx, columns=exp_col) result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"]) tm.assert_frame_equal(result, expected) @@ -1570,18 +1570,30 @@ def test_pivot_datetime_tz(self): name="dt2", ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) - expected = DataFrame( + expected1 = DataFrame( np.array( [ - [0, 3, 1, 2, 0, 3, 1, 2], - [1, 4, 2, 1, 1, 4, 2, 1], - [2, 5, 1, 2, 2, 5, 1, 2], + [0, 3, 1, 2,], + [1, 4, 2, 1], + [2, 5, 1, 2], ], dtype="int64", ), index=exp_idx, - columns=exp_col, + columns=exp_col[:4], + ) + expected2 = DataFrame( + np.array( + [ + [0.0, 3.0, 1.0, 2.0], + [1.0, 4.0, 2.0, 1.0], + [2.0, 5.0, 1.0, 2.0], + ], + ), + index=exp_idx, + columns=exp_col[4:], ) + expected = concat([expected1, expected2], axis=1) result = pivot_table( df, @@ -1628,7 +1640,7 @@ def test_pivot_dtaccessor(self): exp_idx = Index(["a", "b"], name="label") expected = DataFrame( - {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + {7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]}, index=exp_idx, columns=Index([7, 8, 9], dtype=np.int32, name="dt1"), ) @@ -1639,7 +1651,7 @@ def test_pivot_dtaccessor(self): ) expected = DataFrame( - {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + {7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]}, index=Index([1, 2], dtype=np.int32, name="dt2"), columns=Index([7, 8, 9], dtype=np.int32, name="dt1"), ) @@ -1660,7 +1672,7 @@ def test_pivot_dtaccessor(self): names=["dt1", "dt2"], ) expected = DataFrame( - np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), + np.array([[0.0, 3.0, 1.0, 4.0, 2.0, 5.0]]), index=Index([2013], dtype=np.int32), columns=exp_col, ) @@ -1765,12 +1777,6 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): tm.assert_frame_equal(table, expected) def test_categorical_margins(self, observed, request): - if observed: - request.node.add_marker( - pytest.mark.xfail( - reason="GH#17035 (np.mean of ints is casted back to ints)" - ) - ) # GH 10989 df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} @@ -1784,12 +1790,6 @@ def test_categorical_margins(self, observed, request): tm.assert_frame_equal(table, expected) def test_categorical_margins_category(self, observed, request): - if observed: - request.node.add_marker( - pytest.mark.xfail( - reason="GH#17035 (np.mean of ints is casted back to ints)" - ) - ) df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} ) @@ -1816,7 +1816,7 @@ def test_margins_casted_to_float(self): result = pivot_table(df, index="D", margins=True) expected = DataFrame( - {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, + {"A": [3.0, 7.0, 5], "B": [2.5, 6.5, 4.5], "C": [2.0, 5.0, 3.5]}, index=Index(["X", "Y", "All"], name="D"), ) tm.assert_frame_equal(result, expected) @@ -2249,7 +2249,7 @@ def test_pivot_table_sort_false_with_multiple_values(self): index=["lastname", "firstname"], values=["height", "age"], sort=False ) expected = DataFrame( - [[173, 47], [182, 33]], + [[173.0, 47.0], [182.0, 33.0]], columns=["height", "age"], index=MultiIndex.from_tuples( [("Foo", "John"), ("Bar", "Michael")], From 248d8c50882df119ffae04ea50717cd9672fb593 Mon Sep 17 00:00:00 2001 From: Austin Au-Yeung Date: Wed, 26 Jul 2023 02:58:44 -0500 Subject: [PATCH 2/6] Corrections from pre-commit --- pandas/core/reshape/pivot.py | 1 - pandas/tests/reshape/test_pivot.py | 19 ++++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 3bef55463b036..5c2e94735ddc5 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -20,7 +20,6 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - is_integer_dtype, is_list_like, is_nested_list_like, is_scalar, diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 540494b234691..27d3fdbda1ded 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -300,7 +300,9 @@ def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 df = DataFrame({"A": interval_values, "B": 1}) result = df.pivot_table(index="A", values="B", dropna=dropna) - expected = DataFrame({"B": 1.0}, index=Index(interval_values.unique(), name="A")) + expected = DataFrame( + {"B": 1.0}, index=Index(interval_values.unique(), name="A") + ) if not dropna: expected = expected.astype(float) tm.assert_frame_equal(result, expected) @@ -1558,7 +1560,9 @@ def test_pivot_datetime_tz(self): exp_col1 = Index(["value1", "value1"]) exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0.0, 3.0], [1.0, 4.0], [2.0, 5.0]], index=exp_idx, columns=exp_col) + expected = DataFrame( + [[0.0, 3.0], [1.0, 4.0], [2.0, 5.0]], index=exp_idx, columns=exp_col + ) result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"]) tm.assert_frame_equal(result, expected) @@ -1573,7 +1577,12 @@ def test_pivot_datetime_tz(self): expected1 = DataFrame( np.array( [ - [0, 3, 1, 2,], + [ + 0, + 3, + 1, + 2, + ], [1, 4, 2, 1], [2, 5, 1, 2], ], @@ -1776,7 +1785,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): expected = DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) - def test_categorical_margins(self, observed, request): + def test_categorical_margins(self, observed): # GH 10989 df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} @@ -1789,7 +1798,7 @@ def test_categorical_margins(self, observed, request): table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) - def test_categorical_margins_category(self, observed, request): + def test_categorical_margins_category(self, observed): df = DataFrame( {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} ) From ea844fcc3fe0652d9c8194b90124c20d8fdd5043 Mon Sep 17 00:00:00 2001 From: Austin Au-Yeung Date: Wed, 26 Jul 2023 03:08:55 -0500 Subject: [PATCH 3/6] Add entry to whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d0dae450735a3..e2aac772a520f 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -640,6 +640,7 @@ Other - Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`) - Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`) - Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`) +- Bug in :meth:`DataFrame.pivot_table` with casting the mean of ints back to an int (:issue:`16676`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) - Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`DataFrameGroupBy.shift` when passing both "freq" and "fill_value" silently ignoring "fill_value" instead of raising ``ValueError`` (:issue:`53832`) - Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`) @@ -650,7 +651,6 @@ Other - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`) - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) -- .. ***DO NOT USE THIS SECTION*** From cf0536c146b835ee0202ff1107706cbd7b6ba736 Mon Sep 17 00:00:00 2001 From: Austin Au-Yeung Date: Wed, 26 Jul 2023 10:03:28 -0500 Subject: [PATCH 4/6] Update other tests using pivot_table --- pandas/tests/frame/methods/test_drop.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 79fd48de91ed5..2c9c7c297140c 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -172,7 +172,7 @@ def test_drop_multiindex_not_lexsorted(self): lexsorted_mi = MultiIndex.from_tuples( [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] ) - lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + lexsorted_df = DataFrame([[1, 3.0, 4.0]], columns=lexsorted_mi) assert lexsorted_df.columns._is_lexsorted() # define the non-lexsorted version diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c8de1cd6785b6..7d3139cab304d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1776,7 +1776,7 @@ def test_groupby_multiindex_not_lexsorted(): lexsorted_mi = MultiIndex.from_tuples( [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] ) - lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + lexsorted_df = DataFrame([[1, 3.0, 4.0]], columns=lexsorted_mi) assert lexsorted_df.columns._is_lexsorted() # define the non-lexsorted version From ec7e9e6cd509bc8485c5b4f5681f2eee65225bd0 Mon Sep 17 00:00:00 2001 From: Austin Au-Yeung Date: Fri, 28 Jul 2023 17:26:36 -0500 Subject: [PATCH 5/6] Revert "Update other tests using pivot_table" This reverts commit cf0536c146b835ee0202ff1107706cbd7b6ba736. --- pandas/tests/frame/methods/test_drop.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 2c9c7c297140c..79fd48de91ed5 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -172,7 +172,7 @@ def test_drop_multiindex_not_lexsorted(self): lexsorted_mi = MultiIndex.from_tuples( [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] ) - lexsorted_df = DataFrame([[1, 3.0, 4.0]], columns=lexsorted_mi) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns._is_lexsorted() # define the non-lexsorted version diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7d3139cab304d..c8de1cd6785b6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1776,7 +1776,7 @@ def test_groupby_multiindex_not_lexsorted(): lexsorted_mi = MultiIndex.from_tuples( [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] ) - lexsorted_df = DataFrame([[1, 3.0, 4.0]], columns=lexsorted_mi) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns._is_lexsorted() # define the non-lexsorted version From 852acfdde4cb74510424f6d84871054de4733fa8 Mon Sep 17 00:00:00 2001 From: Austin Au-Yeung Date: Fri, 28 Jul 2023 17:57:33 -0500 Subject: [PATCH 6/6] Update tests without changing test inputs --- pandas/tests/frame/methods/test_drop.py | 5 +---- pandas/tests/groupby/test_groupby.py | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 79fd48de91ed5..87b23b4ceba14 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -185,10 +185,7 @@ def test_drop_multiindex_not_lexsorted(self): not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns._is_lexsorted() - # compare the results - tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - - expected = lexsorted_df.drop("a", axis=1) + expected = lexsorted_df.drop("a", axis=1).astype(float) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop("a", axis=1) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c8de1cd6785b6..7fda071b6729e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1789,9 +1789,6 @@ def test_groupby_multiindex_not_lexsorted(): not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns._is_lexsorted() - # compare the results - tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - expected = lexsorted_df.groupby("a").mean() with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.groupby("a").mean()